]> git.ipfire.org Git - thirdparty/mdadm.git/commitdiff
Merge branch 'master' into devel-3.0
authorNeilBrown <neilb@suse.de>
Tue, 2 Jun 2009 05:28:36 +0000 (15:28 +1000)
committerNeilBrown <neilb@suse.de>
Tue, 2 Jun 2009 05:28:36 +0000 (15:28 +1000)
Conflicts:
super0.c
super1.c

69 files changed:
.gitignore
ANNOUNCE-3.0-devel1 [new file with mode: 0644]
ANNOUNCE-3.0-devel2 [new file with mode: 0644]
ANNOUNCE-3.0-devel3 [new file with mode: 0644]
ANNOUNCE-3.0-rc1 [new file with mode: 0644]
Assemble.c
Build.c
Create.c
Detail.c
Examine.c
Grow.c
Incremental.c
Kill.c
Makefile
Manage.c
Monitor.c
Query.c
ReadMe.c
TODO
bitmap.c
config.c
crc32.c [new file with mode: 0644]
crc32.h [new file with mode: 0644]
inventory
kernel-patch-2.6.25 [new file with mode: 0644]
kernel-patch-2.6.27 [new file with mode: 0644]
makedist
managemon.c [new file with mode: 0644]
mapfile.c
md.4
mdadm.8
mdadm.c
mdadm.conf.5
mdadm.h
mdadm.spec
mdassemble.8
mdassemble.c
mdmon.8 [new file with mode: 0644]
mdmon.c [new file with mode: 0644]
mdmon.h [new file with mode: 0644]
mdopen.c
mdstat.c
monitor.c [new file with mode: 0644]
msg.c [new file with mode: 0644]
msg.h [new file with mode: 0644]
platform-intel.c [new file with mode: 0644]
platform-intel.h [new file with mode: 0644]
probe_roms.c [new file with mode: 0644]
probe_roms.h [new file with mode: 0644]
restripe.c
sg_io.c [new file with mode: 0644]
super-ddf.c [new file with mode: 0644]
super-intel.c [new file with mode: 0644]
super0.c
super1.c
sysfs.c
test
tests/01r5integ [new file with mode: 0644]
tests/01raid6integ [new file with mode: 0644]
tests/03r0assem
tests/03r5assemV1
tests/06name
tests/08imsm-overlap [new file with mode: 0644]
tests/09imsm-create-fail-rebuild [new file with mode: 0644]
tests/10ddf-create [new file with mode: 0644]
tests/env-08imsm-overlap [new file with mode: 0644]
tests/env-09imsm-create-fail-rebuild [new file with mode: 0644]
udev-md-raid.rules [new file with mode: 0644]
util.c

index 86e075ed6f7780b0e13fb2bd6310196e01bf8dcc..2503bd8bce6ed1b3d68ab350d9a84d0751a1f9f4 100644 (file)
@@ -3,3 +3,7 @@
 /*-stamp
 /mdadm
 /mdadm.udeb
+/mdmon
+/swap_super
+/test_stripe
+/TAGS
diff --git a/ANNOUNCE-3.0-devel1 b/ANNOUNCE-3.0-devel1
new file mode 100644 (file)
index 0000000..89ed2e3
--- /dev/null
@@ -0,0 +1,84 @@
+Subject:  ANNOUNCE: mdadm 3.0-devel1 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+   mdadm version 3.0-devel1
+
+It is available at the usual places:
+   countrycode=xx.
+   http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+   git://neil.brown.name/mdadm
+   http://neil.brown.name/git?p=mdadm
+
+Note that this is a "devel" release.  It is not intended for
+production use yet, but rather for testing and ongoing development.
+
+The significant change which justifies the new major version number is
+that mdadm can now handle metadata updates entirely in userspace.
+This allows mdadm to support metadata formats that the kernel knows
+nothing about.
+
+Currently two such metadata formats are supported:
+  - DDF  - The SNIA standard format
+  - Intel Matrix - The metadata used by recent Intel ICH controlers.
+
+The manual pages have not yet been updated, but here is a brief outline.
+
+Externally managed metadata introduces the concept of a 'container'.
+A container is a collection of (normally) physical devices which have
+a common set of metadata.  A container is assembled as an md array, but
+is left 'inactive'.
+
+A container can contain one or more data arrays.  These are composed from
+slices (partitions?) of various devices in the container.
+
+For example, a 5 devices DDF set can container a RAID1 using the first
+half of two devices, a RAID0 using the first half of the remain 3 devices,
+and a RAID5 over thte second half of all 5 devices.
+
+A container can be created with
+
+   mdadm --create /dev/md0 -e ddf -n5 /dev/sd[abcde]
+
+or "-e imsm" to use the Intel Matrix Storage Manager.
+
+An array can be created within a container either by giving the
+container name and the only member:
+
+   mdadm -C /dev/md1 --level raid1 -n 2 /dev/md0
+
+or by listing the component devices
+
+   mdadm -C /dev/md2 --level raid0 -n 3 /dev/sd[cde]
+
+The assemble a container, it is easiest just to pass each device in turn to 
+mdadm -I
+
+  for i in /dev/sd[abcde]
+  do mdadm -I $i
+  done
+
+This will assemble the container and the components.
+
+Alternately the container can be assembled explicitly
+
+   mdadm -A /dev/md0 /dev/sd[abcde]
+
+Then the components can all be assembled with
+
+   mdadm -I /dev/md0
+
+For each container, mdadm will start a program called "mdmon" which will
+monitor the array and effect any metadata updates needed.  The array is
+initially assembled readonly. It is up to "mdmon" to mark the metadata 
+as 'dirty' and which the array to 'read-write'.
+
+The version 0.90 and 1.x metadata formats supported by previous
+versions for mdadm are still supported and the kernel still performs
+the same updates it use to.  The new 'mdmon' approach is only used for
+newly introduced metadata types.
+
+Any testing and feedback will be greatly appreciated.
+
+NeilBrown  18th September 2008
+
diff --git a/ANNOUNCE-3.0-devel2 b/ANNOUNCE-3.0-devel2
new file mode 100644 (file)
index 0000000..0f2924c
--- /dev/null
@@ -0,0 +1,98 @@
+Subject:  ANNOUNCE: mdadm 3.0-devel2 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+   mdadm version 3.0-devel2
+
+It is available at the usual places:
+   countrycode=xx.
+   http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+   git://neil.brown.name/mdadm
+   http://neil.brown.name/git?p=mdadm
+
+Note that this is a "devel" release.  It should be used with
+caution, though it is believed to be close to release-candidate stage.
+
+The significant change which justifies the new major version number is
+that mdadm can now handle metadata updates entirely in userspace.
+This allows mdadm to support metadata formats that the kernel knows
+nothing about.
+
+Currently two such metadata formats are supported:
+  - DDF  - The SNIA standard format
+  - Intel Matrix - The metadata used by recent Intel ICH controlers.
+
+Also the approach to device names has changed significantly.
+
+If udev is installed on the system, mdadm will not create any devices
+in /dev.  Rather it allows udev to manage those devices.  For this to work
+as expected, the included udev rules file should be installed.
+
+If udev is not install, mdadm will still create devices and symlinks 
+as required, and will also remove them when the array is stopped.
+
+mdadm now requires all devices which do not have a standard name (mdX
+or md_dX) to live in the directory /dev/md/.  Names in this directory
+will always be created as symlinks back to the standard name in /dev.
+
+The man pages contain some information about the new externally managed
+metadata.  However see below for a more condensed overview.
+
+Externally managed metadata introduces the concept of a 'container'.
+A container is a collection of (normally) physical devices which have
+a common set of metadata.  A container is assembled as an md array, but
+is left 'inactive'.
+
+A container can contain one or more data arrays.  These are composed from
+slices (partitions?) of various devices in the container.
+
+For example, a 5 devices DDF set can container a RAID1 using the first
+half of two devices, a RAID0 using the first half of the remain 3 devices,
+and a RAID5 over thte second half of all 5 devices.
+
+A container can be created with
+
+   mdadm --create /dev/md0 -e ddf -n5 /dev/sd[abcde]
+
+or "-e imsm" to use the Intel Matrix Storage Manager.
+
+An array can be created within a container either by giving the
+container name and the only member:
+
+   mdadm -C /dev/md1 --level raid1 -n 2 /dev/md0
+
+or by listing the component devices
+
+   mdadm -C /dev/md2 --level raid0 -n 3 /dev/sd[cde]
+
+To assemble a container, it is easiest just to pass each device in turn to 
+mdadm -I
+
+  for i in /dev/sd[abcde]
+  do mdadm -I $i
+  done
+
+This will assemble the container and the components.
+
+Alternately the container can be assembled explicitly
+
+   mdadm -A /dev/md0 /dev/sd[abcde]
+
+Then the components can all be assembled with
+
+   mdadm -I /dev/md0
+
+For each container, mdadm will start a program called "mdmon" which will
+monitor the array and effect any metadata updates needed.  The array is
+initially assembled readonly. It is up to "mdmon" to mark the metadata 
+as 'dirty' and which the array to 'read-write'.
+
+The version 0.90 and 1.x metadata formats supported by previous
+versions for mdadm are still supported and the kernel still performs
+the same updates it use to.  The new 'mdmon' approach is only used for
+newly introduced metadata types.
+
+Any testing and feedback will be greatly appreciated.
+
+NeilBrown  5th November 2008
+
diff --git a/ANNOUNCE-3.0-devel3 b/ANNOUNCE-3.0-devel3
new file mode 100644 (file)
index 0000000..078be07
--- /dev/null
@@ -0,0 +1,113 @@
+Subject:  ANNOUNCE: mdadm 3.0-devel3 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+   mdadm version 3.0-devel3
+
+It is available at the usual places:
+   countrycode=xx.
+   http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+   git://neil.brown.name/mdadm
+   http://neil.brown.name/git?p=mdadm
+
+Note that this is a "devel" release.  It should be used with
+caution, though it is believed to be close to release-candidate stage.
+
+There have been numerous improvements and additions since -devel2.
+I think we are close to a release of 3.0.
+
+I need to add lots of tests to the test suite to test the new
+functionality.  And I need to review the man pages.
+
+After that I will release -rc1 followed by -final.
+
+
+The following is the same introduction to 3.x as appeared in
+previous announcements.
+
+
+Any testing and feedback will be greatly appreciated.
+
+NeilBrown  10th March 2009
+
+
+=====================================================
+
+The significant change which justifies the new major version number is
+that mdadm can now handle metadata updates entirely in userspace.
+This allows mdadm to support metadata formats that the kernel knows
+nothing about.
+
+Currently two such metadata formats are supported:
+  - DDF  - The SNIA standard format
+  - Intel Matrix - The metadata used by recent Intel ICH controlers.
+
+Also the approach to device names has changed significantly.
+
+If udev is installed on the system, mdadm will not create any devices
+in /dev.  Rather it allows udev to manage those devices.  For this to work
+as expected, the included udev rules file should be installed.
+
+If udev is not install, mdadm will still create devices and symlinks 
+as required, and will also remove them when the array is stopped.
+
+mdadm now requires all devices which do not have a standard name (mdX
+or md_dX) to live in the directory /dev/md/.  Names in this directory
+will always be created as symlinks back to the standard name in /dev.
+
+The man pages contain some information about the new externally managed
+metadata.  However see below for a more condensed overview.
+
+Externally managed metadata introduces the concept of a 'container'.
+A container is a collection of (normally) physical devices which have
+a common set of metadata.  A container is assembled as an md array, but
+is left 'inactive'.
+
+A container can contain one or more data arrays.  These are composed from
+slices (partitions?) of various devices in the container.
+
+For example, a 5 devices DDF set can container a RAID1 using the first
+half of two devices, a RAID0 using the first half of the remain 3 devices,
+and a RAID5 over thte second half of all 5 devices.
+
+A container can be created with
+
+   mdadm --create /dev/md0 -e ddf -n5 /dev/sd[abcde]
+
+or "-e imsm" to use the Intel Matrix Storage Manager.
+
+An array can be created within a container either by giving the
+container name and the only member:
+
+   mdadm -C /dev/md1 --level raid1 -n 2 /dev/md0
+
+or by listing the component devices
+
+   mdadm -C /dev/md2 --level raid0 -n 3 /dev/sd[cde]
+
+To assemble a container, it is easiest just to pass each device in turn to 
+mdadm -I
+
+  for i in /dev/sd[abcde]
+  do mdadm -I $i
+  done
+
+This will assemble the container and the components.
+
+Alternately the container can be assembled explicitly
+
+   mdadm -A /dev/md0 /dev/sd[abcde]
+
+Then the components can all be assembled with
+
+   mdadm -I /dev/md0
+
+For each container, mdadm will start a program called "mdmon" which will
+monitor the array and effect any metadata updates needed.  The array is
+initially assembled readonly. It is up to "mdmon" to mark the metadata 
+as 'dirty' and which the array to 'read-write'.
+
+The version 0.90 and 1.x metadata formats supported by previous
+versions for mdadm are still supported and the kernel still performs
+the same updates it use to.  The new 'mdmon' approach is only used for
+newly introduced metadata types.
diff --git a/ANNOUNCE-3.0-rc1 b/ANNOUNCE-3.0-rc1
new file mode 100644 (file)
index 0000000..c6269d4
--- /dev/null
@@ -0,0 +1,139 @@
+Subject:  ANNOUNCE: mdadm 3.0-rc1 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+   mdadm version 3.0-rc1
+
+It is available at the usual places:
+   countrycode=xx.
+   http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+   git://neil.brown.name/mdadm
+   http://neil.brown.name/git?p=mdadm
+
+This is a "release candidate" which means that I think it is safe
+to use and that there will be no significant change in functionality
+before release.
+
+The man pages aren't really "release candidate" yet but I will be
+working on them before the final release.
+
+The most significant changes since -devel3 relate to the names of md
+devices as they appear in /dev and /dev/md/, and in particular the names
+that are used when an array is assembled with "--incremental" or with
+"mdadm --assemble --scan" when there are no ARRAY lines in mdadm.conf.
+In these cases mdadm needs to deduce a name to use, and to try to
+avoid using a name that a different array might have a stronger claim to.
+The rules are:
+  - if the array is mentioned in mdadm.conf, use the name given there.
+  - if the array appear to have been created for "this host" using the
+    "homehost" concept, trust the name given in the metadata
+  - if the new setting "HOMEHOST <ignore>" is given (can be in mdadm.conf
+    or on command line) the the name given in the metadata is not
+    associated with some other array by mdadm.conf, then trust the
+    name given in the metadata
+  - otherwise use the name in the metadata, but in an untrusted manner.
+
+If a name is untrusted, or if the name is already in use by another
+array, then a numeric suffix like "_0", "_1" is appended to create
+a unique name for the array.
+
+That name is then used to create a device file in /dev/md/.
+
+So if all arrays needed for boot will always be listed in
+/etc/mdadm.conf, then it is appropriate to add "HOMEHOST <ignore>" to
+mdadm.conf and there is no risk of conflicting names.  However if you
+want auto-assemble to assemble all arrays at boot time and you don't
+want to list them in mdadm.conf, then don't give "HOMEHOST <ignore>"
+either else there could be a risk of the wrong array being assembled
+for a given name.
+
+
+
+The following is the same introduction to 3.x as appeared in
+previous announcements.
+
+Any testing and feedback will be greatly appreciated.
+
+NeilBrown  11th May 2009
+
+
+=====================================================
+
+The significant change which justifies the new major version number is
+that mdadm can now handle metadata updates entirely in userspace.
+This allows mdadm to support metadata formats that the kernel knows
+nothing about.
+
+Currently two such metadata formats are supported:
+  - DDF  - The SNIA standard format
+  - Intel Matrix - The metadata used by recent Intel ICH controlers.
+
+Also the approach to device names has changed significantly.
+
+If udev is installed on the system, mdadm will not create any devices
+in /dev.  Rather it allows udev to manage those devices.  For this to work
+as expected, the included udev rules file should be installed.
+
+If udev is not install, mdadm will still create devices and symlinks 
+as required, and will also remove them when the array is stopped.
+
+mdadm now requires all devices which do not have a standard name (mdX
+or md_dX) to live in the directory /dev/md/.  Names in this directory
+will always be created as symlinks back to the standard name in /dev.
+
+The man pages contain some information about the new externally managed
+metadata.  However see below for a more condensed overview.
+
+Externally managed metadata introduces the concept of a 'container'.
+A container is a collection of (normally) physical devices which have
+a common set of metadata.  A container is assembled as an md array, but
+is left 'inactive'.
+
+A container can contain one or more data arrays.  These are composed from
+slices (partitions?) of various devices in the container.
+
+For example, a 5 devices DDF set can container a RAID1 using the first
+half of two devices, a RAID0 using the first half of the remain 3 devices,
+and a RAID5 over thte second half of all 5 devices.
+
+A container can be created with
+
+   mdadm --create /dev/md0 -e ddf -n5 /dev/sd[abcde]
+
+or "-e imsm" to use the Intel Matrix Storage Manager.
+
+An array can be created within a container either by giving the
+container name and the only member:
+
+   mdadm -C /dev/md1 --level raid1 -n 2 /dev/md0
+
+or by listing the component devices
+
+   mdadm -C /dev/md2 --level raid0 -n 3 /dev/sd[cde]
+
+To assemble a container, it is easiest just to pass each device in turn to 
+mdadm -I
+
+  for i in /dev/sd[abcde]
+  do mdadm -I $i
+  done
+
+This will assemble the container and the components.
+
+Alternately the container can be assembled explicitly
+
+   mdadm -A /dev/md0 /dev/sd[abcde]
+
+Then the components can all be assembled with
+
+   mdadm -I /dev/md0
+
+For each container, mdadm will start a program called "mdmon" which will
+monitor the array and effect any metadata updates needed.  The array is
+initially assembled readonly. It is up to "mdmon" to mark the metadata 
+as 'dirty' and which the array to 'read-write'.
+
+The version 0.90 and 1.x metadata formats supported by previous
+versions for mdadm are still supported and the kernel still performs
+the same updates it use to.  The new 'mdmon' approach is only used for
+newly introduced metadata types.
index ab8faeddf1d142cfa448d76859c51619f27cecfa..3c3a004fc8045a67b16351a4bde7d7d3b67f8c04 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 
 #include       "mdadm.h"
@@ -50,11 +45,36 @@ static int name_matches(char *found, char *required, char *homehost)
        return 0;
 }
 
-int Assemble(struct supertype *st, char *mddev, int mdfd,
+static int is_member_busy(char *metadata_version)
+{
+       /* check if the given member array is active */
+       struct mdstat_ent *mdstat = mdstat_read(1, 0);
+       struct mdstat_ent *ent;
+       int busy = 0;
+
+       for (ent = mdstat; ent; ent = ent->next) {
+               if (ent->metadata_version == NULL)
+                       continue;
+               if (strncmp(ent->metadata_version, "external:", 9) != 0)
+                       continue;
+               if (!is_subarray(&ent->metadata_version[9]))
+                       continue;
+               /* Skip first char - it can be '/' or '-' */
+               if (strcmp(&ent->metadata_version[10], metadata_version+1) == 0) {
+                       busy = 1;
+                       break;
+               }
+       }
+       free_mdstat(mdstat);
+
+       return busy;
+}
+
+int Assemble(struct supertype *st, char *mddev,
             mddev_ident_t ident,
             mddev_dev_t devlist, char *backup_file,
             int readonly, int runstop,
-            char *update, char *homehost,
+            char *update, char *homehost, int require_homehost,
             int verbose, int force)
 {
        /*
@@ -111,10 +131,13 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
         *    START_ARRAY
         *
         */
-       int clean = 0;
-       int must_close = 0;
+       int mdfd;
+       int clean;
+       int auto_assem = (mddev == NULL && !ident->uuid_set &&
+                         ident->super_minor == UnSet && ident->name[0] == 0
+                         && (ident->container == NULL || ident->member == NULL));
        int old_linux = 0;
-       int vers = 0; /* Keep gcc quite - it really is initialised */
+       int vers = vers; /* Keep gcc quite - it really is initialised */
        struct {
                char *devname;
                int uptodate; /* set once we decide that this device is as
@@ -132,36 +155,23 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
        int chosen_drive;
        int change = 0;
        int inargv = 0;
+       int report_missmatch;
        int bitmap_done;
-       int start_partial_ok = (runstop >= 0) && (force || devlist==NULL || mdfd < 0);
+       int start_partial_ok = (runstop >= 0) && 
+               (force || devlist==NULL || auto_assem);
        unsigned int num_devs;
        mddev_dev_t tmpdev;
        struct mdinfo info;
+       struct mdinfo *content = NULL;
        char *avail;
        int nextspare = 0;
+       char *name = NULL;
+       int trustworthy;
+       char chosen_name[1024];
 
        if (get_linux_version() < 2004000)
                old_linux = 1;
 
-       if (mdfd >= 0) {
-               vers = md_get_version(mdfd);
-               if (vers <= 0) {
-                       fprintf(stderr, Name ": %s appears not to be an md device.\n", mddev);
-                       return 1;
-               }
-               if (vers < 9000) {
-                       fprintf(stderr, Name ": Assemble requires driver version 0.90.0 or later.\n"
-                               "    Upgrade your kernel or try --build\n");
-                       return 1;
-               }
-
-               if (ioctl(mdfd, GET_ARRAY_INFO, &info.array)>=0) {
-                       fprintf(stderr, Name ": device %s already active - cannot assemble it\n",
-                               mddev);
-                       return 1;
-               }
-               ioctl(mdfd, STOP_ARRAY, NULL); /* just incase it was started but has no content */
-       }
        /*
         * If any subdevs are listed, then any that don't
         * match ident are discarded.  Remainder must all match and
@@ -173,17 +183,25 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
        if (!devlist &&
            ident->uuid_set == 0 &&
            ident->super_minor < 0 &&
+           ident->name[0] == 0 &&
+           (ident->container == NULL || ident->member == NULL) &&
            ident->devices == NULL) {
                fprintf(stderr, Name ": No identity information available for %s - cannot assemble.\n",
                        mddev ? mddev : "further assembly");
                return 1;
        }
+
        if (devlist == NULL)
                devlist = conf_get_devs();
-       else if (mdfd >= 0)
+       else if (mddev)
                inargv = 1;
 
+       report_missmatch = ((inargv && verbose >= 0) || verbose > 0);
  try_again:
+       /* We come back here when doing auto-assembly and attempting some
+        * set of devices failed.  Those are now marked as ->used==2 and
+        * we ignore them and try again
+        */
 
        tmpdev = devlist; num_devs = 0;
        while (tmpdev) {
@@ -203,7 +221,7 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
 
        /* first walk the list of devices to find a consistent set
         * that match the criterea, if that is possible.
-        * We flag the one we like with 'used'.
+        * We flag the ones we like with 'used'.
         */
        for (tmpdev = devlist;
             tmpdev;
@@ -217,14 +235,14 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
 
                if (ident->devices &&
                    !match_oneof(ident->devices, devname)) {
-                       if ((inargv && verbose>=0) || verbose > 0)
+                       if (report_missmatch)
                                fprintf(stderr, Name ": %s is not one of %s\n", devname, ident->devices);
                        continue;
                }
 
                dfd = dev_open(devname, O_RDONLY|O_EXCL);
                if (dfd < 0) {
-                       if ((inargv && verbose >= 0) || verbose > 0)
+                       if (report_missmatch)
                                fprintf(stderr, Name ": cannot open device %s: %s\n",
                                        devname, strerror(errno));
                        tmpdev->used = 2;
@@ -238,72 +256,127 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                                devname);
                        tmpdev->used = 2;
                } else if (!tst && (tst = guess_super(dfd)) == NULL) {
-                       if ((inargv && verbose >= 0) || verbose > 0)
+                       if (report_missmatch)
                                fprintf(stderr, Name ": no recogniseable superblock on %s\n",
                                        devname);
                        tmpdev->used = 2;
+               } else if (auto_assem && st == NULL &&
+                          !conf_test_metadata(tst->ss->name)) {
+                       if (report_missmatch)
+                               fprintf(stderr, Name ": %s has metadata type %s for which "
+                                       "auto-assembly is disabled\n",
+                                       devname, tst->ss->name);
+                       tmpdev->used = 2;
                } else if (tst->ss->load_super(tst,dfd, NULL)) {
-                       if ((inargv && verbose >= 0) || verbose > 0)
+                       if (report_missmatch)
                                fprintf( stderr, Name ": no RAID superblock on %s\n",
                                         devname);
                } else {
-                       tst->ss->getinfo_super(tst, &info);
+                       content = &info;
+                       memset(content, 0, sizeof(*content));
+                       tst->ss->getinfo_super(tst, content);
                }
                if (dfd >= 0) close(dfd);
 
+               if (tst && tst->sb && tst->ss->container_content
+                   && tst->loaded_container) {
+                       /* tmpdev is a container.  We need to be either
+                        * looking for a member, or auto-assembling
+                        */
+                       if (st) {
+                               /* already found some components, this cannot
+                                * be another one.
+                                */
+                               if (report_missmatch)
+                                       fprintf(stderr, Name ": %s is a container, but we are looking for components\n",
+                                               devname);
+                               goto loop;
+                       }
+
+                       if (ident->container) {
+                               if (ident->container[0] == '/' &&
+                                   !same_dev(ident->container, devname)) {
+                                       if (report_missmatch)
+                                               fprintf(stderr, Name ": %s is not the container required (%s)\n",
+                                                       devname, ident->container);
+                                       goto loop;
+                               }
+                               if (ident->container[0] != '/') {
+                                       /* we have a uuid */
+                                       int uuid[4];
+                                       if (!parse_uuid(ident->container, uuid) ||
+                                           !same_uuid(content->uuid, uuid, tst->ss->swapuuid)) {
+                                               if (report_missmatch)
+                                                       fprintf(stderr, Name ": %s has wrong UUID to be required container\n",
+                                                               devname);
+                                               goto loop;
+                                       }
+                               }
+                       }
+                       /* It is worth looking inside this container.
+                        */
+               next_member:
+                       if (tmpdev->content)
+                               content = tmpdev->content;
+                       else
+                               content = tst->ss->container_content(tst);
+
+                       tmpdev->content = content->next;
+                       if (tmpdev->content == NULL)
+                               tmpdev->used = 2;
+
+               } else if (ident->container || ident->member) {
+                       /* No chance of this matching if we don't have
+                        * a container */
+                       if (report_missmatch)
+                               fprintf(stderr, Name "%s is not a container, and one is required.\n",
+                                       devname);
+                       goto loop;
+               }
+
                if (ident->uuid_set && (!update || strcmp(update, "uuid")!= 0) &&
                    (!tst || !tst->sb ||
-                    same_uuid(info.uuid, ident->uuid, tst->ss->swapuuid)==0)) {
-                       if ((inargv && verbose >= 0) || verbose > 0)
+                    same_uuid(content->uuid, ident->uuid, tst->ss->swapuuid)==0)) {
+                       if (report_missmatch)
                                fprintf(stderr, Name ": %s has wrong uuid.\n",
                                        devname);
                        goto loop;
                }
                if (ident->name[0] && (!update || strcmp(update, "name")!= 0) &&
                    (!tst || !tst->sb ||
-                    name_matches(info.name, ident->name, homehost)==0)) {
-                       if ((inargv && verbose >= 0) || verbose > 0)
+                    name_matches(content->name, ident->name, homehost)==0)) {
+                       if (report_missmatch)
                                fprintf(stderr, Name ": %s has wrong name.\n",
                                        devname);
                        goto loop;
                }
                if (ident->super_minor != UnSet &&
                    (!tst || !tst->sb ||
-                    ident->super_minor != info.array.md_minor)) {
-                       if ((inargv && verbose >= 0) || verbose > 0)
+                    ident->super_minor != content->array.md_minor)) {
+                       if (report_missmatch)
                                fprintf(stderr, Name ": %s has wrong super-minor.\n",
                                        devname);
                        goto loop;
                }
                if (ident->level != UnSet &&
                    (!tst || !tst->sb ||
-                    ident->level != info.array.level)) {
-                       if ((inargv && verbose >= 0) || verbose > 0)
+                    ident->level != content->array.level)) {
+                       if (report_missmatch)
                                fprintf(stderr, Name ": %s has wrong raid level.\n",
                                        devname);
                        goto loop;
                }
                if (ident->raid_disks != UnSet &&
                    (!tst || !tst->sb ||
-                    ident->raid_disks!= info.array.raid_disks)) {
-                       if ((inargv && verbose >= 0) || verbose > 0)
+                    ident->raid_disks!= content->array.raid_disks)) {
+                       if (report_missmatch)
                                fprintf(stderr, Name ": %s requires wrong number of drives.\n",
                                        devname);
                        goto loop;
                }
-               if (mdfd < 0) {
+               if (auto_assem) {
                        if (tst == NULL || tst->sb == NULL)
                                continue;
-                       if (update == NULL &&
-                           tst->ss->match_home(tst, homehost)==0) {
-                               if ((inargv && verbose >= 0) || verbose > 0)
-                                       fprintf(stderr, Name ": %s is not built for host %s.\n",
-                                               devname, homehost);
-                               /* Auto-assemble, and this is not a usable host */
-                               /* if update != NULL, we are updating the host
-                                * name... */
-                               goto loop;
-                       }
                }
                /* If we are this far, then we are nearly commited to this device.
                 * If the super_block doesn't exist, or doesn't match others,
@@ -320,6 +393,33 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                        return 1;
                }
 
+               if (tst && tst->sb && tst->ss->container_content
+                   && tst->loaded_container) {
+                       /* we have the one container we need, don't keep
+                        * looking.  If the chosen member is active, skip.
+                        */
+                       if (is_member_busy(content->text_version)) {
+                               if (report_missmatch)
+                                       fprintf(stderr, Name ": member %s in %s is already assembled\n",
+                                               content->text_version,
+                                               devname);
+                               tst->ss->free_super(tst);
+                               tst = NULL;
+                               content = NULL;
+                               if (auto_assem)
+                                       goto loop;
+                               return 1;
+                       }
+                       st = tst; tst = NULL;
+                       if (!auto_assem && tmpdev->next != NULL) {
+                               fprintf(stderr, Name ": %s is a container, but is not "
+                                       "only device given: confused and aborting\n",
+                                       devname);
+                               st->ss->free_super(st);
+                               return 1;
+                       }
+                       break;
+               }
                if (st == NULL)
                        st = dup_super(tst);
                if (st->minor_version == -1)
@@ -332,21 +432,22 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                         * Or, if we are auto assembling, we just ignore the second
                         * for now.
                         */
-                       if (mdfd < 0)
+                       if (auto_assem)
                                goto loop;
                        if (homehost) {
                                int first = st->ss->match_home(st, homehost);
                                int last = tst->ss->match_home(tst, homehost);
-                               if (first+last == 1) {
+                               if (first != last &&
+                                   (first == 1 || last == 1)) {
                                        /* We can do something */
                                        if (first) {/* just ignore this one */
-                                               if ((inargv && verbose >= 0) || verbose > 0)
+                                               if (report_missmatch)
                                                        fprintf(stderr, Name ": %s misses out due to wrong homehost\n",
                                                                devname);
                                                goto loop;
                                        } else { /* reject all those sofar */
                                                mddev_dev_t td;
-                                               if ((inargv && verbose >= 0) || verbose > 0)
+                                               if (report_missmatch)
                                                        fprintf(stderr, Name ": %s overrides previous devices due to good homehost\n",
                                                                devname);
                                                for (td=devlist; td != tmpdev; td=td->next)
@@ -367,53 +468,99 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                tmpdev->used = 1;
 
        loop:
+               if (tmpdev->content)
+                       goto next_member;
                if (tst)
                        tst->ss->free_super(tst);
        }
 
-       if (mdfd < 0) {
-               /* So... it is up to me to open the device.
-                * We create a name '/dev/md/XXX' based on the info in the
-                * superblock, and call open_mddev on that
-                */
-               mdu_array_info_t inf;
-               char *c;
-               if (!st || !st->sb) {
-                       return 2;
-               }
-               st->ss->getinfo_super(st, &info);
-               c = strchr(info.name, ':');
-               if (c) c++; else c= info.name;
-               if (isdigit(*c) && ((ident->autof & 7)==4 || (ident->autof&7)==6))
-                       /* /dev/md/d0 style for partitionable */
-                       xasprintf(&mddev, "/dev/md/d%s", c);
+       if (!st || !st->sb || !content)
+               return 2;
+
+       /* Now need to open the array device.  Use create_mddev */
+       if (content == &info)
+               st->ss->getinfo_super(st, content);
+
+       trustworthy = FOREIGN;
+       name = content->name;
+       switch (st->ss->match_home(st, homehost)
+               ?: st->ss->match_home(st, "any")) {
+       case 1:
+               trustworthy = LOCAL;
+               name = strchr(content->name, ':');
+               if (name)
+                       name++;
                else
-                       xasprintf(&mddev, "/dev/md/%s", c);
-               mdfd = open_mddev(mddev, ident->autof);
-               if (mdfd < 0) {
-                       st->ss->free_super(st);
-                       free(devices);
+                       name = content->name;
+               break;
+       }
+       if (!auto_assem)
+               /* If the array is listed in mdadm.conf or on
+                * command line, then we trust the name
+                * even if the array doesn't look local
+                */
+               trustworthy = LOCAL;
+
+       if (name[0] == 0 &&
+           content->array.level == LEVEL_CONTAINER) {
+               name = content->text_version;
+               trustworthy = METADATA;
+       }
+
+       if (name[0] && trustworthy != LOCAL &&
+           ! require_homehost &&
+           conf_name_is_free(name))
+               trustworthy = LOCAL;
+
+       if (trustworthy == LOCAL &&
+           strchr(name, ':'))
+               /* Ignore 'host:' prefix of name */
+               name = strchr(name, ':')+1;
+
+       mdfd = create_mddev(mddev, name, ident->autof, trustworthy,
+                           chosen_name);
+       if (mdfd < 0) {
+               st->ss->free_super(st);
+               free(devices);
+               if (auto_assem)
                        goto try_again;
-               }
-               vers = md_get_version(mdfd);
-               if (ioctl(mdfd, GET_ARRAY_INFO, &inf)==0) {
-                       for (tmpdev = devlist ;
-                            tmpdev && tmpdev->used != 1;
-                            tmpdev = tmpdev->next)
-                               ;
-                       fprintf(stderr, Name ": %s already active, cannot restart it!\n", mddev);
-                       if (tmpdev)
-                               fprintf(stderr, Name ":   %s needed for %s...\n",
-                                       mddev, tmpdev->devname);
-                       close(mdfd);
-                       mdfd = -1;
-                       st->ss->free_super(st);
-                       free(devices);
+               return 1;
+       }
+       mddev = chosen_name;
+       vers = md_get_version(mdfd);
+       if (vers < 9000) {
+               fprintf(stderr, Name ": Assemble requires driver version 0.90.0 or later.\n"
+                       "    Upgrade your kernel or try --build\n");
+               close(mdfd);
+               return 1;
+       }
+       if (mddev_busy(fd2devnum(mdfd))) {
+               fprintf(stderr, Name ": %s already active, cannot restart it!\n",
+                       mddev);
+               for (tmpdev = devlist ;
+                    tmpdev && tmpdev->used != 1;
+                    tmpdev = tmpdev->next)
+                       ;
+               if (tmpdev && auto_assem)
+                       fprintf(stderr, Name ":   %s needed for %s...\n",
+                               mddev, tmpdev->devname);
+               close(mdfd);
+               mdfd = -3;
+               st->ss->free_super(st);
+               free(devices);
+               if (auto_assem)
                        goto try_again;
-               }
-               must_close = 1;
+               return 1;
        }
+       ioctl(mdfd, STOP_ARRAY, NULL); /* just incase it was started but has no content */
 
+#ifndef MDASSEMBLE
+       if (content != &info) {
+               /* This is a member of a container.  Try starting the array. */
+               return assemble_container_content(st, mdfd, content, runstop,
+                                          chosen_name, verbose);
+       }
+#endif
        /* Ok, no bad inconsistancy, we can try updating etc */
        bitmap_done = 0;
        for (tmpdev = devlist; tmpdev; tmpdev=tmpdev->next) if (tmpdev->used == 1) {
@@ -446,19 +593,19 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
 
                        tst = dup_super(st);
                        tst->ss->load_super(tst, dfd, NULL);
-                       tst->ss->getinfo_super(tst, &info);
+                       tst->ss->getinfo_super(tst, content);
 
-                       memcpy(info.uuid, ident->uuid, 16);
-                       strcpy(info.name, ident->name);
-                       info.array.md_minor = minor(stb2.st_rdev);
+                       memcpy(content->uuid, ident->uuid, 16);
+                       strcpy(content->name, ident->name);
+                       content->array.md_minor = minor(stb2.st_rdev);
 
-                       tst->ss->update_super(tst, &info, update,
+                       tst->ss->update_super(tst, content, update,
                                              devname, verbose,
                                              ident->uuid_set, homehost);
                        if (strcmp(update, "uuid")==0 &&
                            !ident->uuid_set) {
                                ident->uuid_set = 1;
-                               memcpy(ident->uuid, info.uuid, 16);
+                               memcpy(ident->uuid, content->uuid, 16);
                        }
                        if (dfd < 0)
                                fprintf(stderr, Name ": Cannot open %s for superblock update\n",
@@ -472,7 +619,7 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                        if (strcmp(update, "uuid")==0 &&
                            ident->bitmap_fd >= 0 && !bitmap_done) {
                                if (bitmap_update_uuid(ident->bitmap_fd,
-                                                      info.uuid,
+                                                      content->uuid,
                                                       tst->ss->swapuuid) != 0)
                                        fprintf(stderr, Name ": Could not update uuid on external bitmap.\n");
                                else
@@ -489,7 +636,7 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                        remove_partitions(dfd);
 
                        tst->ss->load_super(tst, dfd, NULL);
-                       tst->ss->getinfo_super(tst, &info);
+                       tst->ss->getinfo_super(tst, content);
                        tst->ss->free_super(tst);
                        close(dfd);
                }
@@ -498,10 +645,10 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
 
                if (verbose > 0)
                        fprintf(stderr, Name ": %s is identified as a member of %s, slot %d.\n",
-                               devname, mddev, info.disk.raid_disk);
+                               devname, mddev, content->disk.raid_disk);
                devices[devcnt].devname = devname;
                devices[devcnt].uptodate = 0;
-               devices[devcnt].i = info;
+               devices[devcnt].i = *content;
                devices[devcnt].i.disk.major = major(stb.st_rdev);
                devices[devcnt].i.disk.minor = minor(stb.st_rdev);
                if (most_recent < devcnt) {
@@ -509,17 +656,17 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                            > devices[most_recent].i.events)
                                most_recent = devcnt;
                }
-               if (info.array.level == -4)
+               if (content->array.level == -4)
                        /* with multipath, the raid_disk from the superblock is meaningless */
                        i = devcnt;
                else
                        i = devices[devcnt].i.disk.raid_disk;
                if (i+1 == 0) {
-                       if (nextspare < info.array.raid_disks)
-                               nextspare = info.array.raid_disks;
+                       if (nextspare < content->array.raid_disks)
+                               nextspare = content->array.raid_disks;
                        i = nextspare++;
                } else {
-                       if (i >= info.array.raid_disks &&
+                       if (i >= content->array.raid_disks &&
                            i >= nextspare)
                                nextspare = i+1;
                }
@@ -542,8 +689,8 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                            == devices[devcnt].i.events
                            && (devices[best[i]].i.disk.minor
                                != devices[devcnt].i.disk.minor)
-                           && st->ss->major == 0
-                           && info.array.level != -4) {
+                           && st->ss == &super0
+                           && content->array.level != LEVEL_MULTIPATH) {
                                /* two different devices with identical superblock.
                                 * Could be a mis-detection caused by overlapping
                                 * partitions.  fail-safe.
@@ -558,7 +705,7 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                                        inargv ? "the list" :
                                           "the\n      DEVICE list in mdadm.conf"
                                        );
-                               if (must_close) close(mdfd);
+                               close(mdfd);
                                return 1;
                        }
                        if (best[i] == -1
@@ -574,21 +721,21 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                        mddev);
                if (st)
                        st->ss->free_super(st);
-               if (must_close) close(mdfd);
+               close(mdfd);
                return 1;
        }
 
        if (update && strcmp(update, "byteorder")==0)
                st->minor_version = 90;
 
-       st->ss->getinfo_super(st, &info);
-       clean = info.array.state & 1;
+       st->ss->getinfo_super(st, content);
+       clean = content->array.state & 1;
 
        /* now we have some devices that might be suitable.
         * I wonder how many
         */
-       avail = malloc(info.array.raid_disks);
-       memset(avail, 0, info.array.raid_disks);
+       avail = malloc(content->array.raid_disks);
+       memset(avail, 0, content->array.raid_disks);
        okcnt = 0;
        sparecnt=0;
        for (i=0; i< bestcnt ;i++) {
@@ -600,7 +747,7 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                /* note: we ignore error flags in multipath arrays
                 * as they don't make sense
                 */
-               if (info.array.level != -4)
+               if (content->array.level != -4)
                        if (!(devices[j].i.disk.state & (1<<MD_DISK_SYNC))) {
                                if (!(devices[j].i.disk.state
                                      & (1<<MD_DISK_FAULTY)))
@@ -610,15 +757,15 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                if (devices[j].i.events+event_margin >=
                    devices[most_recent].i.events) {
                        devices[j].uptodate = 1;
-                       if (i < info.array.raid_disks) {
+                       if (i < content->array.raid_disks) {
                                okcnt++;
                                avail[i]=1;
                        } else
                                sparecnt++;
                }
        }
-       while (force && !enough(info.array.level, info.array.raid_disks,
-                               info.array.layout, 1,
+       while (force && !enough(content->array.level, content->array.raid_disks,
+                               content->array.layout, 1,
                                avail, okcnt)) {
                /* Choose the newest best drive which is
                 * not up-to-date, update the superblock
@@ -628,7 +775,7 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                struct supertype *tst;
                long long current_events;
                chosen_drive = -1;
-               for (i=0; i<info.array.raid_disks && i < bestcnt; i++) {
+               for (i=0; i<content->array.raid_disks && i < bestcnt; i++) {
                        int j = best[i];
                        if (j>=0 &&
                            !devices[j].uptodate &&
@@ -662,8 +809,8 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                        devices[chosen_drive].i.events = 0;
                        continue;
                }
-               info.events = devices[most_recent].i.events;
-               tst->ss->update_super(tst, &info, "force-one",
+               content->events = devices[most_recent].i.events;
+               tst->ss->update_super(tst, content, "force-one",
                                     devices[chosen_drive].devname, verbose,
                                     0, NULL);
 
@@ -685,7 +832,7 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                /* If there are any other drives of the same vintage,
                 * add them in as well.  We can't lose and we might gain
                 */
-               for (i=0; i<info.array.raid_disks && i < bestcnt ; i++) {
+               for (i=0; i<content->array.raid_disks && i < bestcnt ; i++) {
                        int j = best[i];
                        if (j >= 0 &&
                            !devices[j].uptodate &&
@@ -716,29 +863,32 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                if ((fd=dev_open(devices[j].devname, O_RDONLY|O_EXCL))< 0) {
                        fprintf(stderr, Name ": Cannot open %s: %s\n",
                                devices[j].devname, strerror(errno));
-                       if (must_close) close(mdfd);
+                       close(mdfd);
                        return 1;
                }
                if (st->ss->load_super(st,fd, NULL)) {
                        close(fd);
                        fprintf(stderr, Name ": RAID superblock has disappeared from %s\n",
                                devices[j].devname);
-                       if (must_close) close(mdfd);
+                       close(mdfd);
                        return 1;
                }
                close(fd);
        }
        if (st->sb == NULL) {
                fprintf(stderr, Name ": No suitable drives found for %s\n", mddev);
-               if (must_close) close(mdfd);
+               close(mdfd);
                return 1;
        }
-       st->ss->getinfo_super(st, &info);
+       st->ss->getinfo_super(st, content);
+#ifndef MDASSEMBLE
+       sysfs_init(content, mdfd, 0);
+#endif
        for (i=0; i<bestcnt; i++) {
                int j = best[i];
                unsigned int desired_state;
 
-               if (i < info.array.raid_disks)
+               if (i < content->array.raid_disks)
                        desired_state = (1<<MD_DISK_ACTIVE) | (1<<MD_DISK_SYNC);
                else
                        desired_state = 0;
@@ -775,10 +925,10 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
 #endif
        }
        if (force && !clean &&
-           !enough(info.array.level, info.array.raid_disks,
-                   info.array.layout, clean,
+           !enough(content->array.level, content->array.raid_disks,
+                   content->array.layout, clean,
                    avail, okcnt)) {
-               change += st->ss->update_super(st, &info, "force-array",
+               change += st->ss->update_super(st, content, "force-array",
                                        devices[chosen_drive].devname, verbose,
                                               0, NULL);
                clean = 1;
@@ -790,14 +940,14 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                if (fd < 0) {
                        fprintf(stderr, Name ": Could not open %s for write - cannot Assemble array.\n",
                                devices[chosen_drive].devname);
-                       if (must_close) close(mdfd);
+                       close(mdfd);
                        return 1;
                }
                if (st->ss->store_super(st, fd)) {
                        close(fd);
                        fprintf(stderr, Name ": Could not re-write superblock on %s\n",
                                devices[chosen_drive].devname);
-                       if (must_close) close(mdfd);
+                       close(mdfd);
                        return 1;
                }
                close(fd);
@@ -808,7 +958,7 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
         * The code of doing this lives in Grow.c
         */
 #ifndef MDASSEMBLE
-       if (info.reshape_active) {
+       if (content->reshape_active) {
                int err = 0;
                int *fdlist = malloc(sizeof(int)* bestcnt);
                for (i=0; i<bestcnt; i++) {
@@ -825,14 +975,14 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                                fdlist[i] = -1;
                }
                if (!err)
-                       err = Grow_restart(st, &info, fdlist, bestcnt, backup_file);
+                       err = Grow_restart(st, content, fdlist, bestcnt, backup_file);
                while (i>0) {
                        i--;
                        if (fdlist[i]>=0) close(fdlist[i]);
                }
                if (err) {
                        fprintf(stderr, Name ": Failed to restore critical section for reshape, sorry.\n");
-                       if (must_close) close(mdfd);
+                       close(mdfd);
                        return err;
                }
        }
@@ -840,30 +990,29 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
        /* count number of in-sync devices according to the superblock.
         * We must have this number to start the array without -s or -R
         */
-       req_cnt = info.array.working_disks;
+       req_cnt = content->array.working_disks;
 
        /* Almost ready to actually *do* something */
        if (!old_linux) {
                int rv;
-               if ((vers % 100) >= 1) { /* can use different versions */
-                       mdu_array_info_t inf;
-                       memset(&inf, 0, sizeof(inf));
-                       inf.major_version = st->ss->major;
-                       inf.minor_version = st->minor_version;
-                       rv = ioctl(mdfd, SET_ARRAY_INFO, &inf);
-               } else
-                       rv = ioctl(mdfd, SET_ARRAY_INFO, NULL);
 
+               /* First, fill in the map, so that udev can find our name
+                * as soon as we become active.
+                */
+               map_update(NULL, fd2devnum(mdfd), content->text_version,
+                          content->uuid, chosen_name);
+
+               rv = set_array_info(mdfd, st, content);
                if (rv) {
-                       fprintf(stderr, Name ": SET_ARRAY_INFO failed for %s: %s\n",
+                       fprintf(stderr, Name ": failed to set array info for %s: %s\n",
                                mddev, strerror(errno));
-                       if (must_close) close(mdfd);
+                       close(mdfd);
                        return 1;
                }
                if (ident->bitmap_fd >= 0) {
                        if (ioctl(mdfd, SET_BITMAP_FILE, ident->bitmap_fd) != 0) {
                                fprintf(stderr, Name ": SET_BITMAP_FILE failed.\n");
-                               if (must_close) close(mdfd);
+                               close(mdfd);
                                return 1;
                        }
                } else if (ident->bitmap_file) {
@@ -872,13 +1021,13 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                        if (bmfd < 0) {
                                fprintf(stderr, Name ": Could not open bitmap file %s\n",
                                        ident->bitmap_file);
-                               if (must_close) close(mdfd);
+                               close(mdfd);
                                return 1;
                        }
                        if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) {
                                fprintf(stderr, Name ": Failed to set bitmapfile for %s\n", mddev);
                                close(bmfd);
-                               if (must_close) close(mdfd);
+                               close(mdfd);
                                return 1;
                        }
                        close(bmfd);
@@ -895,14 +1044,15 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                                j = chosen_drive;
 
                        if (j >= 0 /* && devices[j].uptodate */) {
-                               if (ioctl(mdfd, ADD_NEW_DISK,
-                                         &devices[j].i.disk)!=0) {
+                               rv = add_disk(mdfd, st, content, &devices[j].i);
+
+                               if (rv) {
                                        fprintf(stderr, Name ": failed to add "
                                                        "%s to %s: %s\n",
                                                devices[j].devname,
                                                mddev,
                                                strerror(errno));
-                                       if (i < info.array.raid_disks
+                                       if (i < content->array.raid_disks
                                            || i == bestcnt)
                                                okcnt--;
                                        else
@@ -912,49 +1062,67 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                                                        "to %s as %d\n",
                                                devices[j].devname, mddev,
                                                devices[j].i.disk.raid_disk);
-                       } else if (verbose > 0 && i < info.array.raid_disks)
+                       } else if (verbose > 0 && i < content->array.raid_disks)
                                fprintf(stderr, Name ": no uptodate device for "
                                                "slot %d of %s\n",
                                        i, mddev);
                }
 
+               if (content->array.level == LEVEL_CONTAINER) {
+                       if (verbose >= 0) {
+                               fprintf(stderr, Name ": Container %s has been "
+                                       "assembled with %d drive%s",
+                                       mddev, okcnt+sparecnt, okcnt+sparecnt==1?"":"s");
+                               if (okcnt < content->array.raid_disks)
+                                       fprintf(stderr, " (out of %d)",
+                                               content->array.raid_disks);
+                               fprintf(stderr, "\n");
+                       }
+                       sysfs_uevent(content, "change");
+                       wait_for(chosen_name, mdfd);
+                       close(mdfd);
+                       return 0;
+               }
+
                if (runstop == 1 ||
                    (runstop <= 0 &&
-                    ( enough(info.array.level, info.array.raid_disks,
-                             info.array.layout, clean, avail, okcnt) &&
+                    ( enough(content->array.level, content->array.raid_disks,
+                             content->array.layout, clean, avail, okcnt) &&
                       (okcnt >= req_cnt || start_partial_ok)
                             ))) {
                        if (ioctl(mdfd, RUN_ARRAY, NULL)==0) {
                                if (verbose >= 0) {
                                        fprintf(stderr, Name ": %s has been started with %d drive%s",
                                                mddev, okcnt, okcnt==1?"":"s");
-                                       if (okcnt < info.array.raid_disks)
-                                               fprintf(stderr, " (out of %d)", info.array.raid_disks);
+                                       if (okcnt < content->array.raid_disks)
+                                               fprintf(stderr, " (out of %d)", content->array.raid_disks);
                                        if (sparecnt)
                                                fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s");
                                        fprintf(stderr, ".\n");
                                }
-                               if (info.reshape_active &&
-                                   info.array.level >= 4 &&
-                                   info.array.level <= 6) {
+                               if (content->reshape_active &&
+                                   content->array.level >= 4 &&
+                                   content->array.level <= 6) {
                                        /* might need to increase the size
                                         * of the stripe cache - default is 256
                                         */
-                                       if (256 < 4 * (info.array.chunk_size/4096)) {
+                                       if (256 < 4 * (content->array.chunk_size/4096)) {
                                                struct mdinfo *sra = sysfs_read(mdfd, 0, 0);
                                                if (sra)
                                                        sysfs_set_num(sra, NULL,
                                                                      "stripe_cache_size",
-                                                                     (4 * info.array.chunk_size / 4096) + 1);
+                                                                     (4 * content->array.chunk_size / 4096) + 1);
                                        }
                                }
-                               if (must_close) {
+                               wait_for(mddev, mdfd);
+                               close(mdfd);
+                               if (auto_assem) {
                                        int usecs = 1;
-                                       close(mdfd);
                                        /* There is a nasty race with 'mdadm --monitor'.
                                         * If it opens this device before we close it,
                                         * it gets an incomplete open on which IO
-                                        * doesn't work and the capacity if wrong.
+                                        * doesn't work and the capacity is
+                                        * wrong.
                                         * If we reopen (to check for layered devices)
                                         * before --monitor closes, we loose.
                                         *
@@ -979,59 +1147,57 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                        fprintf(stderr, Name ": failed to RUN_ARRAY %s: %s\n",
                                mddev, strerror(errno));
 
-                       if (!enough(info.array.level, info.array.raid_disks,
-                                   info.array.layout, 1, avail, okcnt))
+                       if (!enough(content->array.level, content->array.raid_disks,
+                                   content->array.layout, 1, avail, okcnt))
                                fprintf(stderr, Name ": Not enough devices to "
                                        "start the array.\n");
-                       else if (!enough(info.array.level,
-                                        info.array.raid_disks,
-                                        info.array.layout, clean,
+                       else if (!enough(content->array.level,
+                                        content->array.raid_disks,
+                                        content->array.layout, clean,
                                         avail, okcnt))
                                fprintf(stderr, Name ": Not enough devices to "
                                        "start the array while not clean "
                                        "- consider --force.\n");
 
-                       if (must_close) {
+                       if (auto_assem)
                                ioctl(mdfd, STOP_ARRAY, NULL);
-                               close(mdfd);
-                       }
+                       close(mdfd);
                        return 1;
                }
                if (runstop == -1) {
                        fprintf(stderr, Name ": %s assembled from %d drive%s",
                                mddev, okcnt, okcnt==1?"":"s");
-                       if (okcnt != info.array.raid_disks)
-                               fprintf(stderr, " (out of %d)", info.array.raid_disks);
+                       if (okcnt != content->array.raid_disks)
+                               fprintf(stderr, " (out of %d)", content->array.raid_disks);
                        fprintf(stderr, ", but not started.\n");
-                       if (must_close) close(mdfd);
+                       close(mdfd);
                        return 0;
                }
                if (verbose >= -1) {
                        fprintf(stderr, Name ": %s assembled from %d drive%s", mddev, okcnt, okcnt==1?"":"s");
                        if (sparecnt)
                                fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s");
-                       if (!enough(info.array.level, info.array.raid_disks,
-                                   info.array.layout, 1, avail, okcnt))
+                       if (!enough(content->array.level, content->array.raid_disks,
+                                   content->array.layout, 1, avail, okcnt))
                                fprintf(stderr, " - not enough to start the array.\n");
-                       else if (!enough(info.array.level,
-                                        info.array.raid_disks,
-                                        info.array.layout, clean,
+                       else if (!enough(content->array.level,
+                                        content->array.raid_disks,
+                                        content->array.layout, clean,
                                         avail, okcnt))
                                fprintf(stderr, " - not enough to start the "
                                        "array while not clean - consider "
                                        "--force.\n");
                        else {
-                               if (req_cnt == info.array.raid_disks)
+                               if (req_cnt == content->array.raid_disks)
                                        fprintf(stderr, " - need all %d to start it", req_cnt);
                                else
-                                       fprintf(stderr, " - need %d of %d to start", req_cnt, info.array.raid_disks);
+                                       fprintf(stderr, " - need %d of %d to start", req_cnt, content->array.raid_disks);
                                fprintf(stderr, " (use --run to insist).\n");
                        }
                }
-               if (must_close) {
+               if (auto_assem)
                        ioctl(mdfd, STOP_ARRAY, NULL);
-                       close(mdfd);
-               }
+               close(mdfd);
                return 1;
        } else {
                /* The "chosen_drive" is a good choice, and if necessary, the superblock has
@@ -1047,6 +1213,95 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                }
 
        }
-       if (must_close) close(mdfd);
+       close(mdfd);
        return 0;
 }
+
+#ifndef MDASSEMBLE
+int assemble_container_content(struct supertype *st, int mdfd,
+                              struct mdinfo *content, int runstop,
+                              char *chosen_name, int verbose)
+{
+       struct mdinfo *dev, *sra;
+       int working = 0, preexist = 0;
+       struct map_ent *map = NULL;
+
+       sysfs_init(content, mdfd, 0);
+
+       sra = sysfs_read(mdfd, 0, GET_VERSION);
+       if (sra == NULL || strcmp(sra->text_version, content->text_version) != 0)
+               if (sysfs_set_array(content, md_get_version(mdfd)) != 0) {
+                       close(mdfd);
+                       return 1;
+               }
+       if (sra)
+               sysfs_free(sra);
+
+       for (dev = content->devs; dev; dev = dev->next)
+               if (sysfs_add_disk(content, dev, 1) == 0)
+                       working++;
+               else if (errno == EEXIST)
+                       preexist++;
+       if (working == 0) {
+               close(mdfd);
+               return 1;/* Nothing new, don't try to start */
+       }
+       
+       map_update(&map, fd2devnum(mdfd),
+                  content->text_version,
+                  content->uuid, chosen_name);
+
+       if (runstop > 0 ||
+                (working + preexist) >= content->array.working_disks) {
+               int err;
+
+               switch(content->array.level) {
+               case LEVEL_LINEAR:
+               case LEVEL_MULTIPATH:
+               case 0:
+                       err = sysfs_set_str(content, NULL, "array_state",
+                                           "active");
+                       break;
+               default:
+                       err = sysfs_set_str(content, NULL, "array_state",
+                                     "readonly");
+                       /* start mdmon if needed. */
+                       if (!err) {
+                               if (!mdmon_running(st->container_dev))
+                                       start_mdmon(st->container_dev);
+                               ping_monitor(devnum2devname(st->container_dev));
+                       }
+                       break;
+               }
+               if (!err)
+                       sysfs_set_safemode(content, content->safe_mode_delay);
+               if (verbose >= 0) {
+                       if (err)
+                               fprintf(stderr, Name
+                                       ": array %s now has %d devices",
+                                       chosen_name, working + preexist);
+                       else
+                               fprintf(stderr, Name
+                                       ": Started %s with %d devices",
+                                       chosen_name, working + preexist);
+                       if (preexist)
+                               fprintf(stderr, " (%d new)", working);
+                       fprintf(stderr, "\n");
+               }
+               if (!err)
+                       wait_for(chosen_name, mdfd);
+               close(mdfd);
+               return 0;
+               /* FIXME should have an O_EXCL and wait for read-auto */
+       } else {
+               if (verbose >= 0)
+                       fprintf(stderr, Name
+                               ": %s assembled with %d devices but "
+                               "not started\n",
+                               chosen_name, working);
+               close(mdfd);
+               return 1;
+       }
+}
+#endif
+
diff --git a/Build.c b/Build.c
index 21fe2a51d3fd0b25d1ed5712b69de39818b8ca61..7f3925864731bf63e0c1c1a10357a3476a95a2fe 100644 (file)
--- a/Build.c
+++ b/Build.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 
 #include "mdadm.h"
 #define START_MD               _IO (MD_MAJOR, 2)
 #define STOP_MD                _IO (MD_MAJOR, 3)
 
-int Build(char *mddev, int mdfd, int chunk, int level, int layout,
-         int raiddisks,
-         mddev_dev_t devlist, int assume_clean,
-         char *bitmap_file, int bitmap_chunk, int write_behind, int delay,
-         int verbose, unsigned long long size)
+int Build(char *mddev, int chunk, int level, int layout,
+         int raiddisks, mddev_dev_t devlist, int assume_clean,
+         char *bitmap_file, int bitmap_chunk, int write_behind,
+         int delay, int verbose, int autof, unsigned long long size)
 {
        /* Build a linear or raid0 arrays without superblocks
         * We cannot really do any checks, we just do it.
@@ -59,6 +53,10 @@ int Build(char *mddev, int mdfd, int chunk, int level, int layout,
        mddev_dev_t dv;
        int bitmap_fd;
        unsigned long long bitmapsize;
+       int mdfd;
+       char chosen_name[1024];
+       int uuid[4] = {0,0,0,0};
+       struct map_ent *map = NULL;
 
        /* scan all devices, make sure they really are block devices */
        for (dv = devlist; dv; dv=dv->next) {
@@ -112,6 +110,18 @@ int Build(char *mddev, int mdfd, int chunk, int level, int layout,
                        break;
                }
 
+       /* We need to create the device.  It can have no name. */
+       map_lock(&map);
+       mdfd = create_mddev(mddev, NULL, autof, LOCAL,
+                           chosen_name);
+       if (mdfd < 0) {
+               map_unlock(&map);
+               return 1;
+       }
+       mddev = chosen_name;
+
+       map_update(&map, fd2devnum(mdfd), "none", uuid, chosen_name);
+       map_unlock(&map);
 
        vers = md_get_version(mdfd);
 
@@ -140,17 +150,17 @@ int Build(char *mddev, int mdfd, int chunk, int level, int layout,
                if (ioctl(mdfd, SET_ARRAY_INFO, &array)) {
                        fprintf(stderr, Name ": SET_ARRAY_INFO failed for %s: %s\n",
                                mddev, strerror(errno));
-                       return 1;
+                       goto abort;
                }
        } else if (bitmap_file) {
                fprintf(stderr, Name ": bitmaps not supported with this kernel\n");
-               return 1;
+               goto abort;
        }
 
        if (bitmap_file && level <= 0) {
                fprintf(stderr, Name ": bitmaps not meaningful with level %s\n",
                        map_num(pers, level)?:"given");
-               return 1;
+               goto abort;
        }
        /* now add the devices */
        for ((i=0), (dv = devlist) ; dv ; i++, dv=dv->next) {
@@ -211,7 +221,7 @@ int Build(char *mddev, int mdfd, int chunk, int level, int layout,
                                if (bitmap_chunk == UnSet) {
                                        fprintf(stderr, Name ": %s cannot be openned.",
                                                bitmap_file);
-                                       return 1;
+                                       goto abort;
                                }
 #endif
                                if (vers < 9003) {
@@ -224,20 +234,20 @@ int Build(char *mddev, int mdfd, int chunk, int level, int layout,
                                bitmapsize = size>>9; /* FIXME wrong for RAID10 */
                                if (CreateBitmap(bitmap_file, 1, NULL, bitmap_chunk,
                                                 delay, write_behind, bitmapsize, major)) {
-                                       return 1;
+                                       goto abort;
                                }
                                bitmap_fd = open(bitmap_file, O_RDWR);
                                if (bitmap_fd < 0) {
                                        fprintf(stderr, Name ": %s cannot be openned.",
                                                bitmap_file);
-                                       return 1;
+                                       goto abort;
                                }
                        }
                        if (bitmap_fd >= 0) {
                                if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) {
                                        fprintf(stderr, Name ": Cannot set bitmap file for %s: %s\n",
                                                mddev, strerror(errno));
-                                       return 1;
+                                       goto abort;
                                }
                        }
                }
@@ -265,6 +275,8 @@ int Build(char *mddev, int mdfd, int chunk, int level, int layout,
        if (verbose >= 0)
                fprintf(stderr, Name ": array %s built and started.\n",
                        mddev);
+       wait_for(mddev, mdfd);
+       close(mdfd);
        return 0;
 
  abort:
@@ -272,5 +284,6 @@ int Build(char *mddev, int mdfd, int chunk, int level, int layout,
            ioctl(mdfd, STOP_ARRAY, 0);
        else
            ioctl(mdfd, STOP_MD, 0);
+       close(mdfd);
        return 1;
 }
index 9e65d0a9f6516d125cfb5f1c96efb7106a678f46..8a73799c4448c727308228dc51c34049093ebe65 100644 (file)
--- a/Create.c
+++ b/Create.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 
 #include "mdadm.h"
 #include       "md_p.h"
 #include       <ctype.h>
 
-int Create(struct supertype *st, char *mddev, int mdfd,
+static int default_layout(struct supertype *st, int level, int verbose)
+{
+       int layout = UnSet;
+
+       if (st && st->ss->default_layout)
+               layout = st->ss->default_layout(level);
+
+       if (layout == UnSet)
+               switch(level) {
+               default: /* no layout */
+                       layout = 0;
+                       break;
+               case 10:
+                       layout = 0x102; /* near=2, far=1 */
+                       if (verbose > 0)
+                               fprintf(stderr,
+                                       Name ": layout defaults to n1\n");
+                       break;
+               case 5:
+               case 6:
+                       layout = map_name(r5layout, "default");
+                       if (verbose > 0)
+                               fprintf(stderr,
+                                       Name ": layout defaults to %s\n", map_num(r5layout, layout));
+                       break;
+               case LEVEL_FAULTY:
+                       layout = map_name(faultylayout, "default");
+
+                       if (verbose > 0)
+                               fprintf(stderr,
+                                       Name ": layout defaults to %s\n", map_num(faultylayout, layout));
+                       break;
+               }
+
+       return layout;
+}
+
+
+int Create(struct supertype *st, char *mddev,
           int chunk, int level, int layout, unsigned long long size, int raiddisks, int sparedisks,
           char *name, char *homehost, int *uuid,
           int subdevs, mddev_dev_t devlist,
           int runstop, int verbose, int force, int assume_clean,
-          char *bitmap_file, int bitmap_chunk, int write_behind, int delay)
+          char *bitmap_file, int bitmap_chunk, int write_behind, int delay, int autof)
 {
        /*
         * Create a new raid array.
@@ -55,6 +88,7 @@ int Create(struct supertype *st, char *mddev, int mdfd,
         * if runstop==run, or raiddisks disks were used,
         * RUN_ARRAY
         */
+       int mdfd;
        unsigned long long minsize=0, maxsize=0;
        char *mindisc = NULL;
        char *maxdisc = NULL;
@@ -66,31 +100,35 @@ int Create(struct supertype *st, char *mddev, int mdfd,
        int second_missing = subdevs * 2;
        int missing_disks = 0;
        int insert_point = subdevs * 2; /* where to insert a missing drive */
+       int total_slots;
        int pass;
        int vers;
        int rv;
        int bitmap_fd;
+       int have_container = 0;
+       int container_fd = -1;
+       int need_mdmon = 0;
        unsigned long long bitmapsize;
-       struct mdinfo info;
+       struct mdinfo info, *infos;
+       int did_default = 0;
+       int do_default_layout = 0;
+       unsigned long safe_mode_delay = 0;
+       char chosen_name[1024];
+       struct map_ent *map = NULL;
+       unsigned long long newsize;
 
        int major_num = BITMAP_MAJOR_HI;
 
        memset(&info, 0, sizeof(info));
 
-       vers = md_get_version(mdfd);
-       if (vers < 9000) {
-               fprintf(stderr, Name ": Create requires md driver version 0.90.0 or later\n");
-               return 1;
-       } else {
-               mdu_array_info_t inf;
-               memset(&inf, 0, sizeof(inf));
-               ioctl(mdfd, GET_ARRAY_INFO, &inf);
-               if (inf.working_disks != 0) {
-                       fprintf(stderr, Name ": another array by this name"
-                               " is already running.\n");
-                       return 1;
-               }
+       if (level == UnSet) {
+               /* "ddf" and "imsm" metadata only supports one level - should possibly
+                * push this into metadata handler??
+                */
+               if (st && (st->ss == &super_ddf || st->ss == &super_imsm))
+                       level = LEVEL_CONTAINER;
        }
+
        if (level == UnSet) {
                fprintf(stderr,
                        Name ": a RAID level is needed to create an array.\n");
@@ -116,11 +154,55 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                        Name ": This level does not support spare devices\n");
                return 1;
        }
+
+       if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) {
+               /* If given a single device, it might be a container, and we can
+                * extract a device list from there
+                */
+               mdu_array_info_t inf;
+               int fd;
+
+               memset(&inf, 0, sizeof(inf));
+               fd = open(devlist->devname, O_RDONLY);
+               if (fd >= 0 &&
+                   ioctl(fd, GET_ARRAY_INFO, &inf) == 0 &&
+                   inf.raid_disks == 0) {
+                       /* yep, looks like a container */
+                       if (st) {
+                               rv = st->ss->load_super(st, fd,
+                                                       devlist->devname);
+                               if (rv == 0)
+                                       have_container = 1;
+                       } else {
+                               st = guess_super(fd);
+                               if (st && !(rv = st->ss->
+                                           load_super(st, fd,
+                                                      devlist->devname)))
+                                       have_container = 1;
+                               else
+                                       st = NULL;
+                       }
+                       if (have_container) {
+                               subdevs = raiddisks;
+                               first_missing = subdevs * 2;
+                               second_missing = subdevs * 2;
+                               insert_point = subdevs * 2;
+                       }
+               }
+               if (fd >= 0)
+                       close(fd);
+       }
+       if (st && st->ss->external && sparedisks) {
+               fprintf(stderr,
+                       Name ": This metadata type does not support "
+                       "spare disks are create time\n");
+               return 1;
+       }
        if (subdevs > raiddisks+sparedisks) {
                fprintf(stderr, Name ": You have listed more devices (%d) than are in the array(%d)!\n", subdevs, raiddisks+sparedisks);
                return 1;
        }
-       if (subdevs < raiddisks+sparedisks) {
+       if (!have_container && subdevs < raiddisks+sparedisks) {
                fprintf(stderr, Name ": You haven't given enough devices (real or missing) to create this array\n");
                return 1;
        }
@@ -131,32 +213,12 @@ int Create(struct supertype *st, char *mddev, int mdfd,
        }
 
        /* now set some defaults */
-       if (layout == UnSet)
-               switch(level) {
-               default: /* no layout */
-                       layout = 0;
-                       break;
-               case 10:
-                       layout = 0x102; /* near=2, far=1 */
-                       if (verbose > 0)
-                               fprintf(stderr,
-                                       Name ": layout defaults to n1\n");
-                       break;
-               case 5:
-               case 6:
-                       layout = map_name(r5layout, "default");
-                       if (verbose > 0)
-                               fprintf(stderr,
-                                       Name ": layout defaults to %s\n", map_num(r5layout, layout));
-                       break;
-               case LEVEL_FAULTY:
-                       layout = map_name(faultylayout, "default");
 
-                       if (verbose > 0)
-                               fprintf(stderr,
-                                       Name ": layout defaults to %s\n", map_num(faultylayout, layout));
-                       break;
-               }
+
+       if (layout == UnSet) {
+               do_default_layout = 1;
+               layout = default_layout(st, level, verbose);
+       }
 
        if (level == 10)
                /* check layout fits in array*/
@@ -182,6 +244,7 @@ int Create(struct supertype *st, char *mddev, int mdfd,
        case 1:
        case LEVEL_FAULTY:
        case LEVEL_MULTIPATH:
+       case LEVEL_CONTAINER:
                if (chunk) {
                        chunk = 0;
                        if (verbose > 0)
@@ -192,15 +255,27 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                fprintf(stderr, Name ": unknown level %d\n", level);
                return 1;
        }
+       
+       if (size && chunk)
+               size &= ~(unsigned long long)(chunk - 1);
+       newsize = size * 2;
+       if (st && ! st->ss->validate_geometry(st, level, layout, raiddisks,
+                                             chunk, size*2, NULL, &newsize, verbose>=0))
+               return 1;
+       if (size == 0) {
+               size = newsize / 2;
+               if (size && verbose > 0)
+                       fprintf(stderr, Name ": setting size to %lluK\n",
+                               (unsigned long long)size);
+       }
 
        /* now look at the subdevs */
        info.array.active_disks = 0;
        info.array.working_disks = 0;
        dnum = 0;
-       for (dv=devlist; dv; dv=dv->next, dnum++) {
+       for (dv=devlist; dv && !have_container; dv=dv->next, dnum++) {
                char *dname = dv->devname;
-               unsigned long long ldsize, freesize;
-               int fd;
+               unsigned long long freesize;
                if (strcasecmp(dname, "missing")==0) {
                        if (first_missing > dnum)
                                first_missing = dnum;
@@ -212,18 +287,6 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                info.array.working_disks++;
                if (dnum < raiddisks)
                        info.array.active_disks++;
-               fd = open(dname, O_RDONLY|O_EXCL);
-               if (fd <0 ) {
-                       fprintf(stderr, Name ": Cannot open %s: %s\n",
-                               dname, strerror(errno));
-                       fail=1;
-                       continue;
-               }
-               if (!get_dev_size(fd, dname, &ldsize)) {
-                       fail = 1;
-                       close(fd);
-                       continue;
-               }
                if (st == NULL) {
                        struct createinfo *ci = conf_get_create_info();
                        if (ci)
@@ -231,33 +294,46 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                }
                if (st == NULL) {
                        /* Need to choose a default metadata, which is different
-                        * depending on the sizes of devices
+                        * depending on geometry of array.
                         */
                        int i;
                        char *name = "default";
-                       if (level >= 1 && ldsize > (0x7fffffffULL<<10))
-                               name = "default/large";
-                       for(i=0; !st && superlist[i]; i++)
+                       for(i=0; !st && superlist[i]; i++) {
                                st = superlist[i]->match_metadata_desc(name);
+                               if (do_default_layout)
+                                       layout = default_layout(st, level, verbose);
+                               if (st && !st->ss->validate_geometry
+                                               (st, level, layout, raiddisks,
+                                                chunk, size*2, dname, &freesize,
+                                                verbose > 0))
+                                       st = NULL;
+                       }
 
                        if (!st) {
-                               fprintf(stderr, Name ": internal error - no default metadata style\n");
+                               fprintf(stderr, Name ": device %s not suitable "
+                                       "for any style of array\n",
+                                       dname);
                                exit(2);
                        }
-                       if (st->ss->major != 0 ||
+                       if (st->ss != &super0 ||
                            st->minor_version != 90)
-                               fprintf(stderr, Name ": Defaulting to version"
-                                       " %d.%d metadata\n",
-                                       st->ss->major,
-                                       st->minor_version);
-               }
-               freesize = st->ss->avail_size(st, ldsize >> 9);
-               if (freesize == 0) {
-                       fprintf(stderr, Name ": %s is too small: %luK\n",
-                               dname, (unsigned long)(ldsize>>10));
-                       fail = 1;
-                       close(fd);
-                       continue;
+                               did_default = 1;
+               } else {
+                       if (do_default_layout)
+                               layout = default_layout(st, level, verbose);
+                       if (!st->ss->validate_geometry(st, level, layout,
+                                                      raiddisks,
+                                                      chunk, size*2, dname,
+                                                      &freesize,
+                                                      verbose > 0)) {
+
+                               fprintf(stderr,
+                                       Name ": %s is not suitable for "
+                                       "this array.\n",
+                                       dname);
+                               fail = 1;
+                               continue;
+                       }
                }
 
                freesize /= 2; /* convert to K */
@@ -267,10 +343,10 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                }
 
                if (size && freesize < size) {
-                       fprintf(stderr, Name ": %s is smaller that given size."
-                               " %lluK < %lluK + superblock\n", dname, freesize, size);
+                       fprintf(stderr, Name ": %s is smaller than given size."
+                               " %lluK < %lluK + metadata\n",
+                               dname, freesize, size);
                        fail = 1;
-                       close(fd);
                        continue;
                }
                if (maxdisc == NULL || (maxdisc && freesize > maxsize)) {
@@ -282,24 +358,38 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                        minsize = freesize;
                }
                if (runstop != 1 || verbose >= 0) {
+                       int fd = open(dname, O_RDONLY);
+                       if (fd <0 ) {
+                               fprintf(stderr, Name ": Cannot open %s: %s\n",
+                                       dname, strerror(errno));
+                               fail=1;
+                               continue;
+                       }
                        warn |= check_ext2(fd, dname);
                        warn |= check_reiser(fd, dname);
                        warn |= check_raid(fd, dname);
+                       close(fd);
                }
-               close(fd);
        }
+       if (have_container)
+               info.array.working_disks = raiddisks;
        if (fail) {
                fprintf(stderr, Name ": create aborted\n");
                return 1;
        }
        if (size == 0) {
-               if (mindisc == NULL) {
+               if (mindisc == NULL && !have_container) {
                        fprintf(stderr, Name ": no size and no drives given - aborting create.\n");
                        return 1;
                }
-               if (level > 0 || level == LEVEL_MULTIPATH || level == LEVEL_FAULTY) {
+               if (level > 0 || level == LEVEL_MULTIPATH
+                   || level == LEVEL_FAULTY
+                   || st->ss->external ) {
                        /* size is meaningful */
-                       if (minsize > 0x100000000ULL && st->ss->major == 0) {
+                       if (!st->ss->validate_geometry(st, level, layout,
+                                                      raiddisks,
+                                                      chunk, minsize*2,
+                                                      NULL, NULL, 0)) {
                                fprintf(stderr, Name ": devices too large for RAID level %d\n", level);
                                return 1;
                        }
@@ -308,13 +398,21 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                                fprintf(stderr, Name ": size set to %lluK\n", size);
                }
        }
-       if (level > 0 && ((maxsize-size)*100 > maxsize)) {
+       if (!have_container && level > 0 && ((maxsize-size)*100 > maxsize)) {
                if (runstop != 1 || verbose >= 0)
-                       fprintf(stderr, Name ": largest drive (%s) exceed size (%lluK) by more than 1%%\n",
+                       fprintf(stderr, Name ": largest drive (%s) exceeds size (%lluK) by more than 1%%\n",
                                maxdisc, size);
                warn = 1;
        }
 
+       if (st->ss->detail_platform && st->ss->detail_platform(0, 1) != 0) {
+               if (runstop != 1 || verbose >= 0)
+                       fprintf(stderr, Name ": %s unable to enumerate platform support\n"
+                               "    array may not be compatible with hardware/firmware\n",
+                               st->ss->name);
+               warn = 1;
+       }
+
        if (warn) {
                if (runstop!= 1) {
                        if (!ask("Continue creating array? ")) {
@@ -331,7 +429,8 @@ int Create(struct supertype *st, char *mddev, int mdfd,
         * as missing, so that a reconstruct happens (faster than re-parity)
         * FIX: Can we do this for raid6 as well?
         */
-       if (assume_clean==0 && force == 0 && first_missing >= raiddisks) {
+       if (st->ss->external == 0 &&
+           assume_clean==0 && force == 0 && first_missing >= raiddisks) {
                switch ( level ) {
                case 4:
                case 5:
@@ -348,6 +447,7 @@ int Create(struct supertype *st, char *mddev, int mdfd,
         * into a spare, else the create will fail
         */
        if (assume_clean == 0 && force == 0 && first_missing < raiddisks &&
+           st->ss->external == 0 &&
            second_missing >= raiddisks && level == 6) {
                insert_point = raiddisks - 1;
                if (insert_point == first_missing)
@@ -357,12 +457,34 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                missing_disks++;
        }
 
-       if (level <= 0 && first_missing != subdevs * 2) {
+       if (level <= 0 && first_missing < subdevs * 2) {
                fprintf(stderr,
                        Name ": This level does not support missing devices\n");
                return 1;
        }
 
+       /* We need to create the device */
+       map_lock(&map);
+       mdfd = create_mddev(mddev, name, autof, LOCAL, chosen_name);
+       if (mdfd < 0)
+               return 1;
+       mddev = chosen_name;
+
+       vers = md_get_version(mdfd);
+       if (vers < 9000) {
+               fprintf(stderr, Name ": Create requires md driver version 0.90.0 or later\n");
+               goto abort;
+       } else {
+               mdu_array_info_t inf;
+               memset(&inf, 0, sizeof(inf));
+               ioctl(mdfd, GET_ARRAY_INFO, &inf);
+               if (inf.working_disks != 0) {
+                       fprintf(stderr, Name ": another array by this name"
+                               " is already running.\n");
+                       goto abort;
+               }
+       }
+
        /* Ok, lets try some ioctls */
 
        info.array.level = level;
@@ -382,12 +504,16 @@ int Create(struct supertype *st, char *mddev, int mdfd,
             ( level == 6 && (insert_point < raiddisks
                              || second_missing < raiddisks))
             ||
+            ( level <= 0 )
+            ||
             assume_clean
-               )
+               ) {
                info.array.state = 1; /* clean, but one+ drive will be missing*/
-       else
+               info.resync_start = ~0ULL;
+       } else {
                info.array.state = 0; /* not clean, but no errors */
-
+               info.resync_start = 0;
+       }
        if (level == 10) {
                /* for raid10, the bitmap size is the capacity of the array,
                 * which is array.size * raid_disks / ncopies;
@@ -424,7 +550,6 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                + info.array.failed_disks;
        info.array.layout = layout;
        info.array.chunk_size = chunk*1024;
-       info.array.major_version = st->ss->major;
 
        if (name == NULL || *name == 0) {
                /* base name on mddev */
@@ -435,6 +560,7 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                 *  /dev/md/home -> home
                 *  /dev/mdhome -> home
                 */
+               /* FIXME compare this with rules in create_mddev */
                name = strrchr(mddev, '/');
                if (name) {
                        name++;
@@ -451,7 +577,37 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                }
        }
        if (!st->ss->init_super(st, &info.array, size, name, homehost, uuid))
-               return 1;
+               goto abort;
+
+       total_slots = info.array.nr_disks;
+       sysfs_init(&info, mdfd, 0);
+       st->ss->getinfo_super(st, &info);
+
+       if (did_default && verbose >= 0) {
+               if (is_subarray(info.text_version)) {
+                       int dnum = devname2devnum(info.text_version+1);
+                       char *path;
+                       int mdp = get_mdp_major();
+                       struct mdinfo *mdi;
+                       if (dnum > 0)
+                               path = map_dev(MD_MAJOR, dnum, 1);
+                       else
+                               path = map_dev(mdp, (-1-dnum)<< 6, 1);
+
+                       mdi = sysfs_read(-1, dnum, GET_VERSION);
+
+                       fprintf(stderr, Name ": Creating array inside "
+                               "%s container %s\n", 
+                               mdi?mdi->text_version:"managed", path);
+                       sysfs_free(mdi);
+               } else
+                       fprintf(stderr, Name ": Defaulting to version"
+                               " %s metadata\n", info.text_version);
+       }
+
+       map_update(&map, fd2devnum(mdfd), info.text_version,
+                  info.uuid, chosen_name);
+       map_unlock(&map);
 
        if (bitmap_file && vers < 9003) {
                major_num = BITMAP_MAJOR_HOSTENDIAN;
@@ -464,31 +620,55 @@ int Create(struct supertype *st, char *mddev, int mdfd,
        if (bitmap_file && strcmp(bitmap_file, "internal")==0) {
                if ((vers%100) < 2) {
                        fprintf(stderr, Name ": internal bitmaps not supported by this kernel.\n");
-                       return 1;
+                       goto abort;
                }
                if (!st->ss->add_internal_bitmap(st, &bitmap_chunk,
                                                 delay, write_behind,
                                                 bitmapsize, 1, major_num)) {
                        fprintf(stderr, Name ": Given bitmap chunk size not supported.\n");
-                       return 1;
+                       goto abort;
                }
                bitmap_file = NULL;
        }
 
 
+       sysfs_init(&info, mdfd, 0);
 
-       if ((vers % 100) >= 1) { /* can use different versions */
-               mdu_array_info_t inf;
-               memset(&inf, 0, sizeof(inf));
-               inf.major_version = st->ss->major;
-               inf.minor_version = st->minor_version;
-               rv = ioctl(mdfd, SET_ARRAY_INFO, &inf);
-       } else
-               rv = ioctl(mdfd, SET_ARRAY_INFO, NULL);
+       if (st->ss->external && st->subarray[0]) {
+               /* member */
+
+               /* When creating a member, we need to be careful
+                * to negotiate with mdmon properly.
+                * If it is already running, we cannot write to
+                * the devices and must ask it to do that part.
+                * If it isn't running, we write to the devices,
+                * and then start it.
+                * We hold an exclusive open on the container
+                * device to make sure mdmon doesn't exit after
+                * we checked that it is running.
+                *
+                * For now, fail if it is already running.
+                */
+               container_fd = open_dev_excl(st->container_dev);
+               if (container_fd < 0) {
+                       fprintf(stderr, Name ": Cannot get exclusive "
+                               "open on container - weird.\n");
+                       goto abort;
+               }
+               if (mdmon_running(st->container_dev)) {
+                       if (verbose)
+                               fprintf(stderr, Name ": reusing mdmon "
+                                       "for %s.\n",
+                                       devnum2devname(st->container_dev));
+                       st->update_tail = &st->updates;
+               } else
+                       need_mdmon = 1;
+       }
+       rv = set_array_info(mdfd, st, &info);
        if (rv) {
-               fprintf(stderr, Name ": SET_ARRAY_INFO failed for %s: %s\n",
+               fprintf(stderr, Name ": failed to set array info for %s: %s\n",
                        mddev, strerror(errno));
-               return 1;
+               goto abort;
        }
 
        if (bitmap_file) {
@@ -499,22 +679,22 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                                 delay, write_behind,
                                 bitmapsize,
                                 major_num)) {
-                       return 1;
+                       goto abort;
                }
                bitmap_fd = open(bitmap_file, O_RDWR);
                if (bitmap_fd < 0) {
                        fprintf(stderr, Name ": weird: %s cannot be openned\n",
                                bitmap_file);
-                       return 1;
+                       goto abort;
                }
                if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) {
                        fprintf(stderr, Name ": Cannot set bitmap file for %s: %s\n",
                                mddev, strerror(errno));
-                       return 1;
+                       goto abort;
                }
        }
 
-
+       infos = malloc(sizeof(*infos) * total_slots);
 
        for (pass=1; pass <=2 ; pass++) {
                mddev_dev_t moved_disk = NULL; /* the disk that was moved out of the insert point */
@@ -523,76 +703,153 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                     dv=(dv->next)?(dv->next):moved_disk, dnum++) {
                        int fd;
                        struct stat stb;
+                       struct mdinfo *inf = &infos[dnum];
 
-                       info.disk.number = dnum;
+                       if (dnum >= total_slots)
+                               abort();
                        if (dnum == insert_point) {
                                moved_disk = dv;
+                               continue;
                        }
-                       info.disk.raid_disk = info.disk.number;
-                       if (info.disk.raid_disk < raiddisks)
-                               info.disk.state = (1<<MD_DISK_ACTIVE) |
+                       if (strcasecmp(dv->devname, "missing")==0)
+                               continue;
+                       if (have_container)
+                               moved_disk = NULL;
+                       if (have_container && dnum < info.array.raid_disks - 1)
+                               /* repeatedly use the container */
+                               moved_disk = dv;
+
+                       switch(pass) {
+                       case 1:
+                               *inf = info;
+
+                               inf->disk.number = dnum;
+                               inf->disk.raid_disk = dnum;
+                               if (inf->disk.raid_disk < raiddisks)
+                                       inf->disk.state = (1<<MD_DISK_ACTIVE) |
                                                (1<<MD_DISK_SYNC);
-                       else
-                               info.disk.state = 0;
-                       if (dv->writemostly == 1)
-                               info.disk.state |= (1<<MD_DISK_WRITEMOSTLY);
-
-                       if (dnum == insert_point ||
-                           strcasecmp(dv->devname, "missing")==0) {
-                               info.disk.major = 0;
-                               info.disk.minor = 0;
-                               info.disk.state = (1<<MD_DISK_FAULTY);
-                       } else {
-                               fd = open(dv->devname, O_RDONLY|O_EXCL);
-                               if (fd < 0) {
-                                       fprintf(stderr, Name ": failed to open %s after earlier success - aborting\n",
-                                               dv->devname);
-                                       return 1;
+                               else
+                                       inf->disk.state = 0;
+
+                               if (dv->writemostly == 1)
+                                       inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
+
+                               if (have_container)
+                                       fd = -1;
+                               else {
+                                       if (st->ss->external && st->subarray[0])
+                                               fd = open(dv->devname, O_RDWR);
+                                       else
+                                               fd = open(dv->devname, O_RDWR|O_EXCL);
+
+                                       if (fd < 0) {
+                                               fprintf(stderr, Name ": failed to open %s "
+                                                       "after earlier success - aborting\n",
+                                                       dv->devname);
+                                               goto abort;
+                                       }
+                                       fstat(fd, &stb);
+                                       inf->disk.major = major(stb.st_rdev);
+                                       inf->disk.minor = minor(stb.st_rdev);
+                               }
+                               if (fd >= 0)
+                                       remove_partitions(fd);
+                               if (st->ss->add_to_super(st, &inf->disk,
+                                                        fd, dv->devname))
+                                       goto abort;
+                               st->ss->getinfo_super(st, inf);
+                               safe_mode_delay = inf->safe_mode_delay;
+
+                               if (have_container && verbose > 0)
+                                       fprintf(stderr, Name ": Using %s for device %d\n",
+                                               map_dev(inf->disk.major,
+                                                       inf->disk.minor,
+                                                       0), dnum);
+
+                               if (!have_container) {
+                                       /* getinfo_super might have lost these ... */
+                                       inf->disk.major = major(stb.st_rdev);
+                                       inf->disk.minor = minor(stb.st_rdev);
                                }
-                               fstat(fd, &stb);
-                               info.disk.major = major(stb.st_rdev);
-                               info.disk.minor = minor(stb.st_rdev);
-                               remove_partitions(fd);
-                               close(fd);
-                       }
-                       switch(pass){
-                       case 1:
-                               st->ss->add_to_super(st, &info.disk);
                                break;
                        case 2:
-                               if (info.disk.state == 1) break;
-                               Kill(dv->devname, 0, 1); /* Just be sure it is clean */
-                               Kill(dv->devname, 0, 1); /* and again, there could be two superblocks */
-                               st->ss->write_init_super(st, &info.disk,
-                                                        dv->devname);
-
-                               if (ioctl(mdfd, ADD_NEW_DISK, &info.disk)) {
-                                       fprintf(stderr, Name ": ADD_NEW_DISK for %s failed: %s\n",
+                               inf->errors = 0;
+                               rv = 0;
+
+                               rv = add_disk(mdfd, st, &info, inf);
+
+                               if (rv) {
+                                       fprintf(stderr,
+                                               Name ": ADD_NEW_DISK for %s "
+                                               "failed: %s\n",
                                                dv->devname, strerror(errno));
                                        st->ss->free_super(st);
-                                       return 1;
+                                       goto abort;
                                }
-
                                break;
                        }
-                       if (dv == moved_disk && dnum != insert_point) break;
+                       if (!have_container &&
+                           dv == moved_disk && dnum != insert_point) break;
+               }
+               if (pass == 1) {
+                       st->ss->write_init_super(st);
+                       flush_metadata_updates(st);
                }
        }
+       free(infos);
        st->ss->free_super(st);
 
-       /* param is not actually used */
-       if (runstop == 1 || subdevs >= raiddisks) {
-               mdu_param_t param;
-               if (ioctl(mdfd, RUN_ARRAY, &param)) {
-                       fprintf(stderr, Name ": RUN_ARRAY failed: %s\n",
-                               strerror(errno));
-                       Manage_runstop(mddev, mdfd, -1, 0);
-                       return 1;
+       if (level == LEVEL_CONTAINER) {
+               /* No need to start.  But we should signal udev to
+                * create links */
+               sysfs_uevent(&info, "change");
+               if (verbose >= 0)
+                       fprintf(stderr, Name ": container %s prepared.\n", mddev);
+               wait_for(chosen_name, mdfd);
+       } else if (runstop == 1 || subdevs >= raiddisks) {
+               if (st->ss->external) {
+                       switch(level) {
+                       case LEVEL_LINEAR:
+                       case LEVEL_MULTIPATH:
+                       case 0:
+                               sysfs_set_str(&info, NULL, "array_state",
+                                             "active");
+                               need_mdmon = 0;
+                               break;
+                       default:
+                               sysfs_set_str(&info, NULL, "array_state",
+                                             "readonly");
+                               break;
+                       }
+                       sysfs_set_safemode(&info, safe_mode_delay);
+               } else {
+                       /* param is not actually used */
+                       mdu_param_t param;
+                       if (ioctl(mdfd, RUN_ARRAY, &param)) {
+                               fprintf(stderr, Name ": RUN_ARRAY failed: %s\n",
+                                       strerror(errno));
+                               Manage_runstop(mddev, mdfd, -1, 0);
+                               goto abort;
+                       }
                }
                if (verbose >= 0)
                        fprintf(stderr, Name ": array %s started.\n", mddev);
+               if (st->ss->external && st->subarray[0]) {
+                       if (need_mdmon)
+                               start_mdmon(st->container_dev);
+
+                       ping_monitor(devnum2devname(st->container_dev));
+                       close(container_fd);
+               }
+               wait_for(chosen_name, mdfd);
        } else {
                fprintf(stderr, Name ": not starting array - not enough devices.\n");
        }
+       close(mdfd);
        return 0;
+
+ abort:
+       if (mdfd >= 0)
+               close(mdfd);
+       return 1;
 }
index 8f86ead88c1ef995d791ddd008907741ea2cc2fe..ab01cfb4b59092cfbd84c19808135ed995aa681a 100644 (file)
--- a/Detail.c
+++ b/Detail.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 
 #include       "mdadm.h"
 #include       "md_p.h"
 #include       "md_u.h"
+#include       <dirent.h>
 
 int Detail(char *dev, int brief, int export, int test, char *homehost)
 {
@@ -56,6 +52,8 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
        int max_disks = MD_SB_DISKS; /* just a default */
        struct mdinfo info;
        struct mdinfo *sra;
+       char *member = NULL;
+       char *container = NULL;
 
        int rv = test ? 4 : 1;
        int avail_disks = 0;
@@ -96,7 +94,21 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                stb.st_rdev = 0;
        rv = 0;
 
-       if (st) max_disks = st->max_devs;
+       if (st)
+               max_disks = st->max_devs;
+
+       if (sra && is_subarray(sra->text_version) &&
+               strchr(sra->text_version+1, '/')) {
+               /* This is a subarray of some container.
+                * We want the name of the container, and the member
+                */
+               char *s = strchr(sra->text_version+1, '/');
+               int dn;
+               *s++ = '\0';
+               member = s;
+               dn = devname2devnum(sra->text_version+1);
+               container = map_dev(dev2major(dn), dev2minor(dn), 1);
+       }
 
        /* try to load a superblock */
        for (d= 0; d<max_disks; d++) {
@@ -111,7 +123,8 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                        continue;
                if ((dv=map_dev(disk.major, disk.minor, 1))) {
                        if ((!st || !st->sb) &&
-                           (disk.state & (1<<MD_DISK_ACTIVE))) {
+                           (array.raid_disks == 0 || 
+                            (disk.state & (1<<MD_DISK_ACTIVE)))) {
                                /* try to read the superblock from this device
                                 * to get more info
                                 */
@@ -119,8 +132,9 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                                if (fd2 >=0 && st &&
                                    st->ss->load_super(st, fd2, NULL) == 0) {
                                        st->ss->getinfo_super(st, &info);
-                                       if (info.array.ctime != array.ctime ||
-                                           info.array.level != array.level)
+                                       if (array.raid_disks != 0 && /* container */
+                                           (info.array.ctime != array.ctime ||
+                                            info.array.level != array.level))
                                                st->ss->free_super(st);
                                }
                                if (fd2 >= 0) close(fd2);
@@ -132,30 +146,71 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
        c = map_num(pers, array.level);
 
        if (export) {
-               if (c)
-                       printf("MD_LEVEL=%s\n", c);
-               printf("MD_DEVICES=%d\n", array.raid_disks);
-               if (sra && sra->array.major_version < 0)
-                       printf("MD_METADATA=%s\n", sra->text_version);
-               else
-                       printf("MD_METADATA=%d.%02d\n",
-                              array.major_version, array.minor_version);
+               if (array.raid_disks) {
+                       if (c)
+                               printf("MD_LEVEL=%s\n", c);
+                       printf("MD_DEVICES=%d\n", array.raid_disks);
+               } else {
+                       printf("MD_LEVEL=container\n");
+                       printf("MD_DEVICES=%d\n", array.nr_disks);
+               }
+               if (container) {
+                       printf("MD_CONTAINER=%s\n", container);
+                       printf("MD_MEMBER=%s\n", member);
+               } else {
+                       if (sra && sra->array.major_version < 0)
+                               printf("MD_METADATA=%s\n", sra->text_version);
+                       else
+                               printf("MD_METADATA=%d.%02d\n",
+                                      array.major_version, array.minor_version);
+               }
+               
+               if (st && st->sb) {
+                       struct mdinfo info;
+                       char nbuf[64];
+                       struct map_ent *mp, *map = NULL;
+                       st->ss->getinfo_super(st, &info);
+                       fname_from_uuid(st, &info, nbuf, ':');
+                       printf("MD_UUID=%s\n", nbuf+5);
+                       mp = map_by_uuid(&map, info.uuid);
+                       if (mp && mp->path &&
+                           strncmp(mp->path, "/dev/md/", 8) == 0)
+                               printf("MD_DEVNAME=%s\n", mp->path+8);
 
-               if (st && st->sb)
-                       st->ss->export_detail_super(st);
+                       if (st->ss->export_detail_super)
+                               st->ss->export_detail_super(st);
+               } else {
+                       struct map_ent *mp, *map = NULL;
+                       mp = map_by_devnum(&map, fd2devnum(fd));
+                       if (mp && mp->path &&
+                           strncmp(mp->path, "/dev/md/", 8) == 0)
+                               printf("MD_DEVNAME=%s\n", mp->path+8);
+               }
                goto out;
        }
 
        if (brief) {
                mdu_bitmap_file_t bmf;
-               printf("ARRAY %s level=%s num-devices=%d", dev,
-                      c?c:"-unknown-",
-                      array.raid_disks );
-               if (sra && sra->array.major_version < 0)
-                       printf(" metadata=%s", sra->text_version);
-               else
-                       printf(" metadata=%d.%02d",
-                              array.major_version, array.minor_version);
+               printf("ARRAY %s", dev);
+               if (brief > 1) {
+                       if (array.raid_disks)
+                               printf("level=%s num-devices=%d",
+                                      c?c:"-unknown-",
+                                      array.raid_disks );
+                       else
+                               printf("level=container num-devices=%d",
+                                      array.nr_disks);
+               }
+               if (container) {
+                       printf(" container=%s", container);
+                       printf(" member=%s", member);
+               } else {
+                       if (sra && sra->array.major_version < 0)
+                               printf(" metadata=%s", sra->text_version);
+                       else
+                               printf(" metadata=%d.%02d",
+                                      array.major_version, array.minor_version);
+               }
 
                /* Only try GET_BITMAP_FILE for 0.90.01 and later */
                if (vers >= 9001 &&
@@ -180,14 +235,19 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
 
                printf("%s:\n", dev);
 
+               if (container)
+                       printf("      Container : %s, member %s\n", container, member);
+               else {
                if (sra && sra->array.major_version < 0)
                        printf("        Version : %s\n", sra->text_version);
                else
                        printf("        Version : %d.%02d\n",
                               array.major_version, array.minor_version);
+               }
 
                atime = array.ctime;
-               printf("  Creation Time : %.24s\n", ctime(&atime));
+               if (atime)
+                       printf("  Creation Time : %.24s\n", ctime(&atime));
                if (array.raid_disks == 0) c = "container";
                printf("     Raid Level : %s\n", c?c:"-unknown-");
                if (larray_size)
@@ -206,9 +266,13 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                                printf("  Used Dev Size : %d%s\n", array.size,
                                       human_size((long long)array.size<<10));
                }
-               printf("   Raid Devices : %d\n", array.raid_disks);
+               if (array.raid_disks)
+                       printf("   Raid Devices : %d\n", array.raid_disks);
                printf("  Total Devices : %d\n", array.nr_disks);
-               printf("Preferred Minor : %d\n", array.md_minor);
+               if (!container && 
+                   ((sra == NULL && array.major_version == 0) ||
+                    (sra && sra->array.major_version == 0)))
+                       printf("Preferred Minor : %d\n", array.md_minor);
                if (sra == NULL || sra->array.major_version >= 0)
                        printf("    Persistence : Superblock is %spersistent\n",
                               array.not_persistent?"not ":"");
@@ -222,17 +286,22 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                } else if (array.state & (1<<MD_SB_BITMAP_PRESENT))
                        printf("  Intent Bitmap : Internal\n\n");
                atime = array.utime;
-               printf("    Update Time : %.24s\n", ctime(&atime));
-               printf("          State : %s%s%s%s\n",
-                      (array.state&(1<<MD_SB_CLEAN))?"clean":"active",
-                      array.active_disks < array.raid_disks? ", degraded":"",
-                      (!e || e->percent < 0) ? "" :
-                       (e->resync) ? ", resyncing": ", recovering",
-                      larray_size ? "": ", Not Started");
-               printf(" Active Devices : %d\n", array.active_disks);
+               if (atime)
+                       printf("    Update Time : %.24s\n", ctime(&atime));
+               if (array.raid_disks)
+                       printf("          State : %s%s%s%s\n",
+                              (array.state&(1<<MD_SB_CLEAN))?"clean":"active",
+                              array.active_disks < array.raid_disks? ", degraded":"",
+                              (!e || e->percent < 0) ? "" :
+                              (e->resync) ? ", resyncing": ", recovering",
+                              larray_size ? "": ", Not Started");
+               if (array.raid_disks)
+                       printf(" Active Devices : %d\n", array.active_disks);
                printf("Working Devices : %d\n", array.working_disks);
-               printf(" Failed Devices : %d\n", array.failed_disks);
-               printf("  Spare Devices : %d\n", array.spare_disks);
+               if (array.raid_disks) {
+                       printf(" Failed Devices : %d\n", array.failed_disks);
+                       printf("  Spare Devices : %d\n", array.spare_disks);
+               }
                printf("\n");
                if (array.level == 5) {
                        c = map_num(r5layout, array.layout);
@@ -306,7 +375,45 @@ This is pretty boring
                if (st && st->sb)
                        st->ss->detail_super(st, homehost);
 
-               printf("    Number   Major   Minor   RaidDevice State\n");
+               if (array.raid_disks == 0 && sra && sra->array.major_version == -1
+                   && sra->array.minor_version == -2 && sra->text_version[0] != '/') {
+                       /* This looks like a container.  Find any active arrays
+                        * That claim to be a member.
+                        */
+                       DIR *dir = opendir("/sys/block");
+                       struct dirent *de;
+
+                       printf("  Member Arrays :");
+
+                       while (dir && (de = readdir(dir)) != NULL) {
+                               char path[200];
+                               char vbuf[1024];
+                               int nlen = strlen(sra->sys_name);
+                               int dn;
+                               if (de->d_name[0] == '.')
+                                       continue;
+                               sprintf(path, "/sys/block/%s/md/metadata_version",
+                                       de->d_name);
+                               if (load_sys(path, vbuf) < 0)
+                                       continue;
+                               if (strncmp(vbuf, "external:", 9) != 0 ||
+                                   !is_subarray(sra->sys_name+9) ||
+                                   strncmp(vbuf+10, sra->sys_name, nlen) != 0 ||
+                                   vbuf[10+nlen] != '/')
+                                       continue;
+                               dn = devname2devnum(de->d_name);
+                               printf(" %s", map_dev(dev2major(dn),
+                                                     dev2minor(dn), 1));
+                       }
+                       if (dir)
+                               closedir(dir);
+                       printf("\n\n");
+               }
+
+               if (array.raid_disks)
+                       printf("    Number   Major   Minor   RaidDevice State\n");
+               else
+                       printf("    Number   Major   Minor   RaidDevice\n");
        }
        disks = malloc(max_disks * sizeof(mdu_disk_info_t));
        for (d=0; d<max_disks; d++) {
@@ -350,6 +457,9 @@ This is pretty boring
                        else
                                printf("   %5d   %5d    %5d    %5d     ",
                                       disk.number, disk.major, disk.minor, disk.raid_disk);
+               }
+               if (!brief && array.raid_disks) {
+
                        if (disk.state & (1<<MD_DISK_FAULTY)) {
                                printf(" faulty");
                                if (disk.raid_disk < array.raid_disks &&
@@ -401,7 +511,7 @@ This is pretty boring
                }
                if (!brief) printf("\n");
        }
-       if (spares && brief) printf(" spares=%d", spares);
+       if (spares && brief && array.raid_disks) printf(" spares=%d", spares);
        if (brief && st && st->sb)
                st->ss->brief_detail_super(st);
        st->ss->free_super(st);
@@ -417,3 +527,44 @@ out:
        close(fd);
        return rv;
 }
+
+int Detail_Platform(struct superswitch *ss, int scan, int verbose)
+{
+       /* display platform capabilities for the given metadata format
+        * 'scan' in this context means iterate over all metadata types
+        */
+       int i;
+       int err = 1;
+
+       if (ss && ss->detail_platform)
+               err = ss->detail_platform(verbose, 0);
+       else if (ss) {
+               if (verbose)
+                       fprintf(stderr, Name ": %s metadata is platform independent\n",
+                               ss->name ? : "[no name]");
+       } else if (!scan) {
+               if (verbose)
+                       fprintf(stderr, Name ": specify a metadata type or --scan\n");
+       }
+
+       if (!scan)
+               return err;
+
+       for (i = 0; superlist[i]; i++) {
+               struct superswitch *meta = superlist[i];
+
+               if (meta == ss)
+                       continue;
+               if (verbose)
+                       fprintf(stderr, Name ": checking metadata %s\n",
+                               meta->name ? : "[no name]");
+               if (!meta->detail_platform) {
+                       if (verbose)
+                               fprintf(stderr, Name ": %s metadata is platform independent\n",
+                                       meta->name ? : "[no name]");
+               } else
+                       err |= meta->detail_platform(verbose, 0);
+       }
+
+       return err;
+}
index 5de92028acd4c0bf971a859203bd363ca371ddae..f0e98f974d2e58885ebc238cdd796690bf4a9dee 100644 (file)
--- a/Examine.c
+++ b/Examine.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 
 #include       "mdadm.h"
@@ -123,12 +118,13 @@ int Examine(mddev_dev_t devlist, int brief, int export, int scan,
                                st->ss->getinfo_super(st, &ap->info);
                                st->ss->free_super(st);
                        }
-                       if (!(ap->info.disk.state & MD_DISK_SYNC))
+                       if (!(ap->info.disk.state & (1<<MD_DISK_SYNC)))
                                ap->spares++;
                        d = dl_strdup(devlist->devname);
                        dl_add(ap->devs, d);
                } else if (export) {
-                       st->ss->export_examine_super(st);
+                       if (st->ss->export_examine_super)
+                               st->ss->export_examine_super(st);
                } else {
                        printf("%s:\n",devlist->devname);
                        st->ss->examine_super(st, homehost);
@@ -140,7 +136,7 @@ int Examine(mddev_dev_t devlist, int brief, int export, int scan,
                for (ap=arrays; ap; ap=ap->next) {
                        char sep='=';
                        char *d;
-                       ap->st->ss->brief_examine_super(ap->st);
+                       ap->st->ss->brief_examine_super(ap->st, brief > 1);
                        if (ap->spares) printf("   spares=%d", ap->spares);
                        if (brief > 1) {
                                printf("   devices");
diff --git a/Grow.c b/Grow.c
index a8194bf05b69e3e86b5eefcc88241bdb837ea398..18056047819ad4d6affe74b42b7b950151bc2913 100644 (file)
--- a/Grow.c
+++ b/Grow.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 #include       "mdadm.h"
 #include       "dlink.h"
@@ -69,7 +64,7 @@ int Grow_Add_device(char *devname, int fd, char *newdev)
                return 1;
        }
 
-       nfd = open(newdev, O_RDWR|O_EXCL);
+       nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT);
        if (nfd < 0) {
                fprintf(stderr, Name ": cannot open %s\n", newdev);
                return 1;
@@ -396,7 +391,8 @@ struct mdp_backup_super {
        __u64   arraystart;
        __u64   length;
        __u32   sb_csum;        /* csum of preceeding bytes. */
-};
+       __u8 pad[512-68];
+} __attribute__((aligned(512))) bsb;
 
 int bsb_csum(char *buf, int len)
 {
@@ -420,7 +416,6 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
        struct mdu_array_info_s array;
        char *c;
 
-       struct mdp_backup_super bsb;
        struct supertype *st;
 
        int nlevel, olevel;
@@ -720,7 +715,8 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                 * a leading superblock 4K earlier.
                 */
                for (i=array.raid_disks; i<d; i++) {
-                       char buf[4096];
+                       char abuf[4096+512];
+                       char *buf = (char*)(((unsigned long)abuf+511)& ~511);
                        if (i==d-1 && backup_file) {
                                /* This is the backup file */
                                offsets[i] = 8;
@@ -731,7 +727,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                                fprintf(stderr, Name ": could not seek...\n");
                                goto abort;
                        }
-                       memset(buf, 0, sizeof(buf));
+                       memset(buf, 0, 4096);
                        bsb.devstart = __cpu_to_le64(offsets[i]);
                        bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
                        memcpy(buf, &bsb, sizeof(bsb));
@@ -793,7 +789,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        if (lseek64(fdlist[i], (offsets[i]+last_block)<<9, 0) < 0 ||
                            write(fdlist[i], &bsb, sizeof(bsb)) != sizeof(bsb) ||
                            fsync(fdlist[i]) != 0) {
-                               fprintf(stderr, Name ": %s: fail to save metadata for critical region backups.\n",
+                               fprintf(stderr, Name ": %s: failed to save metadata for critical region backups.\n",
                                        devname);
                                goto abort_resume;
                        }
@@ -808,12 +804,21 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                /* wait for reshape to pass the critical region */
                while(1) {
                        unsigned long long comp;
+
                        if (sysfs_get_ll(sra, NULL, "sync_completed", &comp)<0) {
                                sleep(5);
                                break;
                        }
                        if (comp >= nstripe)
                                break;
+                       if (comp == 0) {
+                               /* Maybe it finished already */
+                               char action[20];
+                               if (sysfs_get_str(sra, NULL, "sync_action",
+                                                 action, 20) > 0 &&
+                                   strncmp(action, "reshape", 7) != 0)
+                                       break;
+                       }
                        sleep(1);
                }
 
@@ -882,7 +887,6 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
 
        for (i=old_disks-(backup_file?1:0); i<cnt; i++) {
                struct mdinfo dinfo;
-               struct mdp_backup_super bsb;
                char buf[4096];
                int fd;
 
index 08e0e6f7f5ed3d1bd2847ca37669fc4867de9bff..8c686f7e8e6bde1d426d8731f3bdeca8a9d5c1d3 100644 (file)
@@ -2,7 +2,7 @@
  * Incremental.c - support --incremental.  Part of:
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
@@ -37,7 +37,8 @@ static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra,
                        char *array_name);
 
 int Incremental(char *devname, int verbose, int runstop,
-               struct supertype *st, char *homehost, int autof)
+               struct supertype *st, char *homehost, int require_homehost,
+               int autof)
 {
        /* Add this device to an array, creating the array if necessary
         * and starting the array if sensible or - if runstop>0 - if possible.
@@ -48,7 +49,8 @@ int Incremental(char *devname, int verbose, int runstop,
         * 2/ Find metadata, reject if none appropriate (check
         *       version/name from args)
         * 3/ Check if there is a match in mdadm.conf
-        * 3a/ if not, check for homehost match.  If no match, reject.
+        * 3a/ if not, check for homehost match.  If no match, assemble as
+        *    a 'foreign' array.
         * 4/ Determine device number.
         * - If in mdadm.conf with std name, use that
         * - UUID in /var/run/mdadm.map  use that
@@ -56,6 +58,7 @@ int Incremental(char *devname, int verbose, int runstop,
         * - Choose a free, high number.
         * - Use a partitioned device unless strong suggestion not to.
         *         e.g. auto=md
+        *   Don't choose partitioned for containers.
         * 5/ Find out if array already exists
         * 5a/ if it does not
         * - choose a name, from mdadm.conf or 'name' field in array.
@@ -67,6 +70,7 @@ int Incremental(char *devname, int verbose, int runstop,
         * - add the device
         * 6/ Make sure /var/run/mdadm.map contains this array.
         * 7/ Is there enough devices to possibly start the array?
+        *     For a container, this means running Incremental_container.
         * 7a/ if not, finish with success.
         * 7b/ if yes,
         * - read all metadata and arrange devices like -A does
@@ -74,20 +78,22 @@ int Incremental(char *devname, int verbose, int runstop,
         *   start the array (auto-readonly).
         */
        struct stat stb;
-       struct mdinfo info, info2;
+       struct mdinfo info;
        struct mddev_ident_s *array_list, *match;
        char chosen_name[1024];
        int rv;
-       int devnum;
        struct map_ent *mp, *map = NULL;
        int dfd, mdfd;
        char *avail;
        int active_disks;
+       int trustworthy = FOREIGN;
+       char *name_to_use;
+       mdu_array_info_t ainf;
+
        struct createinfo *ci = conf_get_create_info();
-       char *name;
 
 
-       /* 1/ Check if devices is permitted by mdadm.conf */
+       /* 1/ Check if device is permitted by mdadm.conf */
 
        if (!conf_test_dev(devname)) {
                if (verbose >= 0)
@@ -137,9 +143,10 @@ int Incremental(char *devname, int verbose, int runstop,
                close(dfd);
                return 1;
        }
-       st->ss->getinfo_super(st, &info);
        close (dfd);
 
+       memset(&info, 0, sizeof(info));
+       st->ss->getinfo_super(st, &info);
        /* 3/ Check if there is a match in mdadm.conf */
 
        array_list = conf_get_ident(NULL);
@@ -148,7 +155,7 @@ int Incremental(char *devname, int verbose, int runstop,
                if (array_list->uuid_set &&
                    same_uuid(array_list->uuid, info.uuid, st->ss->swapuuid)
                    == 0) {
-                       if (verbose >= 2)
+                       if (verbose >= 2 && array_list->devname)
                                fprintf(stderr, Name
                                        ": UUID differs from %s.\n",
                                        array_list->devname);
@@ -156,7 +163,7 @@ int Incremental(char *devname, int verbose, int runstop,
                }
                if (array_list->name[0] &&
                    strcasecmp(array_list->name, info.name) != 0) {
-                       if (verbose >= 2)
+                       if (verbose >= 2 && array_list->devname)
                                fprintf(stderr, Name
                                        ": Name differs from %s.\n",
                                        array_list->devname);
@@ -164,7 +171,7 @@ int Incremental(char *devname, int verbose, int runstop,
                }
                if (array_list->devices &&
                    !match_oneof(array_list->devices, devname)) {
-                       if (verbose >= 2)
+                       if (verbose >= 2 && array_list->devname)
                                fprintf(stderr, Name
                                        ": Not a listed device for %s.\n",
                                        array_list->devname);
@@ -172,7 +179,7 @@ int Incremental(char *devname, int verbose, int runstop,
                }
                if (array_list->super_minor != UnSet &&
                    array_list->super_minor != info.array.md_minor) {
-                       if (verbose >= 2)
+                       if (verbose >= 2 && array_list->devname)
                                fprintf(stderr, Name
                                        ": Different super-minor to %s.\n",
                                        array_list->devname);
@@ -182,7 +189,7 @@ int Incremental(char *devname, int verbose, int runstop,
                    !array_list->name[0] &&
                    !array_list->devices &&
                    array_list->super_minor == UnSet) {
-                       if (verbose  >= 2)
+                       if (verbose >= 2 && array_list->devname)
                                fprintf(stderr, Name
                             ": %s doesn't have any identifying information.\n",
                                        array_list->devname);
@@ -191,37 +198,50 @@ int Incremental(char *devname, int verbose, int runstop,
                /* FIXME, should I check raid_disks and level too?? */
 
                if (match) {
-                       if (verbose >= 0)
-                               fprintf(stderr, Name
+                       if (verbose >= 0) {
+                               if (match->devname && array_list->devname)
+                                       fprintf(stderr, Name
                   ": we match both %s and %s - cannot decide which to use.\n",
-                                       match->devname, array_list->devname);
+                                               match->devname, array_list->devname);
+                               else
+                                       fprintf(stderr, Name
+                                               ": multiple lines in mdadm.conf match\n");
+                       }
                        return 2;
                }
                match = array_list;
        }
 
+       if (match && match->devname
+           && strcasecmp(match->devname, "<ignore>") == 0) {
+               if (verbose >= 0)
+                       fprintf(stderr, Name ": array containing %s is explicitly"
+                               " ignored by mdadm.conf\n",
+                               devname);
+               return 1;
+       }
+
+       if (!match && !conf_test_metadata(st->ss->name)) {
+               if (verbose >= 1)
+                       fprintf(stderr, Name
+                               ": %s has metadata type %s for which "
+                               "auto-assembly is disabled\n",
+                               devname, st->ss->name);
+               return 1;
+       }
+
        /* 3a/ if not, check for homehost match.  If no match, continue
         * but don't trust the 'name' in the array. Thus a 'random' minor
         * number will be assigned, and the device name will be based
         * on that. */
-       name = info.name;
-       if (!match) {
-               if (homehost == NULL ||
-                   st->ss->match_home(st, homehost) == 0) {
-                       if (verbose >= 0)
-                               fprintf(stderr, Name
-             ": not found in mdadm.conf and not identified by homehost.\n");
-                       name = NULL;
-               }
-       }
-       /* 4/ Determine device number. */
-       /* - If in mdadm.conf with std name, get number from name. */
-       /* - UUID in /var/run/mdadm.map  get number from mapping */
-       /* - If name is suggestive, use that. unless in use with */
-       /*           different uuid. */
-       /* - Choose a free, high number. */
-       /* - Use a partitioned device unless strong suggestion not to. */
-       /*         e.g. auto=md */
+       if (match)
+               trustworthy = LOCAL;
+       else if ((homehost == NULL ||
+                 st->ss->match_home(st, homehost) != 1) &&
+                st->ss->match_home(st, "any") != 1)
+               trustworthy = FOREIGN;
+       else
+               trustworthy = LOCAL;
 
        /* There are three possible sources for 'autof':  command line,
         * ARRAY line in mdadm.conf, or CREATE line in mdadm.conf.
@@ -233,86 +253,72 @@ int Incremental(char *devname, int verbose, int runstop,
        if (autof == 0)
                autof = ci->autof;
 
-       if (match && (rv = is_standard(match->devname, &devnum))) {
-               devnum = (rv > 0) ? (-1-devnum) : devnum;
-       } else if ((mp = map_by_uuid(&map, info.uuid)) != NULL)
-               devnum = mp->devnum;
-       else {
-               /* Have to guess a bit. */
-               int use_partitions = 1;
-               char *np, *ep;
-               if ((autof&7) == 3 || (autof&7) == 5)
-                       use_partitions = 0;
-               np = name ? strchr(name, ':') : ":NONAME";
-               if (np)
-                       np++;
-               else
-                       np = name;
-               devnum = strtoul(np, &ep, 10);
-               if (ep > np && *ep == 0) {
-                       /* This is a number.  Let check that it is unused. */
-                       if (mddev_busy(use_partitions ? (-1-devnum) : devnum))
-                               devnum = -1;
-               } else
-                       devnum = -1;
-
-               if (devnum < 0) {
-                       /* Haven't found anything yet, choose something free */
-                       devnum = find_free_devnum(use_partitions);
-
-                       if (devnum == NoMdDev) {
-                               fprintf(stderr, Name
-                                       ": No spare md devices!!\n");
-                               return 2;
-                       }
-               } else
-                       devnum = use_partitions ? (-1-devnum) : devnum;
+       if (st->ss->container_content && st->loaded_container) {
+               /* This is a pre-built container array, so we do something
+                * rather different.
+                */
+               return Incremental_container(st, devname, verbose, runstop,
+                                            autof, trustworthy);
        }
-       mdfd = open_mddev_devnum(match ? match->devname : NULL,
-                                devnum,
-                                name,
-                                chosen_name, autof >> 3);
-       if (mdfd < 0) {
-               fprintf(stderr, Name ": failed to open %s: %s.\n",
-                       chosen_name, strerror(errno));
-               return 2;
+
+       name_to_use = info.name;
+       if (name_to_use[0] == 0 &&
+           info.array.level == LEVEL_CONTAINER &&
+           trustworthy == LOCAL) {
+               name_to_use = info.text_version;
+               trustworthy = METADATA;
        }
-       /* 5/ Find out if array already exists */
-       if (! mddev_busy(devnum)) {
-       /* 5a/ if it does not */
-       /* - choose a name, from mdadm.conf or 'name' field in array. */
-       /* - create the array */
-       /* - add the device */
-               mdu_array_info_t ainf;
-               mdu_disk_info_t disk;
-               char md[20];
+       if (name_to_use[0] && trustworthy != LOCAL &&
+           ! require_homehost &&
+           conf_name_is_free(name_to_use))
+               trustworthy = LOCAL;
+
+       /* strip "hostname:" prefix from name if we have decided
+        * to treat it as LOCAL
+        */
+       if (trustworthy == LOCAL && strchr(name_to_use, ':') != NULL)
+               name_to_use = strchr(name_to_use, ':')+1;
+
+       /* 4/ Check if array exists.
+        */
+       map_lock(&map);
+       mp = map_by_uuid(&map, info.uuid);
+       if (mp)
+               mdfd = open_dev(mp->devnum);
+       else
+               mdfd = -1;
+
+       if (mdfd < 0) {
                struct mdinfo *sra;
+               struct mdinfo dinfo;
 
-               memset(&ainf, 0, sizeof(ainf));
-               ainf.major_version = st->ss->major;
-               ainf.minor_version = st->minor_version;
-               if (ioctl(mdfd, SET_ARRAY_INFO, &ainf) != 0) {
-                       fprintf(stderr, Name
-                               ": SET_ARRAY_INFO failed for %s: %s\b",
+               /* Couldn't find an existing array, maybe make a new one */
+               mdfd = create_mddev(match ? match->devname : NULL,
+                                   name_to_use, autof, trustworthy, chosen_name);
+
+               if (mdfd < 0)
+                       return 1;
+
+               sysfs_init(&info, mdfd, 0);
+
+               if (set_array_info(mdfd, st, &info) != 0) {
+                       fprintf(stderr, Name ": failed to set array info for %s: %s\n",
                                chosen_name, strerror(errno));
                        close(mdfd);
                        return 2;
                }
-               sprintf(md, "%d.%d\n", st->ss->major, st->minor_version);
-               sra = sysfs_read(mdfd, devnum, GET_VERSION);
-               sysfs_set_str(sra, NULL, "metadata_version", md);
-               memset(&disk, 0, sizeof(disk));
-               disk.major = major(stb.st_rdev);
-               disk.minor = minor(stb.st_rdev);
-               sysfs_free(sra);
-               if (ioctl(mdfd, ADD_NEW_DISK, &disk) != 0) {
+
+               dinfo = info;
+               dinfo.disk.major = major(stb.st_rdev);
+               dinfo.disk.minor = minor(stb.st_rdev);
+               if (add_disk(mdfd, st, &info, &dinfo) != 0) {
                        fprintf(stderr, Name ": failed to add %s to %s: %s.\n",
                                devname, chosen_name, strerror(errno));
                        ioctl(mdfd, STOP_ARRAY, 0);
                        close(mdfd);
                        return 2;
                }
-               sra = sysfs_read(mdfd, devnum, GET_DEVS);
+               sra = sysfs_read(mdfd, fd2devnum(mdfd), GET_DEVS);
                if (!sra || !sra->devs || sra->devs->disk.raid_disk >= 0) {
                        /* It really should be 'none' - must be old buggy
                         * kernel, and mdadm -I may not be able to complete.
@@ -326,6 +332,12 @@ int Incremental(char *devname, int verbose, int runstop,
                        sysfs_free(sra);
                        return 2;
                }
+               info.array.working_disks = 1;
+               sysfs_free(sra);
+               /* 6/ Make sure /var/run/mdadm.map contains this array. */
+               map_update(&map, fd2devnum(mdfd),
+                          info.text_version,
+                          info.uuid, chosen_name);
        } else {
        /* 5b/ if it does */
        /* - check one drive in array to make sure metadata is a reasonably */
@@ -333,60 +345,60 @@ int Incremental(char *devname, int verbose, int runstop,
        /* - add the device */
                char dn[20];
                int dfd2;
-               mdu_disk_info_t disk;
                int err;
                struct mdinfo *sra;
                struct supertype *st2;
-               sra = sysfs_read(mdfd, devnum, (GET_VERSION | GET_DEVS |
-                                               GET_STATE));
+               struct mdinfo info2, *d;
 
-               if (sra->array.major_version != st->ss->major ||
-                   sra->array.minor_version != st->minor_version) {
-                       if (verbose >= 0)
+               if (mp->path)
+                       strcpy(chosen_name, mp->path);
+               else
+                       strcpy(chosen_name, devnum2devname(mp->devnum));
+
+               sra = sysfs_read(mdfd, fd2devnum(mdfd), (GET_DEVS | GET_STATE));
+
+               if (sra->devs) {
+                       sprintf(dn, "%d:%d", sra->devs->disk.major,
+                               sra->devs->disk.minor);
+                       dfd2 = dev_open(dn, O_RDONLY);
+                       st2 = dup_super(st);
+                       if (st2->ss->load_super(st2, dfd2, NULL) ||
+                           st->ss->compare_super(st, st2) != 0) {
                                fprintf(stderr, Name
-             ": %s has different metadata to chosen array %s %d.%d %d.%d.\n",
-                                       devname, chosen_name,
-                                       sra->array.major_version,
-                                       sra->array.minor_version,
-                                       st->ss->major, st->minor_version);
-                       close(mdfd);
-                       return 1;
-               }
-               sprintf(dn, "%d:%d", sra->devs->disk.major,
-                       sra->devs->disk.minor);
-               dfd2 = dev_open(dn, O_RDONLY);
-               st2 = dup_super(st);
-               if (st2->ss->load_super(st2, dfd2, NULL)) {
-                       fprintf(stderr, Name
-                               ": Strange error loading metadata for %s.\n",
-                               chosen_name);
-                       close(mdfd);
+                                       ": metadata mismatch between %s and "
+                                       "chosen array %s\n",
+                                       devname, chosen_name);
+                               close(mdfd);
+                               close(dfd2);
+                               return 2;
+                       }
                        close(dfd2);
-                       return 2;
-               }
-               close(dfd2);
-               st2->ss->getinfo_super(st2, &info2);
-               st2->ss->free_super(st2);
-               if (info.array.level != info2.array.level ||
-                   memcmp(info.uuid, info2.uuid, 16) != 0 ||
-                   info.array.raid_disks != info2.array.raid_disks) {
-                       fprintf(stderr, Name
-                               ": unexpected difference between %s and %s.\n",
-                               chosen_name, devname);
-                       close(mdfd);
-                       return 2;
+                       memset(&info2, 0, sizeof(info2));
+                       st2->ss->getinfo_super(st2, &info2);
+                       st2->ss->free_super(st2);
+                       if (info.array.level != info2.array.level ||
+                           memcmp(info.uuid, info2.uuid, 16) != 0 ||
+                           info.array.raid_disks != info2.array.raid_disks) {
+                               fprintf(stderr, Name
+                                       ": unexpected difference between %s and %s.\n",
+                                       chosen_name, devname);
+                               close(mdfd);
+                               return 2;
+                       }
                }
-               memset(&disk, 0, sizeof(disk));
-               disk.major = major(stb.st_rdev);
-               disk.minor = minor(stb.st_rdev);
-               err = ioctl(mdfd, ADD_NEW_DISK, &disk);
+               info2.disk.major = major(stb.st_rdev);
+               info2.disk.minor = minor(stb.st_rdev);
+               /* add disk needs to know about containers */
+               if (st->ss->external)
+                       sra->array.level = LEVEL_CONTAINER;
+               err = add_disk(mdfd, st, sra, &info2);
                if (err < 0 && errno == EBUSY) {
                        /* could be another device present with the same
                         * disk.number. Find and reject any such
                         */
                        find_reject(mdfd, st, sra, info.disk.number,
                                    info.events, verbose, chosen_name);
-                       err = ioctl(mdfd, ADD_NEW_DISK, &disk);
+                       err = add_disk(mdfd, st, sra, &info2);
                }
                if (err < 0) {
                        fprintf(stderr, Name ": failed to add %s to %s: %s.\n",
@@ -394,25 +406,47 @@ int Incremental(char *devname, int verbose, int runstop,
                        close(mdfd);
                        return 2;
                }
+               info.array.working_disks = 0;
+               for (d = sra->devs; d; d=d->next)
+                       info.array.working_disks ++;
+                       
        }
-       /* 6/ Make sure /var/run/mdadm.map contains this array. */
-       map_update(&map, devnum,
-                  info.array.major_version,
-                  info.array.minor_version,
-                  info.uuid, chosen_name);
 
        /* 7/ Is there enough devices to possibly start the array? */
        /* 7a/ if not, finish with success. */
+       if (info.array.level == LEVEL_CONTAINER) {
+               /* Try to assemble within the container */
+               map_unlock(&map);
+               sysfs_uevent(&info, "change");
+               if (verbose >= 0)
+                       fprintf(stderr, Name
+                               ": container %s now has %d devices\n",
+                               chosen_name, info.array.working_disks);
+               wait_for(chosen_name, mdfd);
+               close(mdfd);
+               if (runstop < 0)
+                       return 0; /* don't try to assemble */
+               rv = Incremental(chosen_name, verbose, runstop,
+                                NULL, homehost, require_homehost, autof);
+               if (rv == 1)
+                       /* Don't fail the whole -I if a subarray didn't
+                        * have enough devices to start yet
+                        */
+                       rv = 0;
+               return rv;
+       }
        avail = NULL;
        active_disks = count_active(st, mdfd, &avail, &info);
        if (enough(info.array.level, info.array.raid_disks,
                   info.array.layout, info.array.state & 1,
-                  avail, active_disks) == 0) {
+                  avail, active_disks) == 0 ||
+           (runstop < 0 && active_disks < info.array.raid_disks)) {
                free(avail);
                if (verbose >= 0)
                        fprintf(stderr, Name
                             ": %s attached to %s, not enough to start (%d).\n",
                                devname, chosen_name, active_disks);
+               map_unlock(&map);
                close(mdfd);
                return 0;
        }
@@ -423,18 +457,18 @@ int Incremental(char *devname, int verbose, int runstop,
        /*             are enough, */
        /*   + add any bitmap file  */
        /*   + start the array (auto-readonly). */
-{
-       mdu_array_info_t ainf;
 
        if (ioctl(mdfd, GET_ARRAY_INFO, &ainf) == 0) {
                if (verbose >= 0)
                        fprintf(stderr, Name
                           ": %s attached to %s which is already active.\n",
                                devname, chosen_name);
-               close (mdfd);
+               close(mdfd);
+               map_unlock(&map);
                return 0;
        }
-}
+
+       map_unlock(&map);
        if (runstop > 0 || active_disks >= info.array.working_disks) {
                struct mdinfo *sra;
                /* Let's try to start it */
@@ -457,9 +491,9 @@ int Incremental(char *devname, int verbose, int runstop,
                        }
                        close(bmfd);
                }
-               sra = sysfs_read(mdfd, devnum, 0);
+               sra = sysfs_read(mdfd, fd2devnum(mdfd), 0);
                if ((sra == NULL || active_disks >= info.array.working_disks)
-                   && name != NULL)
+                   && trustworthy != FOREIGN)
                        rv = ioctl(mdfd, RUN_ARRAY, NULL);
                else
                        rv = sysfs_set_str(sra, NULL,
@@ -470,6 +504,7 @@ int Incremental(char *devname, int verbose, int runstop,
                           ": %s attached to %s, which has been started.\n",
                                        devname, chosen_name);
                        rv = 0;
+                       wait_for(chosen_name, mdfd);
                } else {
                        fprintf(stderr, Name
                              ": %s attached to %s, but failed to start: %s.\n",
@@ -620,12 +655,11 @@ int IncrementalScan(int verbose)
        devs = conf_get_ident(NULL);
 
        for (me = mapl ; me ; me = me->next) {
-               char path[1024];
                mdu_array_info_t array;
                mdu_bitmap_file_t bmf;
                struct mdinfo *sra;
-               int mdfd = open_mddev_devnum(me->path, me->devnum,
-                                            NULL, path, 0);
+               int mdfd = open_dev(me->devnum);
+
                if (mdfd < 0)
                        continue;
                if (ioctl(mdfd, GET_ARRAY_INFO, &array) == 0 ||
@@ -635,7 +669,8 @@ int IncrementalScan(int verbose)
                }
                /* Ok, we can try this one.   Maybe it needs a bitmap */
                for (mddev = devs ; mddev ; mddev = mddev->next)
-                       if (strcmp(mddev->devname, me->path) == 0)
+                       if (mddev->devname && me->path
+                           && devname_matches(mddev->devname, me->path))
                                break;
                if (mddev && mddev->bitmap_file) {
                        /*
@@ -669,14 +704,138 @@ int IncrementalScan(int verbose)
                                if (verbose >= 0)
                                        fprintf(stderr, Name
                                                ": started array %s\n",
-                                               me->path);
+                                               me->path ?: devnum2devname(me->devnum));
                        } else {
                                fprintf(stderr, Name
                                        ": failed to start array %s: %s\n",
-                                       me->path, strerror(errno));
+                                       me->path ?: devnum2devname(me->devnum),
+                                       strerror(errno));
                                rv = 1;
                        }
                }
        }
        return rv;
 }
+
+static char *container2devname(char *devname)
+{
+       char *mdname = NULL;
+
+       if (devname[0] == '/') {
+               int fd = open(devname, O_RDONLY);
+               if (fd >= 0) {
+                       mdname = devnum2devname(fd2devnum(fd));
+                       close(fd);
+               }
+       } else {
+               int uuid[4];
+               struct map_ent *mp, *map = NULL;
+                                       
+               if (!parse_uuid(devname, uuid))
+                       return mdname;
+               mp = map_by_uuid(&map, uuid);
+               if (mp)
+                       mdname = devnum2devname(mp->devnum);
+               map_free(map);
+       }
+
+       return mdname;
+}
+
+int Incremental_container(struct supertype *st, char *devname, int verbose,
+                         int runstop, int autof, int trustworthy)
+{
+       /* Collect the contents of this container and for each
+        * array, choose a device name and assemble the array.
+        */
+
+       struct mdinfo *list = st->ss->container_content(st);
+       struct mdinfo *ra;
+       struct map_ent *map = NULL;
+
+       map_lock(&map);
+
+       for (ra = list ; ra ; ra = ra->next) {
+               int mdfd;
+               char chosen_name[1024];
+               struct map_ent *mp;
+               struct mddev_ident_s *match = NULL;
+
+               mp = map_by_uuid(&map, ra->uuid);
+
+               if (mp) {
+                       mdfd = open_dev(mp->devnum);
+                       if (mp->path)
+                               strcpy(chosen_name, mp->path);
+                       else
+                               strcpy(chosen_name, devnum2devname(mp->devnum));
+               } else {
+
+                       /* Check in mdadm.conf for container == devname and
+                        * member == ra->text_version after second slash.
+                        */
+                       char *sub = strchr(ra->text_version+1, '/');
+                       struct mddev_ident_s *array_list;
+                       if (sub) {
+                               sub++;
+                               array_list = conf_get_ident(NULL);
+                       } else
+                               array_list = NULL;
+                       for(; array_list ; array_list = array_list->next) {
+                               char *dn;
+                               if (array_list->member == NULL ||
+                                   array_list->container == NULL)
+                                       continue;
+                               if (strcmp(array_list->member, sub) != 0)
+                                       continue;
+                               if (array_list->uuid_set &&
+                                   !same_uuid(ra->uuid, array_list->uuid, st->ss->swapuuid))
+                                       continue;
+                               dn = container2devname(array_list->container);
+                               if (dn == NULL)
+                                       continue;
+                               if (strncmp(dn, ra->text_version+1,
+                                           strlen(dn)) != 0 ||
+                                   ra->text_version[strlen(dn)+1] != '/') {
+                                       free(dn);
+                                       continue;
+                               }
+                               free(dn);
+                               /* we have a match */
+                               match = array_list;
+                               if (verbose>0)
+                                       fprintf(stderr, Name ": match found for member %s\n",
+                                               array_list->member);
+                               break;
+                       }
+
+                       if (match && match->devname &&
+                           strcasecmp(match->devname, "<ignore>") == 0) {
+                               if (verbose > 0)
+                                       fprintf(stderr, Name ": array %s/%s is "
+                                               "explicitly ignored by mdadm.conf\n",
+                                               match->container, match->member);
+                               return 2;
+                       }
+                       if (match)
+                               trustworthy = LOCAL;
+
+                       mdfd = create_mddev(match ? match->devname : NULL,
+                                           ra->name,
+                                           autof,
+                                           trustworthy,
+                                           chosen_name);
+               }
+
+               if (mdfd < 0) {
+                       fprintf(stderr, Name ": failed to open %s: %s.\n",
+                               chosen_name, strerror(errno));
+                       return 2;
+               }
+
+               assemble_container_content(st, mdfd, ra, runstop,
+                                          chosen_name, verbose);
+       }
+       map_unlock(&map);
+       return 0;
+}
diff --git a/Kill.c b/Kill.c
index b1e19b56dd8bbdb57d7b4ac9268c8656fff5d0d5..f5c582180ac28d7a8593657f42b02d481aa352be 100644 (file)
--- a/Kill.c
+++ b/Kill.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  *
  *    Added by Dale Stephenson
  *    steph@snapserver.com
@@ -34,7 +29,7 @@
 #include       "md_u.h"
 #include       "md_p.h"
 
-int Kill(char *dev, int force, int quiet)
+int Kill(char *dev, int force, int quiet, int noexcl)
 {
        /*
         * Nothing fancy about Kill.  It just zeroes out a superblock
@@ -44,6 +39,8 @@ int Kill(char *dev, int force, int quiet)
        int fd, rv = 0;
        struct supertype *st;
 
+       if (force)
+               noexcl = 1;
        fd = open(dev, O_RDWR|(force ? 0 : O_EXCL));
        if (fd < 0) {
                if (!quiet)
@@ -63,10 +60,8 @@ int Kill(char *dev, int force, int quiet)
        if (force && rv >= 2)
                rv = 0; /* ignore bad data in superblock */
        if (rv== 0 || (force && rv >= 2)) {
-               mdu_array_info_t info;
-               info.major_version = -1; /* zero superblock */
                st->ss->free_super(st);
-               st->ss->init_super(st, &info, 0, "", NULL, NULL);
+               st->ss->init_super(st, NULL, 0, "", NULL, NULL);
                if (st->ss->store_super(st, fd)) {
                        if (!quiet)
                                fprintf(stderr, Name ": Could not zero superblock on %s\n",
index b89cd6f755da5d7b19c1f5a46ad3a7a2cb936d5b..56363926031af63254271289b3883fab604aafa4 100644 (file)
--- a/Makefile
+++ b/Makefile
 # e.g.  make CXFLAGS=-O to optimise
 TCC = tcc
 UCLIBC_GCC = $(shell for nm in i386-uclibc-linux-gcc i386-uclibc-gcc; do which $$nm > /dev/null && { echo $$nm ; exit; } ; done; echo false No uclibc found )
-DIET_GCC = diet gcc
+#DIET_GCC = diet gcc
+# sorry, but diet-libc doesn't know about posix_memalign, 
+# so we cannot use it any more.
+DIET_GCC = gcc -DHAVE_STDINT_H
 
 KLIBC=/home/src/klibc/klibc-0.77
 
@@ -40,6 +43,9 @@ KLIBC_GCC = gcc -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIB
 CC = $(CROSS_COMPILE)gcc
 CXFLAGS = -ggdb
 CWFLAGS = -Wall -Werror -Wstrict-prototypes
+ifdef WARN_UNUSED
+CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O
+endif
 
 ifdef DEBIAN
 CPPFLAGS= -DDEBIAN
@@ -69,27 +75,37 @@ MAN8DIR = $(MANDIR)/man8
 OBJS =  mdadm.o config.o mdstat.o  ReadMe.o util.o Manage.o Assemble.o Build.o \
        Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \
        Incremental.o \
-       mdopen.o super0.o super1.o bitmap.o restripe.o sysfs.o sha1.o \
-       mapfile.o
+       mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \
+       restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o \
+       platform-intel.o probe_roms.o
+
 SRCS =  mdadm.c config.c mdstat.c  ReadMe.c util.c Manage.c Assemble.c Build.c \
        Create.c Detail.c Examine.c Grow.c Monitor.c dlink.c Kill.c Query.c \
        Incremental.c \
-       mdopen.c super0.c super1.c bitmap.c restripe.c sysfs.c sha1.c \
-       mapfile.c
+       mdopen.c super0.c super1.c super-ddf.c super-intel.c bitmap.c \
+       restripe.c sysfs.c sha1.c mapfile.c crc32.c sg_io.c msg.c \
+       platform-intel.c probe_roms.c
+
+MON_OBJS = mdmon.o monitor.o managemon.o util.o mdstat.o sysfs.o config.o \
+       Kill.o sg_io.o dlink.o ReadMe.o super0.o super1.o super-intel.o \
+       super-ddf.o sha1.o crc32.o msg.o bitmap.o \
+       platform-intel.o probe_roms.o
+
 
 STATICSRC = pwgr.c
 STATICOBJS = pwgr.o
 
 ASSEMBLE_SRCS := mdassemble.c Assemble.c Manage.c config.c dlink.c util.c \
-       super0.c super1.c sha1.c sysfs.c
-ASSEMBLE_AUTO_SRCS := mdopen.c mdstat.c
+       super0.c super1.c super-ddf.c super-intel.c sha1.c crc32.c sg_io.c mdstat.c \
+       platform-intel.c probe_roms.c sysfs.c
+ASSEMBLE_AUTO_SRCS := mdopen.c
 ASSEMBLE_FLAGS:= $(CFLAGS) -DMDASSEMBLE
 ifdef MDASSEMBLE_AUTO
 ASSEMBLE_SRCS += $(ASSEMBLE_AUTO_SRCS)
 ASSEMBLE_FLAGS += -DMDASSEMBLE_AUTO
 endif
 
-all : mdadm mdadm.man md.man mdadm.conf.man
+all : mdadm mdmon mdadm.man md.man mdadm.conf.man mdmon.man
 
 everything: all mdadm.static swap_super test_stripe \
        mdassemble mdassemble.auto mdassemble.static mdassemble.man \
@@ -119,6 +135,10 @@ mdadm.Os : $(SRCS) mdadm.h
 mdadm.O2 : $(SRCS) mdadm.h
        $(CC) -o mdadm.O2 $(CFLAGS)  -DHAVE_STDINT_H -O2 $(SRCS)
 
+mdmon : $(MON_OBJS)
+       $(CC) $(LDFLAGS) -o mdmon $(MON_OBJS) $(LDLIBS)
+msg.o: msg.c msg.h
+
 test_stripe : restripe.c mdadm.h
        $(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe -DMAIN restripe.c
 
@@ -147,6 +167,9 @@ mdassemble.klibc : $(ASSEMBLE_SRCS) mdadm.h
 mdadm.man : mdadm.8
        nroff -man mdadm.8 > mdadm.man
 
+mdmon.man : mdmon.8
+       nroff -man mdmon.8 > mdmon.man
+
 md.man : md.4
        nroff -man md.4 > md.man
 
@@ -156,13 +179,15 @@ mdadm.conf.man : mdadm.conf.5
 mdassemble.man : mdassemble.8
        nroff -man mdassemble.8 > mdassemble.man
 
-$(OBJS) : mdadm.h bitmap.h
+$(OBJS) : mdadm.h mdmon.h bitmap.h
+$(MON_OBJS) : mdadm.h mdmon.h bitmap.h
 
 sha1.o : sha1.c sha1.h md5.h
        $(CC) $(CFLAGS) -DHAVE_STDINT_H -o sha1.o -c sha1.c
 
-install : mdadm install-man
+install : mdadm mdmon install-man install-udev
        $(INSTALL) -D $(STRIP) -m 755 mdadm $(DESTDIR)$(BINDIR)/mdadm
+       $(INSTALL) -D $(STRIP) -m 755 mdmon $(DESTDIR)$(BINDIR)/mdmon
 
 install-static : mdadm.static install-man
        $(INSTALL) -D $(STRIP) -m 755 mdadm.static $(DESTDIR)$(BINDIR)/mdadm
@@ -176,19 +201,24 @@ install-uclibc : mdadm.uclibc install-man
 install-klibc : mdadm.klibc install-man
        $(INSTALL) -D $(STRIP) -m 755 mdadm.klibc $(DESTDIR)$(BINDIR)/mdadm
 
-install-man: mdadm.8 md.4 mdadm.conf.5
+install-man: mdadm.8 md.4 mdadm.conf.5 mdmon.8
        $(INSTALL) -D -m 644 mdadm.8 $(DESTDIR)$(MAN8DIR)/mdadm.8
+       $(INSTALL) -D -m 644 mdmon.8 $(DESTDIR)$(MAN8DIR)/mdmon.8
        $(INSTALL) -D -m 644 md.4 $(DESTDIR)$(MAN4DIR)/md.4
        $(INSTALL) -D -m 644 mdadm.conf.5 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5
 
+install-udev: udev-md-raid.rules
+       $(INSTALL) -D -m 644 udev-md-raid.rules $(DESTDIR)/lib/udev/rules.d/64-md-raid.rules
+
 uninstall:
-       rm -f $(DESTDIR)$(MAN8DIR)/mdadm.8 md.4 $(DESTDIR)$(MAN4DIR)/md.4 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 $(DESTDIR)$(BINDIR)/mdadm
+       rm -f $(DESTDIR)$(MAN8DIR)/mdadm.8 $(DESTDIR)$(MAN8DIR)/mdmon.8 $(DESTDIR)$(MAN4DIR)/md.4 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 $(DESTDIR)$(BINDIR)/mdadm
 
-test: mdadm test_stripe swap_super
+test: mdadm mdmon test_stripe swap_super
        @echo "Please run 'sh ./test' as root"
 
 clean : 
-       rm -f mdadm $(OBJS) $(STATICOBJS) core *.man mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \
+       rm -f mdadm mdmon $(OBJS) $(MON_OBJS) $(STATICOBJS) core *.man \
+       mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \
        mdadm.Os mdadm.O2 \
        mdassemble mdassemble.static mdassemble.auto mdassemble.uclibc \
        mdassemble.klibc swap_super \
index fa4bb60fc4c1df662f75b8799f5abf3cd0418423..3aa09bcba241caba52b989abd6804de9e65a056e 100644 (file)
--- a/Manage.c
+++ b/Manage.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 
 #include "mdadm.h"
 #include "md_u.h"
 #include "md_p.h"
+#include <ctype.h>
 
 #define REGISTER_DEV           _IO (MD_MAJOR, 1)
 #define START_MD               _IO (MD_MAJOR, 2)
@@ -45,11 +41,57 @@ int Manage_ro(char *devname, int fd, int readonly)
         *
         */
        mdu_array_info_t array;
+#ifndef MDASSEMBLE
+       struct mdinfo *mdi;
+#endif
 
        if (md_get_version(fd) < 9000) {
                fprintf(stderr, Name ": need md driver version 0.90.0 or later\n");
                return 1;
        }
+#ifndef MDASSEMBLE
+       /* If this is an externally-manage array, we need to modify the
+        * metadata_version so that mdmon doesn't undo our change.
+        */
+       mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION);
+       if (mdi &&
+           mdi->array.major_version == -1 &&
+           mdi->array.level > 0 &&
+           is_subarray(mdi->text_version)) {
+               char vers[64];
+               strcpy(vers, "external:");
+               strcat(vers, mdi->text_version);
+               if (readonly > 0) {
+                       int rv;
+                       /* We set readonly ourselves. */
+                       vers[9] = '-';
+                       sysfs_set_str(mdi, NULL, "metadata_version", vers);
+
+                       close(fd);
+                       rv = sysfs_set_str(mdi, NULL, "array_state", "readonly");
+
+                       if (rv < 0) {
+                               fprintf(stderr, Name ": failed to set readonly for %s: %s\n",
+                                       devname, strerror(errno));
+
+                               vers[9] = mdi->text_version[0];
+                               sysfs_set_str(mdi, NULL, "metadata_version", vers);
+                               return 1;
+                       }
+               } else {
+                       char *cp;
+                       /* We cannot set read/write - must signal mdmon */
+                       vers[9] = '/';
+                       sysfs_set_str(mdi, NULL, "metadata_version", vers);
+
+                       cp = strchr(vers+10, '/');
+                       if (*cp)
+                               *cp = 0;
+                       ping_monitor(vers+10);
+               }
+               return 0;
+       }
+#endif
        if (ioctl(fd, GET_ARRAY_INFO, &array)) {
                fprintf(stderr, Name ": %s does not appear to be active.\n",
                        devname);
@@ -74,17 +116,70 @@ int Manage_ro(char *devname, int fd, int readonly)
 
 #ifndef MDASSEMBLE
 
+static void remove_devices(int devnum, char *path)
+{
+       /* Remove all 'standard' devices for 'devnum', including
+        * partitions.  Also remove names at 'path' - possibly with
+        * partition suffixes - which link to those names.
+        */
+       char base[40];
+       char *path2;
+       char link[1024];
+       int n;
+       int part;
+       char *be;
+       char *pe;
+
+       if (devnum >= 0)
+               sprintf(base, "/dev/md%d", devnum);
+       else
+               sprintf(base, "/dev/md_d%d", -1-devnum);
+       be = base + strlen(base);
+       if (path) {
+               path2 = malloc(strlen(path)+20);
+               strcpy(path2, path);
+               pe = path2 + strlen(path2);
+       } else
+               path = NULL;
+       
+       for (part = 0; part < 16; part++) {
+               if (part) {
+                       sprintf(be, "p%d", part);
+                       if (path) {
+                               if (isdigit(pe[-1]))
+                                       sprintf(pe, "p%d", part);
+                               else
+                                       sprintf(pe, "%d", part);
+                       }
+               }
+               /* FIXME test if really is md device ?? */
+               unlink(base);
+               if (path) {
+                       n = readlink(path2, link, sizeof(link));
+                       if (n && strlen(base) == n &&
+                           strncmp(link, base, n) == 0)
+                               unlink(path2);
+               }
+       }
+}
+       
+
 int Manage_runstop(char *devname, int fd, int runstop, int quiet)
 {
        /* Run or stop the array. array must already be configured
         * required >= 0.90.0
+        * Only print failure messages if quiet == 0;
+        * quiet > 0 means really be quiet
+        * quiet < 0 means we will try again if it fails.
         */
        mdu_param_t param; /* unused */
 
        if (runstop == -1 && md_get_version(fd) < 9000) {
                if (ioctl(fd, STOP_MD, 0)) {
-                       if (!quiet) fprintf(stderr, Name ": stopping device %s failed: %s\n",
-                                           devname, strerror(errno));
+                       if (quiet == 0) fprintf(stderr,
+                                               Name ": stopping device %s "
+                                               "failed: %s\n",
+                                               devname, strerror(errno));
                        return 1;
                }
        }
@@ -111,25 +206,77 @@ int Manage_runstop(char *devname, int fd, int runstop, int quiet)
        } else if (runstop < 0){
                struct map_ent *map = NULL;
                struct stat stb;
-               if (ioctl(fd, STOP_ARRAY, NULL)) {
-                       if (quiet==0) {
-                               fprintf(stderr, Name ": fail to stop array %s: %s\n",
+               struct mdinfo *mdi;
+               int devnum;
+               /* If this is an mdmon managed array, just write 'inactive'
+                * to the array state and let mdmon clear up.
+                */
+               devnum = fd2devnum(fd);
+               mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION);
+               if (mdi &&
+                   mdi->array.level > 0 &&
+                   is_subarray(mdi->text_version)) {
+                       /* This is mdmon managed. */
+                       close(fd);
+                       if (sysfs_set_str(mdi, NULL,
+                                         "array_state", "inactive") < 0) {
+                               if (quiet == 0)
+                                       fprintf(stderr, Name
+                                               ": failed to stop array %s: %s\n",
+                                               devname, strerror(errno));
+                               return 1;
+                       }
+
+                       /* Give monitor a chance to act */
+                       ping_monitor(mdi->text_version);
+
+                       fd = open(devname, O_RDONLY);
+               } else if (mdi &&
+                          mdi->array.major_version == -1 &&
+                          mdi->array.minor_version == -2 &&
+                          !is_subarray(mdi->text_version)) {
+                       /* container, possibly mdmon-managed.
+                        * Make sure mdmon isn't opening it, which
+                        * would interfere with the 'stop'
+                        */
+                       ping_monitor(mdi->sys_name);
+               }
+
+               if (fd >= 0 && ioctl(fd, STOP_ARRAY, NULL)) {
+                       if (quiet == 0) {
+                               fprintf(stderr, Name
+                                       ": failed to stop array %s: %s\n",
                                        devname, strerror(errno));
                                if (errno == EBUSY)
                                        fprintf(stderr, "Perhaps a running "
                                                "process, mounted filesystem "
                                                "or active volume group?\n");
                        }
+                       if (mdi)
+                               sysfs_free(mdi);
                        return 1;
                }
+               /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array
+                * was stopped, so We'll do it here just to be sure.  Drop any
+                * partitions as well...
+                */
+               if (fd >= 0)
+                       ioctl(fd, BLKRRPART, 0);
+               if (mdi)
+                       sysfs_uevent(mdi, "change");
+
+               
+               if (devnum != NoMdDev &&
+                   (stat("/dev/.udev", &stb) != 0 ||
+                    check_env("MDADM_NO_UDEV"))) {
+                       struct map_ent *mp = map_by_devnum(&map, devnum);
+                       remove_devices(devnum, mp ? mp->path : NULL);
+               }
+
+
                if (quiet <= 0)
                        fprintf(stderr, Name ": stopped %s\n", devname);
-               if (fstat(fd, &stb) == 0) {
-                       int devnum;
-                       if (major(stb.st_rdev) == MD_MAJOR)
-                               devnum = minor(stb.st_rdev);
-                       else
-                               devnum = -1-(minor(stb.st_rdev)>>6);
+               if (devnum != NoMdDev) {
                        map_delete(&map, devnum);
                        map_write(map);
                        map_free(map);
@@ -201,6 +348,7 @@ int Manage_subdevs(char *devname, int fd,
        struct supertype *st, *tst;
        int duuid[4];
        int ouuid[4];
+       int lfd = -1;
 
        if (ioctl(fd, GET_ARRAY_INFO, &array)) {
                fprintf(stderr, Name ": cannot get array info for %s\n",
@@ -227,6 +375,7 @@ int Manage_subdevs(char *devname, int fd,
                unsigned long long ldsize;
                char dvname[20];
                char *dnprintable = dv->devname;
+               int err;
 
                next = dv->next;
                jnext = 0;
@@ -315,9 +464,14 @@ int Manage_subdevs(char *devname, int fd,
                        return 1;
                case 'a':
                        /* add the device */
-
+                       if (tst->subarray[0]) {
+                               fprintf(stderr, Name ": Cannot add disks to a"
+                                       " \'member\' array, perform this"
+                                       " operation on the parent container\n");
+                               return 1;
+                       }
                        /* Make sure it isn't in use (in 2.6 or later) */
-                       tfd = dev_open(dv->devname, O_RDONLY|O_EXCL);
+                       tfd = dev_open(dv->devname, O_RDONLY|O_EXCL|O_DIRECT);
                        if (tfd < 0) {
                                fprintf(stderr, Name ": Cannot open %s: %s\n",
                                        dv->devname, strerror(errno));
@@ -336,7 +490,9 @@ int Manage_subdevs(char *devname, int fd,
                        }
                        close(tfd);
 
-                       if (array.major_version == 0 &&
+
+                       if (!tst->ss->external &&
+                           array.major_version == 0 &&
                            md_get_version(fd)%100 < 2) {
                                if (ioctl(fd, HOT_ADD_DISK,
                                          (unsigned long)stb.st_rdev)==0) {
@@ -351,12 +507,16 @@ int Manage_subdevs(char *devname, int fd,
                                return 1;
                        }
 
-                       if (array.not_persistent == 0) {
+                       if (array.not_persistent == 0 || tst->ss->external) {
 
                                /* need to find a sample superblock to copy, and
-                                * a spare slot to use
+                                * a spare slot to use.
+                                * For 'external' array (well, container based),
+                                * We can just load the metadata for the array.
                                 */
-                               for (j = 0; j < tst->max_devs; j++) {
+                               if (tst->ss->external) {
+                                       tst->ss->load_super(tst, fd, NULL);
+                               } else for (j = 0; j < tst->max_devs; j++) {
                                        char *dev;
                                        int dfd;
                                        disc.number = j;
@@ -378,6 +538,7 @@ int Manage_subdevs(char *devname, int fd,
                                        close(dfd);
                                        break;
                                }
+                               /* FIXME this is a bad test to be using */
                                if (!tst->sb) {
                                        fprintf(stderr, Name ": cannot find valid superblock in this array - HELP\n");
                                        return 1;
@@ -462,12 +623,21 @@ int Manage_subdevs(char *devname, int fd,
                        disc.minor = minor(stb.st_rdev);
                        disc.number =j;
                        disc.state = 0;
-                       if (array.not_persistent==0) {
+                       if (array.not_persistent==0 || tst->ss->external) {
+                               int dfd;
                                if (dv->writemostly == 1)
                                        disc.state |= 1 << MD_DISK_WRITEMOSTLY;
-                               tst->ss->add_to_super(tst, &disc);
-                               if (tst->ss->write_init_super(tst, &disc,
-                                                             dv->devname))
+                               dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
+                               if (tst->ss->add_to_super(tst, &disc, dfd,
+                                                         dv->devname)) {
+                                       close(dfd);
+                                       return 1;
+                               }
+                               /* write_init_super will close 'dfd' */
+                               if (tst->ss->external)
+                                       /* mdmon will write the metadata */
+                                       close(dfd);
+                               else if (tst->ss->write_init_super(tst))
                                        return 1;
                        } else if (dv->re_add) {
                                /*  this had better be raid1.
@@ -500,7 +670,52 @@ int Manage_subdevs(char *devname, int fd,
                        }
                        if (dv->writemostly == 1)
                                disc.state |= (1 << MD_DISK_WRITEMOSTLY);
-                       if (ioctl(fd,ADD_NEW_DISK, &disc)) {
+                       if (tst->ss->external) {
+                               /* add a disk to an external metadata container
+                                * only if mdmon is around to see it
+                                */
+                               struct mdinfo new_mdi;
+                               struct mdinfo *sra;
+                               int container_fd;
+                               int devnum = fd2devnum(fd);
+
+                               container_fd = open_dev_excl(devnum);
+                               if (container_fd < 0) {
+                                       fprintf(stderr, Name ": add failed for %s:"
+                                               " could not get exclusive access to container\n",
+                                               dv->devname);
+                                       return 1;
+                               }
+
+                               if (!mdmon_running(devnum)) {
+                                       fprintf(stderr, Name ": add failed for %s: mdmon not running\n",
+                                               dv->devname);
+                                       close(container_fd);
+                                       return 1;
+                               }
+
+                               sra = sysfs_read(container_fd, -1, 0);
+                               if (!sra) {
+                                       fprintf(stderr, Name ": add failed for %s: sysfs_read failed\n",
+                                               dv->devname);
+                                       close(container_fd);
+                                       return 1;
+                               }
+                               sra->array.level = LEVEL_CONTAINER;
+                               /* Need to set data_offset and component_size */
+                               tst->ss->getinfo_super(tst, &new_mdi);
+                               new_mdi.disk.major = disc.major;
+                               new_mdi.disk.minor = disc.minor;
+                               if (sysfs_add_disk(sra, &new_mdi, 0) != 0) {
+                                       fprintf(stderr, Name ": add new device to external metadata"
+                                               " failed for %s\n", dv->devname);
+                                       close(container_fd);
+                                       return 1;
+                               }
+                               ping_monitor(devnum2devname(devnum));
+                               sysfs_free(sra);
+                               close(container_fd);
+                       } else if (ioctl(fd, ADD_NEW_DISK, &disc)) {
                                fprintf(stderr, Name ": add new device failed for %s as %d: %s\n",
                                        dv->devname, j, strerror(errno));
                                return 1;
@@ -511,13 +726,94 @@ int Manage_subdevs(char *devname, int fd,
 
                case 'r':
                        /* hot remove */
+                       if (tst->subarray[0]) {
+                               fprintf(stderr, Name ": Cannot remove disks from a"
+                                       " \'member\' array, perform this"
+                                       " operation on the parent container\n");
+                               return 1;
+                       }
+                       if (tst->ss->external) {
+                               /* To remove a device from a container, we must
+                                * check that it isn't in use in an array.
+                                * This involves looking in the 'holders'
+                                * directory - there must be just one entry,
+                                * the container.
+                                * To ensure that it doesn't get used as a
+                                * hold spare while we are checking, we
+                                * get an O_EXCL open on the container
+                                */
+                               int dnum = fd2devnum(fd);
+                               lfd = open_dev_excl(dnum);
+                               if (lfd < 0) {
+                                       fprintf(stderr, Name
+                                               ": Cannot get exclusive access "
+                                               " to container - odd\n");
+                                       return 1;
+                               }
+                               /* in the detached case it is not possible to
+                                * check if we are the unique holder, so just
+                                * rely on the 'detached' checks
+                                */
+                               if (strcmp(dv->devname, "detached") == 0 ||
+                                   sysfs_unique_holder(dnum, stb.st_rdev))
+                                       /* pass */;
+                               else {
+                                       fprintf(stderr, Name
+                                               ": %s is %s, cannot remove.\n",
+                                               dnprintable,
+                                               errno == EEXIST ? "still in use":
+                                               "not a member");
+                                       close(lfd);
+                                       return 1;
+                               }
+                       }
                        /* FIXME check that it is a current member */
-                       if (ioctl(fd, HOT_REMOVE_DISK, (unsigned long)stb.st_rdev)) {
+                       err = ioctl(fd, HOT_REMOVE_DISK, (unsigned long)stb.st_rdev);
+                       if (err && errno == ENODEV) {
+                               /* Old kernels rejected this if no personality
+                                * registered */
+                               struct mdinfo *sra = sysfs_read(fd, 0, GET_DEVS);
+                               struct mdinfo *dv = NULL;
+                               if (sra)
+                                       dv = sra->devs;
+                               for ( ; dv ; dv=dv->next)
+                                       if (dv->disk.major == major(stb.st_rdev) &&
+                                           dv->disk.minor == minor(stb.st_rdev))
+                                               break;
+                               if (dv)
+                                       err = sysfs_set_str(sra, dv,
+                                                           "state", "remove");
+                               else
+                                       err = -1;
+                               if (sra)
+                                       sysfs_free(sra);
+                       }
+                       if (err) {
                                fprintf(stderr, Name ": hot remove failed "
                                        "for %s: %s\n", dnprintable,
                                        strerror(errno));
+                               if (lfd >= 0)
+                                       close(lfd);
                                return 1;
                        }
+                       if (tst->ss->external) {
+                               /*
+                                * Before dropping our exclusive open we make an
+                                * attempt at preventing mdmon from seeing an
+                                * 'add' event before reconciling this 'remove'
+                                * event.
+                                */
+                               char *name = devnum2devname(fd2devnum(fd));
+
+                               if (!name) {
+                                       fprintf(stderr, Name ": unable to get container name\n");
+                                       return 1;
+                               }
+
+                               ping_manager(name);
+                               free(name);
+                       }
+                       close(lfd);
                        if (verbose >= 0)
                                fprintf(stderr, Name ": hot removed %s\n",
                                        dnprintable);
index e0a9d2a614a464816cfea126f21e3cd71982f954..f6fd95cdfa9401c9bef67bdefd3403f8a3a5e852 100644 (file)
--- a/Monitor.c
+++ b/Monitor.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 
 #include       "mdadm.h"
@@ -165,10 +160,21 @@ int Monitor(mddev_dev_t devlist,
        if (devlist == NULL) {
                mddev_ident_t mdlist = conf_get_ident(NULL);
                for (; mdlist; mdlist=mdlist->next) {
-                       struct state *st = malloc(sizeof *st);
+                       struct state *st;
+                       if (mdlist->devname == NULL)
+                               continue;
+                       if (strcasecmp(mdlist->devname, "<ignore>") == 0)
+                               continue;
+                       st = malloc(sizeof *st);
                        if (st == NULL)
                                continue;
-                       st->devname = strdup(mdlist->devname);
+                       if (mdlist->devname[0] == '/')
+                               st->devname = strdup(mdlist->devname);
+                       else {
+                               st->devname = malloc(8+strlen(mdlist->devname)+1);
+                               strcpy(strcpy(st->devname, "/dev/md/"),
+                                      mdlist->devname);
+                       }
                        st->utime = 0;
                        st->next = statelist;
                        st->err = 0;
@@ -273,6 +279,10 @@ int Monitor(mddev_dev_t devlist,
                                        mse = mse2;
                                }
 
+                       if (array.utime == 0)
+                               /* external arrays don't update utime */
+                               array.utime = time(0);
+
                        if (st->utime == array.utime &&
                            st->failed == array.failed_disks &&
                            st->working == array.working_disks &&
@@ -613,10 +623,7 @@ int Wait(char *dev)
                        strerror(errno));
                return 2;
        }
-       if (major(stb.st_rdev) == MD_MAJOR)
-               devnum = minor(stb.st_rdev);
-       else
-               devnum = -1-(minor(stb.st_rdev)/64);
+       devnum = stat2devnum(&stb);
 
        while(1) {
                struct mdstat_ent *ms = mdstat_read(1, 0);
@@ -627,6 +634,13 @@ int Wait(char *dev)
                                break;
 
                if (!e || e->percent < 0) {
+                       if (e && e->metadata_version &&
+                           strncmp(e->metadata_version, "external:", 9) == 0) {
+                               if (is_subarray(&e->metadata_version[9]))
+                                       ping_monitor(&e->metadata_version[9]);
+                               else
+                                       ping_monitor(devnum2devname(devnum));
+                       }
                        free_mdstat(ms);
                        return rv;
                }
diff --git a/Query.c b/Query.c
index 190ee298834e70d9640e29fbc551bba0a5742934..8847be7ec0b6a1e0880865d672b38f9f0374d2e2 100644 (file)
--- a/Query.c
+++ b/Query.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2002-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2002-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 
 #include       "mdadm.h"
@@ -96,7 +91,7 @@ int Query(char *dev)
        if (superror == 0) {
                /* array might be active... */
                st->ss->getinfo_super(st, &info);
-               if (st->ss->major == 0) {
+               if (st->ss == &super0) {
                        mddev = get_md_name(info.array.md_minor);
                        disc.number = info.disk.number;
                        activity = "undetected";
@@ -121,7 +116,7 @@ int Query(char *dev)
                       activity,
                       map_num(pers, info.array.level),
                       mddev);
-               if (st->ss->major == 0)
+               if (st->ss == &super0)
                        put_md_name(mddev);
        }
        return 0;
index 818be0adf9bc9dd64d041d8bbedd1b3d41b69032..b8734d106cb9c54e317e9363bda808861806879a 100644 (file)
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2007 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
@@ -24,7 +24,7 @@
 
 #include "mdadm.h"
 
-char Version[] = Name " - v2.6.9 - 10th March 2009\n";
+char Version[] = Name " - v3.0-rc1 - 11th May 2009\n";
 
 /*
  * File: ReadMe.c
@@ -107,6 +107,7 @@ struct option long_options[] = {
     {"query",    0, 0, 'Q'},
     {"examine-bitmap", 0, 0, 'X'},
     {"auto-detect", 0, 0, AutoDetect},
+    {"detail-platform", 0, 0, DetailPlatform},
 
     /* synonyms */
     {"monitor",   0, 0, 'F'},
@@ -138,7 +139,9 @@ struct option long_options[] = {
     {"write-mostly",0, 0, 'W'},
     {"re-add",    0, 0,  ReAdd},
     {"homehost",  1, 0,  HomeHost},
+#if 0
     {"auto-update-homehost", 0, 0, AutoHomeHost},
+#endif
     {"symlinks",  1, 0,  Symlinks},
 
     /* For assemble */
@@ -161,6 +164,7 @@ struct option long_options[] = {
     {"readwrite", 0, 0, 'w'},
     {"no-degraded",0,0,  NoDegraded },
     {"wait",     0, 0, 'W'},
+    {"wait-clean", 0, 0, Waitclean },
 
     /* For Detail/Examine */
     {"brief",    0, 0, 'b'},
@@ -269,7 +273,6 @@ char OptionHelp[] =
 "  --size=       -z   : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n"
 "  --force       -f   : Honour devices as listed on command line.  Don't\n"
 "                     : insert a missing drive for RAID5.\n"
-"  --auto(=p)    -a   : Automatically allocate new (partitioned) md array if needed.\n"
 "  --assume-clean     : Assume the array is already in-sync. This is dangerous.\n"
 "  --bitmap-chunk=    : chunksize of bitmap in bitmap file (Kilobytes)\n"
 "  --delay=      -d   : seconds between bitmap updates\n"
@@ -287,7 +290,6 @@ char OptionHelp[] =
 "  --scan        -s   : scan config file for missing information\n"
 "  --force       -f   : Assemble the array even if some superblocks appear out-of-date\n"
 "  --update=     -U   : Update superblock: try '-A --update=?' for list of options.\n"
-"  --auto(=p)    -a   : Automatically allocate new (partitioned) md array if needed.\n"
 "  --no-degraded      : Do not start any degraded arrays - default unless --scan.\n"
 "\n"
 " For detail or examine:\n"
@@ -465,6 +467,7 @@ char Help_misc[] =
 "  --query       -Q   : Display general information about how a\n"
 "                       device relates to the md driver\n"
 "  --detail      -D   : Display details of an array\n"
+"  --detail-platform  : Display hardware/firmware details\n"
 "  --examine     -E   : Examine superblock on an array component\n"
 "  --examine-bitmap -X: Display contents of a bitmap file\n"
 "  --zero-superblock  : erase the MD superblock from a device.\n"
@@ -581,16 +584,49 @@ char Help_config[] =
 /* name/number mappings */
 
 mapping_t r5layout[] = {
-       { "left-asymmetric", 0},
-       { "right-asymmetric", 1},
-       { "left-symmetric", 2},
-       { "right-symmetric", 3},
-
-       { "default", 2},
-       { "la", 0},
-       { "ra", 1},
-       { "ls", 2},
-       { "rs", 3},
+       { "left-asymmetric", ALGORITHM_LEFT_ASYMMETRIC},
+       { "right-asymmetric", ALGORITHM_RIGHT_ASYMMETRIC},
+       { "left-symmetric", ALGORITHM_LEFT_SYMMETRIC},
+       { "right-symmetric", ALGORITHM_RIGHT_SYMMETRIC},
+
+       { "default", ALGORITHM_LEFT_SYMMETRIC},
+       { "la", ALGORITHM_LEFT_ASYMMETRIC},
+       { "ra", ALGORITHM_RIGHT_ASYMMETRIC},
+       { "ls", ALGORITHM_LEFT_SYMMETRIC},
+       { "rs", ALGORITHM_RIGHT_SYMMETRIC},
+
+       { "parity-first", ALGORITHM_PARITY_0},
+       { "parity-last", ALGORITHM_PARITY_N},
+       { "ddf-zero-restart", ALGORITHM_RIGHT_ASYMMETRIC},
+       { "ddf-N-restart", ALGORITHM_LEFT_ASYMMETRIC},
+       { "ddf-N-continue", ALGORITHM_LEFT_SYMMETRIC},
+
+       { NULL, 0}
+};
+mapping_t r6layout[] = {
+       { "left-asymmetric", ALGORITHM_LEFT_ASYMMETRIC},
+       { "right-asymmetric", ALGORITHM_RIGHT_ASYMMETRIC},
+       { "left-symmetric", ALGORITHM_LEFT_SYMMETRIC},
+       { "right-symmetric", ALGORITHM_RIGHT_SYMMETRIC},
+
+       { "default", ALGORITHM_LEFT_SYMMETRIC},
+       { "la", ALGORITHM_LEFT_ASYMMETRIC},
+       { "ra", ALGORITHM_RIGHT_ASYMMETRIC},
+       { "ls", ALGORITHM_LEFT_SYMMETRIC},
+       { "rs", ALGORITHM_RIGHT_SYMMETRIC},
+
+       { "parity-first", ALGORITHM_PARITY_0},
+       { "parity-last", ALGORITHM_PARITY_N},
+       { "ddf-zero-restart", ALGORITHM_ROTATING_ZERO_RESTART},
+       { "ddf-N-restart", ALGORITHM_ROTATING_N_RESTART},
+       { "ddf-N-continue", ALGORITHM_ROTATING_N_CONTINUE},
+
+       { "left-asymmetric-6", ALGORITHM_LEFT_ASYMMETRIC_6},
+       { "right-asymmetric-6", ALGORITHM_RIGHT_ASYMMETRIC_6},
+       { "left-symmetric-6", ALGORITHM_LEFT_SYMMETRIC_6},
+       { "right-symmetric-6", ALGORITHM_RIGHT_SYMMETRIC_6},
+       { "parity-first-6", ALGORITHM_PARITY_0_6},
+
        { NULL, 0}
 };
 
@@ -613,6 +649,7 @@ mapping_t pers[] = {
        { "raid10", 10},
        { "10", 10},
        { "faulty", LEVEL_FAULTY},
+       { "container", LEVEL_CONTAINER},
        { NULL, 0}
 };
 
diff --git a/TODO b/TODO
index f79163b88ca434065232034381af3a27aff23c25..279d20db99892c8e79b969a961ae38a7be5fd77c 100644 (file)
--- a/TODO
+++ b/TODO
@@ -1,3 +1,38 @@
+ - add 'name' field to metadata type and use it.
+ - use validate_geometry more
+ - metadata should be able to check/reject bitmap stuff.
+
+DDF:
+  Three new metadata types:
+    ddf - used only to create a container.
+    ddf-bvd - used to create an array in a container
+    ddf-svd - used to create a secondary array from bvds.
+
+  Usage:
+    mdadm -C /dev/ddf1 /dev/sd[abcdef]
+    mdadm -C /dev/md1 -e ddf /dev/sd[a-f]
+    mdadm -C /dev/md1 -l container /dev/sd[a-f]
+
+        Each of these create a new ddf container using all those
+       devices.  The name 'ddf*' signals that ddf metadata should be used.
+       '-e ddf' only supports one level - 'container'.  'container' is only
+       supported by ddf.
+
+    mdadm -C /dev/md1 -l0 -n4 /dev/ddf1 # or maybe not ???
+    mdadm -C /dev/md1 -l1 -n2 /dev/sda /dev/sdb
+       If exactly one device is given, and it is a container, we select
+       devices from that container.
+       If devices are given that are already in use, they must be in use by
+       a container, and the array is created in the container.
+       If devices given are bvds, we slip under the hood to make
+         the svd arrays.
+
+    mdadm -A /dev/ddf ......
+       base drives make a container.  Anything in that container is started
+        auto-read-only.
+        if /dev/ddf is already assembled, we assemble bvds and svds inside it.
+
+
 2005-dec-20
   Want an incremental assembly mode to work nicely with udev.
   Core usage would be something like
index 5618087719cfa199de5e6078eb9f348ac1a313fc..850b0cedbf5fa8c9f4a1294e154377b4779581b0 100644 (file)
--- a/bitmap.c
+++ b/bitmap.c
@@ -131,11 +131,13 @@ bitmap_info_t *bitmap_fd_read(int fd, int brief)
         */
        unsigned long long total_bits = 0, read_bits = 0, dirty_bits = 0;
        bitmap_info_t *info;
-       char *buf, *unaligned;
+       void *buf;
        int n, skip;
 
-       unaligned = malloc(8192*2);
-       buf = (char*) ((unsigned long)unaligned | 8191)+1;
+       if (posix_memalign(&buf, 512, 8192) != 0) {
+               fprintf(stderr, Name ": failed to allocate 8192 bytes\n");
+               return NULL;
+       }
        n = read(fd, buf, 8192);
 
        info = malloc(sizeof(*info));
@@ -154,7 +156,6 @@ bitmap_info_t *bitmap_fd_read(int fd, int brief)
                fprintf(stderr, Name ": failed to read superblock of bitmap "
                        "file: %s\n", strerror(errno));
                free(info);
-               free(unaligned);
                return NULL;
        }
        memcpy(&info->sb, buf, sizeof(info->sb));
index 78bbb9d2d7cf8128eb4b669dee568bf6fd401ccc..c962afdaaed829d0122550166ce0f072d9ccea97 100644 (file)
--- a/config.c
+++ b/config.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 
 #include       "mdadm.h"
@@ -56,7 +51,7 @@
  *  with a key word, and not be indented, or must start with a
  *  non-key-word and must be indented.
  *
- * Keywords are DEVICE and ARRAY
+ * Keywords are DEVICE and ARRAY ... and several others.
  * DEV{ICE} introduces some devices that might contain raid components.
  * e.g.
  *   DEV style=0 /dev/sda* /dev/hd*
@@ -79,7 +74,8 @@
 char DefaultConfFile[] = CONFFILE;
 char DefaultAltConfFile[] = CONFFILE2;
 
-enum linetype { Devices, Array, Mailaddr, Mailfrom, Program, CreateDev, Homehost, LTEnd };
+enum linetype { Devices, Array, Mailaddr, Mailfrom, Program, CreateDev,
+               Homehost, AutoMode, LTEnd };
 char *keywords[] = {
        [Devices]  = "devices",
        [Array]    = "array",
@@ -88,6 +84,7 @@ char *keywords[] = {
        [Program]  = "program",
        [CreateDev]= "create",
        [Homehost] = "homehost",
+       [AutoMode] = "auto",
        [LTEnd]    = NULL
 };
 
@@ -261,12 +258,44 @@ mddev_dev_t load_partitions(void)
                d->devname = strdup(name);
                d->next = rv;
                d->used = 0;
+               d->content = NULL;
                rv = d;
        }
        fclose(f);
        return rv;
 }
 
+mddev_dev_t load_containers(void)
+{
+       struct mdstat_ent *mdstat = mdstat_read(1, 0);
+       struct mdstat_ent *ent;
+       mddev_dev_t d;
+       mddev_dev_t rv = NULL;
+
+       if (!mdstat)
+               return NULL;
+
+       for (ent = mdstat; ent; ent = ent->next)
+               if (ent->metadata_version &&
+                   strncmp(ent->metadata_version, "external:", 9) == 0 &&
+                   !is_subarray(&ent->metadata_version[9])) {
+                       d = malloc(sizeof(*d));
+                       if (!d)
+                               continue;
+                       if (asprintf(&d->devname, "/dev/%s", ent->dev) < 0) {
+                               free(d);
+                               continue;
+                       }
+                       d->next = rv;
+                       d->used = 0;
+                       d->content = NULL;
+                       rv = d;
+               }
+       free_mdstat(mdstat);
+
+       return rv;
+}
+
 struct createinfo createinfo = {
        .autof = 2, /* by default, create devices with standard names */
        .symlinks = 1,
@@ -398,7 +427,8 @@ void devline(char *line)
        struct conf_dev *cd;
 
        for (w=dl_next(line); w != line; w=dl_next(w)) {
-               if (w[0] == '/' || strcasecmp(w, "partitions") == 0) {
+               if (w[0] == '/' || strcasecmp(w, "partitions") == 0 ||
+                   strcasecmp(w, "containers") == 0) {
                        cd = malloc(sizeof(*cd));
                        cd->name = strdup(w);
                        cd->next = cdevlist;
@@ -413,6 +443,17 @@ void devline(char *line)
 mddev_ident_t mddevlist = NULL;
 mddev_ident_t *mddevlp = &mddevlist;
 
+static int is_number(char *w)
+{
+       /* check if there are 1 or more digits and nothing else */
+       int digits = 0;
+       while (*w && isdigit(*w)) {
+               digits++;
+               w++;
+       }
+       return (digits && ! *w);
+}
+
 void arrayline(char *line)
 {
        char *w;
@@ -434,13 +475,39 @@ void arrayline(char *line)
        mis.bitmap_fd = -1;
        mis.bitmap_file = NULL;
        mis.name[0] = 0;
+       mis.container = NULL;
+       mis.member = NULL;
 
        for (w=dl_next(line); w!=line; w=dl_next(w)) {
-               if (w[0] == '/') {
-                       if (mis.devname)
-                               fprintf(stderr, Name ": only give one device per ARRAY line: %s and %s\n",
-                                       mis.devname, w);
-                       else mis.devname = w;
+               if (w[0] == '/' || strchr(w, '=') == NULL) {
+                       /* This names the device, or is '<ignore>'.
+                        * The rules match those in create_mddev.
+                        * 'w' must be:
+                        *  /dev/md/{anything}
+                        *  /dev/mdNN
+                        *  /dev/md_dNN
+                        *  <ignore>
+                        *  or anything that doesn't start '/' or '<'
+                        */
+                       if (strcasecmp(w, "<ignore>") == 0 ||
+                           strncmp(w, "/dev/md/", 8) == 0 ||
+                           (w[0] != '/' && w[0] != '<') ||
+                           (strncmp(w, "/dev/md", 7) == 0 && 
+                            is_number(w+7)) ||
+                           (strncmp(w, "/dev/md_d", 9) == 0 &&
+                            is_number(w+9))
+                               ) {
+                               /* This is acceptable */;
+                               if (mis.devname)
+                                       fprintf(stderr, Name ": only give one "
+                                               "device per ARRAY line: %s and %s\n",
+                                               mis.devname, w);
+                               else
+                                       mis.devname = w;
+                       }else {
+                               fprintf(stderr, Name ": %s is an invalid name for "
+                                       "an md device - ignored.\n", w);
+                       }
                } else if (strncasecmp(w, "uuid=", 5)==0 ) {
                        if (mis.uuid_set)
                                fprintf(stderr, Name ": only specify uuid once, %s ignored.\n",
@@ -516,19 +583,26 @@ void arrayline(char *line)
                } else if (strncasecmp(w, "auto=", 5) == 0 ) {
                        /* whether to create device special files as needed */
                        mis.autof = parse_auto(w+5, "auto type", 0);
+               } else if (strncasecmp(w, "member=", 7) == 0) {
+                       /* subarray within a container */
+                       mis.member = strdup(w+7);
+               } else if (strncasecmp(w, "container=", 10) == 0) {
+                       /* the container holding this subarray.  Either a device name
+                        * or a uuid */
+                       mis.container = strdup(w+10);
                } else {
                        fprintf(stderr, Name ": unrecognised word on ARRAY line: %s\n",
                                w);
                }
        }
-       if (mis.devname == NULL)
-               fprintf(stderr, Name ": ARRAY line with no device\n");
-       else if (mis.uuid_set == 0 && mis.devices == NULL && mis.super_minor == UnSet && mis.name[0] == 0)
+       if (mis.uuid_set == 0 && mis.devices == NULL &&
+           mis.super_minor == UnSet && mis.name[0] == 0 &&
+           (mis.container == NULL || mis.member == NULL))
                fprintf(stderr, Name ": ARRAY line %s has no identity information.\n", mis.devname);
        else {
                mi = malloc(sizeof(*mi));
                *mi = mis;
-               mi->devname = strdup(mis.devname);
+               mi->devname = mis.devname ? strdup(mis.devname) : NULL;
                mi->next = NULL;
                *mddevlp = mi;
                mddevlp = &mi->next;
@@ -558,10 +632,12 @@ void mailfromline(char *line)
                if (alert_mail_from == NULL)
                        alert_mail_from = strdup(w);
                else {
-                       char *t= NULL;
-                       xasprintf(&t, "%s %s", alert_mail_from, w);
-                       free(alert_mail_from);
-                       alert_mail_from = t;
+                       char *t = NULL;
+
+                       if (xasprintf(&t, "%s %s", alert_mail_from, w) > 0) {
+                               free(alert_mail_from);
+                               alert_mail_from = t;
+                       }
                }
        }
 }
@@ -582,12 +658,15 @@ void programline(char *line)
 }
 
 static char *home_host = NULL;
+static int require_homehost = 1;
 void homehostline(char *line)
 {
        char *w;
 
        for (w=dl_next(line); w != line ; w=dl_next(w)) {
-               if (home_host == NULL)
+               if (strcasecmp(w, "<ignore>")==0)
+                       require_homehost = 0;
+               else if (home_host == NULL)
                        home_host = strdup(w);
                else
                        fprintf(stderr, Name ": excess host name on HOMEHOST line: %s - ignored\n",
@@ -595,6 +674,16 @@ void homehostline(char *line)
        }
 }
 
+static char *auto_options = NULL;
+void autoline(char *line)
+{
+       if (auto_options) {
+               fprintf(stderr, Name ": AUTO line may only be give once."
+                       "  Subsequent lines ignored\n");
+               return;
+       }
+       auto_options = line;            
+}
 
 int loaded = 0;
 
@@ -665,6 +754,9 @@ void load_conffile(void)
                case Homehost:
                        homehostline(line);
                        break;
+               case AutoMode:
+                       autoline(line);
+                       break;
                default:
                        fprintf(stderr, Name ": Unknown keyword %s\n", line);
                }
@@ -694,9 +786,11 @@ char *conf_get_program(void)
        return alert_program;
 }
 
-char *conf_get_homehost(void)
+char *conf_get_homehost(int *require_homehostp)
 {
        load_conffile();
+       if (require_homehostp)
+               *require_homehostp = require_homehost;
        return home_host;
 }
 
@@ -711,11 +805,19 @@ mddev_ident_t conf_get_ident(char *dev)
        mddev_ident_t rv;
        load_conffile();
        rv = mddevlist;
-       while (dev && rv && strcmp(dev, rv->devname)!=0)
+       while (dev && rv && (rv->devname == NULL
+                            || !devname_matches(dev, rv->devname)))
                rv = rv->next;
        return rv;
 }
 
+static void append_dlist(mddev_dev_t *dlp, mddev_dev_t list)
+{
+       while (*dlp)
+               dlp = &(*dlp)->next;
+       *dlp = list;
+}
+
 mddev_dev_t conf_get_devs()
 {
        glob_t globbuf;
@@ -733,13 +835,17 @@ mddev_dev_t conf_get_devs()
 
        load_conffile();
 
-       if (cdevlist == NULL)
-               /* default to 'partitions */
+       if (cdevlist == NULL) {
+               /* default to 'partitions' and 'containers' */
                dlist = load_partitions();
+               append_dlist(&dlist, load_containers());
+       }
 
        for (cd=cdevlist; cd; cd=cd->next) {
-               if (strcasecmp(cd->name, "partitions")==0 && dlist == NULL)
-                       dlist = load_partitions();
+               if (strcasecmp(cd->name, "partitions")==0)
+                       append_dlist(&dlist, load_partitions());
+               else if (strcasecmp(cd->name, "containers")==0)
+                       append_dlist(&dlist, load_containers());
                else {
                        glob(cd->name, flags, NULL, &globbuf);
                        flags |= GLOB_APPEND;
@@ -751,6 +857,7 @@ mddev_dev_t conf_get_devs()
                        t->devname = strdup(globbuf.gl_pathv[i]);
                        t->next = dlist;
                        t->used = 0;
+                       t->content = NULL;
                        dlist = t;
 /*     printf("one dev is %s\n", t->devname);*/
                }
@@ -775,6 +882,52 @@ int conf_test_dev(char *devname)
        return 0;
 }
 
+int conf_test_metadata(const char *version)
+{
+       /* Check if the given metadata version is allowed
+        * to be auto-assembled.
+        * The default is 'yes' but the 'auto' line might over-ride that.
+        * Word in auto_options are processed in order with the first
+        * match winning.
+        * word can be:
+        *   +version   - that version can be assembled
+        *   -version   - that version cannot be auto-assembled
+        *   yes or +all - any other version can be assembled
+        *   no or -all  - no other version can be assembled.
+        */
+       char *w;
+       load_conffile();
+       if (!auto_options)
+               return 1;
+       for (w = dl_next(auto_options); w != auto_options; w = dl_next(w)) {
+               int rv;
+               if (strcasecmp(w, "yes") == 0)
+                       return 1;
+               if (strcasecmp(w, "no") == 0)
+                       return 0;
+               if (w[0] == '+')
+                       rv = 1;
+               else if (w[0] == '-')
+                       rv = 0;
+               else continue;
+
+               if (strcasecmp(w+1, "all") == 0)
+                       return rv;
+               if (strcasecmp(w+1, version) == 0)
+                       return rv;
+               /* allow  '0' to match version '0.90'
+                * and 1 or 1.whatever to match version '1.x'
+                */
+               if (version[1] == '.' &&
+                   strlen(w+1) == 1 &&
+                   w[1] == version[0])
+                       return rv;
+               if (version[1] == '.' && version[2] == 'x' &&
+                   strncmp(w+1, version, 2) == 0)
+                       return rv;
+       }
+       return 1;
+}
 
 int match_oneof(char *devices, char *devname)
 {
@@ -800,3 +953,128 @@ int match_oneof(char *devices, char *devname)
     }
     return 0;
 }
+
+int devname_matches(char *name, char *match)
+{
+       /* See if the given array name matches the
+        * given match from config file.
+        *
+        * First strip and /dev/md/ or /dev/, then
+        * see if there might be a numeric match of
+        *  mdNN with NN
+        * then just strcmp
+        */
+       if (strncmp(name, "/dev/md/", 8) == 0)
+               name += 8;
+       else if (strncmp(name, "/dev/", 5) == 0)
+               name += 5;
+
+       if (strncmp(match, "/dev/md/", 8) == 0)
+               match += 8;
+       else if (strncmp(match, "/dev/", 5) == 0)
+               match += 5;
+
+
+       if (strncmp(name, "md", 2) == 0 &&
+           isdigit(name[2]))
+               name += 2;
+       if (strncmp(match, "md", 2) == 0 &&
+           isdigit(match[2]))
+               match += 2;
+
+       return (strcmp(name, match) == 0);
+}
+
+int conf_name_is_free(char *name)
+{
+       /* Check if this name is already take by an ARRAY entry in
+        * the config file.
+        * It can be taken either by a match on devname, name, or
+        * even super-minor.
+        */
+       mddev_ident_t dev;
+
+       load_conffile();
+       for (dev = mddevlist; dev; dev = dev->next) {
+               char nbuf[100];
+               if (dev->devname && devname_matches(name, dev->devname))
+                       return 0;
+               if (dev->name[0] && devname_matches(name, dev->name))
+                       return 0;
+               sprintf(nbuf, "%d", dev->super_minor);
+               if (dev->super_minor != UnSet &&
+                   devname_matches(name, nbuf))
+                       return 0;
+       }
+       return 1;
+}
+
+struct mddev_ident_s *conf_match(struct mdinfo *info, struct supertype *st)
+{
+       struct mddev_ident_s *array_list, *match;
+       int verbose = 0;
+       char *devname = NULL;
+       array_list = conf_get_ident(NULL);
+       match = NULL;
+       for (; array_list; array_list = array_list->next) {
+               if (array_list->uuid_set &&
+                   same_uuid(array_list->uuid, info->uuid, st->ss->swapuuid)
+                   == 0) {
+                       if (verbose >= 2 && array_list->devname)
+                               fprintf(stderr, Name
+                                       ": UUID differs from %s.\n",
+                                       array_list->devname);
+                       continue;
+               }
+               if (array_list->name[0] &&
+                   strcasecmp(array_list->name, info->name) != 0) {
+                       if (verbose >= 2 && array_list->devname)
+                               fprintf(stderr, Name
+                                       ": Name differs from %s.\n",
+                                       array_list->devname);
+                       continue;
+               }
+               if (array_list->devices && devname &&
+                   !match_oneof(array_list->devices, devname)) {
+                       if (verbose >= 2 && array_list->devname)
+                               fprintf(stderr, Name
+                                       ": Not a listed device for %s.\n",
+                                       array_list->devname);
+                       continue;
+               }
+               if (array_list->super_minor != UnSet &&
+                   array_list->super_minor != info->array.md_minor) {
+                       if (verbose >= 2 && array_list->devname)
+                               fprintf(stderr, Name
+                                       ": Different super-minor to %s.\n",
+                                       array_list->devname);
+                       continue;
+               }
+               if (!array_list->uuid_set &&
+                   !array_list->name[0] &&
+                   !array_list->devices &&
+                   array_list->super_minor == UnSet) {
+                       if (verbose >= 2 && array_list->devname)
+                               fprintf(stderr, Name
+                            ": %s doesn't have any identifying information.\n",
+                                       array_list->devname);
+                       continue;
+               }
+               /* FIXME, should I check raid_disks and level too?? */
+
+               if (match) {
+                       if (verbose >= 0) {
+                               if (match->devname && array_list->devname)
+                                       fprintf(stderr, Name
+                  ": we match both %s and %s - cannot decide which to use.\n",
+                                               match->devname, array_list->devname);
+                               else
+                                       fprintf(stderr, Name
+                                               ": multiple lines in mdadm.conf match\n");
+                       }
+                       return NULL;
+               }
+               match = array_list;
+       }
+       return match;
+}
diff --git a/crc32.c b/crc32.c
new file mode 100644 (file)
index 0000000..12d08e5
--- /dev/null
+++ b/crc32.c
@@ -0,0 +1,340 @@
+/* crc32.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2003 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
+ * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
+ * tables for updating the shift register in one step with three exclusive-ors
+ * instead of four steps with four exclusive-ors.  This results about a factor
+ * of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
+ */
+
+/* @(#) $Id$ */
+
+/*
+  Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
+  protection on the static variables used to control the first-use generation
+  of the crc tables.  Therefore, if you #define DYNAMIC_CRC_TABLE, you should
+  first call get_crc_table() to initialize the tables before allowing more than
+  one thread to use crc32().
+ */
+
+#ifdef MAKECRCH
+#  include <stdio.h>
+#  ifndef DYNAMIC_CRC_TABLE
+#    define DYNAMIC_CRC_TABLE
+#  endif /* !DYNAMIC_CRC_TABLE */
+#endif /* MAKECRCH */
+
+/* #include "zutil.h"      / * for STDC and FAR definitions */
+#define STDC
+#define FAR
+#define Z_NULL ((void*)0)
+#define OF(X) X
+#define ZEXPORT
+typedef long ptrdiff_t;
+#define NOBYFOUR
+
+#define local static
+
+/* Find a four-byte integer type for crc32_little() and crc32_big(). */
+#ifndef NOBYFOUR
+#  ifdef STDC           /* need ANSI C limits.h to determine sizes */
+#    include <limits.h>
+#    define BYFOUR
+#    if (UINT_MAX == 0xffffffffUL)
+       typedef unsigned int u4;
+#    else
+#      if (ULONG_MAX == 0xffffffffUL)
+         typedef unsigned long u4;
+#      else
+#        if (USHRT_MAX == 0xffffffffUL)
+           typedef unsigned short u4;
+#        else
+#          undef BYFOUR     /* can't find a four-byte integer type! */
+#        endif
+#      endif
+#    endif
+#  endif /* STDC */
+#endif /* !NOBYFOUR */
+
+/* Definitions for doing the crc four data bytes at a time. */
+#ifdef BYFOUR
+#  define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \
+                (((w)&0xff00)<<8)+(((w)&0xff)<<24))
+   local unsigned long crc32_little OF((unsigned long,
+                        const unsigned char FAR *, unsigned));
+   local unsigned long crc32_big OF((unsigned long,
+                        const unsigned char FAR *, unsigned));
+#  define TBLS 8
+#else
+#  define TBLS 1
+#endif /* BYFOUR */
+
+#ifdef DYNAMIC_CRC_TABLE
+
+local volatile int crc_table_empty = 1;
+local unsigned long FAR crc_table[TBLS][256];
+local void make_crc_table OF((void));
+#ifdef MAKECRCH
+   local void write_table OF((FILE *, const unsigned long FAR *));
+#endif /* MAKECRCH */
+
+/*
+  Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
+  x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
+
+  Polynomials over GF(2) are represented in binary, one bit per coefficient,
+  with the lowest powers in the most significant bit.  Then adding polynomials
+  is just exclusive-or, and multiplying a polynomial by x is a right shift by
+  one.  If we call the above polynomial p, and represent a byte as the
+  polynomial q, also with the lowest power in the most significant bit (so the
+  byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p,
+  where a mod b means the remainder after dividing a by b.
+
+  This calculation is done using the shift-register method of multiplying and
+  taking the remainder.  The register is initialized to zero, and for each
+  incoming bit, x^32 is added mod p to the register if the bit is a one (where
+  x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by
+  x (which is shifting right by one and adding x^32 mod p if the bit shifted
+  out is a one).  We start with the highest power (least significant bit) of
+  q and repeat for all eight bits of q.
+
+  The first table is simply the CRC of all possible eight bit values.  This is
+  all the information needed to generate CRCs on data a byte at a time for all
+  combinations of CRC register values and incoming bytes.  The remaining tables
+  allow for word-at-a-time CRC calculation for both big-endian and little-
+  endian machines, where a word is four bytes.
+*/
+local void make_crc_table()
+{
+    unsigned long c;
+    int n, k;
+    unsigned long poly;                 /* polynomial exclusive-or pattern */
+    /* terms of polynomial defining this crc (except x^32): */
+    static volatile int first = 1;      /* flag to limit concurrent making */
+    static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26};
+
+    /* See if another task is already doing this (not thread-safe, but better
+       than nothing -- significantly reduces duration of vulnerability in
+       case the advice about DYNAMIC_CRC_TABLE is ignored) */
+    if (first) {
+        first = 0;
+
+        /* make exclusive-or pattern from polynomial (0xedb88320UL) */
+        poly = 0UL;
+        for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++)
+            poly |= 1UL << (31 - p[n]);
+
+        /* generate a crc for every 8-bit value */
+        for (n = 0; n < 256; n++) {
+            c = (unsigned long)n;
+            for (k = 0; k < 8; k++)
+                c = c & 1 ? poly ^ (c >> 1) : c >> 1;
+            crc_table[0][n] = c;
+        }
+
+#ifdef BYFOUR
+        /* generate crc for each value followed by one, two, and three zeros,
+           and then the byte reversal of those as well as the first table */
+        for (n = 0; n < 256; n++) {
+            c = crc_table[0][n];
+            crc_table[4][n] = REV(c);
+            for (k = 1; k < 4; k++) {
+                c = crc_table[0][c & 0xff] ^ (c >> 8);
+                crc_table[k][n] = c;
+                crc_table[k + 4][n] = REV(c);
+            }
+        }
+#endif /* BYFOUR */
+
+        crc_table_empty = 0;
+    }
+    else {      /* not first */
+        /* wait for the other guy to finish (not efficient, but rare) */
+        while (crc_table_empty)
+            ;
+    }
+
+#ifdef MAKECRCH
+    /* write out CRC tables to crc32.h */
+    {
+        FILE *out;
+
+        out = fopen("crc32.h", "w");
+        if (out == NULL) return;
+        fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n");
+        fprintf(out, " * Generated automatically by crc32.c\n */\n\n");
+        fprintf(out, "local const unsigned long FAR ");
+        fprintf(out, "crc_table[TBLS][256] =\n{\n  {\n");
+        write_table(out, crc_table[0]);
+#  ifdef BYFOUR
+        fprintf(out, "#ifdef BYFOUR\n");
+        for (k = 1; k < 8; k++) {
+            fprintf(out, "  },\n  {\n");
+            write_table(out, crc_table[k]);
+        }
+        fprintf(out, "#endif\n");
+#  endif /* BYFOUR */
+        fprintf(out, "  }\n};\n");
+        fclose(out);
+    }
+#endif /* MAKECRCH */
+}
+
+#ifdef MAKECRCH
+local void write_table(out, table)
+    FILE *out;
+    const unsigned long FAR *table;
+{
+    int n;
+
+    for (n = 0; n < 256; n++)
+        fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : "    ", table[n],
+                n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", "));
+}
+#endif /* MAKECRCH */
+
+#else /* !DYNAMIC_CRC_TABLE */
+/* ========================================================================
+ * Tables of CRC-32s of all single-byte values, made by make_crc_table().
+ */
+#include "crc32.h"
+#endif /* DYNAMIC_CRC_TABLE */
+
+/* =========================================================================
+ * This function can be used by asm versions of crc32()
+ */
+const unsigned long FAR * ZEXPORT get_crc_table(void)
+{
+#ifdef DYNAMIC_CRC_TABLE
+    if (crc_table_empty)
+        make_crc_table();
+#endif /* DYNAMIC_CRC_TABLE */
+    return (const unsigned long FAR *)crc_table;
+}
+
+/* ========================================================================= */
+#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
+#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+
+/* ========================================================================= */
+unsigned long ZEXPORT crc32(
+       unsigned long crc,
+       const unsigned char FAR *buf,
+       unsigned len)
+{
+    if (buf == Z_NULL) return 0UL;
+
+#ifdef DYNAMIC_CRC_TABLE
+    if (crc_table_empty)
+        make_crc_table();
+#endif /* DYNAMIC_CRC_TABLE */
+
+#ifdef BYFOUR
+    if (sizeof(void *) == sizeof(ptrdiff_t)) {
+        u4 endian;
+
+        endian = 1;
+        if (*((unsigned char *)(&endian)))
+            return crc32_little(crc, buf, len);
+        else
+            return crc32_big(crc, buf, len);
+    }
+#endif /* BYFOUR */
+/*    crc = crc ^ 0xffffffffUL;*/
+    while (len >= 8) {
+        DO8;
+        len -= 8;
+    }
+    if (len) do {
+        DO1;
+    } while (--len);
+    return crc /* ^ 0xffffffffUL*/;
+}
+
+#ifdef BYFOUR
+
+/* ========================================================================= */
+#define DOLIT4 c ^= *buf4++; \
+        c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
+            crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
+#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
+
+/* ========================================================================= */
+local unsigned long crc32_little(crc, buf, len)
+    unsigned long crc;
+    const unsigned char FAR *buf;
+    unsigned len;
+{
+    register u4 c;
+    register const u4 FAR *buf4;
+
+    c = (u4)crc;
+    c = ~c;
+    while (len && ((ptrdiff_t)buf & 3)) {
+        c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+        len--;
+    }
+
+    buf4 = (const u4 FAR *)buf;
+    while (len >= 32) {
+        DOLIT32;
+        len -= 32;
+    }
+    while (len >= 4) {
+        DOLIT4;
+        len -= 4;
+    }
+    buf = (const unsigned char FAR *)buf4;
+
+    if (len) do {
+        c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+    } while (--len);
+    c = ~c;
+    return (unsigned long)c;
+}
+
+/* ========================================================================= */
+#define DOBIG4 c ^= *++buf4; \
+        c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
+            crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
+#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
+
+/* ========================================================================= */
+local unsigned long crc32_big(crc, buf, len)
+    unsigned long crc;
+    const unsigned char FAR *buf;
+    unsigned len;
+{
+    register u4 c;
+    register const u4 FAR *buf4;
+
+    c = REV((u4)crc);
+    c = ~c;
+    while (len && ((ptrdiff_t)buf & 3)) {
+        c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+        len--;
+    }
+
+    buf4 = (const u4 FAR *)buf;
+    buf4--;
+    while (len >= 32) {
+        DOBIG32;
+        len -= 32;
+    }
+    while (len >= 4) {
+        DOBIG4;
+        len -= 4;
+    }
+    buf4++;
+    buf = (const unsigned char FAR *)buf4;
+
+    if (len) do {
+        c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+    } while (--len);
+    c = ~c;
+    return (unsigned long)(REV(c));
+}
+
+#endif /* BYFOUR */
diff --git a/crc32.h b/crc32.h
new file mode 100644 (file)
index 0000000..8053b61
--- /dev/null
+++ b/crc32.h
@@ -0,0 +1,441 @@
+/* crc32.h -- tables for rapid CRC calculation
+ * Generated automatically by crc32.c
+ */
+
+local const unsigned long FAR crc_table[TBLS][256] =
+{
+  {
+    0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
+    0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
+    0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
+    0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
+    0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
+    0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
+    0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
+    0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
+    0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
+    0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
+    0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
+    0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
+    0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
+    0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
+    0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
+    0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
+    0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
+    0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
+    0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
+    0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
+    0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
+    0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
+    0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
+    0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
+    0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
+    0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
+    0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
+    0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
+    0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
+    0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
+    0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
+    0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
+    0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
+    0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
+    0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
+    0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
+    0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
+    0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
+    0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
+    0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
+    0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
+    0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
+    0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
+    0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
+    0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
+    0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
+    0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
+    0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
+    0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
+    0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
+    0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
+    0x2d02ef8dUL
+#ifdef BYFOUR
+  },
+  {
+    0x00000000UL, 0x191b3141UL, 0x32366282UL, 0x2b2d53c3UL, 0x646cc504UL,
+    0x7d77f445UL, 0x565aa786UL, 0x4f4196c7UL, 0xc8d98a08UL, 0xd1c2bb49UL,
+    0xfaefe88aUL, 0xe3f4d9cbUL, 0xacb54f0cUL, 0xb5ae7e4dUL, 0x9e832d8eUL,
+    0x87981ccfUL, 0x4ac21251UL, 0x53d92310UL, 0x78f470d3UL, 0x61ef4192UL,
+    0x2eaed755UL, 0x37b5e614UL, 0x1c98b5d7UL, 0x05838496UL, 0x821b9859UL,
+    0x9b00a918UL, 0xb02dfadbUL, 0xa936cb9aUL, 0xe6775d5dUL, 0xff6c6c1cUL,
+    0xd4413fdfUL, 0xcd5a0e9eUL, 0x958424a2UL, 0x8c9f15e3UL, 0xa7b24620UL,
+    0xbea97761UL, 0xf1e8e1a6UL, 0xe8f3d0e7UL, 0xc3de8324UL, 0xdac5b265UL,
+    0x5d5daeaaUL, 0x44469febUL, 0x6f6bcc28UL, 0x7670fd69UL, 0x39316baeUL,
+    0x202a5aefUL, 0x0b07092cUL, 0x121c386dUL, 0xdf4636f3UL, 0xc65d07b2UL,
+    0xed705471UL, 0xf46b6530UL, 0xbb2af3f7UL, 0xa231c2b6UL, 0x891c9175UL,
+    0x9007a034UL, 0x179fbcfbUL, 0x0e848dbaUL, 0x25a9de79UL, 0x3cb2ef38UL,
+    0x73f379ffUL, 0x6ae848beUL, 0x41c51b7dUL, 0x58de2a3cUL, 0xf0794f05UL,
+    0xe9627e44UL, 0xc24f2d87UL, 0xdb541cc6UL, 0x94158a01UL, 0x8d0ebb40UL,
+    0xa623e883UL, 0xbf38d9c2UL, 0x38a0c50dUL, 0x21bbf44cUL, 0x0a96a78fUL,
+    0x138d96ceUL, 0x5ccc0009UL, 0x45d73148UL, 0x6efa628bUL, 0x77e153caUL,
+    0xbabb5d54UL, 0xa3a06c15UL, 0x888d3fd6UL, 0x91960e97UL, 0xded79850UL,
+    0xc7cca911UL, 0xece1fad2UL, 0xf5facb93UL, 0x7262d75cUL, 0x6b79e61dUL,
+    0x4054b5deUL, 0x594f849fUL, 0x160e1258UL, 0x0f152319UL, 0x243870daUL,
+    0x3d23419bUL, 0x65fd6ba7UL, 0x7ce65ae6UL, 0x57cb0925UL, 0x4ed03864UL,
+    0x0191aea3UL, 0x188a9fe2UL, 0x33a7cc21UL, 0x2abcfd60UL, 0xad24e1afUL,
+    0xb43fd0eeUL, 0x9f12832dUL, 0x8609b26cUL, 0xc94824abUL, 0xd05315eaUL,
+    0xfb7e4629UL, 0xe2657768UL, 0x2f3f79f6UL, 0x362448b7UL, 0x1d091b74UL,
+    0x04122a35UL, 0x4b53bcf2UL, 0x52488db3UL, 0x7965de70UL, 0x607eef31UL,
+    0xe7e6f3feUL, 0xfefdc2bfUL, 0xd5d0917cUL, 0xcccba03dUL, 0x838a36faUL,
+    0x9a9107bbUL, 0xb1bc5478UL, 0xa8a76539UL, 0x3b83984bUL, 0x2298a90aUL,
+    0x09b5fac9UL, 0x10aecb88UL, 0x5fef5d4fUL, 0x46f46c0eUL, 0x6dd93fcdUL,
+    0x74c20e8cUL, 0xf35a1243UL, 0xea412302UL, 0xc16c70c1UL, 0xd8774180UL,
+    0x9736d747UL, 0x8e2de606UL, 0xa500b5c5UL, 0xbc1b8484UL, 0x71418a1aUL,
+    0x685abb5bUL, 0x4377e898UL, 0x5a6cd9d9UL, 0x152d4f1eUL, 0x0c367e5fUL,
+    0x271b2d9cUL, 0x3e001cddUL, 0xb9980012UL, 0xa0833153UL, 0x8bae6290UL,
+    0x92b553d1UL, 0xddf4c516UL, 0xc4eff457UL, 0xefc2a794UL, 0xf6d996d5UL,
+    0xae07bce9UL, 0xb71c8da8UL, 0x9c31de6bUL, 0x852aef2aUL, 0xca6b79edUL,
+    0xd37048acUL, 0xf85d1b6fUL, 0xe1462a2eUL, 0x66de36e1UL, 0x7fc507a0UL,
+    0x54e85463UL, 0x4df36522UL, 0x02b2f3e5UL, 0x1ba9c2a4UL, 0x30849167UL,
+    0x299fa026UL, 0xe4c5aeb8UL, 0xfdde9ff9UL, 0xd6f3cc3aUL, 0xcfe8fd7bUL,
+    0x80a96bbcUL, 0x99b25afdUL, 0xb29f093eUL, 0xab84387fUL, 0x2c1c24b0UL,
+    0x350715f1UL, 0x1e2a4632UL, 0x07317773UL, 0x4870e1b4UL, 0x516bd0f5UL,
+    0x7a468336UL, 0x635db277UL, 0xcbfad74eUL, 0xd2e1e60fUL, 0xf9ccb5ccUL,
+    0xe0d7848dUL, 0xaf96124aUL, 0xb68d230bUL, 0x9da070c8UL, 0x84bb4189UL,
+    0x03235d46UL, 0x1a386c07UL, 0x31153fc4UL, 0x280e0e85UL, 0x674f9842UL,
+    0x7e54a903UL, 0x5579fac0UL, 0x4c62cb81UL, 0x8138c51fUL, 0x9823f45eUL,
+    0xb30ea79dUL, 0xaa1596dcUL, 0xe554001bUL, 0xfc4f315aUL, 0xd7626299UL,
+    0xce7953d8UL, 0x49e14f17UL, 0x50fa7e56UL, 0x7bd72d95UL, 0x62cc1cd4UL,
+    0x2d8d8a13UL, 0x3496bb52UL, 0x1fbbe891UL, 0x06a0d9d0UL, 0x5e7ef3ecUL,
+    0x4765c2adUL, 0x6c48916eUL, 0x7553a02fUL, 0x3a1236e8UL, 0x230907a9UL,
+    0x0824546aUL, 0x113f652bUL, 0x96a779e4UL, 0x8fbc48a5UL, 0xa4911b66UL,
+    0xbd8a2a27UL, 0xf2cbbce0UL, 0xebd08da1UL, 0xc0fdde62UL, 0xd9e6ef23UL,
+    0x14bce1bdUL, 0x0da7d0fcUL, 0x268a833fUL, 0x3f91b27eUL, 0x70d024b9UL,
+    0x69cb15f8UL, 0x42e6463bUL, 0x5bfd777aUL, 0xdc656bb5UL, 0xc57e5af4UL,
+    0xee530937UL, 0xf7483876UL, 0xb809aeb1UL, 0xa1129ff0UL, 0x8a3fcc33UL,
+    0x9324fd72UL
+  },
+  {
+    0x00000000UL, 0x01c26a37UL, 0x0384d46eUL, 0x0246be59UL, 0x0709a8dcUL,
+    0x06cbc2ebUL, 0x048d7cb2UL, 0x054f1685UL, 0x0e1351b8UL, 0x0fd13b8fUL,
+    0x0d9785d6UL, 0x0c55efe1UL, 0x091af964UL, 0x08d89353UL, 0x0a9e2d0aUL,
+    0x0b5c473dUL, 0x1c26a370UL, 0x1de4c947UL, 0x1fa2771eUL, 0x1e601d29UL,
+    0x1b2f0bacUL, 0x1aed619bUL, 0x18abdfc2UL, 0x1969b5f5UL, 0x1235f2c8UL,
+    0x13f798ffUL, 0x11b126a6UL, 0x10734c91UL, 0x153c5a14UL, 0x14fe3023UL,
+    0x16b88e7aUL, 0x177ae44dUL, 0x384d46e0UL, 0x398f2cd7UL, 0x3bc9928eUL,
+    0x3a0bf8b9UL, 0x3f44ee3cUL, 0x3e86840bUL, 0x3cc03a52UL, 0x3d025065UL,
+    0x365e1758UL, 0x379c7d6fUL, 0x35dac336UL, 0x3418a901UL, 0x3157bf84UL,
+    0x3095d5b3UL, 0x32d36beaUL, 0x331101ddUL, 0x246be590UL, 0x25a98fa7UL,
+    0x27ef31feUL, 0x262d5bc9UL, 0x23624d4cUL, 0x22a0277bUL, 0x20e69922UL,
+    0x2124f315UL, 0x2a78b428UL, 0x2bbade1fUL, 0x29fc6046UL, 0x283e0a71UL,
+    0x2d711cf4UL, 0x2cb376c3UL, 0x2ef5c89aUL, 0x2f37a2adUL, 0x709a8dc0UL,
+    0x7158e7f7UL, 0x731e59aeUL, 0x72dc3399UL, 0x7793251cUL, 0x76514f2bUL,
+    0x7417f172UL, 0x75d59b45UL, 0x7e89dc78UL, 0x7f4bb64fUL, 0x7d0d0816UL,
+    0x7ccf6221UL, 0x798074a4UL, 0x78421e93UL, 0x7a04a0caUL, 0x7bc6cafdUL,
+    0x6cbc2eb0UL, 0x6d7e4487UL, 0x6f38fadeUL, 0x6efa90e9UL, 0x6bb5866cUL,
+    0x6a77ec5bUL, 0x68315202UL, 0x69f33835UL, 0x62af7f08UL, 0x636d153fUL,
+    0x612bab66UL, 0x60e9c151UL, 0x65a6d7d4UL, 0x6464bde3UL, 0x662203baUL,
+    0x67e0698dUL, 0x48d7cb20UL, 0x4915a117UL, 0x4b531f4eUL, 0x4a917579UL,
+    0x4fde63fcUL, 0x4e1c09cbUL, 0x4c5ab792UL, 0x4d98dda5UL, 0x46c49a98UL,
+    0x4706f0afUL, 0x45404ef6UL, 0x448224c1UL, 0x41cd3244UL, 0x400f5873UL,
+    0x4249e62aUL, 0x438b8c1dUL, 0x54f16850UL, 0x55330267UL, 0x5775bc3eUL,
+    0x56b7d609UL, 0x53f8c08cUL, 0x523aaabbUL, 0x507c14e2UL, 0x51be7ed5UL,
+    0x5ae239e8UL, 0x5b2053dfUL, 0x5966ed86UL, 0x58a487b1UL, 0x5deb9134UL,
+    0x5c29fb03UL, 0x5e6f455aUL, 0x5fad2f6dUL, 0xe1351b80UL, 0xe0f771b7UL,
+    0xe2b1cfeeUL, 0xe373a5d9UL, 0xe63cb35cUL, 0xe7fed96bUL, 0xe5b86732UL,
+    0xe47a0d05UL, 0xef264a38UL, 0xeee4200fUL, 0xeca29e56UL, 0xed60f461UL,
+    0xe82fe2e4UL, 0xe9ed88d3UL, 0xebab368aUL, 0xea695cbdUL, 0xfd13b8f0UL,
+    0xfcd1d2c7UL, 0xfe976c9eUL, 0xff5506a9UL, 0xfa1a102cUL, 0xfbd87a1bUL,
+    0xf99ec442UL, 0xf85cae75UL, 0xf300e948UL, 0xf2c2837fUL, 0xf0843d26UL,
+    0xf1465711UL, 0xf4094194UL, 0xf5cb2ba3UL, 0xf78d95faUL, 0xf64fffcdUL,
+    0xd9785d60UL, 0xd8ba3757UL, 0xdafc890eUL, 0xdb3ee339UL, 0xde71f5bcUL,
+    0xdfb39f8bUL, 0xddf521d2UL, 0xdc374be5UL, 0xd76b0cd8UL, 0xd6a966efUL,
+    0xd4efd8b6UL, 0xd52db281UL, 0xd062a404UL, 0xd1a0ce33UL, 0xd3e6706aUL,
+    0xd2241a5dUL, 0xc55efe10UL, 0xc49c9427UL, 0xc6da2a7eUL, 0xc7184049UL,
+    0xc25756ccUL, 0xc3953cfbUL, 0xc1d382a2UL, 0xc011e895UL, 0xcb4dafa8UL,
+    0xca8fc59fUL, 0xc8c97bc6UL, 0xc90b11f1UL, 0xcc440774UL, 0xcd866d43UL,
+    0xcfc0d31aUL, 0xce02b92dUL, 0x91af9640UL, 0x906dfc77UL, 0x922b422eUL,
+    0x93e92819UL, 0x96a63e9cUL, 0x976454abUL, 0x9522eaf2UL, 0x94e080c5UL,
+    0x9fbcc7f8UL, 0x9e7eadcfUL, 0x9c381396UL, 0x9dfa79a1UL, 0x98b56f24UL,
+    0x99770513UL, 0x9b31bb4aUL, 0x9af3d17dUL, 0x8d893530UL, 0x8c4b5f07UL,
+    0x8e0de15eUL, 0x8fcf8b69UL, 0x8a809decUL, 0x8b42f7dbUL, 0x89044982UL,
+    0x88c623b5UL, 0x839a6488UL, 0x82580ebfUL, 0x801eb0e6UL, 0x81dcdad1UL,
+    0x8493cc54UL, 0x8551a663UL, 0x8717183aUL, 0x86d5720dUL, 0xa9e2d0a0UL,
+    0xa820ba97UL, 0xaa6604ceUL, 0xaba46ef9UL, 0xaeeb787cUL, 0xaf29124bUL,
+    0xad6fac12UL, 0xacadc625UL, 0xa7f18118UL, 0xa633eb2fUL, 0xa4755576UL,
+    0xa5b73f41UL, 0xa0f829c4UL, 0xa13a43f3UL, 0xa37cfdaaUL, 0xa2be979dUL,
+    0xb5c473d0UL, 0xb40619e7UL, 0xb640a7beUL, 0xb782cd89UL, 0xb2cddb0cUL,
+    0xb30fb13bUL, 0xb1490f62UL, 0xb08b6555UL, 0xbbd72268UL, 0xba15485fUL,
+    0xb853f606UL, 0xb9919c31UL, 0xbcde8ab4UL, 0xbd1ce083UL, 0xbf5a5edaUL,
+    0xbe9834edUL
+  },
+  {
+    0x00000000UL, 0xb8bc6765UL, 0xaa09c88bUL, 0x12b5afeeUL, 0x8f629757UL,
+    0x37def032UL, 0x256b5fdcUL, 0x9dd738b9UL, 0xc5b428efUL, 0x7d084f8aUL,
+    0x6fbde064UL, 0xd7018701UL, 0x4ad6bfb8UL, 0xf26ad8ddUL, 0xe0df7733UL,
+    0x58631056UL, 0x5019579fUL, 0xe8a530faUL, 0xfa109f14UL, 0x42acf871UL,
+    0xdf7bc0c8UL, 0x67c7a7adUL, 0x75720843UL, 0xcdce6f26UL, 0x95ad7f70UL,
+    0x2d111815UL, 0x3fa4b7fbUL, 0x8718d09eUL, 0x1acfe827UL, 0xa2738f42UL,
+    0xb0c620acUL, 0x087a47c9UL, 0xa032af3eUL, 0x188ec85bUL, 0x0a3b67b5UL,
+    0xb28700d0UL, 0x2f503869UL, 0x97ec5f0cUL, 0x8559f0e2UL, 0x3de59787UL,
+    0x658687d1UL, 0xdd3ae0b4UL, 0xcf8f4f5aUL, 0x7733283fUL, 0xeae41086UL,
+    0x525877e3UL, 0x40edd80dUL, 0xf851bf68UL, 0xf02bf8a1UL, 0x48979fc4UL,
+    0x5a22302aUL, 0xe29e574fUL, 0x7f496ff6UL, 0xc7f50893UL, 0xd540a77dUL,
+    0x6dfcc018UL, 0x359fd04eUL, 0x8d23b72bUL, 0x9f9618c5UL, 0x272a7fa0UL,
+    0xbafd4719UL, 0x0241207cUL, 0x10f48f92UL, 0xa848e8f7UL, 0x9b14583dUL,
+    0x23a83f58UL, 0x311d90b6UL, 0x89a1f7d3UL, 0x1476cf6aUL, 0xaccaa80fUL,
+    0xbe7f07e1UL, 0x06c36084UL, 0x5ea070d2UL, 0xe61c17b7UL, 0xf4a9b859UL,
+    0x4c15df3cUL, 0xd1c2e785UL, 0x697e80e0UL, 0x7bcb2f0eUL, 0xc377486bUL,
+    0xcb0d0fa2UL, 0x73b168c7UL, 0x6104c729UL, 0xd9b8a04cUL, 0x446f98f5UL,
+    0xfcd3ff90UL, 0xee66507eUL, 0x56da371bUL, 0x0eb9274dUL, 0xb6054028UL,
+    0xa4b0efc6UL, 0x1c0c88a3UL, 0x81dbb01aUL, 0x3967d77fUL, 0x2bd27891UL,
+    0x936e1ff4UL, 0x3b26f703UL, 0x839a9066UL, 0x912f3f88UL, 0x299358edUL,
+    0xb4446054UL, 0x0cf80731UL, 0x1e4da8dfUL, 0xa6f1cfbaUL, 0xfe92dfecUL,
+    0x462eb889UL, 0x549b1767UL, 0xec277002UL, 0x71f048bbUL, 0xc94c2fdeUL,
+    0xdbf98030UL, 0x6345e755UL, 0x6b3fa09cUL, 0xd383c7f9UL, 0xc1366817UL,
+    0x798a0f72UL, 0xe45d37cbUL, 0x5ce150aeUL, 0x4e54ff40UL, 0xf6e89825UL,
+    0xae8b8873UL, 0x1637ef16UL, 0x048240f8UL, 0xbc3e279dUL, 0x21e91f24UL,
+    0x99557841UL, 0x8be0d7afUL, 0x335cb0caUL, 0xed59b63bUL, 0x55e5d15eUL,
+    0x47507eb0UL, 0xffec19d5UL, 0x623b216cUL, 0xda874609UL, 0xc832e9e7UL,
+    0x708e8e82UL, 0x28ed9ed4UL, 0x9051f9b1UL, 0x82e4565fUL, 0x3a58313aUL,
+    0xa78f0983UL, 0x1f336ee6UL, 0x0d86c108UL, 0xb53aa66dUL, 0xbd40e1a4UL,
+    0x05fc86c1UL, 0x1749292fUL, 0xaff54e4aUL, 0x322276f3UL, 0x8a9e1196UL,
+    0x982bbe78UL, 0x2097d91dUL, 0x78f4c94bUL, 0xc048ae2eUL, 0xd2fd01c0UL,
+    0x6a4166a5UL, 0xf7965e1cUL, 0x4f2a3979UL, 0x5d9f9697UL, 0xe523f1f2UL,
+    0x4d6b1905UL, 0xf5d77e60UL, 0xe762d18eUL, 0x5fdeb6ebUL, 0xc2098e52UL,
+    0x7ab5e937UL, 0x680046d9UL, 0xd0bc21bcUL, 0x88df31eaUL, 0x3063568fUL,
+    0x22d6f961UL, 0x9a6a9e04UL, 0x07bda6bdUL, 0xbf01c1d8UL, 0xadb46e36UL,
+    0x15080953UL, 0x1d724e9aUL, 0xa5ce29ffUL, 0xb77b8611UL, 0x0fc7e174UL,
+    0x9210d9cdUL, 0x2aacbea8UL, 0x38191146UL, 0x80a57623UL, 0xd8c66675UL,
+    0x607a0110UL, 0x72cfaefeUL, 0xca73c99bUL, 0x57a4f122UL, 0xef189647UL,
+    0xfdad39a9UL, 0x45115eccUL, 0x764dee06UL, 0xcef18963UL, 0xdc44268dUL,
+    0x64f841e8UL, 0xf92f7951UL, 0x41931e34UL, 0x5326b1daUL, 0xeb9ad6bfUL,
+    0xb3f9c6e9UL, 0x0b45a18cUL, 0x19f00e62UL, 0xa14c6907UL, 0x3c9b51beUL,
+    0x842736dbUL, 0x96929935UL, 0x2e2efe50UL, 0x2654b999UL, 0x9ee8defcUL,
+    0x8c5d7112UL, 0x34e11677UL, 0xa9362eceUL, 0x118a49abUL, 0x033fe645UL,
+    0xbb838120UL, 0xe3e09176UL, 0x5b5cf613UL, 0x49e959fdUL, 0xf1553e98UL,
+    0x6c820621UL, 0xd43e6144UL, 0xc68bceaaUL, 0x7e37a9cfUL, 0xd67f4138UL,
+    0x6ec3265dUL, 0x7c7689b3UL, 0xc4caeed6UL, 0x591dd66fUL, 0xe1a1b10aUL,
+    0xf3141ee4UL, 0x4ba87981UL, 0x13cb69d7UL, 0xab770eb2UL, 0xb9c2a15cUL,
+    0x017ec639UL, 0x9ca9fe80UL, 0x241599e5UL, 0x36a0360bUL, 0x8e1c516eUL,
+    0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL,
+    0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL,
+    0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL,
+    0xde0506f1UL
+  },
+  {
+    0x00000000UL, 0x96300777UL, 0x2c610eeeUL, 0xba510999UL, 0x19c46d07UL,
+    0x8ff46a70UL, 0x35a563e9UL, 0xa395649eUL, 0x3288db0eUL, 0xa4b8dc79UL,
+    0x1ee9d5e0UL, 0x88d9d297UL, 0x2b4cb609UL, 0xbd7cb17eUL, 0x072db8e7UL,
+    0x911dbf90UL, 0x6410b71dUL, 0xf220b06aUL, 0x4871b9f3UL, 0xde41be84UL,
+    0x7dd4da1aUL, 0xebe4dd6dUL, 0x51b5d4f4UL, 0xc785d383UL, 0x56986c13UL,
+    0xc0a86b64UL, 0x7af962fdUL, 0xecc9658aUL, 0x4f5c0114UL, 0xd96c0663UL,
+    0x633d0ffaUL, 0xf50d088dUL, 0xc8206e3bUL, 0x5e10694cUL, 0xe44160d5UL,
+    0x727167a2UL, 0xd1e4033cUL, 0x47d4044bUL, 0xfd850dd2UL, 0x6bb50aa5UL,
+    0xfaa8b535UL, 0x6c98b242UL, 0xd6c9bbdbUL, 0x40f9bcacUL, 0xe36cd832UL,
+    0x755cdf45UL, 0xcf0dd6dcUL, 0x593dd1abUL, 0xac30d926UL, 0x3a00de51UL,
+    0x8051d7c8UL, 0x1661d0bfUL, 0xb5f4b421UL, 0x23c4b356UL, 0x9995bacfUL,
+    0x0fa5bdb8UL, 0x9eb80228UL, 0x0888055fUL, 0xb2d90cc6UL, 0x24e90bb1UL,
+    0x877c6f2fUL, 0x114c6858UL, 0xab1d61c1UL, 0x3d2d66b6UL, 0x9041dc76UL,
+    0x0671db01UL, 0xbc20d298UL, 0x2a10d5efUL, 0x8985b171UL, 0x1fb5b606UL,
+    0xa5e4bf9fUL, 0x33d4b8e8UL, 0xa2c90778UL, 0x34f9000fUL, 0x8ea80996UL,
+    0x18980ee1UL, 0xbb0d6a7fUL, 0x2d3d6d08UL, 0x976c6491UL, 0x015c63e6UL,
+    0xf4516b6bUL, 0x62616c1cUL, 0xd8306585UL, 0x4e0062f2UL, 0xed95066cUL,
+    0x7ba5011bUL, 0xc1f40882UL, 0x57c40ff5UL, 0xc6d9b065UL, 0x50e9b712UL,
+    0xeab8be8bUL, 0x7c88b9fcUL, 0xdf1ddd62UL, 0x492dda15UL, 0xf37cd38cUL,
+    0x654cd4fbUL, 0x5861b24dUL, 0xce51b53aUL, 0x7400bca3UL, 0xe230bbd4UL,
+    0x41a5df4aUL, 0xd795d83dUL, 0x6dc4d1a4UL, 0xfbf4d6d3UL, 0x6ae96943UL,
+    0xfcd96e34UL, 0x468867adUL, 0xd0b860daUL, 0x732d0444UL, 0xe51d0333UL,
+    0x5f4c0aaaUL, 0xc97c0dddUL, 0x3c710550UL, 0xaa410227UL, 0x10100bbeUL,
+    0x86200cc9UL, 0x25b56857UL, 0xb3856f20UL, 0x09d466b9UL, 0x9fe461ceUL,
+    0x0ef9de5eUL, 0x98c9d929UL, 0x2298d0b0UL, 0xb4a8d7c7UL, 0x173db359UL,
+    0x810db42eUL, 0x3b5cbdb7UL, 0xad6cbac0UL, 0x2083b8edUL, 0xb6b3bf9aUL,
+    0x0ce2b603UL, 0x9ad2b174UL, 0x3947d5eaUL, 0xaf77d29dUL, 0x1526db04UL,
+    0x8316dc73UL, 0x120b63e3UL, 0x843b6494UL, 0x3e6a6d0dUL, 0xa85a6a7aUL,
+    0x0bcf0ee4UL, 0x9dff0993UL, 0x27ae000aUL, 0xb19e077dUL, 0x44930ff0UL,
+    0xd2a30887UL, 0x68f2011eUL, 0xfec20669UL, 0x5d5762f7UL, 0xcb676580UL,
+    0x71366c19UL, 0xe7066b6eUL, 0x761bd4feUL, 0xe02bd389UL, 0x5a7ada10UL,
+    0xcc4add67UL, 0x6fdfb9f9UL, 0xf9efbe8eUL, 0x43beb717UL, 0xd58eb060UL,
+    0xe8a3d6d6UL, 0x7e93d1a1UL, 0xc4c2d838UL, 0x52f2df4fUL, 0xf167bbd1UL,
+    0x6757bca6UL, 0xdd06b53fUL, 0x4b36b248UL, 0xda2b0dd8UL, 0x4c1b0aafUL,
+    0xf64a0336UL, 0x607a0441UL, 0xc3ef60dfUL, 0x55df67a8UL, 0xef8e6e31UL,
+    0x79be6946UL, 0x8cb361cbUL, 0x1a8366bcUL, 0xa0d26f25UL, 0x36e26852UL,
+    0x95770cccUL, 0x03470bbbUL, 0xb9160222UL, 0x2f260555UL, 0xbe3bbac5UL,
+    0x280bbdb2UL, 0x925ab42bUL, 0x046ab35cUL, 0xa7ffd7c2UL, 0x31cfd0b5UL,
+    0x8b9ed92cUL, 0x1daede5bUL, 0xb0c2649bUL, 0x26f263ecUL, 0x9ca36a75UL,
+    0x0a936d02UL, 0xa906099cUL, 0x3f360eebUL, 0x85670772UL, 0x13570005UL,
+    0x824abf95UL, 0x147ab8e2UL, 0xae2bb17bUL, 0x381bb60cUL, 0x9b8ed292UL,
+    0x0dbed5e5UL, 0xb7efdc7cUL, 0x21dfdb0bUL, 0xd4d2d386UL, 0x42e2d4f1UL,
+    0xf8b3dd68UL, 0x6e83da1fUL, 0xcd16be81UL, 0x5b26b9f6UL, 0xe177b06fUL,
+    0x7747b718UL, 0xe65a0888UL, 0x706a0fffUL, 0xca3b0666UL, 0x5c0b0111UL,
+    0xff9e658fUL, 0x69ae62f8UL, 0xd3ff6b61UL, 0x45cf6c16UL, 0x78e20aa0UL,
+    0xeed20dd7UL, 0x5483044eUL, 0xc2b30339UL, 0x612667a7UL, 0xf71660d0UL,
+    0x4d476949UL, 0xdb776e3eUL, 0x4a6ad1aeUL, 0xdc5ad6d9UL, 0x660bdf40UL,
+    0xf03bd837UL, 0x53aebca9UL, 0xc59ebbdeUL, 0x7fcfb247UL, 0xe9ffb530UL,
+    0x1cf2bdbdUL, 0x8ac2bacaUL, 0x3093b353UL, 0xa6a3b424UL, 0x0536d0baUL,
+    0x9306d7cdUL, 0x2957de54UL, 0xbf67d923UL, 0x2e7a66b3UL, 0xb84a61c4UL,
+    0x021b685dUL, 0x942b6f2aUL, 0x37be0bb4UL, 0xa18e0cc3UL, 0x1bdf055aUL,
+    0x8def022dUL
+  },
+  {
+    0x00000000UL, 0x41311b19UL, 0x82623632UL, 0xc3532d2bUL, 0x04c56c64UL,
+    0x45f4777dUL, 0x86a75a56UL, 0xc796414fUL, 0x088ad9c8UL, 0x49bbc2d1UL,
+    0x8ae8effaUL, 0xcbd9f4e3UL, 0x0c4fb5acUL, 0x4d7eaeb5UL, 0x8e2d839eUL,
+    0xcf1c9887UL, 0x5112c24aUL, 0x1023d953UL, 0xd370f478UL, 0x9241ef61UL,
+    0x55d7ae2eUL, 0x14e6b537UL, 0xd7b5981cUL, 0x96848305UL, 0x59981b82UL,
+    0x18a9009bUL, 0xdbfa2db0UL, 0x9acb36a9UL, 0x5d5d77e6UL, 0x1c6c6cffUL,
+    0xdf3f41d4UL, 0x9e0e5acdUL, 0xa2248495UL, 0xe3159f8cUL, 0x2046b2a7UL,
+    0x6177a9beUL, 0xa6e1e8f1UL, 0xe7d0f3e8UL, 0x2483dec3UL, 0x65b2c5daUL,
+    0xaaae5d5dUL, 0xeb9f4644UL, 0x28cc6b6fUL, 0x69fd7076UL, 0xae6b3139UL,
+    0xef5a2a20UL, 0x2c09070bUL, 0x6d381c12UL, 0xf33646dfUL, 0xb2075dc6UL,
+    0x715470edUL, 0x30656bf4UL, 0xf7f32abbUL, 0xb6c231a2UL, 0x75911c89UL,
+    0x34a00790UL, 0xfbbc9f17UL, 0xba8d840eUL, 0x79dea925UL, 0x38efb23cUL,
+    0xff79f373UL, 0xbe48e86aUL, 0x7d1bc541UL, 0x3c2ade58UL, 0x054f79f0UL,
+    0x447e62e9UL, 0x872d4fc2UL, 0xc61c54dbUL, 0x018a1594UL, 0x40bb0e8dUL,
+    0x83e823a6UL, 0xc2d938bfUL, 0x0dc5a038UL, 0x4cf4bb21UL, 0x8fa7960aUL,
+    0xce968d13UL, 0x0900cc5cUL, 0x4831d745UL, 0x8b62fa6eUL, 0xca53e177UL,
+    0x545dbbbaUL, 0x156ca0a3UL, 0xd63f8d88UL, 0x970e9691UL, 0x5098d7deUL,
+    0x11a9ccc7UL, 0xd2fae1ecUL, 0x93cbfaf5UL, 0x5cd76272UL, 0x1de6796bUL,
+    0xdeb55440UL, 0x9f844f59UL, 0x58120e16UL, 0x1923150fUL, 0xda703824UL,
+    0x9b41233dUL, 0xa76bfd65UL, 0xe65ae67cUL, 0x2509cb57UL, 0x6438d04eUL,
+    0xa3ae9101UL, 0xe29f8a18UL, 0x21cca733UL, 0x60fdbc2aUL, 0xafe124adUL,
+    0xeed03fb4UL, 0x2d83129fUL, 0x6cb20986UL, 0xab2448c9UL, 0xea1553d0UL,
+    0x29467efbUL, 0x687765e2UL, 0xf6793f2fUL, 0xb7482436UL, 0x741b091dUL,
+    0x352a1204UL, 0xf2bc534bUL, 0xb38d4852UL, 0x70de6579UL, 0x31ef7e60UL,
+    0xfef3e6e7UL, 0xbfc2fdfeUL, 0x7c91d0d5UL, 0x3da0cbccUL, 0xfa368a83UL,
+    0xbb07919aUL, 0x7854bcb1UL, 0x3965a7a8UL, 0x4b98833bUL, 0x0aa99822UL,
+    0xc9fab509UL, 0x88cbae10UL, 0x4f5def5fUL, 0x0e6cf446UL, 0xcd3fd96dUL,
+    0x8c0ec274UL, 0x43125af3UL, 0x022341eaUL, 0xc1706cc1UL, 0x804177d8UL,
+    0x47d73697UL, 0x06e62d8eUL, 0xc5b500a5UL, 0x84841bbcUL, 0x1a8a4171UL,
+    0x5bbb5a68UL, 0x98e87743UL, 0xd9d96c5aUL, 0x1e4f2d15UL, 0x5f7e360cUL,
+    0x9c2d1b27UL, 0xdd1c003eUL, 0x120098b9UL, 0x533183a0UL, 0x9062ae8bUL,
+    0xd153b592UL, 0x16c5f4ddUL, 0x57f4efc4UL, 0x94a7c2efUL, 0xd596d9f6UL,
+    0xe9bc07aeUL, 0xa88d1cb7UL, 0x6bde319cUL, 0x2aef2a85UL, 0xed796bcaUL,
+    0xac4870d3UL, 0x6f1b5df8UL, 0x2e2a46e1UL, 0xe136de66UL, 0xa007c57fUL,
+    0x6354e854UL, 0x2265f34dUL, 0xe5f3b202UL, 0xa4c2a91bUL, 0x67918430UL,
+    0x26a09f29UL, 0xb8aec5e4UL, 0xf99fdefdUL, 0x3accf3d6UL, 0x7bfde8cfUL,
+    0xbc6ba980UL, 0xfd5ab299UL, 0x3e099fb2UL, 0x7f3884abUL, 0xb0241c2cUL,
+    0xf1150735UL, 0x32462a1eUL, 0x73773107UL, 0xb4e17048UL, 0xf5d06b51UL,
+    0x3683467aUL, 0x77b25d63UL, 0x4ed7facbUL, 0x0fe6e1d2UL, 0xccb5ccf9UL,
+    0x8d84d7e0UL, 0x4a1296afUL, 0x0b238db6UL, 0xc870a09dUL, 0x8941bb84UL,
+    0x465d2303UL, 0x076c381aUL, 0xc43f1531UL, 0x850e0e28UL, 0x42984f67UL,
+    0x03a9547eUL, 0xc0fa7955UL, 0x81cb624cUL, 0x1fc53881UL, 0x5ef42398UL,
+    0x9da70eb3UL, 0xdc9615aaUL, 0x1b0054e5UL, 0x5a314ffcUL, 0x996262d7UL,
+    0xd85379ceUL, 0x174fe149UL, 0x567efa50UL, 0x952dd77bUL, 0xd41ccc62UL,
+    0x138a8d2dUL, 0x52bb9634UL, 0x91e8bb1fUL, 0xd0d9a006UL, 0xecf37e5eUL,
+    0xadc26547UL, 0x6e91486cUL, 0x2fa05375UL, 0xe836123aUL, 0xa9070923UL,
+    0x6a542408UL, 0x2b653f11UL, 0xe479a796UL, 0xa548bc8fUL, 0x661b91a4UL,
+    0x272a8abdUL, 0xe0bccbf2UL, 0xa18dd0ebUL, 0x62defdc0UL, 0x23efe6d9UL,
+    0xbde1bc14UL, 0xfcd0a70dUL, 0x3f838a26UL, 0x7eb2913fUL, 0xb924d070UL,
+    0xf815cb69UL, 0x3b46e642UL, 0x7a77fd5bUL, 0xb56b65dcUL, 0xf45a7ec5UL,
+    0x370953eeUL, 0x763848f7UL, 0xb1ae09b8UL, 0xf09f12a1UL, 0x33cc3f8aUL,
+    0x72fd2493UL
+  },
+  {
+    0x00000000UL, 0x376ac201UL, 0x6ed48403UL, 0x59be4602UL, 0xdca80907UL,
+    0xebc2cb06UL, 0xb27c8d04UL, 0x85164f05UL, 0xb851130eUL, 0x8f3bd10fUL,
+    0xd685970dUL, 0xe1ef550cUL, 0x64f91a09UL, 0x5393d808UL, 0x0a2d9e0aUL,
+    0x3d475c0bUL, 0x70a3261cUL, 0x47c9e41dUL, 0x1e77a21fUL, 0x291d601eUL,
+    0xac0b2f1bUL, 0x9b61ed1aUL, 0xc2dfab18UL, 0xf5b56919UL, 0xc8f23512UL,
+    0xff98f713UL, 0xa626b111UL, 0x914c7310UL, 0x145a3c15UL, 0x2330fe14UL,
+    0x7a8eb816UL, 0x4de47a17UL, 0xe0464d38UL, 0xd72c8f39UL, 0x8e92c93bUL,
+    0xb9f80b3aUL, 0x3cee443fUL, 0x0b84863eUL, 0x523ac03cUL, 0x6550023dUL,
+    0x58175e36UL, 0x6f7d9c37UL, 0x36c3da35UL, 0x01a91834UL, 0x84bf5731UL,
+    0xb3d59530UL, 0xea6bd332UL, 0xdd011133UL, 0x90e56b24UL, 0xa78fa925UL,
+    0xfe31ef27UL, 0xc95b2d26UL, 0x4c4d6223UL, 0x7b27a022UL, 0x2299e620UL,
+    0x15f32421UL, 0x28b4782aUL, 0x1fdeba2bUL, 0x4660fc29UL, 0x710a3e28UL,
+    0xf41c712dUL, 0xc376b32cUL, 0x9ac8f52eUL, 0xada2372fUL, 0xc08d9a70UL,
+    0xf7e75871UL, 0xae591e73UL, 0x9933dc72UL, 0x1c259377UL, 0x2b4f5176UL,
+    0x72f11774UL, 0x459bd575UL, 0x78dc897eUL, 0x4fb64b7fUL, 0x16080d7dUL,
+    0x2162cf7cUL, 0xa4748079UL, 0x931e4278UL, 0xcaa0047aUL, 0xfdcac67bUL,
+    0xb02ebc6cUL, 0x87447e6dUL, 0xdefa386fUL, 0xe990fa6eUL, 0x6c86b56bUL,
+    0x5bec776aUL, 0x02523168UL, 0x3538f369UL, 0x087faf62UL, 0x3f156d63UL,
+    0x66ab2b61UL, 0x51c1e960UL, 0xd4d7a665UL, 0xe3bd6464UL, 0xba032266UL,
+    0x8d69e067UL, 0x20cbd748UL, 0x17a11549UL, 0x4e1f534bUL, 0x7975914aUL,
+    0xfc63de4fUL, 0xcb091c4eUL, 0x92b75a4cUL, 0xa5dd984dUL, 0x989ac446UL,
+    0xaff00647UL, 0xf64e4045UL, 0xc1248244UL, 0x4432cd41UL, 0x73580f40UL,
+    0x2ae64942UL, 0x1d8c8b43UL, 0x5068f154UL, 0x67023355UL, 0x3ebc7557UL,
+    0x09d6b756UL, 0x8cc0f853UL, 0xbbaa3a52UL, 0xe2147c50UL, 0xd57ebe51UL,
+    0xe839e25aUL, 0xdf53205bUL, 0x86ed6659UL, 0xb187a458UL, 0x3491eb5dUL,
+    0x03fb295cUL, 0x5a456f5eUL, 0x6d2fad5fUL, 0x801b35e1UL, 0xb771f7e0UL,
+    0xeecfb1e2UL, 0xd9a573e3UL, 0x5cb33ce6UL, 0x6bd9fee7UL, 0x3267b8e5UL,
+    0x050d7ae4UL, 0x384a26efUL, 0x0f20e4eeUL, 0x569ea2ecUL, 0x61f460edUL,
+    0xe4e22fe8UL, 0xd388ede9UL, 0x8a36abebUL, 0xbd5c69eaUL, 0xf0b813fdUL,
+    0xc7d2d1fcUL, 0x9e6c97feUL, 0xa90655ffUL, 0x2c101afaUL, 0x1b7ad8fbUL,
+    0x42c49ef9UL, 0x75ae5cf8UL, 0x48e900f3UL, 0x7f83c2f2UL, 0x263d84f0UL,
+    0x115746f1UL, 0x944109f4UL, 0xa32bcbf5UL, 0xfa958df7UL, 0xcdff4ff6UL,
+    0x605d78d9UL, 0x5737bad8UL, 0x0e89fcdaUL, 0x39e33edbUL, 0xbcf571deUL,
+    0x8b9fb3dfUL, 0xd221f5ddUL, 0xe54b37dcUL, 0xd80c6bd7UL, 0xef66a9d6UL,
+    0xb6d8efd4UL, 0x81b22dd5UL, 0x04a462d0UL, 0x33cea0d1UL, 0x6a70e6d3UL,
+    0x5d1a24d2UL, 0x10fe5ec5UL, 0x27949cc4UL, 0x7e2adac6UL, 0x494018c7UL,
+    0xcc5657c2UL, 0xfb3c95c3UL, 0xa282d3c1UL, 0x95e811c0UL, 0xa8af4dcbUL,
+    0x9fc58fcaUL, 0xc67bc9c8UL, 0xf1110bc9UL, 0x740744ccUL, 0x436d86cdUL,
+    0x1ad3c0cfUL, 0x2db902ceUL, 0x4096af91UL, 0x77fc6d90UL, 0x2e422b92UL,
+    0x1928e993UL, 0x9c3ea696UL, 0xab546497UL, 0xf2ea2295UL, 0xc580e094UL,
+    0xf8c7bc9fUL, 0xcfad7e9eUL, 0x9613389cUL, 0xa179fa9dUL, 0x246fb598UL,
+    0x13057799UL, 0x4abb319bUL, 0x7dd1f39aUL, 0x3035898dUL, 0x075f4b8cUL,
+    0x5ee10d8eUL, 0x698bcf8fUL, 0xec9d808aUL, 0xdbf7428bUL, 0x82490489UL,
+    0xb523c688UL, 0x88649a83UL, 0xbf0e5882UL, 0xe6b01e80UL, 0xd1dadc81UL,
+    0x54cc9384UL, 0x63a65185UL, 0x3a181787UL, 0x0d72d586UL, 0xa0d0e2a9UL,
+    0x97ba20a8UL, 0xce0466aaUL, 0xf96ea4abUL, 0x7c78ebaeUL, 0x4b1229afUL,
+    0x12ac6fadUL, 0x25c6adacUL, 0x1881f1a7UL, 0x2feb33a6UL, 0x765575a4UL,
+    0x413fb7a5UL, 0xc429f8a0UL, 0xf3433aa1UL, 0xaafd7ca3UL, 0x9d97bea2UL,
+    0xd073c4b5UL, 0xe71906b4UL, 0xbea740b6UL, 0x89cd82b7UL, 0x0cdbcdb2UL,
+    0x3bb10fb3UL, 0x620f49b1UL, 0x55658bb0UL, 0x6822d7bbUL, 0x5f4815baUL,
+    0x06f653b8UL, 0x319c91b9UL, 0xb48adebcUL, 0x83e01cbdUL, 0xda5e5abfUL,
+    0xed3498beUL
+  },
+  {
+    0x00000000UL, 0x6567bcb8UL, 0x8bc809aaUL, 0xeeafb512UL, 0x5797628fUL,
+    0x32f0de37UL, 0xdc5f6b25UL, 0xb938d79dUL, 0xef28b4c5UL, 0x8a4f087dUL,
+    0x64e0bd6fUL, 0x018701d7UL, 0xb8bfd64aUL, 0xddd86af2UL, 0x3377dfe0UL,
+    0x56106358UL, 0x9f571950UL, 0xfa30a5e8UL, 0x149f10faUL, 0x71f8ac42UL,
+    0xc8c07bdfUL, 0xada7c767UL, 0x43087275UL, 0x266fcecdUL, 0x707fad95UL,
+    0x1518112dUL, 0xfbb7a43fUL, 0x9ed01887UL, 0x27e8cf1aUL, 0x428f73a2UL,
+    0xac20c6b0UL, 0xc9477a08UL, 0x3eaf32a0UL, 0x5bc88e18UL, 0xb5673b0aUL,
+    0xd00087b2UL, 0x6938502fUL, 0x0c5fec97UL, 0xe2f05985UL, 0x8797e53dUL,
+    0xd1878665UL, 0xb4e03addUL, 0x5a4f8fcfUL, 0x3f283377UL, 0x8610e4eaUL,
+    0xe3775852UL, 0x0dd8ed40UL, 0x68bf51f8UL, 0xa1f82bf0UL, 0xc49f9748UL,
+    0x2a30225aUL, 0x4f579ee2UL, 0xf66f497fUL, 0x9308f5c7UL, 0x7da740d5UL,
+    0x18c0fc6dUL, 0x4ed09f35UL, 0x2bb7238dUL, 0xc518969fUL, 0xa07f2a27UL,
+    0x1947fdbaUL, 0x7c204102UL, 0x928ff410UL, 0xf7e848a8UL, 0x3d58149bUL,
+    0x583fa823UL, 0xb6901d31UL, 0xd3f7a189UL, 0x6acf7614UL, 0x0fa8caacUL,
+    0xe1077fbeUL, 0x8460c306UL, 0xd270a05eUL, 0xb7171ce6UL, 0x59b8a9f4UL,
+    0x3cdf154cUL, 0x85e7c2d1UL, 0xe0807e69UL, 0x0e2fcb7bUL, 0x6b4877c3UL,
+    0xa20f0dcbUL, 0xc768b173UL, 0x29c70461UL, 0x4ca0b8d9UL, 0xf5986f44UL,
+    0x90ffd3fcUL, 0x7e5066eeUL, 0x1b37da56UL, 0x4d27b90eUL, 0x284005b6UL,
+    0xc6efb0a4UL, 0xa3880c1cUL, 0x1ab0db81UL, 0x7fd76739UL, 0x9178d22bUL,
+    0xf41f6e93UL, 0x03f7263bUL, 0x66909a83UL, 0x883f2f91UL, 0xed589329UL,
+    0x546044b4UL, 0x3107f80cUL, 0xdfa84d1eUL, 0xbacff1a6UL, 0xecdf92feUL,
+    0x89b82e46UL, 0x67179b54UL, 0x027027ecUL, 0xbb48f071UL, 0xde2f4cc9UL,
+    0x3080f9dbUL, 0x55e74563UL, 0x9ca03f6bUL, 0xf9c783d3UL, 0x176836c1UL,
+    0x720f8a79UL, 0xcb375de4UL, 0xae50e15cUL, 0x40ff544eUL, 0x2598e8f6UL,
+    0x73888baeUL, 0x16ef3716UL, 0xf8408204UL, 0x9d273ebcUL, 0x241fe921UL,
+    0x41785599UL, 0xafd7e08bUL, 0xcab05c33UL, 0x3bb659edUL, 0x5ed1e555UL,
+    0xb07e5047UL, 0xd519ecffUL, 0x6c213b62UL, 0x094687daUL, 0xe7e932c8UL,
+    0x828e8e70UL, 0xd49eed28UL, 0xb1f95190UL, 0x5f56e482UL, 0x3a31583aUL,
+    0x83098fa7UL, 0xe66e331fUL, 0x08c1860dUL, 0x6da63ab5UL, 0xa4e140bdUL,
+    0xc186fc05UL, 0x2f294917UL, 0x4a4ef5afUL, 0xf3762232UL, 0x96119e8aUL,
+    0x78be2b98UL, 0x1dd99720UL, 0x4bc9f478UL, 0x2eae48c0UL, 0xc001fdd2UL,
+    0xa566416aUL, 0x1c5e96f7UL, 0x79392a4fUL, 0x97969f5dUL, 0xf2f123e5UL,
+    0x05196b4dUL, 0x607ed7f5UL, 0x8ed162e7UL, 0xebb6de5fUL, 0x528e09c2UL,
+    0x37e9b57aUL, 0xd9460068UL, 0xbc21bcd0UL, 0xea31df88UL, 0x8f566330UL,
+    0x61f9d622UL, 0x049e6a9aUL, 0xbda6bd07UL, 0xd8c101bfUL, 0x366eb4adUL,
+    0x53090815UL, 0x9a4e721dUL, 0xff29cea5UL, 0x11867bb7UL, 0x74e1c70fUL,
+    0xcdd91092UL, 0xa8beac2aUL, 0x46111938UL, 0x2376a580UL, 0x7566c6d8UL,
+    0x10017a60UL, 0xfeaecf72UL, 0x9bc973caUL, 0x22f1a457UL, 0x479618efUL,
+    0xa939adfdUL, 0xcc5e1145UL, 0x06ee4d76UL, 0x6389f1ceUL, 0x8d2644dcUL,
+    0xe841f864UL, 0x51792ff9UL, 0x341e9341UL, 0xdab12653UL, 0xbfd69aebUL,
+    0xe9c6f9b3UL, 0x8ca1450bUL, 0x620ef019UL, 0x07694ca1UL, 0xbe519b3cUL,
+    0xdb362784UL, 0x35999296UL, 0x50fe2e2eUL, 0x99b95426UL, 0xfcdee89eUL,
+    0x12715d8cUL, 0x7716e134UL, 0xce2e36a9UL, 0xab498a11UL, 0x45e63f03UL,
+    0x208183bbUL, 0x7691e0e3UL, 0x13f65c5bUL, 0xfd59e949UL, 0x983e55f1UL,
+    0x2106826cUL, 0x44613ed4UL, 0xaace8bc6UL, 0xcfa9377eUL, 0x38417fd6UL,
+    0x5d26c36eUL, 0xb389767cUL, 0xd6eecac4UL, 0x6fd61d59UL, 0x0ab1a1e1UL,
+    0xe41e14f3UL, 0x8179a84bUL, 0xd769cb13UL, 0xb20e77abUL, 0x5ca1c2b9UL,
+    0x39c67e01UL, 0x80fea99cUL, 0xe5991524UL, 0x0b36a036UL, 0x6e511c8eUL,
+    0xa7166686UL, 0xc271da3eUL, 0x2cde6f2cUL, 0x49b9d394UL, 0xf0810409UL,
+    0x95e6b8b1UL, 0x7b490da3UL, 0x1e2eb11bUL, 0x483ed243UL, 0x2d596efbUL,
+    0xc3f6dbe9UL, 0xa6916751UL, 0x1fa9b0ccUL, 0x7ace0c74UL, 0x9461b966UL,
+    0xf10605deUL
+#endif
+  }
+};
index 32e082758cdaadc44f6f5dbb11005e6269b0afe1..5e1aa75658836d1cdd63c421ea14a30bfdc6f1d2 100755 (executable)
--- a/inventory
+++ b/inventory
@@ -24,6 +24,10 @@ ANNOUNCE-2.6.6
 ANNOUNCE-2.6.7
 ANNOUNCE-2.6.8
 ANNOUNCE-2.6.9
+ANNOUNCE-3.0-devel1
+ANNOUNCE-3.0-devel2
+ANNOUNCE-3.0-devel3
+ANNOUNCE-3.0-rc1
 Assemble.c
 bitmap.c
 bitmap.h
@@ -31,6 +35,8 @@ Build.c
 ChangeLog
 config.c
 COPYING
+crc32.c
+crc32.h
 Create.c
 Detail.c
 dlink.c
@@ -44,10 +50,13 @@ inventory
 kernel-patch-2.6.18
 kernel-patch-2.6.18.6
 kernel-patch-2.6.19
+kernel-patch-2.6.25
+kernel-patch-2.6.27
 Kill.c
 makedist
 Makefile
 Manage.c
+managemon.c
 mapfile.c
 md.4
 md5.h
@@ -59,6 +68,9 @@ mdadm.h
 mdadm.spec
 mdassemble.8
 mdassemble.c
+mdmon.8
+mdmon.c
+mdmon.h
 mdopen.c
 md_p.h
 mdstat.c
@@ -66,17 +78,27 @@ md_u.h
 misc/
 misc/syslog-events
 mkinitramfs
+monitor.c
 Monitor.c
+msg.c
+msg.h
+platform-intel.c
+platform-intel.h
+probe_roms.c
+probe_roms.h
 pwgr.c
 Query.c
 raid5extend.c
 ReadMe.c
 README.initramfs
 restripe.c
+sg_io.c
 sha1.c
 sha1.h
 super0.c
 super1.c
+super-ddf.c
+super-intel.c
 swap_super.c
 sysfs.c
 test
@@ -91,6 +113,8 @@ tests/00raid5
 tests/00raid6
 tests/01r1fail
 tests/01r5fail
+tests/01r5integ
+tests/01raid6integ
 tests/02lineargrow
 tests/02r1add
 tests/02r1grow
@@ -124,8 +148,14 @@ tests/07autoassemble
 tests/07autodetect
 tests/07reshape5intr
 tests/07testreshape5
+tests/08imsm-overlap
+tests/09imsm-create-fail-rebuild
+tests/10ddf-create
 tests/check
+tests/env-08imsm-overlap
+tests/env-09imsm-create-fail-rebuild
 tests/testdev
 tests/ToTest
 TODO
+udev-md-raid.rules
 util.c
diff --git a/kernel-patch-2.6.25 b/kernel-patch-2.6.25
new file mode 100644 (file)
index 0000000..2329007
--- /dev/null
@@ -0,0 +1,199 @@
+Status: ok
+
+Support adding a spare to a live md array with external metadata.
+
+i.e. extend the 'md/dev-XXX/slot' attribute so that you can
+tell a device to fill an vacant slot in an and md array.
+
+
+Signed-off-by: Neil Brown <neilb@suse.de>
+
+### Diffstat output
+ ./drivers/md/md.c        |   44 ++++++++++++++++++++++++++++++++++++++++----
+ ./drivers/md/multipath.c |    7 ++++++-
+ ./drivers/md/raid1.c     |    7 ++++++-
+ ./drivers/md/raid10.c    |   10 ++++++++--
+ ./drivers/md/raid5.c     |   10 ++++++++--
+ 5 files changed, 68 insertions(+), 10 deletions(-)
+
+diff .prev/drivers/md/md.c ./drivers/md/md.c
+--- .prev/drivers/md/md.c      2008-06-05 09:19:56.000000000 +1000
++++ ./drivers/md/md.c  2008-06-10 10:41:21.000000000 +1000
+@@ -1932,7 +1932,7 @@ slot_store(mdk_rdev_t *rdev, const char 
+               slot = -1;
+       else if (e==buf || (*e && *e!= '\n'))
+               return -EINVAL;
+-      if (rdev->mddev->pers) {
++      if (rdev->mddev->pers && slot == -1) {
+               /* Setting 'slot' on an active array requires also
+                * updating the 'rd%d' link, and communicating
+                * with the personality with ->hot_*_disk.
+@@ -1940,8 +1940,6 @@ slot_store(mdk_rdev_t *rdev, const char 
+                * failed/spare devices.  This normally happens automatically,
+                * but not when the metadata is externally managed.
+                */
+-              if (slot != -1)
+-                      return -EBUSY;
+               if (rdev->raid_disk == -1)
+                       return -EEXIST;
+               /* personality does all needed checks */
+@@ -1955,6 +1953,44 @@ slot_store(mdk_rdev_t *rdev, const char 
+               sysfs_remove_link(&rdev->mddev->kobj, nm);
+               set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
+               md_wakeup_thread(rdev->mddev->thread);
++      } else if (rdev->mddev->pers) {
++              mdk_rdev_t *rdev2;
++              struct list_head *tmp;
++              /* Activating a spare .. or possibly reactivating
++               * if we every get bitmaps working here.
++               */
++
++              if (rdev->raid_disk != -1)
++                      return -EBUSY;
++
++              if (rdev->mddev->pers->hot_add_disk == NULL)
++                      return -EINVAL;
++
++              rdev_for_each(rdev2, tmp, rdev->mddev)
++                      if (rdev2->raid_disk == slot)
++                              return -EEXIST;
++
++              rdev->raid_disk = slot;
++              if (test_bit(In_sync, &rdev->flags))
++                      rdev->saved_raid_disk = slot;
++              else
++                      rdev->saved_raid_disk = -1;
++              err = rdev->mddev->pers->
++                      hot_add_disk(rdev->mddev, rdev);
++              if (err != 1) {
++                      rdev->raid_disk = -1;
++                      if (err == 0)
++                              return -EEXIST;
++                      return err;
++              }
++              sprintf(nm, "rd%d", rdev->raid_disk);
++              if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
++                      printk(KERN_WARNING
++                             "md: cannot register "
++                             "%s for %s\n",
++                             nm, mdname(rdev->mddev));
++
++              /* don't wakeup anyone, leave that to userspace. */
+       } else {
+               if (slot >= rdev->mddev->raid_disks)
+                       return -ENOSPC;
+@@ -4205,7 +4241,7 @@ static int add_new_disk(mddev_t * mddev,
+                       super_types[mddev->major_version].
+                               validate_super(mddev, rdev);
+                       err = mddev->pers->hot_add_disk(mddev, rdev);
+-                      if (err)
++                      if (err < 0)
+                               unbind_rdev_from_array(rdev);
+               }
+               if (err)
+
+diff .prev/drivers/md/multipath.c ./drivers/md/multipath.c
+--- .prev/drivers/md/multipath.c       2008-05-30 14:49:31.000000000 +1000
++++ ./drivers/md/multipath.c   2008-06-10 10:35:03.000000000 +1000
+@@ -284,10 +284,15 @@ static int multipath_add_disk(mddev_t *m
+       int found = 0;
+       int path;
+       struct multipath_info *p;
++      int first = 0;
++      int last = mddev->raid_disks - 1;
++
++      if (rdev->raid_disk >= 0)
++              first = last = rdev->raid_disk;
+       print_multipath_conf(conf);
+-      for (path=0; path<mddev->raid_disks; path++) 
++      for (path = first; path <= last; path++)
+               if ((p=conf->multipaths+path)->rdev == NULL) {
+                       q = rdev->bdev->bd_disk->queue;
+                       blk_queue_stack_limits(mddev->queue, q);
+
+diff .prev/drivers/md/raid10.c ./drivers/md/raid10.c
+--- .prev/drivers/md/raid10.c  2008-05-30 14:49:31.000000000 +1000
++++ ./drivers/md/raid10.c      2008-06-10 10:28:53.000000000 +1000
+@@ -1116,6 +1116,8 @@ static int raid10_add_disk(mddev_t *mdde
+       int found = 0;
+       int mirror;
+       mirror_info_t *p;
++      int first = 0;
++      int last = mddev->raid_disks - 1;
+       if (mddev->recovery_cp < MaxSector)
+               /* only hot-add to in-sync arrays, as recovery is
+@@ -1125,12 +1127,16 @@ static int raid10_add_disk(mddev_t *mdde
+       if (!enough(conf))
+               return 0;
++      if (rdev->raid_disk)
++              first = last = rdev->raid_disk;
++
+       if (rdev->saved_raid_disk >= 0 &&
++          rdev->saved_raid_disk >= first &&
+           conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
+               mirror = rdev->saved_raid_disk;
+       else
+-              mirror = 0;
+-      for ( ; mirror < mddev->raid_disks; mirror++)
++              mirror = first;
++      for ( ; mirror <= last ; mirror++)
+               if ( !(p=conf->mirrors+mirror)->rdev) {
+                       blk_queue_stack_limits(mddev->queue,
+
+diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c
+--- .prev/drivers/md/raid1.c   2008-05-30 14:49:31.000000000 +1000
++++ ./drivers/md/raid1.c       2008-06-10 10:41:00.000000000 +1000
+@@ -1103,8 +1103,13 @@ static int raid1_add_disk(mddev_t *mddev
+       int found = 0;
+       int mirror = 0;
+       mirror_info_t *p;
++      int first = 0;
++      int last = mddev->raid_disks - 1;
+-      for (mirror=0; mirror < mddev->raid_disks; mirror++)
++      if (rdev->raid_disk >= 0)
++              first = last = rdev->raid_disk;
++
++      for (mirror = first; mirror <= last; mirror++)
+               if ( !(p=conf->mirrors+mirror)->rdev) {
+                       blk_queue_stack_limits(mddev->queue,
+
+diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c
+--- .prev/drivers/md/raid5.c   2008-05-30 14:49:35.000000000 +1000
++++ ./drivers/md/raid5.c       2008-06-10 10:27:51.000000000 +1000
+@@ -4399,21 +4399,27 @@ static int raid5_add_disk(mddev_t *mddev
+       int found = 0;
+       int disk;
+       struct disk_info *p;
++      int first = 0;
++      int last = conf->raid_disks - 1;
+       if (mddev->degraded > conf->max_degraded)
+               /* no point adding a device */
+               return 0;
++      if (rdev->raid_disk >= 0)
++              first = last = rdev->raid_disk;
++
+       /*
+        * find the disk ... but prefer rdev->saved_raid_disk
+        * if possible.
+        */
+       if (rdev->saved_raid_disk >= 0 &&
++          rdev->saved_raid_disk >= first &&
+           conf->disks[rdev->saved_raid_disk].rdev == NULL)
+               disk = rdev->saved_raid_disk;
+       else
+-              disk = 0;
+-      for ( ; disk < conf->raid_disks; disk++)
++              disk = first;
++      for ( ; disk <= last ; disk++)
+               if ((p=conf->disks + disk)->rdev == NULL) {
+                       clear_bit(In_sync, &rdev->flags);
+                       rdev->raid_disk = disk;
diff --git a/kernel-patch-2.6.27 b/kernel-patch-2.6.27
new file mode 100644 (file)
index 0000000..8d0785d
--- /dev/null
@@ -0,0 +1,36 @@
+touch_mnt_namespace when the mount flags change
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+Daemons that need to be launched while the rootfs is read-only can now
+poll /proc/mounts to be notified when their O_RDWR requests may no
+longer end in EROFS.
+
+Cc: Kay Sievers <kay.sievers@vrfy.org>
+Cc: Neil Brown <neilb@suse.de>
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+---
+
+ fs/namespace.c |    7 ++++++-
+ 1 files changed, 6 insertions(+), 1 deletions(-)
+
+
+diff --git a/fs/namespace.c b/fs/namespace.c
+index 6e283c9..1bd5ba2 100644
+--- a/fs/namespace.c
++++ b/fs/namespace.c
+@@ -1553,8 +1553,13 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags,
+       if (!err)
+               nd->path.mnt->mnt_flags = mnt_flags;
+       up_write(&sb->s_umount);
+-      if (!err)
++      if (!err) {
+               security_sb_post_remount(nd->path.mnt, flags, data);
++
++              spin_lock(&vfsmount_lock);
++              touch_mnt_namespace(nd->path.mnt->mnt_ns);
++              spin_unlock(&vfsmount_lock);
++      }
+       return err;
+ }
index 7adbd7f2cdcae491c9795f654fb90098cc690f92..03ec5fc9721c61e8b5f0daad90a6c3edd6c0ac01 100755 (executable)
--- a/makedist
+++ b/makedist
@@ -16,7 +16,12 @@ set `grep '^char Version' ReadMe.c `
 version=`echo $7 | sed 's/v//'`
 grep "^.TH MDADM 8 .. v$version" mdadm.8 > /dev/null 2>&1 ||
  {
-   echo mdadm.8 does not mention verion $version.
+   echo mdadm.8 does not mention version $version.
+   exit 1
+ }
+grep "^.TH MDMON 8 .. v$version" mdmon.8 > /dev/null 2>&1 ||
+ {
+   echo mdmon.8 does not mention version $version.
    exit 1
  }
 rpmv=`echo $version | tr - _`
diff --git a/managemon.c b/managemon.c
new file mode 100644 (file)
index 0000000..f9d545d
--- /dev/null
@@ -0,0 +1,711 @@
+/*
+ * mdmon - monitor external metadata arrays
+ *
+ * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2007-2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * The management thread for monitoring active md arrays.
+ * This thread does things which might block such as memory
+ * allocation.
+ * In particular:
+ *
+ * - Find out about new arrays in this container.
+ *   Allocate the data structures and open the files.
+ *
+ *   For this we watch /proc/mdstat and find new arrays with
+ *   metadata type that confirms sharing. e.g. "md4"
+ *   When we find a new array we slip it into the list of
+ *   arrays and signal 'monitor' by writing to a pipe.
+ *
+ * - Respond to reshape requests by allocating new data structures
+ *   and opening new files.
+ *
+ *   These come as a change to raid_disks.  We allocate a new
+ *   version of the data structures and slip it into the list.
+ *   'monitor' will notice and release the old version.
+ *   Changes to level, chunksize, layout.. do not need re-allocation.
+ *   Reductions in raid_disks don't really either, but we handle
+ *   them the same way for consistency.
+ *
+ * - When a device is added to the container, we add it to the metadata
+ *   as a spare.
+ *
+ * - Deal with degraded array
+ *    We only do this when first noticing the array is degraded.
+ *    This can be when we first see the array, when sync completes or
+ *    when recovery completes.
+ *
+ *    Check if number of failed devices suggests recovery is needed, and
+ *    skip if not.
+ *    Ask metadata to allocate a spare device
+ *    Add device as not in_sync and give a role
+ *    Update metadata.
+ *    Open sysfs files and pass to monitor.
+ *    Make sure that monitor Starts recovery....
+ *
+ * - Pass on metadata updates from external programs such as
+ *   mdadm creating a new array.
+ *
+ *   This is most-messy.
+ *   It might involve adding a new array or changing the status of
+ *   a spare, or any reconfig that the kernel doesn't get involved in.
+ *
+ *   The required updates are received via a named pipe.  There will
+ *   be one named pipe for each container. Each message contains a
+ *   sync marker: 0x5a5aa5a5, A byte count, and the message.  This is
+ *   passed to the metadata handler which will interpret and process it.
+ *   For 'DDF' messages are internal data blocks with the leading
+ *   'magic number' signifying what sort of data it is.
+ *
+ */
+
+/*
+ * We select on /proc/mdstat and the named pipe.
+ * We create new arrays or updated version of arrays and slip
+ * them into the head of the list, then signal 'monitor' via a pipe write.
+ * 'monitor' will notice and place the old array on a return list.
+ * Metadata updates are placed on a queue just like they arrive
+ * from the named pipe.
+ *
+ * When new arrays are found based on correct metadata string, we
+ * need to identify them with an entry in the metadata.  Maybe we require
+ * the metadata to be mdX/NN  when NN is the index into an appropriate table.
+ *
+ */
+
+/*
+ * List of tasks:
+ * - Watch for spares to be added to the container, and write updated
+ *   metadata to them.
+ * - Watch for new arrays using this container, confirm they match metadata
+ *   and if so, start monitoring them
+ * - Watch for spares being added to monitored arrays.  This shouldn't
+ *   happen, as we should do all the adding.  Just remove them.
+ * - Watch for change in raid-disks, chunk-size, etc.  Update metadata and
+ *   start a reshape.
+ */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include       "mdadm.h"
+#include       "mdmon.h"
+#include       <sys/syscall.h>
+#include       <sys/socket.h>
+#include       <signal.h>
+
+static void close_aa(struct active_array *aa)
+{
+       struct mdinfo *d;
+
+       for (d = aa->info.devs; d; d = d->next)
+               close(d->state_fd);
+
+       close(aa->action_fd);
+       close(aa->info.state_fd);
+       close(aa->resync_start_fd);
+}
+
+static void free_aa(struct active_array *aa)
+{
+       /* Note that this doesn't close fds if they are being used
+        * by a clone.  ->container will be set for a clone
+        */
+       dprintf("%s: devnum: %d\n", __func__, aa->devnum);
+       if (!aa->container)
+               close_aa(aa);
+       while (aa->info.devs) {
+               struct mdinfo *d = aa->info.devs;
+               aa->info.devs = d->next;
+               free(d);
+       }
+       free(aa);
+}
+
+static struct active_array *duplicate_aa(struct active_array *aa)
+{
+       struct active_array *newa = malloc(sizeof(*newa));
+       struct mdinfo **dp1, **dp2;
+
+       *newa = *aa;
+       newa->next = NULL;
+       newa->replaces = NULL;
+       newa->info.next = NULL;
+
+       dp2 = &newa->info.devs;
+
+       for (dp1 = &aa->info.devs; *dp1; dp1 = &(*dp1)->next) {
+               struct mdinfo *d;
+               if ((*dp1)->state_fd < 0)
+                       continue;
+
+               d = malloc(sizeof(*d));
+               *d = **dp1;
+               *dp2 = d;
+               dp2 = & d->next;
+       }
+       *dp2 = NULL;
+
+       return newa;
+}
+
+static void wakeup_monitor(void)
+{
+       /* tgkill(getpid(), mon_tid, SIGUSR1); */
+       int pid = getpid();
+       syscall(SYS_tgkill, pid, mon_tid, SIGUSR1);
+}
+
+static void remove_old(void)
+{
+       if (discard_this) {
+               discard_this->next = NULL;
+               free_aa(discard_this);
+               if (pending_discard == discard_this)
+                       pending_discard = NULL;
+               discard_this = NULL;
+               wakeup_monitor();
+       }
+}
+
+static void replace_array(struct supertype *container,
+                         struct active_array *old,
+                         struct active_array *new)
+{
+       /* To replace an array, we add it to the top of the list
+        * marked with ->replaces to point to the original.
+        * 'monitor' will take the original out of the list
+        * and put it on 'discard_this'.  We take it from there
+        * and discard it.
+        */
+       remove_old();
+       while (pending_discard) {
+               while (discard_this == NULL)
+                       sleep(1);
+               remove_old();
+       }
+       pending_discard = old;
+       new->replaces = old;
+       new->next = container->arrays;
+       container->arrays = new;
+       wakeup_monitor();
+}
+
+struct metadata_update *update_queue = NULL;
+struct metadata_update *update_queue_handled = NULL;
+struct metadata_update *update_queue_pending = NULL;
+
+void check_update_queue(struct supertype *container)
+{
+       while (update_queue_handled) {
+               struct metadata_update *this = update_queue_handled;
+               update_queue_handled = this->next;
+               free(this->buf);
+               if (this->space)
+                       free(this->space);
+               free(this);
+       }
+       if (update_queue == NULL &&
+           update_queue_pending) {
+               update_queue = update_queue_pending;
+               update_queue_pending = NULL;
+               wakeup_monitor();
+       }
+}
+
+static void queue_metadata_update(struct metadata_update *mu)
+{
+       struct metadata_update **qp;
+
+       qp = &update_queue_pending;
+       while (*qp)
+               qp = & ((*qp)->next);
+       *qp = mu;
+}
+
+static void add_disk_to_container(struct supertype *st, struct mdinfo *sd)
+{
+       int dfd;
+       char nm[20];
+       struct supertype *st2;
+       struct metadata_update *update = NULL;
+       struct mdinfo info;
+       mdu_disk_info_t dk = {
+               .number = -1,
+               .major = sd->disk.major,
+               .minor = sd->disk.minor,
+               .raid_disk = -1,
+               .state = 0,
+       };
+
+       dprintf("%s: add %d:%d to container\n",
+               __func__, sd->disk.major, sd->disk.minor);
+
+       sd->next = st->devs;
+       st->devs = sd;
+
+       sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+       dfd = dev_open(nm, O_RDWR);
+       if (dfd < 0)
+               return;
+
+       /* Check the metadata and see if it is already part of this
+        * array
+        */
+       st2 = dup_super(st);
+       if (st2->ss->load_super(st2, dfd, NULL) == 0) {
+               st2->ss->getinfo_super(st, &info);
+               if (st->ss->compare_super(st, st2) == 0 &&
+                   info.disk.raid_disk >= 0) {
+                       /* Looks like a good member of array.
+                        * Just accept it.
+                        * mdadm will incorporate any parts into
+                        * active arrays.
+                        */
+                       st2->ss->free_super(st2);
+                       return;
+               }
+       }
+       st2->ss->free_super(st2);
+
+       st->update_tail = &update;
+       st->ss->add_to_super(st, &dk, dfd, NULL);
+       st->ss->write_init_super(st);
+       queue_metadata_update(update);
+       st->update_tail = NULL;
+}
+
+static void manage_container(struct mdstat_ent *mdstat,
+                            struct supertype *container)
+{
+       /* The only thing of interest here is if a new device
+        * has been added to the container.  We add it to the
+        * array ignoring any metadata on it.
+        * FIXME should we look for compatible metadata and take hints
+        * about spare assignment.... probably not.
+        */
+       if (mdstat->devcnt != container->devcnt) {
+               struct mdinfo **cdp, *cd, *di, *mdi;
+               int found;
+
+               /* read /sys/block/NAME/md/dev-??/block/dev to find out
+                * what is there, and compare with container->info.devs
+                * To see what is removed and what is added.
+                * These need to be remove from, or added to, the array
+                */
+               mdi = sysfs_read(-1, mdstat->devnum, GET_DEVS|SKIP_GONE_DEVS);
+               if (!mdi) {
+                       /* invalidate the current count so we can try again */
+                       container->devcnt = -1;
+                       return;
+               }
+
+               /* check for removals */
+               for (cdp = &container->devs; *cdp; ) {
+                       found = 0;
+                       for (di = mdi->devs; di; di = di->next)
+                               if (di->disk.major == (*cdp)->disk.major &&
+                                   di->disk.minor == (*cdp)->disk.minor) {
+                                       found = 1;
+                                       break;
+                               }
+                       if (!found) {
+                               cd = *cdp;
+                               *cdp = (*cdp)->next;
+                               free(cd);
+                       } else
+                               cdp = &(*cdp)->next;
+               }
+
+               /* check for additions */
+               for (di = mdi->devs; di; di = di->next) {
+                       for (cd = container->devs; cd; cd = cd->next)
+                               if (di->disk.major == cd->disk.major &&
+                                   di->disk.minor == cd->disk.minor)
+                                       break;
+                       if (!cd) {
+                               struct mdinfo *newd = malloc(sizeof(*newd));
+
+                               if (!newd) {
+                                       container->devcnt = -1;
+                                       continue;
+                               }
+                               *newd = *di;
+                               add_disk_to_container(container, newd);
+                       }
+               }
+               sysfs_free(mdi);
+               container->devcnt = mdstat->devcnt;
+       }
+}
+
+static void manage_member(struct mdstat_ent *mdstat,
+                         struct active_array *a)
+{
+       /* Compare mdstat info with known state of member array.
+        * We do not need to look for device state changes here, that
+        * is dealt with by the monitor.
+        *
+        * We just look for changes which suggest that a reshape is
+        * being requested.
+        * Unfortunately decreases in raid_disks don't show up in
+        * mdstat until the reshape completes FIXME.
+        *
+        * Actually, we also want to handle degraded arrays here by
+        * trying to find and assign a spare.
+        * We do that whenever the monitor tells us too.
+        */
+       // FIXME
+       a->info.array.raid_disks = mdstat->raid_disks;
+       a->info.array.chunk_size = mdstat->chunk_size;
+       // MORE
+
+       if (a->check_degraded) {
+               struct metadata_update *updates = NULL;
+               struct mdinfo *newdev;
+               struct active_array *newa;
+
+               a->check_degraded = 0;
+
+               /* The array may not be degraded, this is just a good time
+                * to check.
+                */
+               newdev = a->container->ss->activate_spare(a, &updates);
+               if (newdev) {
+                       struct mdinfo *d;
+                       /* Cool, we can add a device or several. */
+                       newa = duplicate_aa(a);
+                       /* suspend recovery - maybe not needed */
+
+                       /* Add device to array and set offset/size/slot.
+                        * and open files for each newdev */
+                       for (d = newdev; d ; d = d->next) {
+                               struct mdinfo *newd;
+                               if (sysfs_add_disk(&newa->info, d, 0) < 0)
+                                       continue;
+                               newd = malloc(sizeof(*newd));
+                               *newd = *d;
+                               newd->next = newa->info.devs;
+                               newa->info.devs = newd;
+
+                               newd->state_fd = sysfs_open(a->devnum,
+                                                           newd->sys_name,
+                                                           "state");
+                               newd->prev_state
+                                       = read_dev_state(newd->state_fd);
+                               newd->curr_state = newd->prev_state;
+                       }
+                       queue_metadata_update(updates);
+                       replace_array(a->container, a, newa);
+                       sysfs_set_str(&a->info, NULL, "sync_action", "recover");
+               }
+       }
+}
+
+static int aa_ready(struct active_array *aa)
+{
+       struct mdinfo *d;
+       int level = aa->info.array.level;
+
+       for (d = aa->info.devs; d; d = d->next)
+               if (d->state_fd < 0)
+                       return 0;
+
+       if (aa->info.state_fd < 0)
+               return 0;
+
+       if (level > 0 && (aa->action_fd < 0 || aa->resync_start_fd < 0))
+               return 0;
+
+       if (!aa->container)
+               return 0;
+
+       return 1;
+}
+
+static void manage_new(struct mdstat_ent *mdstat,
+                      struct supertype *container,
+                      struct active_array *victim)
+{
+       /* A new array has appeared in this container.
+        * Hopefully it is already recorded in the metadata.
+        * Check, then create the new array to report it to
+        * the monitor.
+        */
+
+       struct active_array *new;
+       struct mdinfo *mdi, *di;
+       char *inst;
+       int i;
+       int failed = 0;
+
+       /* check if array is ready to be monitored */
+       if (!mdstat->active)
+               return;
+
+       mdi = sysfs_read(-1, mdstat->devnum,
+                        GET_LEVEL|GET_CHUNK|GET_DISKS|GET_COMPONENT|
+                        GET_DEGRADED|GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE);
+
+       new = malloc(sizeof(*new));
+
+       if (!new || !mdi) {
+               if (mdi)
+                       sysfs_free(mdi);
+               if (new)
+                       free(new);
+               return;
+       }
+       memset(new, 0, sizeof(*new));
+
+       new->devnum = mdstat->devnum;
+       strcpy(new->info.sys_name, devnum2devname(new->devnum));
+
+       new->prev_state = new->curr_state = new->next_state = inactive;
+       new->prev_action= new->curr_action= new->next_action= idle;
+
+       new->container = container;
+
+       inst = &mdstat->metadata_version[10+strlen(container->devname)+1];
+
+       new->info.array = mdi->array;
+       new->info.component_size = mdi->component_size;
+
+       for (i = 0; i < new->info.array.raid_disks; i++) {
+               struct mdinfo *newd = malloc(sizeof(*newd));
+
+               for (di = mdi->devs; di; di = di->next)
+                       if (i == di->disk.raid_disk)
+                               break;
+
+               if (di && newd) {
+                       memcpy(newd, di, sizeof(*newd));
+
+                       newd->state_fd = sysfs_open(new->devnum,
+                                                   newd->sys_name,
+                                                   "state");
+
+                       newd->prev_state = read_dev_state(newd->state_fd);
+                       newd->curr_state = newd->prev_state;
+               } else {
+                       if (newd)
+                               free(newd);
+
+                       failed++;
+                       if (failed > new->info.array.failed_disks) {
+                               /* we cannot properly monitor without all working disks */
+                               new->container = NULL;
+                               break;
+                       }
+                       continue;
+               }
+               sprintf(newd->sys_name, "rd%d", i);
+               newd->next = new->info.devs;
+               new->info.devs = newd;
+       }
+
+       new->action_fd = sysfs_open(new->devnum, NULL, "sync_action");
+       new->info.state_fd = sysfs_open(new->devnum, NULL, "array_state");
+       new->resync_start_fd = sysfs_open(new->devnum, NULL, "resync_start");
+       new->metadata_fd = sysfs_open(new->devnum, NULL, "metadata_version");
+       get_resync_start(new);
+       dprintf("%s: inst: %d action: %d state: %d\n", __func__, atoi(inst),
+               new->action_fd, new->info.state_fd);
+
+       sysfs_free(mdi);
+
+       /* if everything checks out tell the metadata handler we want to
+        * manage this instance
+        */
+       if (!aa_ready(new) || container->ss->open_new(container, new, inst) < 0) {
+               fprintf(stderr, "mdmon: failed to monitor %s\n",
+                       mdstat->metadata_version);
+               new->container = NULL;
+               free_aa(new);
+       } else {
+               replace_array(container, victim, new);
+               if (failed) {
+                       new->check_degraded = 1;
+                       manage_member(mdstat, new);
+               }
+       }
+}
+
+void manage(struct mdstat_ent *mdstat, struct supertype *container)
+{
+       /* We have just read mdstat and need to compare it with
+        * the known active arrays.
+        * Arrays with the wrong metadata are ignored.
+        */
+
+       for ( ; mdstat ; mdstat = mdstat->next) {
+               struct active_array *a;
+               if (mdstat->devnum == container->devnum) {
+                       manage_container(mdstat, container);
+                       continue;
+               }
+               if (!is_container_member(mdstat, container->devname))
+                       /* Not for this array */
+                       continue;
+               /* Looks like a member of this container */
+               for (a = container->arrays; a; a = a->next) {
+                       if (mdstat->devnum == a->devnum) {
+                               if (a->container)
+                                       manage_member(mdstat, a);
+                               break;
+                       }
+               }
+               if (a == NULL || !a->container)
+                       manage_new(mdstat, container, a);
+       }
+}
+
+static void handle_message(struct supertype *container, struct metadata_update *msg)
+{
+       /* queue this metadata update through to the monitor */
+
+       struct metadata_update *mu;
+
+       if (msg->len <= 0)
+               while (update_queue_pending || update_queue) {
+                       check_update_queue(container);
+                       usleep(15*1000);
+               }
+
+       if (msg->len == 0) { /* ping_monitor */
+               int cnt;
+               
+               cnt = monitor_loop_cnt;
+               if (cnt & 1)
+                       cnt += 2; /* wait until next pselect */
+               else
+                       cnt += 3; /* wait for 2 pselects */
+               wakeup_monitor();
+
+               while (monitor_loop_cnt - cnt < 0)
+                       usleep(10 * 1000);
+       } else if (msg->len == -1) { /* ping_manager */
+               struct mdstat_ent *mdstat = mdstat_read(1, 0);
+
+               manage(mdstat, container);
+               free_mdstat(mdstat);
+       } else if (!sigterm) {
+               mu = malloc(sizeof(*mu));
+               mu->len = msg->len;
+               mu->buf = msg->buf;
+               msg->buf = NULL;
+               mu->space = NULL;
+               mu->next = NULL;
+               if (container->ss->prepare_update)
+                       container->ss->prepare_update(container, mu);
+               queue_metadata_update(mu);
+       }
+}
+
+void read_sock(struct supertype *container)
+{
+       int fd;
+       struct metadata_update msg;
+       int terminate = 0;
+       long fl;
+       int tmo = 3; /* 3 second timeout before hanging up the socket */
+
+       fd = accept(container->sock, NULL, NULL);
+       if (fd < 0)
+               return;
+
+       fl = fcntl(fd, F_GETFL, 0);
+       fl |= O_NONBLOCK;
+       fcntl(fd, F_SETFL, fl);
+
+       do {
+               msg.buf = NULL;
+
+               /* read and validate the message */
+               if (receive_message(fd, &msg, tmo) == 0) {
+                       handle_message(container, &msg);
+                       if (ack(fd, tmo) < 0)
+                               terminate = 1;
+               } else
+                       terminate = 1;
+
+       } while (!terminate);
+
+       close(fd);
+}
+
+int exit_now = 0;
+int manager_ready = 0;
+void do_manager(struct supertype *container)
+{
+       struct mdstat_ent *mdstat;
+       sigset_t set;
+       int proc_fd;
+
+       sigprocmask(SIG_UNBLOCK, NULL, &set);
+       sigdelset(&set, SIGUSR1);
+       sigdelset(&set, SIGHUP);
+       sigdelset(&set, SIGALRM);
+       sigdelset(&set, SIGTERM);
+       proc_fd = open("/proc/mounts", O_RDONLY);
+
+       do {
+
+               if (exit_now)
+                       exit(0);
+
+               /* Can only 'manage' things if 'monitor' is not making
+                * structural changes to metadata, so need to check
+                * update_queue
+                */
+               if (update_queue == NULL) {
+                       mdstat = mdstat_read(1, 0);
+
+                       manage(mdstat, container);
+
+                       read_sock(container);
+
+                       if (container->sock < 0 || socket_hup_requested) {
+                               close(container->sock);
+                               container->sock = make_control_sock(container->devname);
+                               make_pidfile(container->devname, 0);
+                               socket_hup_requested = 0;
+                       }
+                       if (container->sock < 0)
+                               alarm(30);
+
+                       free_mdstat(mdstat);
+               }
+               remove_old();
+
+               check_update_queue(container);
+
+               manager_ready = 1;
+
+               if (sigterm)
+                       wakeup_monitor();
+
+               if (update_queue == NULL) {
+                       if (container->sock < 0)
+                               mdstat_wait_fd(proc_fd, &set);
+                       else
+                               mdstat_wait_fd(container->sock, &set);
+               } else
+                       /* If an update is happening, just wait for signal */
+                       pselect(0, NULL, NULL, NULL, NULL, &set);
+       } while(1);
+}
index 59cc6c6d355ae9369df8582b966479c84228c836..601c4ccf378b53ae185e326b45aa6f884be167e3 100644 (file)
--- a/mapfile.c
+++ b/mapfile.c
@@ -2,7 +2,7 @@
  * mapfile - manage /var/run/mdadm.map. Part of:
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  * also allows the array device name to be easily found.
  *
  * The map file is line based with space separated fields.  The fields are:
- *  Device id  -  mdX or mdpX  where is a number.
- *  metadata   -  0.90 1.0 1.1 1.2
+ *  Device id  -  mdX or mdpX  where is a number.
+ *  metadata   -  0.90 1.0 1.1 1.2 ddf ...
  *  UUID       -  uuid of the array
  *  path       -  path where device created: /dev/md/home
  *
+ * The preferred location for the map file is /var/run/mdadm.map.
+ * However /var/run may not exist or be writable in early boot.  And if
+ * no-one has created /var/run/mdadm, we still want to survive.
+ * So possible locations are:
+ *   /var/run/mdadm/map  /var/run/mdadm.map  /dev/.mdadm.map
+ * the last, because udev requires a writable /dev very early.
+ * We read from the first one that exists and write to the first
+ * one that we can.
  */
+#include       "mdadm.h"
+#include       <ctype.h>
 
+#define mapnames(base) { #base, #base ".new", #base ".lock"}
+char *mapname[3][3] = {
+       mapnames(/var/run/mdadm/map),
+       mapnames(/var/run/mdadm.map),
+       mapnames(/dev/.mdadm.map)
+};
 
-#include "mdadm.h"
+int mapmode[3] = { O_RDONLY, O_RDWR|O_CREAT, O_RDWR|O_CREAT | O_TRUNC };
+char *mapsmode[3] = { "r", "w", "w"};
 
+FILE *open_map(int modenum, int *choice)
+{
+       int i;
+       for (i = 0 ; i < 3 ; i++) {
+               int fd = open(mapname[i][modenum], mapmode[modenum], 0600);
+               if (fd >= 0) {
+                       *choice = i;
+                       return fdopen(fd, mapsmode[modenum]);
+               }
+       }
+       return NULL;
+}
 
 int map_write(struct map_ent *mel)
 {
        FILE *f;
        int err;
-       int subdir = 1;
+       int which;
+
+       f = open_map(1, &which);
 
-       f = fopen("/var/run/mdadm/map.new", "w");
-       if (!f) {
-               f = fopen("/var/run/mdadm.map.new", "w");
-               subdir = 0;
-       }
        if (!f)
                return 0;
-       while (mel) {
+       for (; mel; mel = mel->next) {
+               if (mel->bad)
+                       continue;
                if (mel->devnum < 0)
                        fprintf(f, "mdp%d ", -1-mel->devnum);
                else
                        fprintf(f, "md%d ", mel->devnum);
-               fprintf(f, "%d.%d ", mel->major, mel->minor);
+               fprintf(f, "%s ", mel->metadata);
                fprintf(f, "%08x:%08x:%08x:%08x ", mel->uuid[0],
                        mel->uuid[1], mel->uuid[2], mel->uuid[3]);
-               fprintf(f, "%s\n", mel->path);
-               mel = mel->next;
+               fprintf(f, "%s\n", mel->path?:"");
        }
        fflush(f);
        err = ferror(f);
        fclose(f);
        if (err) {
-               if (subdir)
-                       unlink("/var/run/mdadm/map.new");
-               else
-                       unlink("/var/run/mdadm.map.new");
+               unlink(mapname[which][1]);
                return 0;
        }
-       if (subdir)
-               return rename("/var/run/mdadm/map.new",
-                             "/var/run/mdadm/map") == 0;
-       else
-               return rename("/var/run/mdadm.map.new",
-                             "/var/run/mdadm.map") == 0;
+       return rename(mapname[which][1],
+                     mapname[which][0]) == 0;
+}
+
+
+static FILE *lf = NULL;
+static int lwhich = 0;
+int map_lock(struct map_ent **melp)
+{
+       if (lf == NULL) {
+               lf = open_map(2, &lwhich);
+               if (lf == NULL)
+                       return -1;
+               if (lockf(fileno(lf), F_LOCK, 0) != 0) {
+                       fclose(lf);
+                       lf = NULL;
+                       return -1;
+               }
+       }
+       if (*melp)
+               map_free(*melp);
+       map_read(melp);
+       return 0;
+}
+
+void map_unlock(struct map_ent **melp)
+{
+       if (lf)
+               fclose(lf);
+       unlink(mapname[lwhich][2]);
+       lf = NULL;
 }
 
 void map_add(struct map_ent **melp,
-           int devnum, int major, int minor, int uuid[4], char *path)
+           int devnum, char *metadata, int uuid[4], char *path)
 {
        struct map_ent *me = malloc(sizeof(*me));
 
        me->devnum = devnum;
-       me->major = major;
-       me->minor = minor;
+       strcpy(me->metadata, metadata);
        memcpy(me->uuid, uuid, 16);
-       me->path = strdup(path);
+       me->path = path ? strdup(path) : NULL;
        me->next = *melp;
+       me->bad = 0;
        *melp = me;
 }
 
@@ -105,30 +154,31 @@ void map_read(struct map_ent **melp)
        FILE *f;
        char buf[8192];
        char path[200];
-       int devnum, major, minor, uuid[4];
+       int devnum, uuid[4];
+       char metadata[30];
        char nam[4];
+       int which;
 
        *melp = NULL;
 
-       f = fopen("/var/run/mdadm/map", "r");
-       if (!f)
-               f = fopen("/var/run/mdadm.map", "r");
+       f = open_map(0, &which);
        if (!f) {
                RebuildMap();
-               f = fopen("/var/run/mdadm/map", "r");
+               f = open_map(0, &which);
        }
-       if (!f)
-               f = fopen("/var/run/mdadm.map", "r");
        if (!f)
                return;
 
        while (fgets(buf, sizeof(buf), f)) {
-               if (sscanf(buf, " md%1[p]%d %d.%d %x:%x:%x:%x %200s",
-                          nam, &devnum, &major, &minor, uuid, uuid+1,
-                          uuid+2, uuid+3, path) == 9) {
-                       if (nam[0] == 'p')
+               path[0] = 0;
+               if (sscanf(buf, " %3[mdp]%d %s %x:%x:%x:%x %200s",
+                          nam, &devnum, metadata, uuid, uuid+1,
+                          uuid+2, uuid+3, path) >= 7) {
+                       if (strncmp(nam, "md", 2) != 0)
+                               continue;
+                       if (nam[2] == 'p')
                                devnum = -1 - devnum;
-                       map_add(melp, devnum, major, minor, uuid, path);
+                       map_add(melp, devnum, metadata, uuid, path);
                }
        }
        fclose(f);
@@ -144,7 +194,7 @@ void map_free(struct map_ent *map)
        }
 }
 
-int map_update(struct map_ent **mpp, int devnum, int major, int minor,
+int map_update(struct map_ent **mpp, int devnum, char *metadata,
               int *uuid, char *path)
 {
        struct map_ent *map, *mp;
@@ -157,16 +207,16 @@ int map_update(struct map_ent **mpp, int devnum, int major, int minor,
 
        for (mp = map ; mp ; mp=mp->next)
                if (mp->devnum == devnum) {
-                       mp->major = major;
-                       mp->minor = minor;
+                       strcpy(mp->metadata, metadata);
                        memcpy(mp->uuid, uuid, 16);
                        free(mp->path);
-                       mp->path = strdup(path);
+                       mp->path = path ? strdup(path) : NULL;
                        break;
                }
        if (!mp)
-               map_add(&map, devnum, major, minor, uuid, path);
-       *mpp = NULL;
+               map_add(&map, devnum, metadata, uuid, path);
+       if (mpp)
+               *mpp = NULL;
        rv = map_write(map);
        map_free(map);
        return rv;
@@ -195,11 +245,56 @@ struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4])
        if (!*map)
                map_read(map);
 
-       for (mp = *map ; mp ; mp = mp->next)
-               if (memcmp(uuid, mp->uuid, 16) == 0)
-                       return mp;
+       for (mp = *map ; mp ; mp = mp->next) {
+               if (memcmp(uuid, mp->uuid, 16) != 0)
+                       continue;
+               if (!mddev_busy(mp->devnum)) {
+                       mp->bad = 1;
+                       continue;
+               }
+               return mp;
+       }
+       return NULL;
+}
+
+struct map_ent *map_by_devnum(struct map_ent **map, int devnum)
+{
+       struct map_ent *mp;
+       if (!*map)
+               map_read(map);
+
+       for (mp = *map ; mp ; mp = mp->next) {
+               if (mp->devnum != devnum)
+                       continue;
+               if (!mddev_busy(mp->devnum)) {
+                       mp->bad = 1;
+                       continue;
+               }
+               return mp;
+       }
        return NULL;
+}
+
+struct map_ent *map_by_name(struct map_ent **map, char *name)
+{
+       struct map_ent *mp;
+       if (!*map)
+               map_read(map);
 
+       for (mp = *map ; mp ; mp = mp->next) {
+               if (!mp->path)
+                       continue;
+               if (strncmp(mp->path, "/dev/md/", 8) != 0)
+                       continue;
+               if (strcmp(mp->path+8, name) != 0)
+                       continue;
+               if (!mddev_busy(mp->devnum)) {
+                       mp->bad = 1;
+                       continue;
+               }
+               return mp;
+       }
+       return NULL;
 }
 
 void RebuildMap(void)
@@ -208,12 +303,26 @@ void RebuildMap(void)
        struct mdstat_ent *md;
        struct map_ent *map = NULL;
        int mdp = get_mdp_major();
+       int require_homehost;
+       char sys_hostname[256];
+       char *homehost = conf_get_homehost(&require_homehost);
+
+       if (homehost == NULL || strcmp(homehost, "<system>")==0) {
+               if (gethostname(sys_hostname, sizeof(sys_hostname)) == 0) {
+                       sys_hostname[sizeof(sys_hostname)-1] = 0;
+                       homehost = sys_hostname;
+               }
+       }
 
        for (md = mdstat ; md ; md = md->next) {
-               struct mdinfo *sra = sysfs_read(-1, md->devnum, GET_DEVS);
+               struct mdinfo *sra = sysfs_read(-1, md->devnum, GET_DEVS|SKIP_GONE_DEVS);
                struct mdinfo *sd;
 
+               if (!sra)
+                       continue;
+
                for (sd = sra->devs ; sd ; sd = sd->next) {
+                       char namebuf[100];
                        char dn[30];
                        int dfd;
                        int ok;
@@ -234,18 +343,95 @@ void RebuildMap(void)
                        if (ok != 0)
                                continue;
                        st->ss->getinfo_super(st, &info);
-                       if (md->devnum > 0)
+                       if (md->devnum >= 0)
                                path = map_dev(MD_MAJOR, md->devnum, 0);
                        else
                                path = map_dev(mdp, (-1-md->devnum)<< 6, 0);
-                       map_add(&map, md->devnum, st->ss->major,
-                               st->minor_version,
-                               info.uuid, path ? : "/unknown");
+                       if (path == NULL ||
+                           strncmp(path, "/dev/md/", 8) != 0) {
+                               /* We would really like a name that provides
+                                * an MD_DEVNAME for udev.
+                                * The name needs to be unique both in /dev/md/
+                                * and in this mapfile.
+                                * It needs to match watch -I or -As would come
+                                * up with.
+                                * That means:
+                                *   Check if array is in mdadm.conf 
+                                *        - if so use that.
+                                *   determine trustworthy from homehost etc
+                                *   find a unique name based on metadata name.
+                                *   
+                                */
+                               struct mddev_ident_s *match = conf_match(&info, st);
+                               struct stat stb;
+                               if (match && match->devname && match->devname[0] == '/') {
+                                       path = match->devname;
+                                       if (path[0] != '/') {
+                                               strcpy(namebuf, "/dev/md/");
+                                               strcat(namebuf, path);
+                                               path = namebuf;
+                                       }
+                               } else {
+                                       int unum = 0;
+                                       char *sep = "_";
+                                       const char *name;
+                                       int conflict = 1;
+                                       if ((homehost == NULL ||
+                                            st->ss->match_home(st, homehost) != 1) &&
+                                           st->ss->match_home(st, "any") != 1 &&
+                                           (require_homehost
+                                            || ! conf_name_is_free(info.name)))
+                                               /* require a numeric suffix */
+                                               unum = 0;
+                                       else
+                                               /* allow name to be used as-is if no conflict */
+                                               unum = -1;
+                                       name = info.name;
+                                       if (!*name) {
+                                               name = st->ss->name;
+                                               if (!isdigit(name[strlen(name)-1]) &&
+                                                   unum == -1) {
+                                                       unum = 0;
+                                                       sep = "";
+                                               }
+                                       }
+                                       if (strchr(name, ':'))
+                                               /* probably a uniquifying
+                                                * hostname prefix.  Allow
+                                                * without a suffix
+                                                */
+                                               unum = -1;
+
+                                       while (conflict) {
+                                               if (unum >= 0)
+                                                       sprintf(namebuf, "/dev/md/%s%s%d",
+                                                               name, sep, unum);
+                                               else
+                                                       sprintf(namebuf, "/dev/md/%s",
+                                                               name);
+                                               unum++;
+                                               if (lstat(namebuf, &stb) != 0 &&
+                                                   (map == NULL ||
+                                                    !map_by_name(&map, namebuf+8)))
+                                                       conflict = 0;
+                                       }
+                                       path = namebuf;
+                               }
+                       }
+                       map_add(&map, md->devnum,
+                               info.text_version,
+                               info.uuid, path);
                        st->ss->free_super(st);
                        break;
                }
+               sysfs_free(sra);
        }
-       free_mdstat(mdstat);
        map_write(map);
        map_free(map);
+       for (md = mdstat ; md ; md = md->next) {
+               struct mdinfo *sra = sysfs_read(-1, md->devnum, GET_VERSION);
+               sysfs_uevent(sra, "change");
+               sysfs_free(sra);
+       }
+       free_mdstat(mdstat);
 }
diff --git a/md.4 b/md.4
index dfd287f1f156db44c03715fa26303c0b6f6c9011..04b5308c11076374dd2f4f0896cfbce79bb304a5 100644 (file)
--- a/md.4
+++ b/md.4
@@ -11,6 +11,8 @@ md \- Multiple Device driver aka Linux Software RAID
 .BI /dev/md n
 .br
 .BI /dev/md/ n
+.br
+.BR /dev/md/ name
 .SH DESCRIPTION
 The
 .B md
@@ -37,15 +39,17 @@ including RAID0 (striped array), LINEAR (catenated array),
 MULTIPATH (a set of different interfaces to the same device),
 and FAULTY (a layer over a single device into which errors can be injected).
 
-.SS MD SUPER BLOCK
-Each device in an array may have a
-.I superblock
-which records information about the structure and state of the array.
+.SS MD METADATA
+Each device in an array may have some 
+.I metadata
+stored in the device.  This metadata is sometimes called a
+.BR superblock .
+The metadata records information about the structure and state of the array.
 This allows the array to be reliably re-assembled after a shutdown.
 
 From Linux kernel version 2.6.10,
 .B md
-provides support for two different formats of this superblock, and
+provides support for two different formats of metadata, and
 other formats can be added.  Prior to this release, only one format is
 supported.
 
@@ -66,11 +70,11 @@ normally 1K long, but can be longer.  It is normally stored between 8K
 and 12K from the end of the device, on a 4K boundary, though
 variations can be stored at the start of the device (version 1.1) or 4K from
 the start of the device (version 1.2).
-This superblock format stores multibyte data in a
+This metadata format stores multibyte data in a
 processor-independent format and supports up to hundreds of
 component devices (version 0.90 only supports 28).
 
-The superblock contains, among other things:
+The metadata contains, among other things:
 .TP
 LEVEL
 The manner in which the devices are arranged into the array
@@ -80,6 +84,7 @@ UUID
 a 128 bit Universally Unique Identifier that identifies the array that
 contains this device.
 
+.PP
 When a version 0.90 array is being reshaped (e.g. adding extra devices
 to a RAID5), the version number is temporarily set to 0.91.  This
 ensures that if the reshape process is stopped in the middle (e.g. by
@@ -88,7 +93,7 @@ not support reshaping, then the array will not be assembled (which
 would cause data corruption) but will be left untouched until a kernel
 that can complete the reshape processes is used.
 
-.SS ARRAYS WITHOUT SUPERBLOCKS
+.SS ARRAYS WITHOUT METADATA
 While it is usually best to create arrays with superblocks so that
 they can be assembled reliably, there are some circumstances when an
 array without superblocks is preferred.  These include:
@@ -118,6 +123,40 @@ configuration that does not use a superblock, and to maintain the state of
 the array elsewhere.  While not encouraged for general us, it does
 have special-purpose uses and is supported.
 
+.SS ARRAYS WITH EXTERNAL METADATA
+
+From release 2.6.28, the
+.I md
+driver supports arrays with externally managed metadata.  That is,
+the metadata is not managed by the kernel by rather by a user-space
+program which is external to the kernel.  This allows support for a
+variety of metadata formats without cluttering the kernel with lots of
+details.
+.PP
+.I md
+is able to communicate with the user-space program through various
+sysfs attributes so that it can make appropriate changes to the
+metadata \- for example to make a device as faulty.  When necessary,
+.I md
+will wait for the program to acknowledge the event by writing to a
+sysfs attribute.
+The manual page for
+.IR mdmon (8)
+contains more detail about this interaction.
+
+.SS CONTAINERS
+Many metadata formats use a single block of metadata to describe a
+number of different arrays which all use the same set of devices.
+In this case it is helpful for the kernel to know about the full set
+of devices as a whole.  This set is known to md as a
+.IR container .
+A container is an
+.I md
+array with externally managed metadata and with device offset and size
+so that it just covers the metadata part of the devices.  The
+remainder of each device is available to be incorporated into various
+arrays.
+
 .SS LINEAR
 
 A linear array simply catenates the available space on each
@@ -138,12 +177,12 @@ A RAID0 array (which has zero redundancy) is also known as a
 striped array.
 A RAID0 array is configured at creation with a
 .B "Chunk Size" 
-which must be a power of two, and at least 4 kibibytes.
+which must be a power of two (prior to Linux 2.6.31), and at least 4
+kibibytes.
 
 The RAID0 driver assigns the first chunk of the array to the first
 device, the second chunk to the second device, and so on until all
-drives have been assigned one chunk.  This collection of chunks forms
-a
+drives have been assigned one chunk.  This collection of chunks forms a
 .BR stripe .
 Further chunks are gathered into stripes in the same way, and are
 assigned to the remaining space in the drives.
@@ -175,6 +214,11 @@ multiple sequential streams or a random workload will use more than one
 spindle. In theory, having an N-disk RAID1 will allow N sequential
 threads to read from all disks.
 
+Individual devices in a RAID1 can be marked as "write-mostly".
+This drives are excluded from the normal read balancing and will only
+be read from when there is no other option.  This can be useful for
+devices connected over a slow link.
+
 .SS RAID4
 
 A RAID4 array is like a RAID0 array with an extra device for storing
@@ -274,7 +318,11 @@ A MULTIPATH array is composed of a number of logically different
 devices, often fibre channel interfaces, that all refer the the same
 real device. If one of these interfaces fails (e.g. due to cable
 problems), the multipath driver will attempt to redirect requests to
-another interface. 
+another interface.
+
+The MULTIPATH drive is not receiving any ongoing development and
+should be considered a legacy driver.  The device-mapper based
+multipath drivers should be preferred for new installations.
 
 .SS FAULTY
 The FAULTY md module is provided for testing purposes.  A faulty array
@@ -526,10 +574,22 @@ Finally, "idle" can be written to stop the check/repair process.
 .B md/stripe_cache_size
 This is only available on RAID5 and RAID6.  It records the size (in
 pages per device) of the  stripe cache which is used for synchronising
-all read and write operations to the array.  The default is 128.
+all write operations to the array and all read operations if the array
+is degraded.  The default is 256.  Valid values are 17 to 32768.
 Increasing this number can increase performance in some situations, at
-some cost in system memory.
+some cost in system memory.  Note, setting this value too high can
+result in an "out of memory" condition for the system.
 
+memory_consumed = system_page_size * nr_disks * stripe_cache_size
+
+.TP
+.B md/preread_bypass_threshold
+This is only available on RAID5 and RAID6.  This variable sets the
+number of times MD will service a full-stripe-write before servicing a
+stripe that requires some "prereading".  For fairness this defaults to
+1.  Valid values are 0 to stripe_cache_size.  Setting this to 0
+maximizes sequential-write throughput at the cost of fairness to threads
+doing small or random writes.  
 
 .SS KERNEL PARAMETERS
 
@@ -557,6 +617,8 @@ in
 
 .TP
 .B md_mod.start_ro=1
+.TP
+.B /sys/module/md_mod/parameters/start_ro
 This tells md to start all arrays in read-only mode.  This is a soft
 read-only that will automatically switch to read-write on the first
 write request.  However until that write request, nothing is written
@@ -565,6 +627,8 @@ operation is started.
 
 .TP
 .B md_mod.start_dirty_degraded=1
+.TP
+.B /sys/module/md_mod/parameters/start_dirty_degraded
 As mentioned above, md will not normally start a RAID4, RAID5, or
 RAID6 that is both dirty and degraded as this situation can imply
 hidden data loss.  This can be awkward if the root filesystem is
@@ -614,13 +678,13 @@ A readable and writable file that reflects the current "goal" rebuild
 speed for times when non-rebuild activity is current on an array.
 The speed is in Kibibytes per second, and is a per-device rate, not a
 per-array rate (which means that an array with more disks will shuffle
-more data for a given speed).   The default is 100.
+more data for a given speed).   The default is 1000.
 
 .TP
 .B /proc/sys/dev/raid/speed_limit_max
 A readable and writable file that reflects the current "goal" rebuild
 speed for times when no non-rebuild activity is current on an array.
-The default is 100,000.
+The default is 200,000.
 
 .SH SEE ALSO
 .BR mdadm (8),
diff --git a/mdadm.8 b/mdadm.8
index 9f9b8ae1e078b777500e28257430e4f8385219ea..2607c8f47fdb70bc92553bb4c5ac2b3a8505d63d 100644 (file)
--- a/mdadm.8
+++ b/mdadm.8
@@ -5,7 +5,7 @@
 .\"   the Free Software Foundation; either version 2 of the License, or
 .\"   (at your option) any later version.
 .\" See file COPYING in distribution for details.
-.TH MDADM 8 "" v2.6.9
+.TH MDADM 8 "" v3.0-rc1
 .SH NAME
 mdadm \- manage MD devices
 .I aka
@@ -17,7 +17,7 @@ Linux Software RAID
 
 .SH DESCRIPTION
 RAID devices are virtual devices created from two or more
-real block devices. This allows multiple devices (typically disk
+real block devices.  This allows multiple devices (typically disk
 drives or partitions thereof) to be combined into a single device to
 hold (for example) a single filesystem.
 Some RAID levels include redundancy and so can survive some degree of
@@ -38,60 +38,50 @@ md devices,
 .BR RAID6 ,
 .BR RAID10 ,
 .BR MULTIPATH ,
+.BR FAULTY ,
 and
-.BR FAULTY .
+.BR CONTAINER .
 
 .B MULTIPATH
 is not a Software RAID mechanism, but does involve
 multiple devices:
 each device is a path to one common physical storage device.
+New installations should not use md/multipath as it is not well
+supported and has no ongoing development.  Use the Device Mapper based
+multipath-tools instead.
 
 .B FAULTY
 is also not true RAID, and it only involves one device.  It
 provides a layer over a true device that can be used to inject faults.
 
-.\".I mdadm
-.\"is a program that can be used to create, manage, and monitor
-.\"MD devices.  As
-.\"such it provides a similar set of functionality to the
-.\".B raidtools
-.\"packages.
-.\"The key differences between
-.\".I mdadm
-.\"and
-.\".B raidtools
-.\"are:
-.\".IP \(bu 4
-.\".I mdadm
-.\"is a single program and not a collection of programs.
-.\".IP \(bu 4
-.\".I mdadm
-.\"can perform (almost) all of its functions without having a
-.\"configuration file and does not use one by default.  Also
-.\".I mdadm
-.\"helps with management of the configuration
-.\"file.
-.\".IP \(bu 4
-.\".I mdadm
-.\"can provide information about your arrays (through Query, Detail, and Examine)
-.\"that
-.\".B  raidtools
-.\"cannot.
-.\".P
-.\".I mdadm
-.\"does not use
-.\".IR /etc/raidtab ,
-.\"the
-.\".B raidtools
-.\"configuration file, at all.  It has a different configuration file
-.\"with a different format and a different purpose.
+.B CONTAINER
+is different again.  A
+.B CONTAINER
+is a collection of devices that are
+managed as a set.  This is similar to the set of devices connected to
+a hardware RAID controller.  The set of devices may contain a number
+of different RAID arrays each utilising some (or all) of the blocks from a
+number of the devices in the set.  For example, two devices in a 5-device set
+might form a RAID1 using the whole devices.  The remaining three might
+have a RAID5 over the first half of each device, and a RAID0 over the
+second half.
+
+With a
+.BR CONTAINER ,
+there is one set of metadata that describes all of
+the arrays in the container.  So when
+.I mdadm
+creates a
+.B CONTAINER
+device, the device just represents the metadata.  Other normal arrays (RAID1
+etc) can be created inside the container.
 
 .SH MODES
 mdadm has several major modes of operation:
 .TP
 .B Assemble
 Assemble the components of a previously created
-array into an active array. Components can be explicitly given
+array into an active array.  Components can be explicitly given
 or can be searched for.
 .I mdadm
 checks that the components
@@ -100,7 +90,7 @@ information so as to assemble a faulty array.
 
 .TP
 .B Build
-Build an array that doesn't have per-device superblocks.  For these
+Build an array that doesn't have per-device metadata (superblocks).  For these
 sorts of arrays,
 .I mdadm
 cannot differentiate between initial creation and subsequent assembly
@@ -112,15 +102,20 @@ what you are doing.
 
 .TP
 .B Create
-Create a new array with per-device superblocks.
-.\"It can progress
-.\"in several step create-add-add-run or it can all happen with one command.
+Create a new array with per-device metadata (superblocks).
+Appropriate metadata is written to each device, and then the array
+comprising those devices is activated.  A 'resync' process is started
+to make sure that the array is consistent (e.g. both sides of a mirror
+contain the same data) but the content of the device is left otherwise
+untouched.
+The array can be used as soon as it has been created.  There is no
+need to wait for the initial resync to finish.
 
 .TP
 .B "Follow or Monitor"
 Monitor one or more md devices and act on any state changes.  This is
-only meaningful for raid1, 4, 5, 6, 10 or multipath arrays, as
-only these have interesting state.  raid0 or linear never have
+only meaningful for RAID1, 4, 5, 6, 10 or multipath arrays, as
+only these have interesting state.  RAID0 or Linear never have
 missing, spare, or failed drives, so there is nothing to monitor.
 
 .TP
@@ -140,6 +135,13 @@ system.  As each device is detected,
 .I mdadm
 has a chance to include it in some array as appropriate.
 
+If a
+.B CONTAINER
+is passed to
+.I mdadm
+in this mode, then any arrays within that container will be assembled
+and started.
+
 .TP
 .B Manage
 This is for doing things to specific components of an array such as
@@ -195,7 +197,8 @@ work if
 is compiled into the kernel \(em not if it is a module.
 Arrays can be auto-detected by the kernel if all the components are in
 primary MS-DOS partitions with partition type
-.BR FD .
+.BR FD ,
+and all use v0.90 metadata.
 In-kernel autodetect is not recommended for new installations.  Using
 .I mdadm
 to detect and assemble arrays \(em possibly in an
@@ -208,7 +211,7 @@ If a device is given before any options, or if the first option is
 .BR \-\-fail ,
 or
 .BR \-\-remove ,
-then the MANAGE mode is assume.
+then the MANAGE mode is assumed.
 Anything other than these will cause the
 .B Misc
 mode to be assumed.
@@ -272,10 +275,12 @@ If the config file given is
 then nothing will be read, but
 .I mdadm
 will act as though the config file contained exactly
-.B "DEVICE partitions"
+.B "DEVICE partitions containers"
 and will read
 .B /proc/partitions
-to find a list of devices to scan.
+to find a list of devices to scan, and
+.B /proc/mdstat
+to find a list of containers to examine.
 If the word
 .B "none"
 is given for the config file, then
@@ -303,7 +308,7 @@ says to get a list of array devices from
 
 .TP
 .B \-e ", " \-\-metadata=
-Declare the style of superblock (raid metadata) to be used.  The
+Declare the style of RAID metadata (superblock) to be used.  The
 default is 0.90 for
 .BR \-\-create ,
 and to guess for other operations.
@@ -325,6 +330,20 @@ Use the new version-1 format superblock.  This has few restrictions.
 The different sub-versions store the superblock at different locations
 on the device, either at the end (for 1.0), at the start (for 1.1) or
 4K from the start (for 1.2).
+.IP ddf
+Use the "Industry Standard" DDF (Disk Data Format) format defined by
+SNIA.
+When creating a DDF array a
+.B CONTAINER
+will be created, and normal arrays can be created in that container.
+.IP imsm
+Use the Intel(R) Matrix Storage Manager metadata format.  This creates a
+.B CONTAINER
+which is managed in a similar manner to DDF, and is supported by an
+option-rom on some platforms:
+.IP
+.B http://www.intel.com/design/chipsets/matrixstorage_sb.htm
+.PP
 .RE
 
 .TP
@@ -336,7 +355,7 @@ should be considered the home for any arrays.
 
 When creating an array, the
 .B homehost
-will be recorded in the superblock.  For version-1 superblocks, it will
+will be recorded in the metadata.  For version-1 superblocks, it will
 be prefixed to the array name.  For version-0.90 superblocks, part of
 the SHA1 hash of the hostname will be stored in the later half of the
 UUID.
@@ -345,7 +364,9 @@ When reporting information about an array, any array which is tagged
 for the given homehost will be reported as such.
 
 When using Auto-Assemble, only arrays tagged for the given homehost
-will be assembled.
+will be allowed to use 'local' names (i.e. not ending in '_' followed
+by a digit string).  See below under
+.BR "Auto Assembly" .
 
 .SH For create, build, or grow:
 
@@ -356,30 +377,29 @@ number of spare devices (see below) must equal the number of
 .I component-devices
 (including "\fBmissing\fP" devices)
 that are listed on the command line for
-.BR  \-\-create .
+.BR \-\-create .
 Setting a value of 1 is probably
 a mistake and so requires that
 .B \-\-force
 be specified first.  A value of 1 will then be allowed for linear,
-multipath, raid0 and raid1.  It is never allowed for raid4 or raid5.
+multipath, RAID0 and RAID1.  It is never allowed for RAID4, RAID5 or RAID6.
 .br
 This number can only be changed using
 .B \-\-grow
-for RAID1, RAID5 and RAID6 arrays, and only on kernels which provide
-necessary support.
+for RAID1, RAID4, RAID5 and RAID6 arrays, and only on kernels which provide
+the necessary support.
 
 .TP
 .BR \-x ", " \-\-spare\-devices=
 Specify the number of spare (eXtra) devices in the initial array.
 Spares can also be added
 and removed later.  The number of component devices listed
-on the command line must equal the number of raid devices plus the
+on the command line must equal the number of RAID devices plus the
 number of spare devices.
 
-
 .TP
 .BR \-z ", " \-\-size=
-Amount (in Kibibytes) of space to use from each drive in RAID level 1/4/5/6.
+Amount (in Kibibytes) of space to use from each drive in RAID levels 1/4/5/6.
 This must be a multiple of the chunk size, and must leave about 128Kb
 of space at the end of the drive for the RAID superblock.
 If this is not specified
@@ -389,7 +409,7 @@ issued.
 
 This value can be set with
 .B \-\-grow
-for RAID level 1/4/5/6. If the array was created with a size smaller
+for RAID level 1/4/5/6.  If the array was created with a size smaller
 than the currently active drives, the extra space can be accessed
 using
 .BR \-\-grow .
@@ -397,20 +417,37 @@ The size can be given as
 .B max
 which means to choose the largest size that fits on all current drives.
 
+This value can not be used with
+.B CONTAINER
+metadata such as DDF and IMSM.
+
 .TP
 .BR \-c ", " \-\-chunk=
 Specify chunk size of kibibytes.  The default is 64.
+This is only meaningful for RAID0, RAID4, RAID5, RAID6, and RAID10.
 
 .TP
 .BR \-\-rounding=
-Specify rounding factor for linear array (==chunk size)
+Specify rounding factor for a Linear array.  The size of each
+component will be rounded down to a multiple of this size.
+This is a synonym for
+.B \-\-chunk
+but highlights the different meaning for Linear as compared to other
+RAID levels.
 
 .TP
 .BR \-l ", " \-\-level=
-Set raid level.  When used with
+Set RAID level.  When used with
 .BR \-\-create ,
 options are: linear, raid0, 0, stripe, raid1, 1, mirror, raid4, 4,
-raid5, 5, raid6, 6, raid10, 10, multipath, mp, faulty.  Obviously some of these are synonymous.
+raid5, 5, raid6, 6, raid10, 10, multipath, mp, faulty, container.
+Obviously some of these are synonymous.
+
+When a
+.B CONTAINER
+metadata type is requested, only the
+.B container
+level is permitted, and it does not need to be explicitly given.
 
 When used with
 .BR \-\-build ,
@@ -421,11 +458,11 @@ Not yet supported with
 
 .TP
 .BR \-p ", " \-\-layout=
-This option configures the fine details of data layout for raid5,
-and raid10 arrays, and controls the failure modes for
+This option configures the fine details of data layout for RAID5, RAID6,
+and RAID10 arrays, and controls the failure modes for
 .IR faulty .
 
-The layout of the raid5 parity block can be one of
+The layout of the RAID5 parity block can be one of
 .BR left\-asymmetric ,
 .BR left\-symmetric ,
 .BR right\-asymmetric ,
@@ -434,6 +471,31 @@ The layout of the raid5 parity block can be one of
 The default is
 .BR left\-symmetric .
 
+It is also possibly to cause RAID5 to use a RAID4-like layout by
+choosing
+.BR parity\-first ,
+or
+.BR parity\-last .
+
+Finally for RAID5 there are DDF\-compatible layouts,
+.BR ddf\-zero\-restart ,
+.BR ddf\-N\-restart ,
+and
+.BR ddf\-N\-continue .
+
+These same layouts are available for RAID6.  There are also 4 layouts
+that will provide an intermediate stage for converting between RAID5
+and RAID6.  These provide a layout which is identical to the
+corresponding RAID5 layout on the first N\-1 devices, and has the 'Q'
+syndrome (the second 'parity' block used by RAID6) on the last device.
+These layouts are:
+.BR left\-symmetric\-6 ,
+.BR right\-symmetric\-6 ,
+.BR left\-asymmetric\-6 ,
+.BR right\-asymmetric\-6 ,
+and
+.BR pairty\-first\-6 .
+
 When setting the failure mode for level
 .I faulty,
 the options are:
@@ -467,7 +529,7 @@ Finally, the layout options for RAID10 are one of 'n', 'o' or 'f' followed
 by a small number.  The default is 'n2'.  The supported options are:
 
 .I 'n'
-signals 'near' copies. Multiple copies of one data block are at
+signals 'near' copies.  Multiple copies of one data block are at
 similar offsets in different devices.
 
 .I 'o'
@@ -480,7 +542,7 @@ down.
 .I 'f'
 signals 'far' copies
 (multiple copies have very different offsets).
-See md(4) for more detail about 'near' and 'far'.
+See md(4) for more detail about 'near', 'offset', and 'far'.
 
 The number is the number of copies of each datablock.  2 is normal, 3
 can be useful.  This number can be at most equal to the number of
@@ -518,7 +580,7 @@ Storing bitmap files on other filesystems may result in serious problems.
 
 .TP
 .BR \-\-bitmap\-chunk=
-Set the chunksize of the bitmap. Each bit corresponds to that many
+Set the chunksize of the bitmap.  Each bit corresponds to that many
 Kilobytes of storage.
 When using a file based bitmap, the default is to use the smallest
 size that is at-least 4 and requires no more than 2^21 chunks.
@@ -527,10 +589,9 @@ When using an
 bitmap, the chunksize is automatically determined to make best use of
 available space.
 
-
 .TP
 .BR \-W ", " \-\-write\-mostly
-subsequent devices lists in a
+subsequent devices listed in a
 .BR \-\-build ,
 .BR \-\-create ,
 or
@@ -543,8 +604,8 @@ slow link.
 .TP
 .BR \-\-write\-behind=
 Specify that write-behind mode should be enabled (valid for RAID1
-only). If an argument is specified, it will set the maximum number
-of outstanding writes allowed. The default value is 256.
+only).  If an argument is specified, it will set the maximum number
+of outstanding writes allowed.  The default value is 256.
 A write-intent bitmap is required in order to use write-behind
 mode, and write-behind is only attempted on drives marked as
 .IR write-mostly .
@@ -558,24 +619,33 @@ when trying to recover from a major failure as you can be sure that no
 data will be affected unless you actually write to the array.  It can
 also be used when creating a RAID1 or RAID10 if you want to avoid the
 initial resync, however this practice \(em while normally safe \(em is not
-recommended.   Use this only if you really know what you are doing.
+recommended.  Use this only if you really know what you are doing.
 
 .TP
 .BR \-\-backup\-file=
 This is needed when
 .B \-\-grow
 is used to increase the number of
-raid-devices in a RAID5 if there  are no spare devices available.
-See the section below on RAID_DEVICE CHANGES.  The file should be
-stored on a separate device, not on the raid array being reshaped.
+raid-devices in a RAID5 if there are no spare devices available.
+See the GROW MODE section below on RAID\-DEVICES CHANGES.  The file
+should be stored on a separate device, not on the RAID array being
+reshaped.
 
 .TP
 .BR \-N ", " \-\-name=
 Set a
 .B name
 for the array.  This is currently only effective when creating an
-array with a version-1 superblock.  The name is a simple textual
-string that can be used to identify array components when assembling.
+array with a version-1 superblock, or an array in a DDF container.
+The name is a simple textual string that can be used to identify array
+components when assembling.  If name is needed but not specified, it
+is taken from the basename of the device that is being created.
+e.g. when creating
+.I /dev/md/home
+the
+.B name
+will default to
+.IR home .
 
 .TP
 .BR \-R ", " \-\-run
@@ -594,20 +664,24 @@ Insist that
 accept the geometry and layout specified without question.  Normally
 .I mdadm
 will not allow creation of an array with only one device, and will try
-to create a raid5 array with one missing drive (as this makes the
+to create a RAID5 array with one missing drive (as this makes the
 initial resync work faster).  With
 .BR \-\-force ,
 .I mdadm
 will not try to be so clever.
 
 .TP
-.BR \-a ", " "\-\-auto{=no,yes,md,mdp,part,p}{NN}"
-Instruct mdadm to create the device file if needed, possibly allocating
+.BR \-a ", " "\-\-auto{=yes,md,mdp,part,p}{NN}"
+Instruct mdadm how to create the device file if needed, possibly allocating
 an unused minor number.  "md" causes a non-partitionable array
-to be used.  "mdp", "part" or "p" causes a partitionable array (2.6 and
+to be used (though since Linux 2.6.28, these array devices are in fact
+partitionable).  "mdp", "part" or "p" causes a partitionable array (2.6 and
 later) to be used.  "yes" requires the named md device to have
 a 'standard' format, and the type and minor number will be determined
-from this.  See DEVICE NAMES below.
+from this.  With mdadm 3.0, device creation is normally left up to
+.I udev
+so this option is unlikely to be needed.
+See DEVICE NAMES below.
 
 The argument can also come immediately after
 "\-a".  e.g. "\-ap".
@@ -633,45 +707,48 @@ partitions.  A different number of partitions can be specified at the
 end of this option (e.g.
 .BR \-\-auto=p7 ).
 If the device name ends with a digit, the partition names add a 'p',
-and a number, e.g. "/dev/home1p3".  If there is no
-trailing digit, then the partition names just have a number added,
-e.g. "/dev/scratch3".
+and a number, e.g.
+.IR /dev/md/home1p3 .
+If there is no trailing digit, then the partition names just have a
+number added, e.g.
+.IR /dev/md/scratch3 .
 
 If the md device name is in a 'standard' format as described in DEVICE
 NAMES, then it will be created, if necessary, with the appropriate
-number based on that name.  If the device name is not in one of these
-formats, then a unused minor number will be allocated.  The minor
+device number based on that name.  If the device name is not in one of these
+formats, then a unused device number will be allocated.  The device
 number will be considered unused if there is no active array for that
 number, and there is no entry in /dev for that number and with a
-non-standard name.
-
-.TP
-.BR \-\-symlink = no
-Normally when
-.B \-\-auto
-causes
-.I mdadm
-to create devices in
-.B /dev/md/
-it will also create symlinks from
-.B /dev/
-with names starting with
-.B md
-or
-.BR md_ .
-Use
-.B \-\-symlink=no
-to suppress this, or
-.B \-\-symlink=yes
-to enforce this even if it is suppressing
-.IR mdadm.conf .
-
+non-standard name.  Names that are not in 'standard' format are only
+allowed in "/dev/md/".
+
+.\".TP
+.\".BR \-\-symlink = no
+.\"Normally when
+.\".B \-\-auto
+.\"causes
+.\".I mdadm
+.\"to create devices in
+.\".B /dev/md/
+.\"it will also create symlinks from
+.\".B /dev/
+.\"with names starting with
+.\".B md
+.\"or
+.\".BR md_ .
+.\"Use
+.\".B \-\-symlink=no
+.\"to suppress this, or
+.\".B \-\-symlink=yes
+.\"to enforce this even if it is suppressing
+.\".IR mdadm.conf .
+.\"
 
 .SH For assemble:
 
 .TP
 .BR \-u ", " \-\-uuid=
-uuid of array to assemble. Devices which don't have this uuid are
+uuid of array to assemble.  Devices which don't have this uuid are
 excluded
 
 .TP
@@ -691,6 +768,12 @@ e.g. when assembling
 .B \-\-super\-minor=dev
 will look for super blocks with a minor number of 0.
 
+.B \-\-super\-minor
+is only relevant for v0.90 metadata, and should not normally be used.
+Using
+.B \-\-uuid
+is much safer.
+
 .TP
 .BR \-N ", " \-\-name=
 Specify the name of the array to assemble.  This must be the name
@@ -702,7 +785,15 @@ prefixed to the start of the given name.
 
 .TP
 .BR \-f ", " \-\-force
-Assemble the array even if some superblocks appear out-of-date
+Assemble the array even if the metadata on some devices appears to be
+out-of-date.  If
+.I mdadm
+cannot find enough working devices to start the array, but can find
+some devices that are recorded as having failed, then it will mark
+those devices as working so that the array can be started.
+An array which requires
+.B \-\-force
+to be started may contain data corruption.  Use it carefully.
 
 .TP
 .BR \-R ", " \-\-run
@@ -783,7 +874,7 @@ This can be useful if
 reports a different "Preferred Minor" to
 .BR \-\-detail .
 In some cases this update will be performed automatically
-by the kernel driver. In particular the update happens automatically
+by the kernel driver.  In particular the update happens automatically
 at the first write to an array with redundancy (RAID level 1 or
 greater) on a 2.6 (or later) kernel.
 
@@ -817,8 +908,8 @@ The
 .B resync
 option will cause the array to be marked
 .I dirty
-meaning that any redundancy in the array (e.g. parity for raid5,
-copies for raid1) may be incorrect.  This will cause the raid system
+meaning that any redundancy in the array (e.g. parity for RAID5,
+copies for RAID1) may be incorrect.  This will cause the RAID system
 to perform a "resync" pass to make sure that all redundant information
 is correct.
 
@@ -836,7 +927,7 @@ with original (Version 0.90) superblocks.
 
 The
 .B summaries
-option will correct the summaries in the superblock. That is the
+option will correct the summaries in the superblock.  That is the
 counts of total, working, active, failed, and spare devices.
 
 The
@@ -855,6 +946,7 @@ This will cause
 to determine the maximum usable amount of space on each device and
 update the relevant field in the metadata.
 
+.ig XX
 .TP
 .B \-\-auto\-update\-homehost
 This flag is only meaningful with auto-assembly (see discussion below).
@@ -862,16 +954,27 @@ In that situation, if no suitable arrays are found for this homehost,
 .I mdadm
 will rescan for any arrays at all and will assemble them and update the
 homehost to match the current host.
+.XX
 
 .SH For Manage mode:
 
 .TP
 .BR \-a ", " \-\-add
-hot-add listed devices.
+hot-add listed devices.  For arrays with redundancy, the listed
+devices become available as spares.  If the array is degraded, it will
+immediately start recovering data on to one of these spares.
 
 .TP
 .BR \-\-re\-add
-re-add a device that was recently removed from an array.
+re-add a device that was recently removed from an array.  This is only
+needed for arrays that have be built (i.e. with
+.BR --build ).
+For created arrays, devices are always re-added if that is possible.
+When re-adding a device, if nothing has changed on the array since the
+device was removed, no recovery is performed.  Also, if the array has
+a write-intent bitmap, then the recovery performed after a re-add will
+be limited to those blocks which, according to the bitmap, might have
+changed since the device was removed.
 
 .TP
 .BR \-r ", " \-\-remove
@@ -908,18 +1011,17 @@ same as
 .TP
 .BR \-\-write\-mostly
 Subsequent devices that are added or re-added will have the 'write-mostly'
-flag set.  This is only valid for RAID! and means that the 'md' driver
+flag set.  This is only valid for RAID1 and means that the 'md' driver
 will avoid reading from these devices if possible.
 .TP
 .BR \-\-readwrite
 Subsequent devices that are added or re-added will have the 'write-mostly'
 flag cleared.
 
-
 .P
-Each of these options require that the first device listed is the array
+Each of these options requires that the first device listed is the array
 to be acted upon, and the remainder are component devices to be added,
-removed, or marked as faulty.  Several different operations can be
+removed, marked as faulty, etc.  Several different operations can be
 specified for different devices, e.g.
 .in +5
 mdadm /dev/md0 \-\-add /dev/sda1 \-\-fail /dev/sdb1 \-\-remove /dev/sdb1
@@ -952,7 +1054,12 @@ Information about what is discovered is presented.
 
 .TP
 .BR \-D ", " \-\-detail
-Print detail of one or more md devices.
+Print details of one or more md devices.
+
+.TP
+.BR \-\-detail\-platform
+Print details of the platform's RAID capabilities (firmware / hardware
+topology) for a given metadata format.
 
 .TP
 .BR \-Y ", " \-\-export
@@ -966,12 +1073,21 @@ pairs for easy import into the environment.
 
 .TP
 .BR \-E ", " \-\-examine
-Print content of md superblock on device(s).
+Print contents of the metadata stored on the named device(s).
+Note the contrast between
+.B \-\-examine
+and
+.BR \-\-detail .
+.B \-\-examine
+applies to devices which are components of an array, while
+.B \-\-detail
+applies to a whole array which is currently active.
 .TP
 .B \-\-sparc2.2
-If an array was created on a 2.2 Linux kernel patched with RAID
-support, the superblock will have been created incorrectly, or at
-least incompatibly with 2.4 and later kernels.  Using the
+If an array was created on a SPARC machine with a 2.2 Linux kernel
+patched with RAID support, the superblock will have been created
+incorrectly, or at least incompatibly with 2.4 and later kernels.
+Using the
 .B \-\-sparc2.2
 flag with
 .B \-\-examine
@@ -983,11 +1099,19 @@ the right thing, then the array can be successfully assembled using
 .BR \-X ", " \-\-examine\-bitmap
 Report information about a bitmap file.
 The argument is either an external bitmap file or an array component
-in case of an internal bitmap.
+in case of an internal bitmap.  Note that running this on an array
+device (e.g.
+.BR /dev/md0 )
+does not report the bitmap for that array.
 
 .TP
 .BR \-R ", " \-\-run
-start a partially built array.
+start a partially assembled array.  If
+.B \-\-assemble
+did not find enough devices to fully start the array, it might leaving
+it partially assembled.  If you wish, you can then use
+.B \-\-run
+to start the array in degraded mode.
 
 .TP
 .BR \-S ", " \-\-stop
@@ -1015,7 +1139,9 @@ When used with
 .BR \-\-detail ,
 the exit status of
 .I mdadm
-is set to reflect the status of the device.
+is set to reflect the status of the device.  See below in
+.B MISC MODE
+for details.
 
 .TP
 .BR \-W ", " \-\-wait
@@ -1025,6 +1151,20 @@ activity to finish before returning.
 will return with success if it actually waited for every device
 listed, otherwise it will return failure.
 
+.TP
+.BR \-\-wait\-clean
+For each md device given, or each device in /proc/mdstat if
+.B \-\-scan
+is given, arrange for the array to be marked clean as soon as possible.
+Also, quiesce resync so that the monitor for external metadata arrays
+(mdmon) has an opportunity to checkpoint the resync position.
+.I mdadm
+will return with success if the array uses external metadata and we
+successfully waited.  For native arrays this returns immediately as the
+kernel handles both dirty-clean transitions and resync checkpointing in
+the kernel at shutdown.  No action is taken if safe-mode handling is
+disabled.
+
 .SH For Incremental Assembly mode:
 .TP
 .BR \-\-rebuild\-map ", " \-r
@@ -1039,6 +1179,11 @@ uses to help track which arrays are currently being assembled.
 Run any array assembled as soon as a minimal number of devices are
 available, rather than waiting until all expected devices are present.
 
+.TP
+.B \-\-no\-degraded
+This allows the hot-plug system to prevent arrays from running when it knows
+that more disks may arrive later in the discovery process.
+
 .TP
 .BR \-\-scan ", " \-s
 Only meaningful with
@@ -1070,14 +1215,17 @@ facility of 'daemon' and varying priorities.
 Give a delay in seconds.
 .I mdadm
 polls the md arrays and then waits this many seconds before polling
-again.  The default is 60 seconds.
+again.  The default is 60 seconds.  Since 2.6.16, there is no need to
+reduce this as the kernel alerts
+.I mdadm
+immediately when there is any change.
 
 .TP
 .BR \-f ", " \-\-daemonise
 Tell
 .I mdadm
 to run as a background daemon if it decides to monitor anything.  This
-causes it to fork and run in the child, and to disconnect form the
+causes it to fork and run in the child, and to disconnect from the
 terminal.  The process id of the child is written to stdout.
 This is useful with
 .B \-\-scan
@@ -1122,16 +1270,16 @@ Usage:
 .HP 12
 Usage:
 .B mdadm \-\-assemble \-\-scan
-.I  md-devices-and-options...
+.I md-devices-and-options...
 .HP 12
 Usage:
 .B mdadm \-\-assemble \-\-scan
-.I  options...
+.I options...
 
 .PP
-This usage assembles one or more raid arrays from pre-existing components.
+This usage assembles one or more RAID arrays from pre-existing components.
 For each array, mdadm needs to know the md device, the identity of the
-array, and a number of component-devices. These can be found in a number of ways.
+array, and a number of component-devices.  These can be found in a number of ways.
 
 In the first usage example (without the
 .BR \-\-scan )
@@ -1139,7 +1287,9 @@ the first device given is the md device.
 In the second usage example, all devices listed are treated as md
 devices and assembly is attempted.
 In the third (where no devices are listed) all md devices that are
-listed in the configuration file are assembled.
+listed in the configuration file are assembled.  If not arrays are
+described by the configuration file, then any arrays that
+can be found on unused devices will be assembled.
 
 If precisely one device is listed, but
 .B \-\-scan
@@ -1151,7 +1301,9 @@ was given and identity information is extracted from the configuration file.
 
 The identity can be given with the
 .B \-\-uuid
-option, with the
+option, the
+.B \-\-name
+option, or the
 .B \-\-super\-minor
 option, will be taken from the md-device record in the config file, or
 will be taken from the super block of the first component-device
@@ -1159,7 +1311,7 @@ listed on the command line.
 
 Devices can be given on the
 .B \-\-assemble
-command line or in the config file. Only devices which have an md
+command line or in the config file.  Only devices which have an md
 superblock which contains the right identity will be considered for
 any array.
 
@@ -1169,6 +1321,8 @@ or requested with (a possibly implicit)
 .BR \-\-scan .
 In the later case,
 .B /etc/mdadm.conf
+or
+.B /etc/mdadm/mdadm.conf
 is used.
 
 If
@@ -1178,50 +1332,59 @@ identity of md arrays.
 
 Normally the array will be started after it is assembled.  However if
 .B \-\-scan
-is not given and insufficient drives were listed to start a complete
-(non-degraded) array, then the array is not started (to guard against
-usage errors).  To insist that the array be started in this case (as
-may work for RAID1, 4, 5, 6, or 10), give the
+is not given and not all expected drives were listed, then the array
+is not started (to guard against usage errors).  To insist that the
+array be started in this case (as may work for RAID1, 4, 5, 6, or 10),
+give the
 .B \-\-run
 flag.
 
-If the md device does not exist, then it will be created providing the
-intent is clear. i.e. the name must be in a standard form, or the
-.B \-\-auto
-option must be given to clarify how and whether the device should be
-created.
-This can be useful for handling partitioned devices (which don't have
-a stable device number \(em it can change after a reboot) and when using
-"udev" to manage your
+If
+.I udev
+is active,
+.I mdadm
+does not create any entries in
 .B /dev
-tree (udev cannot handle md devices because of the unusual device
-initialisation conventions).
+but leaves that to
+.IR udev .
+It does record information in
+.B /var/run/mdadm/map
+which will allow
+.I udev
+to choose the correct name.
 
-If the option to "auto" is "mdp" or "part" or (on the command line
-only) "p", then mdadm will create a partitionable array, using the
-first free one that is not in use and does not already have an entry
-in /dev (apart from numeric /dev/md* entries).
+If
+.I mdadm
+detects that udev is not configured, it will create the devices in
+.B /dev
+itself.
 
-If the option to "auto" is "yes" or "md" or (on the command line)
-nothing, then mdadm will create a traditional, non-partitionable md
-array.
+In Linux kernels prior to version 2.6.28 there were two distinctly
+different types of md devices that could be created: one that could be
+partitioned using standard partitioning tools and one that could not.
+Since 2.6.28 that distinction is no longer relevant as both type of
+devices can be partitioned.
+.I mdadm
+will normally create the type that originally could not be partitioned
+as it has a well defined major number (9).
 
-It is expected that the "auto" functionality will be used to create
-device entries with meaningful names such as "/dev/md/home" or
-"/dev/md/root", rather than names based on the numerical array number.
+Prior to 2.6.28, it is important that mdadm chooses the correct type
+of array device to use.  This can be controlled with the
+.B \-\-auto
+option.  In particular, a value of "mdp" or "part" or "p" tells mdadm
+to use a partitionable device rather than the default.
 
-When using option "auto" to create a partitionable array, the device
-files for the first 4 partitions are also created. If a different
-number is required it can be simply appended to the auto option.
-e.g. "auto=part8".  Partition names are created by appending a digit
-string to the device name, with an intervening "p" if the device name
-ends with a digit.
+In the no-udev case, the value given to
+.B \-\-auto
+can be suffixed by a number.  This tells
+.I mdadm
+to create that number of partition devices rather than the default of 4.
 
-The
+The value given to
 .B \-\-auto
-option is also available in Build and Create modes.  As those modes do
-not use a config file, the "auto=" config option does not apply to
-these modes.
+can also be given in the configuration file as a word starting
+.B auto=
+on the ARRAY line for the relevant array.
 
 .SS Auto Assembly
 When
@@ -1233,16 +1396,15 @@ and no devices are listed,
 will first attempt to assemble all the arrays listed in the config
 file.
 
-If a
-.B homehost
-has been specified (either in the config file or on the command line),
-.I mdadm
-will look further for possible arrays and will try to assemble
-anything that it finds which is tagged as belonging to the given
-homehost.  This is the only situation where
-.I mdadm
-will assemble arrays without being given specific device name or
-identity information for the array.
+In no array at listed in the config (other than those marked
+.BR <ignore> )
+it will look through the available devices for possible arrays and
+will try to assemble anything that it finds.  Arrays which are tagged
+as belonging to the given homehost will be assembled and started
+normally.  Arrays which do not obviously belong to this host are given
+names that are expected not to conflict with anything local, and are
+started "read-auto" so that nothing is written to any device until the
+array is written to. i.e.  automatic resync etc is delayed.
 
 If
 .I mdadm
@@ -1258,9 +1420,10 @@ so for example
 If the array uses version-1 metadata, then the
 .B name
 from the superblock is used to similarly create a name in
-.BR /dev/md
+.B /dev/md/
 (the name will have any 'host' prefix stripped first).
 
+.ig XX
 If
 .I mdadm
 cannot find any array for the given host at all, and if
@@ -1278,6 +1441,7 @@ homehost tagging.
 The reason for requiring arrays to be tagged with the homehost for
 auto assembly is to guard against problems that can arise when moving
 devices from one host to another.
+.XX
 
 .SH BUILD MODE
 
@@ -1293,14 +1457,16 @@ Usage:
 .PP
 This usage is similar to
 .BR \-\-create .
-The difference is that it creates an array without a superblock. With
+The difference is that it creates an array without a superblock.  With
 these arrays there is no difference between initially creating the array and
 subsequently assembling the array, except that hopefully there is useful
 data there in the second case.
 
-The level may raid0, linear, multipath, or faulty, or one of their
-synonyms. All devices must be listed and the array will be started
-once complete.
+The level may raid0, linear, raid1, raid10, multipath, or faulty, or
+one of their synonyms.  All devices must be listed and the array will
+be started once complete.  It will often be appropriate to use
+.B \-\-assume\-clean
+with levels raid1 or raid10.
 
 .SH CREATE MODE
 
@@ -1312,20 +1478,20 @@ Usage:
 .BI \-\-level= Y
 .br
 .BI \-\-raid\-devices= Z
-.I  devices
+.I devices
 
 .PP
 This usage will initialise a new md array, associate some devices with
 it, and activate the array.
 
-If the
-.B \-\-auto
-option is given (as described in more detail in the section on
-Assemble mode), then the md device will be created with a suitable
-device number if necessary.
+The named device will normally not exist when
+.I "mdadm \-\-create"
+is run, but will be created by
+.I udev
+once the array becomes active.
 
-As devices are added, they are checked to see if they contain raid
-superblocks or filesystems. They are also checked to see if the variance in
+As devices are added, they are checked to see if they contain RAID
+superblocks or filesystems.  They are also checked to see if the variance in
 device size exceeds 1%.
 
 If any discrepancy is found, the array will not automatically be run, though
@@ -1347,9 +1513,9 @@ others can be
 When creating a RAID5 array,
 .I mdadm
 will automatically create a degraded array with an extra spare drive.
-This is because building the spare into a degraded array is in general faster than resyncing
-the parity on a non-degraded, but not clean, array.  This feature can
-be overridden with the
+This is because building the spare into a degraded array is in general
+faster than resyncing the parity on a non-degraded, but not clean,
+array.  This feature can be overridden with the
 .B \-\-force
 option.
 
@@ -1371,11 +1537,11 @@ is being created, then the name
 .B home
 will be used.
 
-When creating a partition based array, using 
-.I mdadm 
-with version-1.x metadata, the partition type should be set to 
+When creating a partition based array, using
+.I mdadm
+with version-1.x metadata, the partition type should be set to
 .B 0xDA
-(non fs-data). This type selection allows for greater precision since
+(non fs-data).  This type selection allows for greater precision since
 using any other [RAID auto-detect (0xFD) or a GNU/Linux partition (0x83)],
 might create problems in the event of array recovery through a live cdrom.
 
@@ -1398,6 +1564,16 @@ setting.
 .\".B \-\-size
 .\"is given, the apparent size of the smallest drive given is used.
 
+When creating an array within a
+.B CONTAINER
+.I mdadm
+can be given either the list of devices to use, or simply the name of
+the container.  The former case gives control over which devices in
+the container will be used for the array.  The latter case allows
+.I mdadm
+to automatically choose which devices to use based on how much spare
+space is available.
+
 The General Management options that are valid with
 .B \-\-create
 are:
@@ -1410,7 +1586,6 @@ be in use.
 .B \-\-readonly
 start the array readonly \(em not supported yet.
 
-
 .SH MANAGE MODE
 .HP 12
 Usage:
@@ -1421,7 +1596,7 @@ Usage:
 
 This usage will allow individual devices in an array to be failed,
 removed or added.  It is possible to perform multiple operations with
-on command. For example:
+on command.  For example:
 .br
 .B "  mdadm /dev/md0 \-f /dev/hda1 \-r /dev/hda1 \-a /dev/hda1"
 .br
@@ -1433,12 +1608,20 @@ and will then remove it from the array and finally add it back
 in as a spare.  However only one md array can be affected by a single
 command.
 
+When a device is added to an active array, mdadm checks to see if it
+has metadata on it which suggests that it was recently a member of the
+array.  If it does, it tried to "re-add" the device.  If there have
+been no changes since the device was removed, or if the array has a
+write-intent bitmap which has recorded whatever changes there were,
+then the device will immediately become a full member of the array and
+those differences recorded in the bitmap will be resolved.
+
 .SH MISC MODE
 .HP 12
 Usage:
 .B mdadm
 .I options ...
-.I devices  ...
+.I devices ...
 .PP
 
 MISC mode includes a number of distinct operations that
@@ -1453,7 +1636,7 @@ The information discovered is reported.
 .TP
 .B \-\-detail
 The device should be an active md device.
-.B   mdadm
+.B mdadm
 will display a detailed description of the array.
 .B \-\-brief
 or
@@ -1483,6 +1666,26 @@ The array has multiple failed devices such that it is unusable.
 There was an error while trying to get information about the device.
 .RE
 
+.TP
+.B \-\-detail\-platform
+Print detail of the platform's RAID capabilities (firmware / hardware
+topology).  If the metadata is specified with
+.B \-e
+or
+.B \-\-metadata=
+then the return status will be:
+.RS
+.TP
+0
+metadata successfully enumerated its platform components on this system
+.TP
+1
+metadata is platform independent
+.TP
+2
+metadata failed to find its platform components on this system
+.RE
+
 .TP
 .B \-\-examine
 The device should be a component of an md array.
@@ -1534,7 +1737,6 @@ For
 .B \-\-scan
 causes all devices listed in the config file to be examined.
 
-
 .SH MONITOR MODE
 
 .HP 12
@@ -1560,7 +1762,7 @@ and if the destination array has a failed drive but no spares.
 
 If any devices are listed on the command line,
 .I mdadm
-will only monitor those devices. Otherwise all arrays listed in the
+will only monitor those devices.  Otherwise all arrays listed in the
 configuration file will be monitored.  Further, if
 .B \-\-scan
 is given, then any other md devices that appear in
@@ -1644,7 +1846,7 @@ device has been successfully rebuilt and has been made active.
 .B NewArray
 A new md array has been detected in the
 .B /proc/mdstat
-file.   (syslog priority: Info)
+file.  (syslog priority: Info)
 
 .TP
 .B DegradedArray
@@ -1733,7 +1935,7 @@ The GROW mode is used for changing the size or shape of an active
 array.
 For this to work, the kernel must support the necessary change.
 Various types of growth are being added during 2.6 development,
-including restructuring a raid5 array to have more active devices.
+including restructuring a RAID5 array to have more active devices.
 
 Currently the only support available is to
 .IP \(bu 4
@@ -1746,6 +1948,10 @@ add a write-intent bitmap to any array which supports these bitmaps, or
 remove a write-intent bitmap from such an array.
 .PP
 
+GROW mode is not currently supported for
+.B CONTAINERS
+or arrays inside containers.
+
 .SS SIZE CHANGES
 Normally when an array is built the "size" it taken from the smallest
 of the drives.  If all the small drives in an arrays are, one at a
@@ -1760,7 +1966,11 @@ Note that when an array changes size, any filesystem that may be
 stored in the array will not automatically grow to use the space.  The
 filesystem will need to be explicitly told to use the extra space.
 
-.SS RAID-DEVICES CHANGES
+Also the size of an array cannot be changed while it has an active
+bitmap.  If an array has a bitmap, it must be removed before the size
+can be changed. Once the change it complete a new bitmap can be created.
+
+.SS RAID\-DEVICES CHANGES
 
 A RAID1 array can work with any number of devices from 1 upwards
 (though 1 is not very useful).  There may be times which you want to
@@ -1778,9 +1988,9 @@ present will be activated immediately.
 Increasing the number of active devices in a RAID5 is much more
 effort.  Every block in the array will need to be read and written
 back to a new location.  From 2.6.17, the Linux Kernel is able to do
-this safely, including restart and interrupted "reshape".
+this safely, including restarting an interrupted "reshape".
 
-When relocating the first few stripes on a raid5, it is not possible
+When relocating the first few stripes on a RAID5, it is not possible
 to keep the data on disk completely consistent and crash-proof.  To
 provide the required safety, mdadm disables writes to the array while
 this "critical section" is reshaped, and takes a backup of the data
@@ -1798,7 +2008,7 @@ to restore the backup and reassemble the array.
 A write-intent bitmap can be added to, or removed from, an active
 array.  Either internal bitmaps, or bitmaps stored in a separate file,
 can be added.  Note that if you add a bitmap stored in a file which is
-in a filesystem that is on the raid array being affected, the system
+in a filesystem that is on the RAID array being affected, the system
 will deadlock.  The bitmap must be on a separate filesystem.
 
 .SH INCREMENTAL MODE
@@ -1816,7 +2026,6 @@ Usage:
 Usage:
 .B mdadm \-\-incremental \-\-run \-\-scan
 
-
 .PP
 This mode is designed to be used in conjunction with a device
 discovery system.  As devices are found in a system, they can be
@@ -1824,6 +2033,13 @@ passed to
 .B "mdadm \-\-incremental"
 to be conditionally added to an appropriate array.
 
+If the device passed is a
+.B CONTAINER
+device created by a previous call to
+.IR mdadm ,
+then rather than trying to add that device to an array, all the arrays
+described by the metadata of the container will be started.
+
 .I mdadm
 performs a number of tests to determine if the device is part of an
 array, and which array it should be part of.  If an appropriate array
@@ -1837,14 +2053,6 @@ will only add devices to an array which were previously working
 (active or spare) parts of that array.  It does not currently support
 automatic inclusion of a new drive as a spare in some array.
 
-.B "mdadm \-\-incremental"
-requires a bug-fix in all kernels through 2.6.19.
-Hopefully, this will be fixed in 2.6.20; alternately, apply the patch
-which is included with the mdadm source distribution.  If
-.I mdadm
-detects that this bug is present, it will abort any attempt to use
-.BR \-\-incremental .
-
 The tests that
 .I mdadm
 makes are as follow:
@@ -1877,6 +2085,7 @@ finds any known version of metadata.  If no
 .I md
 metadata is found, the device is rejected.
 
+.ig XX
 .IP +
 Does the metadata match an expected array?
 The metadata can match in two ways.  Either there is an array listed
@@ -1894,14 +2103,16 @@ If
 .I mdadm
 is not able to positively identify the array as belonging to the
 current host, the device will be rejected.
+.XX
 
-.IP +
 .I mdadm
 keeps a list of arrays that it has partially assembled in
 .B /var/run/mdadm/map
 (or
 .B /var/run/mdadm.map
-if the directory doesn't exist).  If no array exists which matches
+if the directory doesn't exist.  Or maybe even
+.BR /dev/.mdadm.map ).
+If no array exists which matches
 the metadata on the new device,
 .I mdadm
 must choose a device name and unit number.  It does this based on any
@@ -1918,7 +2129,13 @@ line in
 suggests that a non-partitionable array is preferred, that will be
 honoured.
 
-.IP +
+If the array is not found in the config file and its metadata does not
+identify it as belonging to the "homehost", then
+.I mdadm
+will choose a name for the array which is certain not to conflict with
+any array which does belong to this host.  It does this be adding an
+underscore and a small number to the name preferred by the metadata.
+
 Once an appropriate array is found or created and the device is added,
 .I mdadm
 must decide if the array is ready to be started.  It will
@@ -1932,8 +2149,8 @@ As an alternative,
 may be passed to
 .I mdadm
 in which case the array will be run as soon as there are enough
-devices present for the data to be accessible.  For a raid1, that
-means one device will start the array.  For a clean raid5, the array
+devices present for the data to be accessible.  For a RAID1, that
+means one device will start the array.  For a clean RAID5, the array
 will be started as soon as all but one drive is present.
 
 Note that neither of these approaches is really ideal.  If it can
@@ -1948,11 +2165,33 @@ that no metadata updates are made and no attempt at resync or recovery
 happens.  Further devices that are found before the first write can
 still be added safely.
 
+.SH ENVIRONMENT
+This section describes environment variables that affect how mdadm
+operates.
+
+.TP
+.B MDADM_NO_MDMON
+Setting this value to 1 will prevent mdadm from automatically launching
+mdmon.  This variable is intended primarily for debugging mdadm/mdmon.
+
+.TP
+.B MDADM_NO_UDEV
+Normally,
+.I mdadm
+does not create any device nodes in /dev, but leaves that task to
+.IR udev .
+If
+.I udev
+appears not to be configured, or if this environment variable is set
+to '1', the
+.I mdadm
+will create and devices that are needed.
+
 .SH EXAMPLES
 
 .B "  mdadm \-\-query /dev/name-of-device"
 .br
-This will find out if a given device is a raid array, or is part of
+This will find out if a given device is a RAID array, or is part of
 one, and will provide brief information about the device.
 
 .B "  mdadm \-\-assemble \-\-scan"
@@ -2038,6 +2277,24 @@ can be started.
 Any devices which are components of /dev/md4 will be marked as faulty
 and then remove from the array.
 
+.B "  mdadm --create /dev/md/ddf --metadata=ddf --raid-disks 6 /dev/sd[a-f]"
+.br
+Create a DDF array over 6 devices.
+
+.B "  mdadm --create /dev/md/home -n3 -l5 -z 30000000 /dev/md/ddf"
+.br
+Create a RAID5 array over any 3 devices in the given DDF set.  Use
+only 30 gigabytes of each device.
+
+.B "  mdadm -A /dev/md/ddf1 /dev/sd[a-f]"
+.br
+Assemble a pre-exist ddf array.
+
+.B "  mdadm -I /dev/md/ddf1"
+.br
+Assemble all arrays contained in the ddf array, assigning names as
+appropriate.
+
 .B "  mdadm \-\-create \-\-help"
 .br
 Provide help about the Create mode.
@@ -2050,7 +2307,6 @@ Provide help about the format of the config file.
 .br
 Provide general help.
 
-
 .SH FILES
 
 .SS /proc/mdstat
@@ -2066,7 +2322,6 @@ uses this to find arrays when
 is given in Misc mode, and to monitor array reconstruction
 on Monitor mode.
 
-
 .SS /etc/mdadm.conf
 
 The config file lists which devices may be scanned to see if
@@ -2083,33 +2338,66 @@ If
 .B /var/run/mdadm
 does not exist as a directory, then
 .B /var/run/mdadm.map
-is used instead.
+is used instead.  If
+.B /var/run
+is not available (as may be the case during early boot),
+.B /dev/.mdadm.map
+is used on the basis that
+.B /dev
+is usually available very early in boot.
 
 .SH DEVICE NAMES
 
-While entries in the /dev directory can have any format you like,
 .I mdadm
-has an understanding of 'standard' formats which it uses to guide its
-behaviour when creating device files via the
-.B \-\-auto
-option.
+understand two sorts of names for array devices.
+
+The first is the so-called 'standard' format name, which matches the
+names used by the kernel and which appear in
+.IR /proc/mdstat .
+
+The second sort can be freely chosen, but must reside in
+.IR /dev/md/ .
+When giving a device name to
+.I mdadm
+to create or assemble an array, either full path name such as
+.I /dev/md0
+or
+.I /dev/md/home
+can be given, or just the suffix of the second sort of name, such as
+.I home
+can be given.
+
+When
+.I mdadm
+chooses device names during auto-assembly or incremental assembly, it
+will sometimes add a small sequence number to the end of the name to
+avoid conflicted between multiple arrays that have the same name.  If
+.I mdadm
+can reasonably determine that the array really is meant for this host,
+either by a hostname in the metadata, or by the presence of the array
+in /etc/mdadm.conf, then it will leave off the suffix if possible.
+Also if the homehost is specified as
+.B <ignore>
+.I mdadm
+will only use a suffix if a different array of the same name already
+exists or is listed in the config file.
 
 The standard names for non-partitioned arrays (the only sort of md
-array available in 2.4 and earlier) are either of
+array available in 2.4 and earlier) are of the form
 .IP
 /dev/mdNN
-.br
-/dev/md/NN
 .PP
 where NN is a number.
 The standard names for partitionable arrays (as available from 2.6
-onwards) are either of
+onwards) are of the form
 .IP
-/dev/md/dNN
-.br
 /dev/md_dNN
 .PP
 Partition numbers should be indicated by added "pMM" to these, thus "/dev/md/d1p2".
+.PP
+From kernel version, 2.6.28 the "non-partitioned array" can actually
+be partitioned.  So the "md_dNN" names are no longer needed, and
+partitions such as "/dev/mdNNpXX" are possible.
 
 .SH NOTE
 .I mdadm
@@ -2134,7 +2422,7 @@ RAID, see:
 .\"for new releases of the RAID driver check out:
 .\"
 .\".IP
-.\".UR  ftp://ftp.kernel.org/pub/linux/kernel/people/mingo/raid-patches
+.\".UR ftp://ftp.kernel.org/pub/linux/kernel/people/mingo/raid-patches
 .\"ftp://ftp.kernel.org/pub/linux/kernel/people/mingo/raid-patches
 .\".UE
 .\".PP
@@ -2152,6 +2440,7 @@ should always be available from
 .PP
 Related man pages:
 .PP
+.IR mdmon (8),
 .IR mdadm.conf (5),
 .IR md (4).
 .PP
diff --git a/mdadm.c b/mdadm.c
index e889b9c2714fcbf17b0409dd3bd7183e130ea2e4..bb3e5bb1821827d246ef88f542b2e873cd3c91b4 100644 (file)
--- a/mdadm.c
+++ b/mdadm.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  *
  *    Additions for bitmap and write-behind RAID options, Copyright (C) 2003-2004,
  *    Paul Clements, SteelEye Technology, Inc.
@@ -91,6 +86,7 @@ int main(int argc, char *argv[])
 
        char *homehost = NULL;
        char sys_hostname[256];
+       int require_homehost = 1;
        char *mailaddr = NULL;
        char *program = NULL;
        int delay = 0;
@@ -124,13 +120,15 @@ int main(int argc, char *argv[])
        ident.bitmap_fd = -1;
        ident.bitmap_file = NULL;
        ident.name[0] = 0;
+       ident.container = NULL;
+       ident.member = NULL;
 
        while ((option_index = -1) ,
               (opt=getopt_long(argc, argv,
                                shortopt, long_options,
                                &option_index)) != -1) {
                int newmode = mode;
-               /* firstly, some mode-independant options */
+               /* firstly, some mode-independent options */
                switch(opt) {
                case 'h':
                        if (option_index > 0 &&
@@ -164,7 +162,10 @@ int main(int argc, char *argv[])
                        continue;
 
                case HomeHost:
-                       homehost = optarg;
+                       if (strcasecmp(optarg, "<ignore>") == 0)
+                               require_homehost = 0;
+                       else
+                               homehost = optarg;
                        continue;
 
                case ':':
@@ -214,6 +215,8 @@ int main(int argc, char *argv[])
                case 'o':
                case 'w':
                case 'W':
+               case Waitclean:
+               case DetailPlatform:
                case 'K': if (!mode) newmode = MISC; break;
                }
                if (mode && newmode == mode) {
@@ -253,6 +256,7 @@ int main(int argc, char *argv[])
                                        dv->writemostly = writemostly;
                                        dv->re_add = re_add;
                                        dv->used = 0;
+                                       dv->content = NULL;
                                        dv->next = NULL;
                                        *devlistend = dv;
                                        devlistend = &dv->next;
@@ -305,6 +309,8 @@ int main(int argc, char *argv[])
                        dv->disposition = devmode;
                        dv->writemostly = writemostly;
                        dv->re_add = re_add;
+                       dv->used = 0;
+                       dv->content = NULL;
                        dv->next = NULL;
                        *devlistend = dv;
                        devlistend = &dv->next;
@@ -332,9 +338,11 @@ int main(int argc, char *argv[])
                        }
                        continue;
 
+#if 0
                case O(ASSEMBLE,AutoHomeHost):
                        auto_update_home = 1;
                        continue;
+#endif
                case O(INCREMENTAL, 'e'):
                case O(CREATE,'e'):
                case O(ASSEMBLE,'e'):
@@ -400,7 +408,10 @@ int main(int argc, char *argv[])
                                        optarg);
                                exit(2);
                        }
-                       if (level != 0 && level != -1 && level != 1 && level != -4 && level != -5 && mode == BUILD) {
+                       if (level != 0 && level != LEVEL_LINEAR && level != 1 &&
+                           level != LEVEL_MULTIPATH && level != LEVEL_FAULTY &&
+                           level != 10 &&
+                           mode == BUILD) {
                                fprintf(stderr, Name ": Raid level %s not permitted with --build.\n",
                                        optarg);
                                exit(2);
@@ -431,7 +442,6 @@ int main(int argc, char *argv[])
                                exit(2);
 
                        case 5:
-                       case 6:
                                layout = map_name(r5layout, optarg);
                                if (layout==UnSet) {
                                        fprintf(stderr, Name ": layout %s not understood for raid5.\n",
@@ -439,6 +449,14 @@ int main(int argc, char *argv[])
                                        exit(2);
                                }
                                break;
+                       case 6:
+                               layout = map_name(r6layout, optarg);
+                               if (layout==UnSet) {
+                                       fprintf(stderr, Name ": layout %s not understood for raid6.\n",
+                                               optarg);
+                                       exit(2);
+                               }
+                               break;
 
                        case 10:
                                /* 'f', 'o' or 'n' followed by a number <= raid_disks */
@@ -638,6 +656,7 @@ int main(int argc, char *argv[])
                "     'summaries', 'homehost', 'byteorder', 'devicesize'.\n");
                        exit(outf == stdout ? 0 : 2);
 
+               case O(INCREMENTAL,NoDegraded):
                case O(ASSEMBLE,NoDegraded): /* --no-degraded */
                        runstop = -1; /* --stop isn't allowed for --assemble,
                                       * so we overload slightly */
@@ -763,6 +782,8 @@ int main(int argc, char *argv[])
                case O(MISC,'o'):
                case O(MISC,'w'):
                case O(MISC,'W'):
+               case O(MISC, Waitclean):
+               case O(MISC, DetailPlatform):
                        if (devmode && devmode != opt &&
                            (devmode == 'E' || (opt == 'E' && devmode != 'Q'))) {
                                fprintf(stderr, Name ": --examine/-E cannot be given with -%c\n",
@@ -939,16 +960,36 @@ int main(int argc, char *argv[])
                        fprintf(stderr, Name ": --super-minor=dev is incompatible with --auto\n");
                        exit(2);
                }
-               if (mode == MANAGE || mode == GROW)
-                       autof=1; /* Don't create */
-               mdfd = open_mddev(devlist->devname, autof);
-               if (mdfd < 0)
+               if (mode == MANAGE || mode == GROW) {
+                       mdfd = open_mddev(devlist->devname, 1);
+                       if (mdfd < 0)
+                               exit(1);
+               } else
+                       /* non-existent device is OK */
+                       mdfd = open_mddev(devlist->devname, 0);
+               if (mdfd == -2) {
+                       fprintf(stderr, Name ": device %s exists but is not an "
+                               "md array.\n", devlist->devname);
                        exit(1);
+               }
                if ((int)ident.super_minor == -2) {
                        struct stat stb;
+                       if (mdfd < 0) {
+                               fprintf(stderr, Name ": --super-minor=dev given, and "
+                                       "listed device %s doesn't exist.\n",
+                                       devlist->devname);
+                               exit(1);
+                       }
                        fstat(mdfd, &stb);
                        ident.super_minor = minor(stb.st_rdev);
                }
+               if (mdfd >= 0 && mode != MANAGE && mode != GROW) {
+                       /* We don't really want this open yet, we just might
+                        * have wanted to check some things
+                        */
+                       close(mdfd);
+                       mdfd = -1;
+               }
        }
 
        if (raiddisks) {
@@ -973,14 +1014,16 @@ int main(int argc, char *argv[])
        }
 
        if (homehost == NULL)
-               homehost = conf_get_homehost();
-       if (homehost && strcmp(homehost, "<system>")==0) {
+               homehost = conf_get_homehost(&require_homehost);
+       if (homehost == NULL || strcmp(homehost, "<system>")==0) {
                if (gethostname(sys_hostname, sizeof(sys_hostname)) == 0) {
                        sys_hostname[sizeof(sys_hostname)-1] = 0;
                        homehost = sys_hostname;
                }
        }
 
+       ident.autof = autof;
+
        rv = 0;
        switch(mode) {
        case MANAGE:
@@ -1004,22 +1047,23 @@ int main(int argc, char *argv[])
                                fprintf(stderr, Name ": %s not identified in config file.\n",
                                        devlist->devname);
                                rv |= 1;
-                       } else {
-                               mdfd = open_mddev(devlist->devname,
-                                                 array_ident->autof ? array_ident->autof : autof);
-                               if (mdfd < 0)
-                                       rv |= 1;
-                               else {
-                                       rv |= Assemble(ss, devlist->devname, mdfd, array_ident,
-                                                      NULL, backup_file,
-                                                      readonly, runstop, update, homehost, verbose-quiet, force);
+                               if (mdfd >= 0)
                                        close(mdfd);
-                               }
+                       } else {
+                               if (array_ident->autof == 0)
+                                       array_ident->autof = autof;
+                               rv |= Assemble(ss, devlist->devname, array_ident,
+                                              NULL, backup_file,
+                                              readonly, runstop, update,
+                                              homehost, require_homehost,
+                                              verbose-quiet, force);
                        }
                } else if (!scan)
-                       rv = Assemble(ss, devlist->devname, mdfd, &ident,
+                       rv = Assemble(ss, devlist->devname, &ident,
                                      devlist->next, backup_file,
-                                     readonly, runstop, update, homehost, verbose-quiet, force);
+                                     readonly, runstop, update,
+                                     homehost, require_homehost,
+                                     verbose-quiet, force);
                else if (devs_found>0) {
                        if (update && devs_found > 1) {
                                fprintf(stderr, Name ": can only update a single array at a time\n");
@@ -1037,16 +1081,13 @@ int main(int argc, char *argv[])
                                        rv |= 1;
                                        continue;
                                }
-                               mdfd = open_mddev(dv->devname,
-                                                 array_ident->autof ?array_ident->autof : autof);
-                               if (mdfd < 0) {
-                                       rv |= 1;
-                                       continue;
-                               }
-                               rv |= Assemble(ss, dv->devname, mdfd, array_ident,
+                               if (array_ident->autof == 0)
+                                       array_ident->autof = autof;
+                               rv |= Assemble(ss, dv->devname, array_ident,
                                               NULL, backup_file,
-                                              readonly, runstop, update, homehost, verbose-quiet, force);
-                               close(mdfd);
+                                              readonly, runstop, update,
+                                              homehost, require_homehost,
+                                              verbose-quiet, force);
                        }
                } else {
                        mddev_ident_t array_list =  conf_get_ident(NULL);
@@ -1065,28 +1106,23 @@ int main(int argc, char *argv[])
                                exit(1);
                        }
                        for (; array_list; array_list = array_list->next) {
-                               mdu_array_info_t array;
-                               mdfd = open_mddev(array_list->devname,
-                                                 array_list->autof ? array_list->autof : autof);
-                               if (mdfd < 0) {
-                                       rv |= 1;
+                               if (array_list->devname &&
+                                   strcasecmp(array_list->devname, "<ignore>") == 0)
                                        continue;
-                               }
-                               if (ioctl(mdfd, GET_ARRAY_INFO, &array)>=0)
-                                       /* already assembled, skip */
-                                       cnt++;
-                               else {
-                                       rv |= Assemble(ss, array_list->devname, mdfd,
-                                                      array_list,
-                                                      NULL, NULL,
-                                                      readonly, runstop, NULL, homehost, verbose-quiet, force);
-                                       if (rv == 0) cnt++;
-                               }
-                               close(mdfd);
-                       }
-                       if (homehost) {
+                               if (array_list->autof == 0)
+                                       array_list->autof = autof;
+                               
+                               rv |= Assemble(ss, array_list->devname,
+                                              array_list,
+                                              NULL, NULL,
+                                              readonly, runstop, NULL,
+                                              homehost, require_homehost,
+                                              verbose-quiet, force);
+                               cnt++;
+                       }
+                       if (homehost && cnt == 0) {
                                /* Maybe we can auto-assemble something.
-                                * Repeatedly call Assemble in auto-assmble mode
+                                * Repeatedly call Assemble in auto-assemble mode
                                 * until it fails
                                 */
                                int rv2;
@@ -1096,10 +1132,12 @@ int main(int argc, char *argv[])
                                        mddev_dev_t devlist = conf_get_devs();
                                        acnt = 0;
                                        do {
-                                               rv2 = Assemble(ss, NULL, -1,
+                                               rv2 = Assemble(ss, NULL,
                                                               &ident,
                                                               devlist, NULL,
-                                                              readonly, runstop, NULL, homehost, verbose-quiet, force);
+                                                              readonly, runstop, NULL,
+                                                              homehost, require_homehost,
+                                                              verbose-quiet, force);
                                                if (rv2==0) {
                                                        cnt++;
                                                        acnt++;
@@ -1112,15 +1150,18 @@ int main(int argc, char *argv[])
                                        } while (rv2!=2);
                                        /* Incase there are stacked devices, we need to go around again */
                                } while (acnt);
+#if 0
                                if (cnt == 0 && auto_update_home && homehost) {
                                        /* Nothing found, maybe we need to bootstrap homehost info */
                                        do {
                                                acnt = 0;
                                                do {
-                                                       rv2 = Assemble(ss, NULL, -1,
+                                                       rv2 = Assemble(ss, NULL,
                                                                       &ident,
                                                                       NULL, NULL,
-                                                                      readonly, runstop, "homehost", homehost, verbose-quiet, force);
+                                                                      readonly, runstop, "homehost",
+                                                                      homehost, require_homehost,
+                                                                      verbose-quiet, force);
                                                        if (rv2==0) {
                                                                cnt++;
                                                                acnt++;
@@ -1129,6 +1170,7 @@ int main(int argc, char *argv[])
                                                /* Incase there are stacked devices, we need to go around again */
                                        } while (acnt);
                                }
+#endif
                                if (cnt == 0 && rv == 0) {
                                        fprintf(stderr, Name ": No arrays found in config file or automatically\n");
                                        rv = 1;
@@ -1160,10 +1202,10 @@ int main(int argc, char *argv[])
                                break;
                        }
                }
-               rv = Build(devlist->devname, mdfd, chunk, level, layout,
+               rv = Build(devlist->devname, chunk, level, layout,
                           raiddisks, devlist->next, assume_clean,
-                          bitmap_file, bitmap_chunk, write_behind, delay,
-                          verbose-quiet, size);
+                          bitmap_file, bitmap_chunk, write_behind,
+                          delay, verbose-quiet, autof, size);
                break;
        case CREATE:
                if (delay == 0) delay = DEFAULT_BITMAP_DELAY;
@@ -1178,11 +1220,11 @@ int main(int argc, char *argv[])
                        break;
                }
 
-               rv = Create(ss, devlist->devname, mdfd, chunk, level, layout, size<0 ? 0 : size,
+               rv = Create(ss, devlist->devname, chunk, level, layout, size<0 ? 0 : size,
                            raiddisks, sparedisks, ident.name, homehost,
                            ident.uuid_set ? ident.uuid : NULL,
                            devs_found-1, devlist->next, runstop, verbose-quiet, force, assume_clean,
-                           bitmap_file, bitmap_chunk, write_behind, delay);
+                           bitmap_file, bitmap_chunk, write_behind, delay, autof);
                break;
        case MISC:
                if (devmode == 'E') {
@@ -1201,22 +1243,40 @@ int main(int argc, char *argv[])
                        rv = Examine(devlist, scan?(verbose>1?0:verbose+1):brief,
                                     export, scan,
                                     SparcAdjust, ss, homehost);
+               } else if (devmode == DetailPlatform) {
+                       rv = Detail_Platform(ss ? ss->ss : NULL, ss ? scan : 1, verbose);
                } else {
                        if (devlist == NULL) {
-                               if (devmode=='D' && scan) {
-                                       /* apply --detail to all devices in /proc/mdstat */
+                               if ((devmode=='D' || devmode == Waitclean) && scan) {
+                                       /* apply --detail or --wait-clean to
+                                        * all devices in /proc/mdstat
+                                        */
                                        struct mdstat_ent *ms = mdstat_read(0, 1);
                                        struct mdstat_ent *e;
+                                       struct map_ent *map = NULL;
+                                       int v = verbose>1?0:verbose+1;
+
                                        for (e=ms ; e ; e=e->next) {
-                                               char *name = get_md_name(e->devnum);
+                                               char *name;
+                                               struct map_ent *me;
+                                               me = map_by_devnum(&map, e->devnum);
+                                               if (me && me->path
+                                                   && strcmp(me->path, "/unknown") != 0)
+                                                       name = me->path;
+                                               else
+                                                       name = get_md_name(e->devnum);
 
                                                if (!name) {
                                                        fprintf(stderr, Name ": cannot find device file for %s\n",
                                                                e->dev);
                                                        continue;
                                                }
-                                               rv |= Detail(name, verbose>1?0:verbose+1,
-                                                            export, test, homehost);
+                                               if (devmode == 'D')
+                                                       rv |= Detail(name, v,
+                                                                    export, test,
+                                                                    homehost);
+                                               else
+                                                       rv |= WaitClean(name, v);
                                                put_md_name(name);
                                        }
                                        free_mdstat(ms);
@@ -1268,13 +1328,16 @@ int main(int argc, char *argv[])
                                                     export, test, homehost);
                                        continue;
                                case 'K': /* Zero superblock */
-                                       rv |= Kill(dv->devname, force, quiet); continue;
+                                       rv |= Kill(dv->devname, force, quiet,0);
+                                       continue;
                                case 'Q':
                                        rv |= Query(dv->devname); continue;
                                case 'X':
                                        rv |= ExamineBitmap(dv->devname, brief, ss); continue;
                                case 'W':
                                        rv |= Wait(dv->devname); continue;
+                               case Waitclean:
+                                       rv |= WaitClean(dv->devname, verbose-quiet); continue;
                                }
                                mdfd = open_mddev(dv->devname, 1);
                                if (mdfd>=0) {
@@ -1305,6 +1368,13 @@ int main(int argc, char *argv[])
                        rv = 1;
                        break;
                }
+               if (delay == 0) {
+                       if (get_linux_version() > 20616)
+                               /* mdstat responds to poll */
+                               delay = 1000;
+                       else
+                               delay = 60;
+               }
                rv= Monitor(devlist, mailaddr, program,
                            delay?delay:60, daemonise, scan, oneshot,
                            dosyslog, test, pidfile);
@@ -1367,7 +1437,7 @@ int main(int argc, char *argv[])
                        break;
                }
                rv = Incremental(devlist->devname, verbose-quiet, runstop,
-                                ss, homehost, autof);
+                                ss, homehost, require_homehost, autof);
                break;
        case AUTODETECT:
                autodetect();
index 376b8388ff22014f91ab74a7a14d2873740f59bd..002e2b37f85050c5c295c854b11c71b1624c011d 100644 (file)
@@ -53,9 +53,20 @@ Also, there may be several device lines present in the file.
 
 Alternatively, a
 .B device
-line can contain the word
+line can contain either of both of the  words
+.B containers
+and
 .BR partitions .
-This will cause
+The word
+.B containers
+will cause
+.I mdadm
+to look for assembled CONTAINER arrays and included them as a source
+for assembling further arrays.
+
+The word
+.I partitions
+will cause
 .I mdadm
 to read
 .I /proc/partitions
@@ -67,7 +78,7 @@ but only the major and minor device numbers.  It scans
 .I /dev
 to find the name that matches the numbers.
 
-If no DEVICE line is present, then "DEVICE partitions" is assumed.
+If no DEVICE line is present, then "DEVICE partitions containers" is assumed.
 
 For example:
 .IP
@@ -75,22 +86,35 @@ DEVICE /dev/hda* /dev/hdc*
 .br
 DEV    /dev/sd*
 .br
-DEVICE /dev/discs/disc*/disc
+DEVICE /dev/disk/by-path/pci*
 .br
 DEVICE partitions
 
 .TP
 .B ARRAY
 The ARRAY lines identify actual arrays.  The second word on the line
-should be the name of the device where the array is normally
+may be the name of the device where the array is normally
 assembled, such as
-.BR  /dev/md1 .
+.B /dev/md1
+or
+.BR /dev/md/backup .
+If the name does not start with a slash
+.RB (' / '),
+it is treated as being in
+.BR /dev/md/ .
+Alternately the word
+.B <ignore>
+(complete with angle brackets) can be given in which case any array
+which matches the rest of the line will never be automatically assembled.
+If no device name is given,
+.I mdadm
+will use various heuristics to determine an appropriate name.
+
 Subsequent words identify the array, or identify the array as a member
 of a group. If multiple identities are given,
 then a component device must match ALL identities to be considered a
 match.  Each identity word has a tag, and equals sign, and some value.
 The tags are:
-
 .RS 4
 .TP
 .B uuid=
@@ -135,6 +159,7 @@ this is mainly for compatibility with the output of
 .TP
 .B spares=
 The value is a number of spare devices to expect the array to have.
+The sole use of this keyword and value is as follows:
 .B mdadm \-\-monitor
 will report an array if it is found to have fewer than this number of
 spares when
@@ -157,10 +182,15 @@ or missing drive but no spare.
 
 .TP
 .B auto=
-This option declares to
+This option is rarely needed with mdadm-3.0, particularly if use with
+the Linux kernel v2.6.28 or later.
+It tells
 .I mdadm
-that it should try to create the device file of the array if it
-doesn't already exist, or exists but with the wrong device number.
+whether to use partitionable array or non-partitionable arrays and,
+in the absence of
+.IR udev ,
+how many partition devices to create.  From 2.6.28 all md array
+devices are partitionable, hence this option is not needed.
 
 The value of this option can be "yes" or "md" to indicate that a
 traditional, non-partitionable md array should be created, or "mdp",
@@ -189,6 +219,18 @@ Specify the metadata format that the array has.  This is mainly
 recognised for comparability with the output of
 .BR "mdadm \-Es" .
 
+.TP
+.B container=
+Specify that this array is a member array of some container.  The
+value given can be either a path name in /dev, or a UUID of the
+container array.
+
+.TP
+.B member=
+Specify that this array is a member array of some container.  Each
+type of container has some way to enumerate member arrays, often a
+simple sequence number.  The value identifies which member of a
+container the array is.  It will usually accompany a "container=" word.
 .RE
 
 .TP
@@ -295,18 +337,72 @@ The
 .B homehost
 line gives a default value for the
 .B --homehost=
-option to mdadm.  There should be exactly one other word on the line.
-It should either exactly
+option to mdadm.  There should normally be only one other word on the line.
+It should either be a host name, or one of the special words
 .B <system>
-or a host name.
+and
+.BR <ignore> .
 If
 .B <system>
 is given, then the
 .BR gethostname ( 2 )
 systemcall is used to get the host name.
+
+If
+.B <ignore>
+is given, then a flag is set so that when arrays are being
+auto-assembled the checking of the recorded
+.I homehost
+is disabled.
+If
+.B <ignore>
+is given it is also possible to give an explicit name which will be
+used when creating arrays.  This is the only case when there can be
+more that one other word on the
+.B HOMEHOST
+line.
+
 When arrays are created, this host name will be stored in the
-metadata.  When arrays are assembled using auto-assembly, only arrays
-with this host name stored in the metadata will be considered.
+metadata.  When arrays are assembled using auto-assembly, arrays which
+do not record the correct homehost name in their metadata will be
+assembled using a "foreign" name.  A "foreign" name alway ends with a
+digit string preceded by an underscore to differentiate it
+from any possible local name. e.g.
+.B /dev/md/1_1
+or
+.BR /dev/md/home_0 .
+.TP
+.B AUTO
+A list of names of metadata format can be given, each preceded by a
+plus or minus sign.  Also the word
+.I all
+preceded by plus or minus is allowed and is usually last.
+
+When
+.I mdadm
+is auto-assembling an array, with via
+.I --assemble
+or
+.I --incremental
+and it finds metadata of a given type, it checks that metadata type
+against those listed in this line.  The first match wins, where
+.I all
+matches anything.
+If a match is found that was preceded by a plus sign, the auto
+assembly is allowed.  If the match was preceded by a minus sign, the
+auto assembly is disallowed.  If no match is found, the auto assembly
+is allowed.
+
+This can be used to disable all auto-assembly (so that only arrays
+explicitly listed in mdadm.conf or on the command line are assembled),
+or to disable assembly of certain metadata types which might be
+handled by other software.
+
+The known metadata types are
+.BR 0.90 ,
+.BR 1.x ,
+.BR ddf ,
+.BR imsm .
 
 .SH EXAMPLE
 DEVICE /dev/sd[bcdjkl]1
@@ -354,6 +450,8 @@ PROGRAM /usr/sbin/handle\-mdadm\-events
 CREATE group=system mode=0640 auto=part\-8
 .br
 HOMEHOST <system>
+.br
+AUTO +1.x -all
 
 .SH SEE ALSO
 .BR mdadm (8),
diff --git a/mdadm.h b/mdadm.h
index 0eb8f399a3ed738e271d3a0ae2879fe84e305218..e564deeaa8f2eb97dddf676a5f6287da61b24058 100644 (file)
--- a/mdadm.h
+++ b/mdadm.h
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 
 #define        _GNU_SOURCE
@@ -76,6 +71,7 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
 #include       "md_u.h"
 #include       "md_p.h"
 #include       "bitmap.h"
+#include       "msg.h"
 
 #include <endian.h>
 /* Redhat don't like to #include <asm/byteorder.h>, and
@@ -106,6 +102,13 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
 #define        __le16_to_cpu(_x) (_x)
 #define __le32_to_cpu(_x) (_x)
 #define __le64_to_cpu(_x) (_x)
+
+#define        __cpu_to_be16(_x) bswap_16(_x)
+#define __cpu_to_be32(_x) bswap_32(_x)
+#define __cpu_to_be64(_x) bswap_64(_x)
+#define        __be16_to_cpu(_x) bswap_16(_x)
+#define __be32_to_cpu(_x) bswap_32(_x)
+#define __be64_to_cpu(_x) bswap_64(_x)
 #elif BYTE_ORDER == BIG_ENDIAN
 #define        __cpu_to_le16(_x) bswap_16(_x)
 #define __cpu_to_le32(_x) bswap_32(_x)
@@ -113,6 +116,13 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
 #define        __le16_to_cpu(_x) bswap_16(_x)
 #define __le32_to_cpu(_x) bswap_32(_x)
 #define __le64_to_cpu(_x) bswap_64(_x)
+
+#define        __cpu_to_be16(_x) (_x)
+#define __cpu_to_be32(_x) (_x)
+#define __cpu_to_be64(_x) (_x)
+#define        __be16_to_cpu(_x) (_x)
+#define __be32_to_cpu(_x) (_x)
+#define __be64_to_cpu(_x) (_x)
 #else
 #  error "unknown endianness."
 #endif
@@ -128,18 +138,41 @@ struct mdinfo {
        int                     uuid[4];
        char                    name[33];
        unsigned long long      data_offset;
-       unsigned long long      component_size;
+       unsigned long long      component_size; /* same as array.size, except in
+                                                * sectors and up to 64bits.
+                                                */
+       unsigned long long      custom_array_size; /* size for non-default sized
+                                                   * arrays (in sectors)
+                                                   */
        int                     reshape_active;
        unsigned long long      reshape_progress;
+       unsigned long long      resync_start;
+       unsigned long           safe_mode_delay; /* ms delay to mark clean */
        int                     new_level, delta_disks, new_layout, new_chunk;
        int                     errors;
        int                     cache_size; /* size of raid456 stripe cache*/
        int                     mismatch_cnt;
        char                    text_version[50];
 
+       int container_member; /* for assembling external-metatdata arrays
+                              * This is to be used internally by metadata
+                              * handler only */
+
        char            sys_name[20];
        struct mdinfo *devs;
        struct mdinfo *next;
+
+       /* Device info for mdmon: */
+       int state_fd;
+       #define DS_FAULTY       1
+       #define DS_INSYNC       2
+       #define DS_WRITE_MOSTLY 4
+       #define DS_SPARE        8
+       #define DS_BLOCKED      16
+       #define DS_REMOVE       1024
+       #define DS_UNBLOCK      2048
+       int prev_state, curr_state, next_state;
+
 };
 
 struct createinfo {
@@ -189,6 +222,8 @@ enum special_options {
        AutoHomeHost,
        Symlinks,
        AutoDetect,
+       Waitclean,
+       DetailPlatform,
 };
 
 /* structures read from config file */
@@ -223,6 +258,13 @@ typedef struct mddev_ident_s {
        char    *bitmap_file;
        int     bitmap_fd;
 
+       char    *container;     /* /dev/whatever name of container, or
+                                * uuid of container.  You would expect
+                                * this to be the 'devname' or UUID
+                                * of some other entry.
+                                */
+       char    *member;        /* subarray within a container */
+
        struct mddev_ident_s *next;
 } *mddev_ident_t;
 
@@ -235,6 +277,8 @@ typedef struct mddev_dev_s {
        char writemostly;       /* 1 for 'set writemostly', 2 for 'clear writemostly' */
        char re_add;
        char used;              /* set when used */
+       struct mdinfo *content; /* If devname is a container, this might list
+                                * the remaining member arrays. */
        struct mddev_dev_s *next;
 } *mddev_dev_t;
 
@@ -252,57 +296,86 @@ struct mdstat_ent {
        char            *pattern; /* U or up, _ for down */
        int             percent; /* -1 if no resync */
        int             resync; /* 1 if resync, 0 if recovery */
+       int             devcnt;
+       int             raid_disks;
+       int             chunk_size;
+       char *          metadata_version;
        struct mdstat_ent *next;
 };
 
 extern struct mdstat_ent *mdstat_read(int hold, int start);
 extern void free_mdstat(struct mdstat_ent *ms);
 extern void mdstat_wait(int seconds);
+extern void mdstat_wait_fd(int fd, const sigset_t *sigmask);
 extern int mddev_busy(int devnum);
 
 struct map_ent {
        struct map_ent *next;
        int     devnum;
-       int     major,minor;
+       char    metadata[20];
        int     uuid[4];
+       int     bad;
        char    *path;
 };
-extern int map_update(struct map_ent **mpp, int devnum, int major, int minor,
+extern int map_update(struct map_ent **mpp, int devnum, char *metadata,
                      int uuid[4], char *path);
 extern struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4]);
+extern struct map_ent *map_by_devnum(struct map_ent **map, int devnum);
+extern struct map_ent *map_by_name(struct map_ent **map, char *name);
 extern void map_read(struct map_ent **melp);
 extern int map_write(struct map_ent *mel);
 extern void map_delete(struct map_ent **mapp, int devnum);
 extern void map_free(struct map_ent *map);
 extern void map_add(struct map_ent **melp,
-                   int devnum, int major, int minor, int uuid[4], char *path);
+                   int devnum, char *metadata, int uuid[4], char *path);
+extern int map_lock(struct map_ent **melp);
+extern void map_unlock(struct map_ent **melp);
 
 /* various details can be requested */
-#define        GET_LEVEL       1
-#define        GET_LAYOUT      2
-#define        GET_COMPONENT   4
-#define        GET_CHUNK       8
-#define GET_CACHE      16
-#define        GET_MISMATCH    32
-#define        GET_VERSION     64
-
-#define        GET_DEVS        1024 /* gets role, major, minor */
-#define        GET_OFFSET      2048
-#define        GET_SIZE        4096
-#define        GET_STATE       8192
-#define        GET_ERROR       16384
+enum sysfs_read_flags {
+       GET_LEVEL       = (1 << 0),
+       GET_LAYOUT      = (1 << 1),
+       GET_COMPONENT   = (1 << 2),
+       GET_CHUNK       = (1 << 3),
+       GET_CACHE       = (1 << 4),
+       GET_MISMATCH    = (1 << 5),
+       GET_VERSION     = (1 << 6),
+       GET_DISKS       = (1 << 7),
+       GET_DEGRADED    = (1 << 8),
+       GET_SAFEMODE    = (1 << 9),
+       GET_DEVS        = (1 << 10), /* gets role, major, minor */
+       GET_OFFSET      = (1 << 11),
+       GET_SIZE        = (1 << 12),
+       GET_STATE       = (1 << 13),
+       GET_ERROR       = (1 << 14),
+       SKIP_GONE_DEVS  = (1 << 15),
+};
 
 /* If fd >= 0, get the array it is open on,
  * else use devnum. >=0 -> major9. <0.....
  */
+extern int sysfs_open(int devnum, char *devname, char *attr);
+extern void sysfs_init(struct mdinfo *mdi, int fd, int devnum);
 extern void sysfs_free(struct mdinfo *sra);
 extern struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options);
+extern int sysfs_attr_match(const char *attr, const char *str);
+extern int sysfs_match_word(const char *word, char **list);
 extern int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev,
                         char *name, char *val);
 extern int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev,
                         char *name, unsigned long long val);
+extern int sysfs_uevent(struct mdinfo *sra, char *event);
 extern int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
                        char *name, unsigned long long *val);
+extern int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev,
+                        char *name, char *val, int size);
+extern int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms);
+extern int sysfs_set_array(struct mdinfo *info, int vers);
+extern int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd,
+                         int in_sync);
+extern int sysfs_disk_to_scsi_id(int fd, __u32 *id);
+extern int sysfs_unique_holder(int devnum, long rdev);
+extern int load_sys(char *path, char *buf);
 
 
 extern int save_stripes(int *source, unsigned long long *offsets,
@@ -322,32 +395,132 @@ extern int restore_stripes(int *dest, unsigned long long *offsets,
 
 extern char *map_num(mapping_t *map, int num);
 extern int map_name(mapping_t *map, char *name);
-extern mapping_t r5layout[], pers[], modes[], faultylayout[];
+extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[];
 
 extern char *map_dev(int major, int minor, int create);
 
+struct active_array;
+struct metadata_update;
 
+/* A superswitch provides entry point the a metadata handler.
+ *
+ * The super_switch primarily operates on some "metadata" that
+ * is accessed via the 'supertype'.
+ * This metadata has one of three possible sources.
+ * 1/ It is read from a single device.  In this case it may not completely
+ *    describe the array or arrays as some information might be on other
+ *    devices.
+ * 2/ It is read from all devices in a container.  In this case all
+ *    information is present.
+ * 3/ It is created by ->init_super / ->add_to_super.  In this case it will
+ *    be complete once enough ->add_to_super calls have completed.
+ *
+ * When creating an array inside a container, the metadata will be
+ * formed by a combination of 2 and 3.  The metadata or the array is read,
+ * then new information is added.
+ *
+ * The metadata must sometimes have a concept of a 'current' array
+ * and a 'current' device.
+ * The 'current' array is set by init_super to be the newly created array,
+ * or is set by super_by_fd when it finds it is looking at an array inside
+ * a container.
+ *
+ * The 'current' device is either the device that the metadata was read from
+ * in case 1, or the last device added by add_to_super in case 3.
+ * Case 2 does not identify a 'current' device.
+ */
 extern struct superswitch {
+
+       /* Used to report details of metadata read from a component
+        * device. ->load_super has been called.
+        */
        void (*examine_super)(struct supertype *st, char *homehost);
-       void (*brief_examine_super)(struct supertype *st);
+       void (*brief_examine_super)(struct supertype *st, int verbose);
        void (*export_examine_super)(struct supertype *st);
+
+       /* Used to report details of an active array.
+        * ->load_super was possibly given a 'component' string.
+        */
        void (*detail_super)(struct supertype *st, char *homehost);
        void (*brief_detail_super)(struct supertype *st);
        void (*export_detail_super)(struct supertype *st);
+
+       /* Optional: platform hardware / firmware details */
+       int (*detail_platform)(int verbose, int enumerate_only);
+
+       /* Used:
+        *   to get uuid to storing in bitmap metadata
+        *   and 'reshape' backup-data metadata
+        *   To see if a device is being re-added to an array it was part of.
+        */
        void (*uuid_from_super)(struct supertype *st, int uuid[4]);
+
+       /* Extract generic details from metadata.  This could be details about
+        * the container, or about an individual array within the container.
+        * The determination is made either by:
+        *   load_super being given a 'component' string.
+        *   validate_geometry determining what to create.
+        * The info includes both array information and device information.
+        * The particular device should be:
+        *   The last device added by add_to_super
+        *   The device the metadata was loaded from by load_super
+        */
        void (*getinfo_super)(struct supertype *st, struct mdinfo *info);
+
+       /* Check if the given metadata is flagged as belonging to "this"
+        * host.  0 for 'no', 1 for 'yes', -1 for "Don't record homehost"
+        */
        int (*match_home)(struct supertype *st, char *homehost);
+
+       /* Make one of several generic modifications to metadata
+        * prior to assembly (or other times).
+        *   sparc2.2  - first bug in early 0.90 metadata
+        *   super-minor - change name of 0.90 metadata
+        *   summaries - 'correct' any redundant data
+        *   resync - mark array as dirty to trigger a resync.
+        *   uuid - set new uuid - only 0.90 or 1.x
+        *   name - change the name of the array (where supported)
+        *   homehost - change which host this array is tied to.
+        *   devicesize - If metadata is at start of device, change recorded
+        *               device size to match actual device size
+        *   byteorder - swap bytes for 0.90 metadata
+        *
+        *   force-one  - mark that device as uptodate, not old or failed.
+        *   force-array - mark array as clean if it would not otherwise
+        *               assemble
+        *   assemble   - not sure how this is different from force-one...
+        *   linear-grow-new - add a new device to a linear array, but don't
+        *                   change the size: so superblock still matches
+        *   linear-grow-update - now change the size of the array.
+        */
        int (*update_super)(struct supertype *st, struct mdinfo *info,
                            char *update,
                            char *devname, int verbose,
                            int uuid_set, char *homehost);
+
+       /* Create new metadata for new array as described.  This could
+        * be a new container, or an array in a pre-existing container.
+        * Also used to zero metadata prior to writing it to invalidate old
+        * metadata.
+        */
        int (*init_super)(struct supertype *st, mdu_array_info_t *info,
                          unsigned long long size, char *name,
                          char *homehost, int *uuid);
-       void (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo);
+
+       /* update the metadata to include new device, either at create or
+        * when hot-adding a spare.
+        */
+       int (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo,
+                            int fd, char *devname);
+
+       /* Write metadata to one device when fixing problems or adding
+        * a new device.
+        */
        int (*store_super)(struct supertype *st, int fd);
-       int (*write_init_super)(struct supertype *st, mdu_disk_info_t *dinfo,
-                               char *devname);
+
+       /*  Write all metadata for this array.
+        */
+       int (*write_init_super)(struct supertype *st);
        int (*compare_super)(struct supertype *st, struct supertype *tst);
        int (*load_super)(struct supertype *st, int fd, char *devname);
        struct supertype * (*match_metadata_desc)(char *arg);
@@ -358,15 +531,123 @@ extern struct superswitch {
        void (*locate_bitmap)(struct supertype *st, int fd);
        int (*write_bitmap)(struct supertype *st, int fd);
        void (*free_super)(struct supertype *st);
-       int major;
+
+       /* validate_geometry is called with an st returned by
+        * match_metadata_desc.
+        * It should check that the geometry described in compatible with
+        * the metadata type.  It will be called repeatedly as devices
+        * added to validate changing size and new devices.  If there are
+        * inter-device dependencies, it should record sufficient details
+        * so these can be validated.
+        * Both 'size' and '*freesize' are in sectors.  chunk is bytes.
+        */
+       int (*validate_geometry)(struct supertype *st, int level, int layout,
+                                int raiddisks,
+                                int chunk, unsigned long long size,
+                                char *subdev, unsigned long long *freesize,
+                                int verbose);
+
+       struct mdinfo *(*container_content)(struct supertype *st);
+       /* Allow a metadata handler to override mdadm's default layouts */
+       int (*default_layout)(int level); /* optional */
+
+/* for mdmon */
+       int (*open_new)(struct supertype *c, struct active_array *a,
+                       char *inst);
+
+       /* Tell the metadata handler the current state of the array.
+        * This covers whether it is known to be consistent (no pending writes)
+        * and how far along a resync is known to have progressed
+        * (in a->resync_start).
+        * resync status is really irrelevant if the array is not consistent,
+        * but some metadata (DDF!) have a place to record the distinction.
+        * If 'consistent' is '2', then the array can mark it dirty if a 
+        * resync/recovery/whatever is required, or leave it clean if not.
+        * Return value is 0 dirty (not consistent) and 1 if clean.
+        * it is only really important if consistent is passed in as '2'.
+        */
+       int (*set_array_state)(struct active_array *a, int consistent);
+
+       /* When the state of a device might have changed, we call set_disk to
+        * tell the metadata what the current state is.
+        * Typically this happens on spare->in_sync and (spare|in_sync)->faulty
+        * transitions.
+        * set_disk might be called when the state of the particular disk has
+        * not in fact changed.
+        */
+       void (*set_disk)(struct active_array *a, int n, int state);
+       void (*sync_metadata)(struct supertype *st);
+       void (*process_update)(struct supertype *st,
+                              struct metadata_update *update);
+       void (*prepare_update)(struct supertype *st,
+                              struct metadata_update *update);
+
+       /* activate_spare will check if the array is degraded and, if it
+        * is, try to find some spare space in the container.
+        * On success, it add appropriate updates (For process_update) to
+        * to the 'updates' list and returns a list of 'mdinfo' identifying
+        * the device, or devices as there might be multiple missing
+        * devices and multiple spares available.
+        */
+       struct mdinfo *(*activate_spare)(struct active_array *a,
+                                        struct metadata_update **updates);
+
        int swapuuid; /* true if uuid is bigending rather than hostendian */
-} super0, super1, *superlist[];
+       int external;
+       const char *name; /* canonical metadata name */
+} super0, super1, super_ddf, *superlist[];
+
+extern struct superswitch super_imsm;
+
+struct metadata_update {
+       int     len;
+       char    *buf;
+       void    *space; /* allocated space that monitor will use */
+       struct metadata_update *next;
+};
 
+/* A supertype holds a particular collection of metadata.
+ * It identifies the metadata type by the superswitch, and the particular
+ * sub-version of that metadata type.
+ * metadata read in or created is stored in 'sb' and 'info'.
+ * There are also fields used by mdmon to track containers.
+ *
+ * A supertype may refer to:
+ *   Just an array, possibly in a container
+ *   A container, not identifying any particular array
+ *   Info read from just one device, not yet fully describing the array/container.
+ *
+ *
+ * A supertype is created by:
+ *   super_by_fd
+ *   guess_super
+ *   dup_super
+ */
 struct supertype {
        struct superswitch *ss;
        int minor_version;
        int max_devs;
+       int container_dev;    /* devnum of container */
+       char subarray[32];      /* name of array inside container */
        void *sb;
+       void *info;
+       int loaded_container;   /* Set if load_super found a container,
+                                * not just one device */
+
+       struct metadata_update *updates;
+       struct metadata_update **update_tail;
+
+       /* extra stuff used by mdmon */
+       struct active_array *arrays;
+       int sock; /* listen to external programs */
+       int devnum;
+       char *devname; /* e.g. md0.  This appears in metadata_verison:
+                       *  external:/md0/12
+                       */
+       int devcnt;
+
+       struct mdinfo *devs;
+
 };
 
 extern struct supertype *super_by_fd(int fd);
@@ -375,6 +656,7 @@ extern struct supertype *dup_super(struct supertype *st);
 extern int get_dev_size(int fd, char *dname, unsigned long long *sizep);
 extern void get_one_disk(int mdfd, mdu_array_info_t *ainf,
                         mdu_disk_info_t *disk);
+void wait_for(char *dev, int fd);
 
 #if __GNUC__ < 3
 struct stat64;
@@ -433,28 +715,28 @@ extern int Grow_restart(struct supertype *st, struct mdinfo *info,
                        int *fdlist, int cnt, char *backup_file);
 
 
-extern int Assemble(struct supertype *st, char *mddev, int mdfd,
+extern int Assemble(struct supertype *st, char *mddev,
                    mddev_ident_t ident,
                    mddev_dev_t devlist, char *backup_file,
                    int readonly, int runstop,
-                   char *update, char *homehost,
+                   char *update, char *homehost, int require_homehost,
                    int verbose, int force);
 
-extern int Build(char *mddev, int mdfd, int chunk, int level, int layout,
-                int raiddisks,
-                mddev_dev_t devlist, int assume_clean,
+extern int Build(char *mddev, int chunk, int level, int layout,
+                int raiddisks, mddev_dev_t devlist, int assume_clean,
                 char *bitmap_file, int bitmap_chunk, int write_behind,
-                int delay, int verbose, unsigned long long size);
+                int delay, int verbose, int autof, unsigned long long size);
 
 
-extern int Create(struct supertype *st, char *mddev, int mdfd,
+extern int Create(struct supertype *st, char *mddev,
                  int chunk, int level, int layout, unsigned long long size, int raiddisks, int sparedisks,
                  char *name, char *homehost, int *uuid,
                  int subdevs, mddev_dev_t devlist,
                  int runstop, int verbose, int force, int assume_clean,
-                 char *bitmap_file, int bitmap_chunk, int write_behind, int delay);
+                 char *bitmap_file, int bitmap_chunk, int write_behind, int delay, int autof);
 
 extern int Detail(char *dev, int brief, int export, int test, char *homehost);
+extern int Detail_Platform(struct superswitch *ss, int scan, int verbose);
 extern int Query(char *dev);
 extern int Examine(mddev_dev_t devlist, int brief, int export, int scan,
                   int SparcAdjust, struct supertype *forcest, char *homehost);
@@ -463,11 +745,16 @@ extern int Monitor(mddev_dev_t devlist,
                   int period, int daemonise, int scan, int oneshot,
                   int dosyslog, int test, char *pidfile);
 
-extern int Kill(char *dev, int force, int quiet);
+extern int Kill(char *dev, int force, int quiet, int noexcl);
 extern int Wait(char *dev);
+extern int WaitClean(char *dev, int verbose);
 
 extern int Incremental(char *devname, int verbose, int runstop,
-                      struct supertype *st, char *homehost, int autof);
+                      struct supertype *st, char *homehost, int require_homehost,
+                      int autof);
+extern int Incremental_container(struct supertype *st, char *devname,
+                                int verbose, int runstop, int autof,
+                                int trustworthy);
 extern void RebuildMap(void);
 extern int IncrementalScan(int verbose);
 
@@ -489,32 +776,53 @@ extern int check_raid(int fd, char *name);
 
 extern int get_mdp_major(void);
 extern int dev_open(char *dev, int flags);
+extern int open_dev(int devnum);
+extern int open_dev_excl(int devnum);
 extern int is_standard(char *dev, int *nump);
+extern int same_dev(char *one, char *two);
 
 extern int parse_auto(char *str, char *msg, int config);
 extern mddev_ident_t conf_get_ident(char *dev);
 extern mddev_dev_t conf_get_devs(void);
 extern int conf_test_dev(char *devname);
+extern int conf_test_metadata(const char *version);
 extern struct createinfo *conf_get_create_info(void);
 extern void set_conffile(char *file);
 extern char *conf_get_mailaddr(void);
 extern char *conf_get_mailfrom(void);
 extern char *conf_get_program(void);
-extern char *conf_get_homehost(void);
+extern char *conf_get_homehost(int *require_homehostp);
 extern char *conf_line(FILE *file);
 extern char *conf_word(FILE *file, int allow_key);
+extern int conf_name_is_free(char *name);
+extern int devname_matches(char *name, char *match);
+extern struct mddev_ident_s *conf_match(struct mdinfo *info, struct supertype *st);
+
 extern void free_line(char *line);
 extern int match_oneof(char *devices, char *devname);
 extern void uuid_from_super(int uuid[4], mdp_super_t *super);
+extern const int uuid_match_any[4];
 extern int same_uuid(int a[4], int b[4], int swapuuid);
 extern void copy_uuid(void *a, int b[4], int swapuuid);
+extern char *fname_from_uuid(struct supertype *st,
+                            struct mdinfo *info, char *buf, char sep);
 extern unsigned long calc_csum(void *super, int bytes);
 extern int enough(int level, int raid_disks, int layout, int clean,
                   char *avail, int avail_disks);
 extern int ask(char *mesg);
 extern unsigned long long get_component_size(int fd);
 extern void remove_partitions(int fd);
-
+extern unsigned long long calc_array_size(int level, int raid_disks, int layout,
+                                  int chunksize, unsigned long long devsize);
+extern int flush_metadata_updates(struct supertype *st);
+extern void append_metadata_update(struct supertype *st, void *buf, int len);
+extern int assemble_container_content(struct supertype *st, int mdfd,
+                                     struct mdinfo *content, int runstop,
+                                     char *chosen_name, int verbose);
+
+extern int add_disk(int mdfd, struct supertype *st,
+                   struct mdinfo *sra, struct mdinfo *info);
+extern int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info);
 
 extern char *human_size(long long bytes);
 extern char *human_size_brief(long long bytes);
@@ -528,10 +836,64 @@ extern char *get_md_name(int dev);
 
 extern char DefaultConfFile[];
 
-extern int open_mddev(char *dev, int autof);
-extern int open_mddev_devnum(char *devname, int devnum, char *name,
-                            char *chosen_name, int parts);
+extern int create_mddev(char *dev, char *name, int autof, int trustworthy,
+                       char *chosen);
+/* values for 'trustworthy' */
+#define        LOCAL   1
+#define        FOREIGN 2
+#define        METADATA 3
+extern int open_mddev(char *dev, int report_errors);
+extern int open_container(int fd);
+
+extern int mdmon_running(int devnum);
+extern int signal_mdmon(int devnum);
+extern int check_env(char *name);
+extern int start_mdmon(int devnum);
+
+extern char *devnum2devname(int num);
+extern int devname2devnum(char *name);
+extern int stat2devnum(struct stat *st);
+extern int fd2devnum(int fd);
+
+static inline int dev2major(int d)
+{
+       if (d >= 0)
+               return MD_MAJOR;
+       else
+               return get_mdp_major();
+}
+
+static inline int dev2minor(int d)
+{
+       if (d >= 0)
+               return d;
+       return (-1-d) << MdpMinorShift;
+}
+
+static inline int ROUND_UP(int a, int base)
+{
+       return ((a+base-1)/base)*base;
+}
+
+static inline int is_subarray(char *vers)
+{
+       /* The version string for a 'subarray' (an array in a container)
+        * is 
+        *    /containername/componentname    for normal read-write arrays
+        *    -containername/componentname    for read-only arrays.
+        * containername is e.g. md0, md_d1
+        * componentname is dependant on the metadata. e.g. '1' 'S1' ...
+        */
+       return (*vers == '/' || *vers == '-');
+}
 
+#ifdef DEBUG
+#define dprintf(fmt, arg...) \
+       fprintf(stderr, fmt, ##arg)
+#else
+#define dprintf(fmt, arg...) \
+        ({ if (0) fprintf(stderr, fmt, ##arg); 0; })
+#endif
 #include <assert.h>
 #include <stdarg.h>
 static inline int xasprintf(char **strp, const char *fmt, ...) {
@@ -548,6 +910,10 @@ static inline int xasprintf(char **strp, const char *fmt, ...) {
 #define        LEVEL_LINEAR            (-1)
 #define        LEVEL_FAULTY            (-5)
 
+/* kernel module doesn't know about these */
+#define LEVEL_CONTAINER                (-100)
+#define        LEVEL_UNSUPPORTED       (-200)
+
 
 /* faulty stuff */
 
@@ -578,8 +944,44 @@ static inline int xasprintf(char **strp, const char *fmt, ...) {
 #define makedev(M,m) (((M)<<8) | (m))
 #endif
 
-/* for raid5 */
+/* for raid4/5/6 */
 #define ALGORITHM_LEFT_ASYMMETRIC      0
 #define ALGORITHM_RIGHT_ASYMMETRIC     1
 #define ALGORITHM_LEFT_SYMMETRIC       2
 #define ALGORITHM_RIGHT_SYMMETRIC      3
+
+/* Define non-rotating (raid4) algorithms.  These allow
+ * conversion of raid4 to raid5.
+ */
+#define ALGORITHM_PARITY_0             4 /* P or P,Q are initial devices */
+#define ALGORITHM_PARITY_N             5 /* P or P,Q are final devices. */
+
+/* DDF RAID6 layouts differ from md/raid6 layouts in two ways.
+ * Firstly, the exact positioning of the parity block is slightly
+ * different between the 'LEFT_*' modes of md and the "_N_*" modes
+ * of DDF.
+ * Secondly, or order of datablocks over which the Q syndrome is computed
+ * is different.
+ * Consequently we have different layouts for DDF/raid6 than md/raid6.
+ * These layouts are from the DDFv1.2 spec.
+ * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but
+ * leaves RLQ=3 as 'Vendor Specific'
+ */
+
+#define ALGORITHM_ROTATING_ZERO_RESTART        8 /* DDF PRL=6 RLQ=1 */
+#define ALGORITHM_ROTATING_N_RESTART   9 /* DDF PRL=6 RLQ=2 */
+#define ALGORITHM_ROTATING_N_CONTINUE  10 /*DDF PRL=6 RLQ=3 */
+
+
+/* For every RAID5 algorithm we define a RAID6 algorithm
+ * with exactly the same layout for data and parity, and
+ * with the Q block always on the last device (N-1).
+ * This allows trivial conversion from RAID5 to RAID6
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC_6    16
+#define ALGORITHM_RIGHT_ASYMMETRIC_6   17
+#define ALGORITHM_LEFT_SYMMETRIC_6     18
+#define ALGORITHM_RIGHT_SYMMETRIC_6    19
+#define ALGORITHM_PARITY_0_6           20
+#define ALGORITHM_PARITY_N_6           ALGORITHM_PARITY_N
+
index f18dc30af0de1fd3d32da48c08cb92244bdbd9b1..c9618174e3bec64c89a0cfb67df9f94a89e119b0 100644 (file)
@@ -1,6 +1,6 @@
 Summary:     mdadm is used for controlling Linux md devices (aka RAID arrays)
 Name:        mdadm
-Version:     2.6.9
+Version:     3.0_rc1
 Release:     1
 Source:      http://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-%{version}.tgz
 URL:         http://neil.brown.name/blog/mdadm
index 77d877f1b22296b37af007d52f21e3280611a8fa..1baf2fe0b79099199d0592499335ef197c6e2859 100644 (file)
@@ -1,5 +1,5 @@
 .\" -*- nroff -*-
-.TH MDASSEMBLE 8 "" v2.6.9
+.TH MDASSEMBLE 8 "" v3.0-rc1
 .SH NAME
 mdassemble \- assemble MD devices
 .I aka
index b0d87b8362e7ceee98571d7ae9ceff12f71ba526..cf83795789158a28e41f441dac40dd060f067681 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * mdassemble - assemble Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  * Copyright (C) 2003 Luca Berra <bluca@vodka.it>
  *
  *
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 
 #include "mdadm.h"
@@ -55,7 +50,7 @@ mapping_t pers[] = {
 
 #ifndef MDASSEMBLE_AUTO
 /* from mdopen.c */
-int open_mddev(char *dev, int autof/*unused */)
+int open_mddev(char *dev, int report_errors/*unused*/)
 {
        int mdfd = open(dev, O_RDWR);
        if (mdfd < 0)
@@ -69,7 +64,21 @@ int open_mddev(char *dev, int autof/*unused */)
        }
        return mdfd;
 }
+int create_mddev(char *dev, char *name, int autof/*unused*/, int trustworthy,
+                char *chosen)
+{
+       return open_mddev(dev, 0);
+}
 #endif
+int map_update(struct map_ent **mpp, int devnum, char *metadata,
+              int *uuid, char *path)
+{
+       return 0;
+}
+struct map_ent *map_by_name(struct map_ent **mpp, char *name)
+{
+       return NULL;
+}
 
 int rv;
 int mdfd = -1;
@@ -86,19 +95,19 @@ int main(int argc, char *argv[]) {
        } else
                for (; array_list; array_list = array_list->next) {
                        mdu_array_info_t array;
-                       mdfd = open_mddev(array_list->devname, array_list->autof);
-                       if (mdfd < 0) {
-                               rv |= 1;
+                       if (strcasecmp(array_list->devname, "<ignore>") == 0)
                                continue;
-                       }
-                       if (ioctl(mdfd, GET_ARRAY_INFO, &array) < 0) {
-                               rv |= Assemble(array_list->st, array_list->devname, mdfd,
-                                          array_list, NULL, NULL,
-                                          readonly, runstop, NULL, NULL, verbose, force);
-                       } else {
+                       mdfd = open_mddev(array_list->devname, 0);
+                       if (mdfd >= 0 && ioctl(mdfd, GET_ARRAY_INFO, &array) == 0) {
                                rv |= Manage_ro(array_list->devname, mdfd, -1); /* make it readwrite */
+                               continue;
                        }
-                       close(mdfd);
+                       if (mdfd >= 0)
+                               close(mdfd);
+                       rv |= Assemble(array_list->st, array_list->devname,
+                                      array_list, NULL, NULL,
+                                      readonly, runstop, NULL, NULL, 0,
+                                      verbose, force);
                }
        return rv;
 }
diff --git a/mdmon.8 b/mdmon.8
new file mode 100644 (file)
index 0000000..c9cb5de
--- /dev/null
+++ b/mdmon.8
@@ -0,0 +1,164 @@
+.\" See file COPYING in distribution for details.
+.TH MDMON 8 "" v3.0-rc1
+.SH NAME
+mdmon \- monitor MD external metadata arrays
+
+.SH SYNOPSIS
+
+.BI mdmon " CONTAINER [NEWROOT]"
+
+.SH OVERVIEW
+The 2.6.27 kernel brings the ability to support external metadata arrays.
+External metadata implies that user space handles all updates to the metadata.
+The kernel's responsibility is to notify user space when a "metadata event"
+occurs, like disk failures and clean-to-dirty transitions.  The kernel, in
+important cases, waits for user space to take action on these notifications.
+
+.SH DESCRIPTION
+.SS Metadata updates:
+To service metadata update requests a daemon,
+.IR mdmon ,
+is introduced.
+.I Mdmon
+is tasked with polling the sysfs namespace looking for changes in
+.BR array_state , 
+.BR sync_action ,
+and per disk
+.BR state
+attributes.  When a change is detected it calls a per metadata type
+handler to make modifications to the metadata.  The following actions
+are taken:
+.RS
+.TP
+.B array_state \- inactive
+Clear the dirty bit for the volume and let the array be stopped
+.TP
+.B array_state \- write pending
+Set the dirty bit for the array and then set
+.B array_state
+to
+.BR active .
+Writes
+are blocked until userspace writes
+.BR active.
+.TP
+.B array_state \- active-idle
+The safe mode timer has expired so set array state to clean to block writes to the array
+.TP
+.B array_state \- clean
+Clear the dirty bit for the volume
+.TP
+.B array_state \- read-only
+This is the initial state that all arrays start at.
+.I mdmon
+takes one of the three actions:
+.RS
+.TP
+1/
+Transition the array to read-auto keeping the dirty bit clear if the metadata
+handler determines that the array does not need resyncing or other modification
+.TP
+2/
+Transition the array to active if the metadata handler determines a resync or
+some other manipulation is necessary
+.TP
+3/
+Leave the array read\-only if the volume is marked to not be monitored; for
+example, the metadata version has been set to "external:\-dev/md127" instead of
+"external:/dev/md127"
+.RE
+.TP
+.B sync_action \- resync\-to\-idle
+Notify the metadata handler that a resync may have completed.  If a resync
+process is idled before it completes this event allows the metadata handler to
+checkpoint resync.
+.TP
+.B sync_action \- recover\-to\-idle
+A spare may have completed rebuilding so tell the metadata handler about the
+state of each disk.  This is the metadata handler's opportunity to clear
+any "out-of-sync" bits and clear the volume's degraded status.  If a recovery
+process is idled before it completes this event allows the metadata handler to
+checkpoint recovery.
+.TP
+.B <disk>/state \- faulty
+A disk failure kicks off a series of events.  First, notify the metadata
+handler that a disk has failed, and then notify the kernel that it can unblock
+writes that were dependent on this disk.  After unblocking the kernel this disk
+is set to be removed+ from the member array.  Finally the disk is marked failed
+in all other member arrays in the container.
+.IP
++ Note This behavior differs slightly from native MD arrays where
+removal is reserved for a
+.B mdadm --remove
+event.  In the external metadata case the container holds the final
+reference on a block device and a
+.B mdadm --remove <container> <victim>
+call is still required.
+.RE
+
+.SS Containers:
+.P
+External metadata formats, like DDF, differ from the native MD metadata
+formats in that they define a set of disks and a series of sub-arrays
+within those disks.  MD metadata in comparison defines a 1:1
+relationship between a set of block devices and a raid array.  For
+example to create 2 arrays at different raid levels on a single
+set of disks, MD metadata requires the disks be partitioned and then
+each array can created be created with a subset of those partitions.  The
+supported external formats perform this disk carving internally.
+.P
+Container devices simply hold references to all member disks and allow
+tools like
+.I mdmon
+to determine which active arrays belong to which
+container.  Some array management commands like disk removal and disk
+add are now only valid at the container level.  Attempts to perform
+these actions on member arrays are blocked with error messages like:
+.IP
+"mdadm: Cannot remove disks from a \'member\' array, perform this
+operation on the parent container"
+.P
+Containers are identified in /proc/mdstat with a metadata version string
+"external:<metadata name>". Member devices are identified by
+"external:/<container device>/<member index>", or "external:-<container
+device>/<member index>" if the array is to remain readonly.
+
+.SH OPTIONS
+.TP
+CONTAINER
+The
+.B container
+device to monitor.  It can be a full path like /dev/md/container, a simple md
+device name like md127, or /proc/mdstat which tells
+.I mdmon
+to scan for containers and launch an
+.I mdmon
+instance for each one found.
+.TP
+[NEWROOT]
+In order to support an external metadata raid array as the rootfs
+.I mdmon
+needs to be started in the initramfs environment.  Once the initramfs
+environment mounts the final rootfs
+.I mdmon
+needs to be restarted in the new namespace.  When NEWROOT is specified
+.I mdmon
+will terminate any
+.I mdmon
+instances that are running in the current namespace,
+.IR chroot (2)
+to NEWROOT, and continue monitoring the container.
+.PP
+Note that
+.I mdmon
+is automatically started by
+.I mdadm
+when needed and so does not need to be considered when working with
+RAID arrays.  The only times it is run other that by
+.I  mdadm
+is when the boot scripts need to restart it after mounting the new
+root filesystem.
+
+.SH SEE ALSO
+.IR mdadm (8),
+.IR md (4).
diff --git a/mdmon.c b/mdmon.c
new file mode 100644 (file)
index 0000000..37f97af
--- /dev/null
+++ b/mdmon.c
@@ -0,0 +1,550 @@
+/*
+ * mdmon - monitor external metadata arrays
+ *
+ * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2007-2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * md array manager.
+ * When md arrays have user-space managed metadata, this is the program
+ * that does the managing.
+ *
+ * Given one argument: the name of the array (e.g. /dev/md0) that is
+ * the container.
+ * We fork off a helper that runs high priority and mlocked.  It responds to
+ * device failures and other events that might stop writeout, or that are
+ * trivial to deal with.
+ * The main thread then watches for new arrays being created in the container
+ * and starts monitoring them too ... along with a few other tasks.
+ *
+ * The main thread communicates with the priority thread by writing over
+ * a pipe.
+ * Separate programs can communicate with the main thread via Unix-domain
+ * socket.
+ * The two threads share address space and open file table.
+ *
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include       <unistd.h>
+#include       <stdlib.h>
+#include       <sys/types.h>
+#include       <sys/stat.h>
+#include       <sys/socket.h>
+#include       <sys/un.h>
+#include       <sys/mman.h>
+#include       <sys/syscall.h>
+#include       <sys/wait.h>
+#include       <stdio.h>
+#include       <errno.h>
+#include       <string.h>
+#include       <fcntl.h>
+#include       <signal.h>
+#include       <dirent.h>
+
+#include       <sched.h>
+
+#include       "mdadm.h"
+#include       "mdmon.h"
+
+struct active_array *discard_this;
+struct active_array *pending_discard;
+
+int mon_tid, mgr_tid;
+
+int sigterm;
+
+int run_child(void *v)
+{
+       struct supertype *c = v;
+
+       do_monitor(c);
+       return 0;
+}
+
+#ifdef __ia64__
+int __clone2(int (*fn)(void *),
+           void *child_stack_base, size_t stack_size,
+           int flags, void *arg, ...
+        /* pid_t *pid, struct user_desc *tls, pid_t *ctid */ );
+#endif
+ int clone_monitor(struct supertype *container)
+{
+       static char stack[4096];
+
+#ifdef __ia64__
+       mon_tid = __clone2(run_child, stack, sizeof(stack),
+                  CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD,
+                  container);
+#else
+       mon_tid = clone(run_child, stack+4096-64,
+                  CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD,
+                  container);
+#endif
+
+       mgr_tid = syscall(SYS_gettid);
+
+       return mon_tid;
+}
+
+static struct superswitch *find_metadata_methods(char *vers)
+{
+       if (strcmp(vers, "ddf") == 0)
+               return &super_ddf;
+       if (strcmp(vers, "imsm") == 0)
+               return &super_imsm;
+       return NULL;
+}
+
+
+int make_pidfile(char *devname, int o_excl)
+{
+       char path[100];
+       char pid[10];
+       int fd;
+       int n;
+
+       if (sigterm)
+               return -1;
+
+       sprintf(path, "/var/run/mdadm/%s.pid", devname);
+
+       fd = open(path, O_RDWR|O_CREAT|o_excl, 0600);
+       if (fd < 0)
+               return -errno;
+       sprintf(pid, "%d\n", getpid());
+       n = write(fd, pid, strlen(pid));
+       close(fd);
+       if (n < 0)
+               return -errno;
+       return 0;
+}
+
+int is_container_member(struct mdstat_ent *mdstat, char *container)
+{
+       if (mdstat->metadata_version == NULL ||
+           strncmp(mdstat->metadata_version, "external:", 9) != 0 ||
+           !is_subarray(mdstat->metadata_version+9) ||
+           strncmp(mdstat->metadata_version+10, container, strlen(container)) != 0 ||
+           mdstat->metadata_version[10+strlen(container)] != '/')
+               return 0;
+
+       return 1;
+}
+
+void remove_pidfile(char *devname);
+static void try_kill_monitor(char *devname)
+{
+       char buf[100];
+       int fd;
+       pid_t pid;
+       struct mdstat_ent *mdstat;
+
+       sprintf(buf, "/var/run/mdadm/%s.pid", devname);
+       fd = open(buf, O_RDONLY);
+       if (fd < 0)
+               return;
+
+       if (read(fd, buf, sizeof(buf)) < 0) {
+               close(fd);
+               return;
+       }
+
+       close(fd);
+       pid = strtoul(buf, NULL, 10);
+
+       /* first rule of survival... don't off yourself */
+       if (pid == getpid())
+               return;
+
+       /* kill this process if it is mdmon */
+       sprintf(buf, "/proc/%lu/cmdline", (unsigned long) pid);
+       fd = open(buf, O_RDONLY);
+       if (fd < 0)
+               return;
+
+       if (read(fd, buf, sizeof(buf)) < 0) {
+               close(fd);
+               return;
+       }
+
+       if (!strstr(buf, "mdmon"))
+               return;
+
+       kill(pid, SIGTERM);
+
+       mdstat = mdstat_read(0, 0);
+       for ( ; mdstat; mdstat = mdstat->next)
+               if (is_container_member(mdstat, devname)) {
+                       sprintf(buf, "/dev/%s", mdstat->dev);
+                       WaitClean(buf, 0);
+               }
+       free_mdstat(mdstat);
+       remove_pidfile(devname);
+}
+
+void remove_pidfile(char *devname)
+{
+       char buf[100];
+
+       if (sigterm)
+               return;
+
+       sprintf(buf, "/var/run/mdadm/%s.pid", devname);
+       unlink(buf);
+       sprintf(buf, "/var/run/mdadm/%s.sock", devname);
+       unlink(buf);
+}
+
+int make_control_sock(char *devname)
+{
+       char path[100];
+       int sfd;
+       long fl;
+       struct sockaddr_un addr;
+
+       if (sigterm)
+               return -1;
+
+       sprintf(path, "/var/run/mdadm/%s.sock", devname);
+       unlink(path);
+       sfd = socket(PF_LOCAL, SOCK_STREAM, 0);
+       if (sfd < 0)
+               return -1;
+
+       addr.sun_family = PF_LOCAL;
+       strcpy(addr.sun_path, path);
+       if (bind(sfd, &addr, sizeof(addr)) < 0) {
+               close(sfd);
+               return -1;
+       }
+       listen(sfd, 10);
+       fl = fcntl(sfd, F_GETFL, 0);
+       fl |= O_NONBLOCK;
+       fcntl(sfd, F_SETFL, fl);
+       return sfd;
+}
+
+int socket_hup_requested;
+static void hup(int sig)
+{
+       socket_hup_requested = 1;
+}
+
+static void term(int sig)
+{
+       sigterm = 1;
+}
+
+static void wake_me(int sig)
+{
+
+}
+
+/* if we are debugging and starting mdmon by hand then don't fork */
+static int do_fork(void)
+{
+       #ifdef DEBUG
+       if (check_env("MDADM_NO_MDMON"))
+               return 0;
+       #endif
+
+       return 1;
+}
+
+void usage(void)
+{
+       fprintf(stderr, "Usage: mdmon /device/name/for/container [target_dir]\n");
+       exit(2);
+}
+
+int mdmon(char *devname, int devnum, int scan, char *switchroot);
+
+int main(int argc, char *argv[])
+{
+       char *container_name = NULL;
+       char *switchroot = NULL;
+       int devnum;
+       char *devname;
+       int scan = 0;
+       int status = 0;
+
+       switch (argc) {
+       case 3:
+               switchroot = argv[2];
+       case 2:
+               container_name = argv[1];
+               break;
+       default:
+               usage();
+       }
+
+       if (strcmp(container_name, "/proc/mdstat") == 0) {
+               struct mdstat_ent *mdstat, *e;
+
+               /* launch an mdmon instance for each container found */
+               scan = 1;
+               mdstat = mdstat_read(0, 0);
+               for (e = mdstat; e; e = e->next) {
+                       if (strncmp(e->metadata_version, "external:", 9) == 0 &&
+                           !is_subarray(&e->metadata_version[9])) {
+                               devname = devnum2devname(e->devnum);
+                               /* update cmdline so this mdmon instance can be
+                                * distinguished from others in a call to ps(1)
+                                */
+                               if (strlen(devname) <= strlen(container_name)) {
+                                       memset(container_name, 0, strlen(container_name));
+                                       sprintf(container_name, "%s", devname);
+                               }
+                               status |= mdmon(devname, e->devnum, scan,
+                                               switchroot);
+                       }
+               }
+               free_mdstat(mdstat);
+
+               return status;
+       } else if (strncmp(container_name, "md", 2) == 0) {
+               devnum = devname2devnum(container_name);
+               devname = devnum2devname(devnum);
+               if (strcmp(container_name, devname) != 0)
+                       devname = NULL;
+       } else {
+               struct stat st;
+
+               devnum = NoMdDev;
+               if (stat(container_name, &st) == 0)
+                       devnum = stat2devnum(&st);
+               if (devnum == NoMdDev)
+                       devname = NULL;
+               else
+                       devname = devnum2devname(devnum);
+       }
+
+       if (!devname) {
+               fprintf(stderr, "mdmon: %s is not a valid md device name\n",
+                       container_name);
+               exit(1);
+       }
+       return mdmon(devname, devnum, scan, switchroot);
+}
+
+int mdmon(char *devname, int devnum, int scan, char *switchroot)
+{
+       int mdfd;
+       struct mdinfo *mdi, *di;
+       struct supertype *container;
+       sigset_t set;
+       struct sigaction act;
+       int pfd[2];
+       int status;
+       int ignore;
+
+       dprintf("starting mdmon for %s in %s\n",
+               devname, switchroot ? : "/");
+       mdfd = open_dev(devnum);
+       if (mdfd < 0) {
+               fprintf(stderr, "mdmon: %s: %s\n", devname,
+                       strerror(errno));
+               return 1;
+       }
+       if (md_get_version(mdfd) < 0) {
+               fprintf(stderr, "mdmon: %s: Not an md device\n",
+                       devname);
+               return 1;
+       }
+
+       /* Fork, and have the child tell us when they are ready */
+       if (do_fork() || scan) {
+               if (pipe(pfd) != 0) {
+                       fprintf(stderr, "mdmon: failed to create pipe\n");
+                       return 1;
+               }
+               switch(fork()) {
+               case -1:
+                       fprintf(stderr, "mdmon: failed to fork: %s\n",
+                               strerror(errno));
+                       return 1;
+               case 0: /* child */
+                       close(pfd[0]);
+                       break;
+               default: /* parent */
+                       close(pfd[1]);
+                       if (read(pfd[0], &status, sizeof(status)) != sizeof(status)) {
+                               wait(&status);
+                               status = WEXITSTATUS(status);
+                       }
+                       return status;
+               }
+       } else
+               pfd[0] = pfd[1] = -1;
+
+       container = malloc(sizeof(*container));
+       container->devnum = devnum;
+       container->devname = devname;
+       container->arrays = NULL;
+       container->subarray[0] = 0;
+
+       if (!container->devname) {
+               fprintf(stderr, "mdmon: failed to allocate container name string\n");
+               exit(3);
+       }
+
+       mdi = sysfs_read(mdfd, container->devnum,
+                        GET_VERSION|GET_LEVEL|GET_DEVS|SKIP_GONE_DEVS);
+
+       if (!mdi) {
+               fprintf(stderr, "mdmon: failed to load sysfs info for %s\n",
+                       container->devname);
+               exit(3);
+       }
+       if (mdi->array.level != UnSet) {
+               fprintf(stderr, "mdmon: %s is not a container - cannot monitor\n",
+                       devname);
+               exit(3);
+       }
+       if (mdi->array.major_version != -1 ||
+           mdi->array.minor_version != -2) {
+               fprintf(stderr, "mdmon: %s does not use external metadata - cannot monitor\n",
+                       devname);
+               exit(3);
+       }
+
+       container->ss = find_metadata_methods(mdi->text_version);
+       if (container->ss == NULL) {
+               fprintf(stderr, "mdmon: %s uses unknown metadata: %s\n",
+                       devname, mdi->text_version);
+               exit(3);
+       }
+
+       container->devs = NULL;
+       for (di = mdi->devs; di; di = di->next) {
+               struct mdinfo *cd = malloc(sizeof(*cd));
+               *cd = *di;
+               cd->next = container->devs;
+               container->devs = cd;
+       }
+       sysfs_free(mdi);
+
+       /* SIGUSR is sent between parent and child.  So both block it
+        * and enable it only with pselect.
+        */
+       sigemptyset(&set);
+       sigaddset(&set, SIGUSR1);
+       sigaddset(&set, SIGHUP);
+       sigaddset(&set, SIGALRM);
+       sigaddset(&set, SIGTERM);
+       sigprocmask(SIG_BLOCK, &set, NULL);
+       act.sa_handler = wake_me;
+       act.sa_flags = 0;
+       sigaction(SIGUSR1, &act, NULL);
+       sigaction(SIGALRM, &act, NULL);
+       act.sa_handler = hup;
+       sigaction(SIGHUP, &act, NULL);
+       act.sa_handler = term;
+       sigaction(SIGTERM, &act, NULL);
+       act.sa_handler = SIG_IGN;
+       sigaction(SIGPIPE, &act, NULL);
+
+       if (switchroot) {
+               /* we assume we assume that /sys /proc /dev are available in
+                * the new root (see nash:setuproot)
+                *
+                * kill any monitors in the current namespace and change
+                * to the new one
+                */
+               try_kill_monitor(container->devname);
+               if (chroot(switchroot) != 0) {
+                       fprintf(stderr, "mdmon: failed to chroot to '%s': %s\n",
+                               switchroot, strerror(errno));
+                       exit(4);
+               }
+       }
+
+       /* If this fails, we hope it already exists 
+        * pid file lives in /var/run/mdadm/mdXX.pid
+        */
+       mkdir("/var", 0600);
+       mkdir("/var/run", 0600);
+       mkdir("/var/run/mdadm", 0600);
+       ignore = chdir("/");
+       if (make_pidfile(container->devname, O_EXCL) < 0) {
+               if (ping_monitor(container->devname) == 0) {
+                       fprintf(stderr, "mdmon: %s already managed\n",
+                               container->devname);
+                       exit(3);
+               } else {
+                       int err;
+
+                       /* cleanup the old monitor, this one is taking over */
+                       try_kill_monitor(container->devname);
+                       err = make_pidfile(container->devname, 0);
+                       if (err < 0) {
+                               fprintf(stderr, "mdmon: %s Cannot create pidfile\n",
+                                       container->devname);
+                               if (err == -EROFS) {
+                                       /* FIXME implement a mechanism to
+                                        * prevent duplicate monitor instances
+                                        */
+                                       fprintf(stderr,
+                                               "mdmon: continuing on read-only file system\n");
+                               } else
+                                       exit(3);
+                       }
+               }
+       }
+       container->sock = make_control_sock(container->devname);
+
+       if (container->ss->load_super(container, mdfd, devname)) {
+               fprintf(stderr, "mdmon: Cannot load metadata for %s\n",
+                       devname);
+               exit(3);
+       }
+       close(mdfd);
+
+       /* Ok, this is close enough.  We can say goodbye to our parent now.
+        */
+       status = 0;
+       if (write(pfd[1], &status, sizeof(status)) < 0)
+               fprintf(stderr, "mdmon: failed to notify our parent: %d\n",
+                       getppid());
+       close(pfd[1]);
+
+       setsid();
+       close(0);
+       open("/dev/null", O_RDWR);
+       close(1);
+       ignore = dup(0);
+#ifndef DEBUG
+       close(2);
+       ignore = dup(0);
+#endif
+
+       mlockall(MCL_FUTURE);
+
+       if (clone_monitor(container) < 0) {
+               fprintf(stderr, "mdmon: failed to start monitor process: %s\n",
+                       strerror(errno));
+               exit(2);
+       }
+
+       do_manager(container);
+
+       exit(0);
+}
diff --git a/mdmon.h b/mdmon.h
new file mode 100644 (file)
index 0000000..7cfee35
--- /dev/null
+++ b/mdmon.h
@@ -0,0 +1,94 @@
+/*
+ * mdmon - monitor external metadata arrays
+ *
+ * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2007-2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+enum array_state { clear, inactive, suspended, readonly, read_auto,
+                  clean, active, write_pending, active_idle, bad_word};
+
+enum sync_action { idle, reshape, resync, recover, check, repair, bad_action };
+
+
+struct active_array {
+       struct mdinfo info;
+       struct supertype *container;
+       struct active_array *next, *replaces;
+
+       int action_fd;
+       int resync_start_fd;
+       int metadata_fd; /* for monitoring rw/ro status */
+
+       enum array_state prev_state, curr_state, next_state;
+       enum sync_action prev_action, curr_action, next_action;
+
+       int check_degraded; /* flag set by mon, read by manage */
+
+       int devnum;
+
+       unsigned long long resync_start;
+};
+
+/*
+ * Metadata updates are handled by the monitor thread,
+ * as it has exclusive access to the metadata.
+ * When the manager want to updates metadata, either
+ * for it's own reason (e.g. committing a spare) or
+ * on behalf of mdadm, it creates a metadata_update
+ * structure and queues it to the monitor.
+ * Updates are created and processed by code under the
+ * superswitch.  All common code sees them as opaque
+ * blobs.
+ */
+extern struct metadata_update *update_queue, *update_queue_handled;
+
+#define MD_MAJOR 9
+
+extern struct active_array *container;
+extern struct active_array *discard_this;
+extern struct active_array *pending_discard;
+extern struct md_generic_cmd *active_cmd;
+
+
+void remove_pidfile(char *devname);
+void do_monitor(struct supertype *container);
+void do_manager(struct supertype *container);
+int make_control_sock(char *devname);
+int make_pidfile(char *devname, int o_excl);
+extern int socket_hup_requested;
+extern int sigterm;
+
+int read_dev_state(int fd);
+int get_resync_start(struct active_array *a);
+int is_container_member(struct mdstat_ent *mdstat, char *container);
+
+struct mdstat_ent *mdstat_read(int hold, int start);
+
+extern int exit_now, manager_ready;
+extern int mon_tid, mgr_tid;
+extern int monitor_loop_cnt;
+
+/* helper routine to determine resync completion since MaxSector is a
+ * moving target
+ */
+static inline int is_resync_complete(struct active_array *a)
+{
+       if (a->resync_start >= a->info.component_size)
+               return 1;
+       return 0;
+}
+
index 9250e4bacf3062b308157c83480de7561603ed1c..d322cf42ad515327800290b2c6c1787d243212c2 100644 (file)
--- a/mdopen.c
+++ b/mdopen.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 
 #include "mdadm.h"
 #include <ctype.h>
 
 
-void make_dev_symlink(char *dev)
-{
-       char *new = strdup(dev);
-
-       if (!new) return;
-       /* /dev/md/0 -> /dev/md0
-        * /dev/md/d0 -> /dev/md_d0
-        */
-       if (isdigit(new[8]))
-               strcpy(new+7, new+8);
-       else
-               new[7] = '_';
-       if (symlink(dev+5, new))
-               perror(new);
-}
-
-
-void make_parts(char *dev, int cnt, int symlinks)
+void make_parts(char *dev, int cnt)
 {
        /* make 'cnt' partition devices for 'dev'
-        * We use the major/minor from dev and add 1..cnt
+        * If dev is a device name we use the
+        *  major/minor from dev and add 1..cnt
+        * If it is a symlink, we make similar symlinks.
         * If dev ends with a digit, we add "p%d" else "%d"
         * If the name exists, we use it's owner/mode,
         * else that of dev
         */
        struct stat stb;
-       int major_num, minor_num;
+       int major_num = major_num; /* quiet gcc -Os unitialized warning */
+       int minor_num = minor_num; /* quiet gcc -Os unitialized warning */
+       int odig = odig; /* quiet gcc -Os unitialized warning */
        int i;
        int nlen = strlen(dev) + 20;
        char *name = malloc(nlen);
        int dig = isdigit(dev[strlen(dev)-1]);
+       char orig[1024];
+       char sym[1024];
+       int err;
 
        if (cnt==0) cnt=4;
-       if (stat(dev, &stb)!= 0)
+       if (lstat(dev, &stb)!= 0)
                return;
-       if (!S_ISBLK(stb.st_mode))
-               return;
-       major_num = major(stb.st_rdev);
-       minor_num = minor(stb.st_rdev);
+       if (S_ISLNK(stb.st_mode)) {
+               int len = readlink(dev, orig, sizeof(orig));
+               if (len < 0 || len > 1000)
+                       return;
+               orig[len] = 0;
+               odig = isdigit(orig[len-1]);
+       } else if (S_ISBLK(stb.st_mode)) {
+               major_num = major(stb.st_rdev);
+               minor_num = minor(stb.st_rdev);
+       } else
+                  return;
        for (i=1; i <= cnt ; i++) {
                struct stat stb2;
                snprintf(name, nlen, "%s%s%d", dev, dig?"p":"", i);
@@ -83,39 +75,75 @@ void make_parts(char *dev, int cnt, int symlinks)
                } else {
                        stb2 = stb;
                }
-               if (mknod(name, S_IFBLK | 0600, makedev(major_num, minor_num+i)))
-                       perror("mknod");
-               if (chown(name, stb2.st_uid, stb2.st_gid))
-                       perror("chown");
-               if (chmod(name, stb2.st_mode & 07777))
-                       perror("chmod");
-               if (symlinks && strncmp(name, "/dev/md/", 8) == 0)
-                       make_dev_symlink(name);
-               stat(name, &stb2);
-               add_dev(name, &stb2, 0, NULL);
+               if (S_ISBLK(stb.st_mode)) {
+                       if (mknod(name, S_IFBLK | 0600,
+                                 makedev(major_num, minor_num+i)))
+                               perror("mknod");
+                       if (chown(name, stb2.st_uid, stb2.st_gid))
+                               perror("chown");
+                       if (chmod(name, stb2.st_mode & 07777))
+                               perror("chmod");
+                       err = 0;
+               } else {
+                       snprintf(sym, sizeof(sym), "%s%s%d", orig, odig?"p":"", i);
+                       err = symlink(sym, name);
+               }
+
+               if (err == 0 && stat(name, &stb2) == 0)
+                       add_dev(name, &stb2, 0, NULL);
        }
 }
 
 
 /*
- * Open a given md device, and check that it really is one.
- * If 'autof' is given, then we need to create, or recreate, the md device.
- * If the name already exists, and is not a block device, we fail.
- * If it exists and is not an md device, is not the right type (partitioned or not),
- * or is currently in-use, we remove the device, but remember the owner and mode.
- * If it now doesn't exist, we find a new md array and create the device.
- * Default ownership/mode comes from config file.
+ * We need a new md device to assemble/build/create an array.
+ * 'dev' is a name given us by the user (command line or mdadm.conf)
+ * It might start with /dev or /dev/md any might end with a digit
+ * string.
+ * If it starts with just /dev, it must be /dev/mdX or /dev/md_dX
+ * If it ends with a digit string, then it must be as above, or
+ * 'trustworthy' must be 'METADATA' and the 'dev' must be
+ *  /dev/md/'name'NN or 'name'NN
+ * If it doesn't end with a digit string, it must be /dev/md/'name'
+ * or 'name' or must be NULL.
+ * If the digit string is present, it gives the minor number to use
+ * If not, we choose a high, unused minor number.
+ * If the 'dev' is a standard name, it devices whether 'md' or 'mdp'.
+ * else if the name is 'd[0-9]+' then we use mdp
+ * else if trustworthy is 'METADATA' we use md
+ * else the choice depends on 'autof'.
+ * If name is NULL it is assumed to match whatever dev provides.
+ * If both name and dev are NULL, we choose a name 'mdXX' or 'mdpXX'
+ *
+ * If 'name' is given, and 'trustworthy' is 'foreign' and name is not
+ * supported by 'dev', we add a "_%d" suffix based on the minor number
+ * use that.
+ *
+ * If udev is configured, we create a temporary device, open it, and 
+ * unlink it.
+ * If not, we create the /dev/mdXX device, and is name is usable,
+ * /dev/md/name
+ * In any case we return /dev/md/name or (if that isn't available)
+ * /dev/mdXX in 'chosen'.
+ *
+ * When we create devices, we use uid/gid/umask from config file.
  */
-int open_mddev(char *dev, int autof)
+
+int create_mddev(char *dev, char *name, int autof, int trustworthy,
+                char *chosen)
 {
        int mdfd;
        struct stat stb;
-       int major_num = MD_MAJOR;
-       int minor_num = 0;
-       int must_remove = 0;
-       int num;
+       int num = -1;
+       int use_mdp = -1;
        struct createinfo *ci = conf_get_create_info();
        int parts;
+       char *cname;
+       char devname[20];
+       char cbuf[400];
+       if (chosen == NULL)
+               chosen = cbuf;
+
 
        if (autof == 0)
                autof = ci->autof;
@@ -123,226 +151,255 @@ int open_mddev(char *dev, int autof)
        parts = autof >> 3;
        autof &= 7;
 
-       if (autof && autof != 1) {
-               /* autof is set, so we need to check that the name is ok,
-                * and possibly create one if not
-                */
-               int std;
-               stb.st_mode = 0;
-               if (stat(dev, &stb)==0 && ! S_ISBLK(stb.st_mode)) {
-                       fprintf(stderr, Name ": %s is not a block device.\n",
-                               dev);
-                       return -1;
-               }
-               /* check major number is correct */
-               num = -1;
-               std = is_standard(dev, &num);
-               if (std>0) major_num = get_mdp_major();
-               switch(autof) {
-               case 2: /* only create is_standard names */
-                       if (!std && !stb.st_mode) {
-                               fprintf(stderr, Name
-                       ": %s does not exist and is not a 'standard' name "
-                       "so it cannot be created\n", dev);
-                               return -1;
-                       }
-                       break;
-               case 3: /* create md, reject std>0 */
-                       if (std > 0) {
-                               fprintf(stderr, Name ": that --auto option "
-                               "not compatable with device named %s\n", dev);
-                               return -1;
-                       }
-                       break;
-               case 4: /* create mdp, reject std<0 */
-                       if (std < 0) {
-                               fprintf(stderr, Name ": that --auto option "
-                               "not compatable with device named %s\n", dev);
+       strcpy(chosen, "/dev/md/");
+       cname = chosen + strlen(chosen);
+
+
+       if (dev) {
+               
+               if (strncmp(dev, "/dev/md/", 8) == 0) {
+                       strcpy(cname, dev+8);
+               } else if (strncmp(dev, "/dev/", 5) == 0) {
+                       char *e = dev + strlen(dev);
+                       while (e > dev && isdigit(e[-1]))
+                               e--;
+                       if (e[0])
+                               num = strtoul(e, NULL, 10);
+                       strcpy(cname, dev+5);
+                       cname[e-(dev+5)] = 0;
+                       /* name *must* be mdXX or md_dXX in this context */
+                       if (num < 0 ||
+                           (strcmp(cname, "md") != 0 && strcmp(cname, "md_d") != 0)) {
+                               fprintf(stderr, Name ": %s is an invalid name "
+                                       "for an md device.  Try /dev/md/%s\n",
+                                       dev, dev+5);
                                return -1;
                        }
-                       major_num = get_mdp_major();
-                       break;
-               case 5: /* default to md if not standard */
-                       break;
-               case 6: /* default to mdp if not standard */
-                       if (std == 0) major_num = get_mdp_major();
-                       break;
+                       if (strcmp(cname, "md") == 0)
+                               use_mdp = 0;
+                       else
+                               use_mdp = 1;
+                       /* recreate name: /dev/md/0 or /dev/md/d0 */
+                       sprintf(cname, "%s%d", use_mdp?"d":"", num);
+               } else
+                       strcpy(cname, dev);
+
+               /* 'cname' must not contain a slash, and may not be
+                * empty.
+                */
+               if (strchr(cname, '/') != NULL) {
+                       fprintf(stderr, Name ": %s is an invalid name "
+                               "for an md device.\n", dev);
+                       return -1;
                }
-               /* major is final. num is -1 if not standard */
-               if (stb.st_mode && major(stb.st_rdev) != major_num)
-                       must_remove = 1;
-               if (stb.st_mode && !must_remove) {
-                       /* looks ok, see if it is available */
-                       mdfd = open(dev, O_RDWR);
-                       if (mdfd < 0) {
-                               fprintf(stderr, Name ": error opening %s: %s\n",
-                                       dev, strerror(errno));
-                               return -1;
-                       } else if (md_get_version(mdfd) <= 0) {
-                               fprintf(stderr, Name ": %s does not appear to be an md device\n",
-                                       dev);
-                               close(mdfd);
-                               return -1;
-                       }
-                       if (major_num != MD_MAJOR && parts > 0)
-                               make_parts(dev, parts, ci->symlinks);
-                       return mdfd;
+               if (cname[0] == 0) {
+                       fprintf(stderr, Name ": %s is an invalid name "
+                               "for an md device (empty!).", dev);
+                       return -1;
                }
-               /* Ok, need to find a minor that is not in use.
-                * If the device name is in a 'standard' format,
-                * intuit the minor from that, else
-                * easiest to read /proc/mdstat, and hunt through for
-                * an unused number
-                */
                if (num < 0) {
-                       /* need to pick an unused number */
-                       int num = find_free_devnum(major_num != MD_MAJOR);
-
-                       if (major_num == MD_MAJOR)
-                               minor_num = num;
+                       /* If cname  is 'N' or 'dN', we get dev number
+                        * from there.
+                        */
+                       char *sp = cname;
+                       char *ep;
+                       if (cname[0] == 'd')
+                               sp++;
+                       num = strtoul(sp, &ep, 10);
+                       if (ep == sp || *ep || num < 0)
+                               num = -1;
+                       else if (cname[0] == 'd')
+                               use_mdp = 1;
                        else
-                               minor_num = (-1-num) << MdpMinorShift;
-               } else if (major_num == MD_MAJOR)
-                       minor_num = num;
-               else
-                       minor_num = num << MdpMinorShift;
-               /* major and minor have been chosen */
-
-               /* If it was a 'standard' name and it is in-use, then
-                * the device could already be correct
-                */
-               if (stb.st_mode && major(stb.st_rdev) == major_num &&
-                   minor(stb.st_rdev) == minor_num)
-                       ;
-               else {
-                       if (major(makedev(major_num,minor_num)) != major_num ||
-                           minor(makedev(major_num,minor_num)) != minor_num) {
-                               fprintf(stderr, Name ": Need newer C library to use more than 4 partitionable md devices, sorry\n");
-                               return -1;
-                       }
-                       if (must_remove)
-                               unlink(dev);
-
-                       if (strncmp(dev, "/dev/md/", 8) == 0) {
-                               if (mkdir("/dev/md",0700)==0) {
-                                       if (chown("/dev/md", ci->uid, ci->gid))
-                                               perror("chown /dev/md");
-                                       if (chmod("/dev/md", ci->mode| ((ci->mode>>2) & 0111)))
-                                               perror("chmod /dev/md");
-                               }
-                       }
-                       if (mknod(dev, S_IFBLK|0600, makedev(major_num, minor_num))!= 0) {
-                               fprintf(stderr, Name ": failed to create %s\n", dev);
-                               return -1;
-                       }
-                       if (must_remove) {
-                               if (chown(dev, stb.st_uid, stb.st_gid))
-                                       perror("chown");
-                               if (chmod(dev, stb.st_mode & 07777))
-                                       perror("chmod");
-                       } else {
-                               if (chown(dev, ci->uid, ci->gid))
-                                       perror("chown");
-                               if (chmod(dev, ci->mode))
-                                       perror("chmod");
-                       }
-                       stat(dev, &stb);
-                       add_dev(dev, &stb, 0, NULL);
-                       if (ci->symlinks && strncmp(dev, "/dev/md/", 8) == 0)
-                               make_dev_symlink(dev);
-                       if (major_num != MD_MAJOR)
-                               make_parts(dev,parts, ci->symlinks);
+                               use_mdp = 0;
                }
        }
-       mdfd = open(dev, O_RDWR);
-       if (mdfd < 0)
-               fprintf(stderr, Name ": error opening %s: %s\n",
-                       dev, strerror(errno));
-       else if (md_get_version(mdfd) <= 0) {
-               fprintf(stderr, Name ": %s does not appear to be an md device\n",
-                       dev);
-               close(mdfd);
-               mdfd = -1;
-       }
-       return mdfd;
-}
 
-
-int open_mddev_devnum(char *devname, int devnum, char *name,
-                     char *chosen_name, int parts)
-{
-       /* Open the md device with number 'devnum', possibly using 'devname',
-        * possibly constructing a name with 'name', but in any case, copying
-        * the name into 'chosen_name'
-        */
-       int major_num, minor_num;
-       struct stat stb;
-       int i;
-       struct createinfo *ci = conf_get_create_info();
-
-       if (devname)
-               strcpy(chosen_name, devname);
-       else if (name && *name && strchr(name,'/') == NULL) {
-               char *n = strchr(name, ':');
-               if (n) n++; else n = name;
-               if (isdigit(*n) && devnum < 0)
-                       sprintf(chosen_name, "/dev/md/d%s", n);
-               else
-                       sprintf(chosen_name, "/dev/md/%s", n);
-       } else {
-               if (devnum >= 0)
-                       sprintf(chosen_name, "/dev/md%d", devnum);
+       /* Now determine device number */
+       /* named 'METADATA' cannot use 'mdp'. */
+       if (name && name[0] == 0)
+               name = NULL;
+       if (name && trustworthy == METADATA && use_mdp == 1) {
+               fprintf(stderr, Name ": %s is not allowed for a %s container. "
+                       "Consider /dev/md%d.\n", dev, name, num);
+               return -1;
+       }
+       if (name && trustworthy == METADATA)
+               use_mdp = 0;
+       if (use_mdp == -1) {
+               if (autof == 4 || autof == 6)
+                       use_mdp = 1;
                else
-                       sprintf(chosen_name, "/dev/md/d%d", -1-devnum);
+                       use_mdp = 0;
        }
-       if (devnum >= 0) {
-               major_num = MD_MAJOR;
-               minor_num = devnum;
-       } else {
-               major_num = get_mdp_major();
-               minor_num = (-1-devnum) << 6;
+       if (num < 0 && trustworthy == LOCAL && name) {
+               /* if name is numeric, possibly prefixed by 
+                * 'md' or '/dev/md', use that for num
+                * if it is not already in use */
+               char *ep;
+               char *n2 = name;
+               if (strncmp(n2, "/dev/", 5) == 0)
+                       n2 += 5;
+               if (strncmp(n2, "md", 2) == 0)
+                       n2 += 2;
+               if (*n2 == '/')
+                       n2++;
+               num = strtoul(n2, &ep, 10);
+               if (ep == n2 || *ep)
+                       num = -1;
+               else if (mddev_busy(use_mdp ? (-1-num) : num))
+                       num = -1;
        }
-       if (stat(chosen_name, &stb) == 0) {
-               /* It already exists.  Check it is right. */
-               if ( ! S_ISBLK(stb.st_mode) ||
-                    stb.st_rdev != makedev(major_num, minor_num)) {
-                       errno = EEXIST;
+
+       if (num < 0) {
+               /* need to choose a free number. */
+               num = find_free_devnum(use_mdp);
+               if (num == NoMdDev) {
+                       fprintf(stderr, Name ": No avail md devices - aborting\n");
                        return -1;
                }
        } else {
-               /* special case: if --incremental is suggesting a name
-                * in /dev/md/, we make sure the directory exists.
+               num = use_mdp ? (-1-num) : num;
+               if (mddev_busy(num)) {
+                       fprintf(stderr, Name ": %s is already in use.\n",
+                               dev);
+                       return -1;
+               }
+       }
+
+       if (num < 0)
+               sprintf(devname, "/dev/md_d%d", -1-num);
+       else
+               sprintf(devname, "/dev/md%d", num);
+
+       if (cname[0] == 0 && name) {
+               /* Need to find a name if we can
+                * We don't completely trust 'name'.  Truncate to
+                * reasonable length and remove '/'
                 */
-               if (strncmp(chosen_name, "/dev/md/", 8) == 0) {
+               char *cp;
+               struct map_ent *map = NULL;
+               int conflict = 1;
+               int unum = 0;
+               int cnlen;
+               strncpy(cname, name, 200);
+               cname[200] = 0;
+               while ((cp = strchr(cname, '/')) != NULL)
+                       *cp = '-';
+               if (trustworthy == LOCAL ||
+                   (trustworthy == FOREIGN && strchr(cname, ':') != NULL)) {
+                       /* Only need suffix if there is a conflict */
+                       if (map_by_name(&map, cname) == NULL)
+                               conflict = 0;
+               }
+               cnlen = strlen(cname);
+               while (conflict) {
+                       if (trustworthy == METADATA && !isdigit(cname[cnlen-1]))
+                               sprintf(cname+cnlen, "%d", unum);
+                       else
+                               /* add _%d to FOREIGN array that don't 
+                                * a 'host:' prefix
+                                */
+                               sprintf(cname+cnlen, "_%d", unum);
+                       unum++;
+                       if (map_by_name(&map, cname) == NULL)
+                               conflict = 0;
+               }
+       }
+
+       if (dev)
+               strcpy(chosen, dev);
+       else if (cname[0] == 0)
+               strcpy(chosen, devname);
+
+       /* We have a device number and name.
+        * If we cannot detect udev, we need to make
+        * devices and links ourselves.
+        */
+       if (stat("/dev/.udev", &stb) != 0 ||
+           check_env("MDADM_NO_UDEV")) {
+               /* Make sure 'devname' exists and 'chosen' is a symlink to it */
+               if (lstat(devname, &stb) == 0) {
+                       /* Must be the correct device, else error */
+                       if ((stb.st_mode&S_IFMT) != S_IFBLK ||
+                           stb.st_rdev != makedev(dev2major(num),dev2minor(num))) {
+                               fprintf(stderr, Name ": %s exists but looks wrong, please fix\n",
+                                       devname);
+                               return -1;
+                       }
+               } else {
+                       if (mknod(devname, S_IFBLK|0600,
+                                 makedev(dev2major(num),dev2minor(num))) != 0) {
+                               fprintf(stderr, Name ": failed to create %s\n",
+                                       devname);
+                               return -1;
+                       }
+                       if (chown(devname, ci->uid, ci->gid))
+                               perror("chown");
+                       if (chmod(devname, ci->mode))
+                               perror("chmod");
+                       stat(devname, &stb);
+                       add_dev(devname, &stb, 0, NULL);
+               }
+               if (use_mdp == 1)
+                       make_parts(devname, parts);
+               if (strcmp(chosen, devname) != 0) {
+
                        if (mkdir("/dev/md",0700)==0) {
                                if (chown("/dev/md", ci->uid, ci->gid))
                                        perror("chown /dev/md");
-                               if (chmod("/dev/md", ci->mode|
-                                                 ((ci->mode>>2) & 0111)))
+                               if (chmod("/dev/md", ci->mode| ((ci->mode>>2) & 0111)))
                                        perror("chmod /dev/md");
                        }
-               }
 
-               if (mknod(chosen_name, S_IFBLK | 0600,
-                         makedev(major_num, minor_num)) != 0) {
-                       return -1;
+                       if (dev && strcmp(chosen, dev) == 0)
+                               /* We know we are allowed to use this name */
+                               unlink(chosen);
+
+                       if (lstat(chosen, &stb) == 0) {
+                               char buf[300];
+                               if ((stb.st_mode & S_IFMT) != S_IFLNK ||
+                                   readlink(chosen, buf, 300) <0 ||
+                                   strcmp(buf, devname) != 0) {
+                                       fprintf(stderr, Name ": %s exists - ignoring\n",
+                                               chosen);
+                                       strcpy(chosen, devname);
+                               }
+                       } else if (symlink(devname, chosen) != 0)
+                               fprintf(stderr, Name ": failed to create %s: %s\n",
+                                       chosen, strerror(errno));
+                       if (use_mdp && strcmp(chosen, devname) != 0)
+                               make_parts(chosen, parts);
                }
-               /* FIXME chown/chmod ?? */
        }
+       mdfd = open_dev_excl(num);
+       if (mdfd < 0)
+               fprintf(stderr, Name ": unexpected failure opening %s\n",
+                       devname);
+       return mdfd;
+}
 
-       /* Simple locking to avoid --incr being called for the same
-        * array multiple times in parallel.
-        */
-       for (i = 0; i < 25 ; i++) {
-               int fd;
 
-               fd = open(chosen_name, O_RDWR|O_EXCL);
-               if (fd >= 0 || errno != EBUSY) {
-                       if (devnum < 0)
-                               make_parts(chosen_name, parts, ci->symlinks);
-                       return fd;
-               }
-               usleep(200000);
+/* Open this and check that it is an md device.
+ * On success, return filedescriptor.
+ * On failure, return -1 if it doesn't exist,
+ * or -2 if it exists but is not an md device.
+ */
+int open_mddev(char *dev, int report_errors)
+{
+       int mdfd = open(dev, O_RDWR);
+       if (mdfd < 0) {
+               if (report_errors)
+                       fprintf(stderr, Name ": error opening %s: %s\n",
+                               dev, strerror(errno));
+               return -1;
        }
-       return -1;
+       if (md_get_version(mdfd) <= 0) {
+               close(mdfd);
+               if (report_errors)
+                       fprintf(stderr, Name ": %s does not appear to be "
+                               "an md device\n", dev);
+               return -2;
+       }
+       return mdfd;
 }
index a8f7ce7576eb1d017acfe962a1dea48084cde5d2..4d2f473eb09a5b4714dda91fad6c222609c866b2 100644 (file)
--- a/mdstat.c
+++ b/mdstat.c
@@ -2,7 +2,7 @@
  * mdstat - parse /proc/mdstat file. Part of:
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2002-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2002-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 
 /*
@@ -86,6 +81,7 @@
 #include       "mdadm.h"
 #include       "dlink.h"
 #include       <sys/select.h>
+#include       <ctype.h>
 
 void free_mdstat(struct mdstat_ent *ms)
 {
@@ -94,6 +90,7 @@ void free_mdstat(struct mdstat_ent *ms)
                if (ms->dev) free(ms->dev);
                if (ms->level) free(ms->level);
                if (ms->pattern) free(ms->pattern);
+               if (ms->metadata_version) free(ms->metadata_version);
                t = ms;
                ms = ms->next;
                free(t);
@@ -158,6 +155,10 @@ struct mdstat_ent *mdstat_read(int hold, int start)
                ent->percent = -1;
                ent->active = -1;
                ent->resync = 0;
+               ent->metadata_version = NULL;
+               ent->raid_disks = 0;
+               ent->chunk_size = 0;
+               ent->devcnt = 0;
 
                ent->dev = strdup(line);
                ent->devnum = devnum;
@@ -176,22 +177,28 @@ struct mdstat_ent *mdstat_read(int hold, int start)
                                in_devs = 1;
                        } else if (in_devs && strcmp(w, "blocks")==0)
                                in_devs = 0;
-                       else if (in_devs && strncmp(w, "md", 2)==0) {
-                               /* This has an md device as a component.
-                                * If that device is already in the list,
-                                * make sure we insert before there.
-                                */
-                               struct mdstat_ent **ih;
-                               int dn2;
-                               if (strncmp(w, "md_d", 4)==0)
-                                       dn2 = -1-strtoul(w+4, &ep, 10);
-                               else
-                                       dn2 = strtoul(w+2, &ep, 10);
-                               ih = &all;
-                               while (ih != insert_here && *ih &&
-                                      (*ih)->devnum != dn2)
-                                       ih = & (*ih)->next;
-                               insert_here = ih;
+                       else if (in_devs) {
+                               ent->devcnt++;
+                               if (strncmp(w, "md", 2)==0) {
+                                       /* This has an md device as a component.
+                                        * If that device is already in the
+                                        * list, make sure we insert before
+                                        * there.
+                                        */
+                                       struct mdstat_ent **ih;
+                                       int dn2 = devname2devnum(w);
+                                       ih = &all;
+                                       while (ih != insert_here && *ih &&
+                                              (*ih)->devnum != dn2)
+                                               ih = & (*ih)->next;
+                                       insert_here = ih;
+                               }
+                       } else if (strcmp(w, "super") == 0 &&
+                                  dl_next(w) != line) {
+                               w = dl_next(w);
+                               ent->metadata_version = strdup(w);
+                       } else if (w[0] == '[' && isdigit(w[1])) {
+                               ent->raid_disks = atoi(w+1);
                        } else if (!ent->pattern &&
                                 w[0] == '[' &&
                                 (w[1] == 'U' || w[1] == '_')) {
@@ -248,12 +255,43 @@ void mdstat_wait(int seconds)
 {
        fd_set fds;
        struct timeval tm;
+       int maxfd = 0;
        FD_ZERO(&fds);
-       if (mdstat_fd >= 0)
+       if (mdstat_fd >= 0) {
                FD_SET(mdstat_fd, &fds);
+               maxfd = mdstat_fd;
+       }
        tm.tv_sec = seconds;
        tm.tv_usec = 0;
-       select(mdstat_fd >2 ? mdstat_fd+1:3, NULL, NULL, &fds, &tm);
+       select(maxfd + 1, NULL, NULL, &fds, &tm);
+}
+
+void mdstat_wait_fd(int fd, const sigset_t *sigmask)
+{
+       fd_set fds, rfds;
+       int maxfd = fd;
+
+       FD_ZERO(&fds);
+       FD_ZERO(&rfds);
+       if (mdstat_fd >= 0)
+               FD_SET(mdstat_fd, &fds);
+       if (fd >= 0) {
+               struct stat stb;
+               fstat(fd, &stb);
+               if ((stb.st_mode & S_IFMT) == S_IFREG)
+                       /* Must be a /proc or /sys fd, so expect
+                        * POLLPRI
+                        * i.e. an 'exceptional' event.
+                        */
+                       FD_SET(fd, &fds);
+               else
+                       FD_SET(fd, &rfds);
+       }
+       if (mdstat_fd > maxfd)
+               maxfd = mdstat_fd;
+
+       pselect(maxfd + 1, &rfds, NULL, &fds,
+               NULL, sigmask);
 }
 
 int mddev_busy(int devnum)
diff --git a/monitor.c b/monitor.c
new file mode 100644 (file)
index 0000000..0cafc3a
--- /dev/null
+++ b/monitor.c
@@ -0,0 +1,573 @@
+/*
+ * mdmon - monitor external metadata arrays
+ *
+ * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2007-2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "mdadm.h"
+#include "mdmon.h"
+#include <sys/syscall.h>
+#include <sys/select.h>
+#include <signal.h>
+
+static char *array_states[] = {
+       "clear", "inactive", "suspended", "readonly", "read-auto",
+       "clean", "active", "write-pending", "active-idle", NULL };
+static char *sync_actions[] = {
+       "idle", "reshape", "resync", "recover", "check", "repair", NULL
+};
+
+static int write_attr(char *attr, int fd)
+{
+       return write(fd, attr, strlen(attr));
+}
+
+static void add_fd(fd_set *fds, int *maxfd, int fd)
+{
+       if (fd < 0)
+               return;
+       if (fd > *maxfd)
+               *maxfd = fd;
+       FD_SET(fd, fds);
+}
+
+static int read_attr(char *buf, int len, int fd)
+{
+       int n;
+
+       if (fd < 0) {
+               buf[0] = 0;
+               return 0;
+       }
+       lseek(fd, 0, 0);
+       n = read(fd, buf, len - 1);
+
+       if (n <= 0) {
+               buf[0] = 0;
+               return 0;
+       }
+       buf[n] = 0;
+       if (buf[n-1] == '\n')
+               buf[n-1] = 0;
+       return n;
+}
+
+int get_resync_start(struct active_array *a)
+{
+       char buf[30];
+       int n;
+
+       n = read_attr(buf, 30, a->resync_start_fd);
+       if (n <= 0)
+               return n;
+       if (strncmp(buf, "none", 4) == 0)
+               a->resync_start = ~0ULL;
+       else
+               a->resync_start = strtoull(buf, NULL, 10);
+
+       return 1;
+}
+
+
+static enum array_state read_state(int fd)
+{
+       char buf[20];
+       int n = read_attr(buf, 20, fd);
+
+       if (n <= 0)
+               return bad_word;
+       return (enum array_state) sysfs_match_word(buf, array_states);
+}
+
+static enum sync_action read_action( int fd)
+{
+       char buf[20];
+       int n = read_attr(buf, 20, fd);
+
+       if (n <= 0)
+               return bad_action;
+       return (enum sync_action) sysfs_match_word(buf, sync_actions);
+}
+
+int read_dev_state(int fd)
+{
+       char buf[60];
+       int n = read_attr(buf, 60, fd);
+       char *cp;
+       int rv = 0;
+
+       if (n <= 0)
+               return 0;
+
+       cp = buf;
+       while (cp) {
+               if (sysfs_attr_match(cp, "faulty"))
+                       rv |= DS_FAULTY;
+               if (sysfs_attr_match(cp, "in_sync"))
+                       rv |= DS_INSYNC;
+               if (sysfs_attr_match(cp, "write_mostly"))
+                       rv |= DS_WRITE_MOSTLY;
+               if (sysfs_attr_match(cp, "spare"))
+                       rv |= DS_SPARE;
+               if (sysfs_attr_match(cp, "blocked"))
+                       rv |= DS_BLOCKED;
+               cp = strchr(cp, ',');
+               if (cp)
+                       cp++;
+       }
+       return rv;
+}
+
+static void signal_manager(void)
+{
+       /* tgkill(getpid(), mon_tid, SIGUSR1); */
+       int pid = getpid();
+       syscall(SYS_tgkill, pid, mgr_tid, SIGUSR1);
+}
+
+/* Monitor a set of active md arrays - all of which share the
+ * same metadata - and respond to events that require
+ * metadata update.
+ *
+ * New arrays are detected by another thread which allocates
+ * required memory and attaches the data structure to our list.
+ *
+ * Events:
+ *  Array stops.
+ *    This is detected by array_state going to 'clear' or 'inactive'.
+ *    while we thought it was active.
+ *    Response is to mark metadata as clean and 'clear' the array(??)
+ *  write-pending
+ *    array_state if 'write-pending'
+ *    We mark metadata as 'dirty' then set array to 'active'.
+ *  active_idle
+ *    Either ignore, or mark clean, then mark metadata as clean.
+ *
+ *  device fails
+ *    detected by rd-N/state reporting "faulty"
+ *    mark device as 'failed' in metadata, let the kernel release the
+ *    device by writing '-blocked' to rd/state, and finally write 'remove' to
+ *    rd/state.  Before a disk can be replaced it must be failed and removed
+ *    from all container members, this will be preemptive for the other
+ *    arrays... safe?
+ *
+ *  sync completes
+ *    sync_action was 'resync' and becomes 'idle' and resync_start becomes
+ *    MaxSector
+ *    Notify metadata that sync is complete.
+ *
+ *  recovery completes
+ *    sync_action changes from 'recover' to 'idle'
+ *    Check each device state and mark metadata if 'faulty' or 'in_sync'.
+ *
+ *  deal with resync
+ *    This only happens on finding a new array... mdadm will have set
+ *    'resync_start' to the correct value.  If 'resync_start' indicates that an
+ *    resync needs to occur set the array to the 'active' state rather than the
+ *    initial read-auto state.
+ *
+ *
+ *
+ * We wait for a change (poll/select) on array_state, sync_action, and
+ * each rd-X/state file.
+ * When we get any change, we check everything.  So read each state file,
+ * then decide what to do.
+ *
+ * The core action is to write new metadata to all devices in the array.
+ * This is done at most once on any wakeup.
+ * After that we might:
+ *   - update the array_state
+ *   - set the role of some devices.
+ *   - request a sync_action
+ *
+ */
+
+static int read_and_act(struct active_array *a)
+{
+       int check_degraded = 0;
+       int deactivate = 0;
+       struct mdinfo *mdi;
+       int dirty = 0;
+
+       a->next_state = bad_word;
+       a->next_action = bad_action;
+
+       a->curr_state = read_state(a->info.state_fd);
+       a->curr_action = read_action(a->action_fd);
+       for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+               mdi->next_state = 0;
+               if (mdi->state_fd >= 0)
+                       mdi->curr_state = read_dev_state(mdi->state_fd);
+       }
+
+       if (a->curr_state <= inactive &&
+           a->prev_state > inactive) {
+               /* array has been stopped */
+               get_resync_start(a);
+               a->container->ss->set_array_state(a, 1);
+               a->next_state = clear;
+               deactivate = 1;
+       }
+       if (a->curr_state == write_pending) {
+               get_resync_start(a);
+               a->container->ss->set_array_state(a, 0);
+               a->next_state = active;
+               dirty = 1;
+       }
+       if (a->curr_state == active_idle) {
+               /* Set array to 'clean' FIRST, then mark clean
+                * in the metadata
+                */
+               a->next_state = clean;
+               dirty = 1;
+       }
+       if (a->curr_state == clean) {
+               get_resync_start(a);
+               a->container->ss->set_array_state(a, 1);
+       }
+       if (a->curr_state == active ||
+           a->curr_state == suspended ||
+           a->curr_state == bad_word)
+               dirty = 1;
+       if (a->curr_state == readonly) {
+               /* Well, I'm ready to handle things.  If readonly
+                * wasn't requested, transition to read-auto.
+                */
+               char buf[64];
+               read_attr(buf, sizeof(buf), a->metadata_fd);
+               if (strncmp(buf, "external:-", 10) == 0) {
+                       /* explicit request for readonly array.  Leave it alone */
+                       ;
+               } else {
+                       get_resync_start(a);
+                       if (a->container->ss->set_array_state(a, 2))
+                               a->next_state = read_auto; /* array is clean */
+                       else {
+                               a->next_state = active; /* Now active for recovery etc */
+                               dirty = 1;
+                       }
+               }
+       }
+
+       if (!deactivate &&
+           a->curr_action == idle &&
+           a->prev_action == resync) {
+               /* A resync has finished.  The endpoint is recorded in
+                * 'sync_start'.  We don't update the metadata
+                * until the array goes inactive or readonly though.
+                * Just check if we need to fiddle spares.
+                */
+               get_resync_start(a);
+               a->container->ss->set_array_state(a, a->curr_state <= clean);
+               check_degraded = 1;
+       }
+
+       if (!deactivate &&
+           a->curr_action == idle &&
+           a->prev_action == recover) {
+               /* A recovery has finished.  Some disks may be in sync now,
+                * and the array may no longer be degraded
+                */
+               for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
+                       a->container->ss->set_disk(a, mdi->disk.raid_disk,
+                                                  mdi->curr_state);
+                       if (! (mdi->curr_state & DS_INSYNC))
+                               check_degraded = 1;
+               }
+       }
+
+       /* Check for failures and if found:
+        * 1/ Record the failure in the metadata and unblock the device.
+        *    FIXME update the kernel to stop notifying on failed drives when
+        *    the array is readonly and we have cleared 'blocked'
+        * 2/ Try to remove the device if the array is writable, or can be
+        *    made writable.
+        */
+       for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
+               if (mdi->curr_state & DS_FAULTY) {
+                       a->container->ss->set_disk(a, mdi->disk.raid_disk,
+                                                  mdi->curr_state);
+                       check_degraded = 1;
+                       mdi->next_state |= DS_UNBLOCK;
+                       if (a->curr_state == read_auto) {
+                               a->container->ss->set_array_state(a, 0);
+                               a->next_state = active;
+                       }
+                       if (a->curr_state > readonly)
+                               mdi->next_state |= DS_REMOVE;
+               }
+       }
+
+       a->container->ss->sync_metadata(a->container);
+       dprintf("%s(%d): state:%s action:%s next(", __func__, a->info.container_member,
+               array_states[a->curr_state], sync_actions[a->curr_action]);
+
+       /* Effect state changes in the array */
+       if (a->next_state != bad_word) {
+               dprintf(" state:%s", array_states[a->next_state]);
+               write_attr(array_states[a->next_state], a->info.state_fd);
+       }
+       if (a->next_action != bad_action) {
+               write_attr(sync_actions[a->next_action], a->action_fd);
+               dprintf(" action:%s", sync_actions[a->next_action]);
+       }
+       for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+               if (mdi->next_state & DS_UNBLOCK) {
+                       dprintf(" %d:-blocked", mdi->disk.raid_disk);
+                       write_attr("-blocked", mdi->state_fd);
+               }
+
+               if ((mdi->next_state & DS_REMOVE) && mdi->state_fd >= 0) {
+                       int remove_result;
+
+                       /* the kernel may not be able to immediately remove the
+                        * disk, we can simply wait until the next event to try
+                        * again.
+                        */
+                       remove_result = write_attr("remove", mdi->state_fd);
+                       if (remove_result > 0) {
+                               dprintf(" %d:removed", mdi->disk.raid_disk);
+                               close(mdi->state_fd);
+                               mdi->state_fd = -1;
+                       }
+               }
+               if (mdi->next_state & DS_INSYNC) {
+                       write_attr("+in_sync", mdi->state_fd);
+                       dprintf(" %d:+in_sync", mdi->disk.raid_disk);
+               }
+       }
+       dprintf(" )\n");
+
+       /* move curr_ to prev_ */
+       a->prev_state = a->curr_state;
+
+       a->prev_action = a->curr_action;
+
+       for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+               mdi->prev_state = mdi->curr_state;
+               mdi->next_state = 0;
+       }
+
+       if (check_degraded) {
+               /* manager will do the actual check */
+               a->check_degraded = 1;
+               signal_manager();
+       }
+
+       if (deactivate)
+               a->container = NULL;
+
+       return dirty;
+}
+
+static struct mdinfo *
+find_device(struct active_array *a, int major, int minor)
+{
+       struct mdinfo *mdi;
+
+       for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
+               if (mdi->disk.major == major && mdi->disk.minor == minor)
+                       return mdi;
+
+       return NULL;
+}
+
+static void reconcile_failed(struct active_array *aa, struct mdinfo *failed)
+{
+       struct active_array *a;
+       struct mdinfo *victim;
+
+       for (a = aa; a; a = a->next) {
+               if (!a->container)
+                       continue;
+               victim = find_device(a, failed->disk.major, failed->disk.minor);
+               if (!victim)
+                       continue;
+
+               if (!(victim->curr_state & DS_FAULTY))
+                       write_attr("faulty", victim->state_fd);
+       }
+}
+
+#ifdef DEBUG
+static void dprint_wake_reasons(fd_set *fds)
+{
+       int i;
+       char proc_path[256];
+       char link[256];
+       char *basename;
+       int rv;
+
+       fprintf(stderr, "monitor: wake ( ");
+       for (i = 0; i < FD_SETSIZE; i++) {
+               if (FD_ISSET(i, fds)) {
+                       sprintf(proc_path, "/proc/%d/fd/%d",
+                               (int) getpid(), i);
+
+                       rv = readlink(proc_path, link, sizeof(link) - 1);
+                       if (rv < 0) {
+                               fprintf(stderr, "%d:unknown ", i);
+                               continue;
+                       }
+                       link[rv] = '\0';
+                       basename = strrchr(link, '/');
+                       fprintf(stderr, "%d:%s ",
+                               i, basename ? ++basename : link);
+               }
+       }
+       fprintf(stderr, ")\n");
+}
+#endif
+
+int monitor_loop_cnt;
+
+static int wait_and_act(struct supertype *container, int nowait)
+{
+       fd_set rfds;
+       int maxfd = 0;
+       struct active_array **aap = &container->arrays;
+       struct active_array *a, **ap;
+       int rv;
+       struct mdinfo *mdi;
+       static unsigned int dirty_arrays = ~0; /* start at some non-zero value */
+
+       FD_ZERO(&rfds);
+
+       for (ap = aap ; *ap ;) {
+               a = *ap;
+               /* once an array has been deactivated we want to
+                * ask the manager to discard it.
+                */
+               if (!a->container) {
+                       if (discard_this) {
+                               ap = &(*ap)->next;
+                               continue;
+                       }
+                       *ap = a->next;
+                       a->next = NULL;
+                       discard_this = a;
+                       signal_manager();
+                       continue;
+               }
+
+               add_fd(&rfds, &maxfd, a->info.state_fd);
+               add_fd(&rfds, &maxfd, a->action_fd);
+               for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
+                       add_fd(&rfds, &maxfd, mdi->state_fd);
+
+               ap = &(*ap)->next;
+       }
+
+       if (manager_ready && (*aap == NULL || (sigterm && !dirty_arrays))) {
+               /* No interesting arrays, or we have been told to
+                * terminate and everything is clean.  Lets see about
+                * exiting.  Note that blocking at this point is not a
+                * problem as there are no active arrays, there is
+                * nothing that we need to be ready to do.
+                */
+               int fd = open_dev_excl(container->devnum);
+               if (fd >= 0 || errno != EBUSY) {
+                       /* OK, we are safe to leave */
+                       if (sigterm && !dirty_arrays)
+                               dprintf("caught sigterm, all clean... exiting\n");
+                       else
+                               dprintf("no arrays to monitor... exiting\n");
+                       remove_pidfile(container->devname);
+                       exit_now = 1;
+                       signal_manager();
+                       exit(0);
+               }
+       }
+
+       if (!nowait) {
+               sigset_t set;
+               sigprocmask(SIG_UNBLOCK, NULL, &set);
+               sigdelset(&set, SIGUSR1);
+               monitor_loop_cnt |= 1;
+               rv = pselect(maxfd+1, NULL, NULL, &rfds, NULL, &set);
+               monitor_loop_cnt += 1;
+               if (rv == -1 && errno == EINTR)
+                       rv = 0;
+               #ifdef DEBUG
+               dprint_wake_reasons(&rfds);
+               #endif
+
+       }
+
+       if (update_queue) {
+               struct metadata_update *this;
+
+               for (this = update_queue; this ; this = this->next)
+                       container->ss->process_update(container, this);
+
+               update_queue_handled = update_queue;
+               update_queue = NULL;
+               signal_manager();
+               container->ss->sync_metadata(container);
+       }
+
+       rv = 0;
+       dirty_arrays = 0;
+       for (a = *aap; a ; a = a->next) {
+               int is_dirty;
+
+               if (a->replaces && !discard_this) {
+                       struct active_array **ap;
+                       for (ap = &a->next; *ap && *ap != a->replaces;
+                            ap = & (*ap)->next)
+                               ;
+                       if (*ap)
+                               *ap = (*ap)->next;
+                       discard_this = a->replaces;
+                       a->replaces = NULL;
+                       /* FIXME check if device->state_fd need to be cleared?*/
+                       signal_manager();
+               }
+               if (a->container) {
+                       is_dirty = read_and_act(a);
+                       rv |= 1;
+                       dirty_arrays += is_dirty;
+                       /* when terminating stop manipulating the array after it
+                        * is clean, but make sure read_and_act() is given a
+                        * chance to handle 'active_idle'
+                        */
+                       if (sigterm && !is_dirty)
+                               a->container = NULL; /* stop touching this array */
+               }
+       }
+
+       /* propagate failures across container members */
+       for (a = *aap; a ; a = a->next) {
+               if (!a->container)
+                       continue;
+               for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
+                       if (mdi->curr_state & DS_FAULTY)
+                               reconcile_failed(*aap, mdi);
+       }
+
+       return rv;
+}
+
+void do_monitor(struct supertype *container)
+{
+       int rv;
+       int first = 1;
+       do {
+               rv = wait_and_act(container, first);
+               first = 0;
+       } while (rv >= 0);
+}
diff --git a/msg.c b/msg.c
new file mode 100644 (file)
index 0000000..5a4839f
--- /dev/null
+++ b/msg.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (C) 2008 Intel Corporation
+ *
+ *     mdmon socket / message handling
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include "mdadm.h"
+#include "mdmon.h"
+
+static const __u32 start_magic = 0x5a5aa5a5;
+static const __u32 end_magic = 0xa5a55a5a;
+
+static int send_buf(int fd, const void* buf, int len, int tmo)
+{
+       fd_set set;
+       int rv;
+       struct timeval timeout = {tmo, 0};
+       struct timeval *ptmo = tmo ? &timeout : NULL;
+
+       while (len) {
+               FD_ZERO(&set);
+               FD_SET(fd, &set);
+               rv = select(fd+1, NULL, &set, NULL, ptmo);
+               if (rv <= 0)
+                       return -1;
+               rv = write(fd, buf, len);
+               if (rv <= 0)
+                       return -1;
+               len -= rv;
+               buf += rv;
+       }
+       return 0;
+}
+
+static int recv_buf(int fd, void* buf, int len, int tmo)
+{
+       fd_set set;
+       int rv;
+       struct timeval timeout = {tmo, 0};
+       struct timeval *ptmo = tmo ? &timeout : NULL;
+
+       while (len) {
+               FD_ZERO(&set);
+               FD_SET(fd, &set);
+               rv = select(fd+1, &set, NULL, NULL, ptmo);
+               if (rv <= 0)
+                       return -1;
+               rv = read(fd, buf, len);
+               if (rv <= 0)
+                       return -1;
+               len -= rv;
+               buf += rv;
+       }
+       return 0;
+}
+
+
+int send_message(int fd, struct metadata_update *msg, int tmo)
+{
+       __s32 len = msg->len;
+       int rv;
+
+       rv = send_buf(fd, &start_magic, 4, tmo);
+       rv = rv ?: send_buf(fd, &len, 4, tmo);
+       if (len > 0)
+               rv = rv ?: send_buf(fd, msg->buf, msg->len, tmo);
+       rv = send_buf(fd, &end_magic, 4, tmo);
+
+       return rv;
+}
+
+int receive_message(int fd, struct metadata_update *msg, int tmo)
+{
+       __u32 magic;
+       __s32 len;
+       int rv;
+
+       rv = recv_buf(fd, &magic, 4, tmo);
+       if (rv < 0 || magic != start_magic)
+               return -1;
+       rv = recv_buf(fd, &len, 4, tmo);
+       if (rv < 0 || len > MSG_MAX_LEN)
+               return -1;
+       if (len > 0) {
+               msg->buf = malloc(len);
+               if (msg->buf == NULL)
+                       return -1;
+               rv = recv_buf(fd, msg->buf, len, tmo);
+               if (rv < 0) {
+                       free(msg->buf);
+                       return -1;
+               }
+       } else
+               msg->buf = NULL;
+       rv = recv_buf(fd, &magic, 4, tmo);
+       if (rv < 0 || magic != end_magic) {
+               free(msg->buf);
+               return -1;
+       }
+       msg->len = len;
+       return 0;
+}
+
+int ack(int fd, int tmo)
+{
+       struct metadata_update msg = { .len = 0 };
+
+       return send_message(fd, &msg, tmo);
+}
+
+int wait_reply(int fd, int tmo)
+{
+       struct metadata_update msg;
+       return receive_message(fd, &msg, tmo);
+}
+
+int connect_monitor(char *devname)
+{
+       char path[100];
+       int sfd;
+       long fl;
+       struct sockaddr_un addr;
+       int pos;
+       char *c;
+
+       pos = sprintf(path, "/var/run/mdadm/");
+       if (is_subarray(devname)) {
+               devname++;
+               c = strchr(devname, '/');
+               if (!c)
+                       return -1;
+               snprintf(&path[pos], c - devname + 1, "%s", devname);
+               pos += c - devname;
+       } else
+               pos += sprintf(&path[pos], "%s", devname);
+       sprintf(&path[pos], ".sock");
+
+       sfd = socket(PF_LOCAL, SOCK_STREAM, 0);
+       if (sfd < 0)
+               return -1;
+
+       addr.sun_family = PF_LOCAL;
+       strcpy(addr.sun_path, path);
+       if (connect(sfd, &addr, sizeof(addr)) < 0) {
+               close(sfd);
+               return -1;
+       }
+
+       fl = fcntl(sfd, F_GETFL, 0);
+       fl |= O_NONBLOCK;
+       fcntl(sfd, F_SETFL, fl);
+
+       return sfd;
+}
+
+/* give the monitor a chance to update the metadata */
+int ping_monitor(char *devname)
+{
+       int sfd = connect_monitor(devname);
+       int err = 0;
+
+       if (sfd < 0)
+               return sfd;
+
+       /* try to ping existing socket */
+       if (ack(sfd, 20) != 0)
+               err = -1;
+
+       /* check the reply */
+       if (!err && wait_reply(sfd, 20) != 0)
+               err = -1;
+
+       close(sfd);
+       return err;
+}
+
+/* give the manager a chance to view the updated container state.  This
+ * would naturally happen due to the manager noticing a change in
+ * /proc/mdstat; however, pinging encourages this detection to happen
+ * while an exclusive open() on the container is active
+ */
+int ping_manager(char *devname)
+{
+       int sfd = connect_monitor(devname);
+       struct metadata_update msg = { .len = -1 };
+       int err = 0;
+
+       if (sfd < 0)
+               return sfd;
+
+       err = send_message(sfd, &msg, 20);
+
+       /* check the reply */
+       if (!err && wait_reply(sfd, 20) != 0)
+               err = -1;
+
+       close(sfd);
+       return err;
+}
diff --git a/msg.h b/msg.h
new file mode 100644 (file)
index 0000000..b9bd205
--- /dev/null
+++ b/msg.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2008 Intel Corporation
+ *
+ *     mdmon socket / message handling
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+
+struct mdinfo;
+struct metadata_update;
+
+extern int receive_message(int fd, struct metadata_update *msg, int tmo);
+extern int send_message(int fd, struct metadata_update *msg, int tmo);
+extern int ack(int fd, int tmo);
+extern int wait_reply(int fd, int tmo);
+extern int connect_monitor(char *devname);
+extern int ping_monitor(char *devname);
+extern int ping_manager(char *devname);
+
+#define MSG_MAX_LEN (4*1024*1024)
diff --git a/platform-intel.c b/platform-intel.c
new file mode 100644 (file)
index 0000000..5160227
--- /dev/null
@@ -0,0 +1,257 @@
+/*
+ * Intel(R) Matrix Storage Manager hardware and firmware support routines
+ *
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include "mdadm.h"
+#include "platform-intel.h"
+#include "probe_roms.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+void free_sys_dev(struct sys_dev **list)
+{
+       while (*list) {
+               struct sys_dev *next = (*list)->next;
+
+               if ((*list)->path)
+                       free((*list)->path);
+               free(*list);
+               *list = next;
+       }
+}
+
+struct sys_dev *find_driver_devices(const char *bus, const char *driver)
+{
+       /* search sysfs for devices driven by 'driver' */
+       char path[256];
+       char link[256];
+       char *c;
+       DIR *driver_dir;
+       struct dirent *de;
+       struct sys_dev *head = NULL;
+       struct sys_dev *list = NULL;
+
+       sprintf(path, "/sys/bus/%s/drivers/%s", bus, driver);
+       driver_dir = opendir(path);
+       if (!driver_dir)
+               return NULL;
+       for (de = readdir(driver_dir); de; de = readdir(driver_dir)) {
+               /* is 'de' a device? check that the 'subsystem' link exists and
+                * that its target matches 'bus'
+                */
+               sprintf(path, "/sys/bus/%s/drivers/%s/%s/subsystem",
+                       bus, driver, de->d_name);
+               if (readlink(path, link, sizeof(link)) < 0)
+                       continue;
+               c = strrchr(link, '/');
+               if (!c)
+                       continue;
+               if (strncmp(bus, c+1, strlen(bus)) != 0)
+                       continue;
+
+               /* start / add list entry */
+               if (!head) {
+                       head = malloc(sizeof(*head));
+                       list = head;
+               } else {
+                       list->next = malloc(sizeof(*head));
+                       list = list->next;
+               }
+
+               if (!list) {
+                       free_sys_dev(&head);
+                       break;
+               }
+
+               /* generate canonical path name for the device */
+               sprintf(path, "/sys/bus/%s/drivers/%s/%s",
+                       bus, driver, de->d_name);
+               list->path = canonicalize_file_name(path);
+               list->next = NULL;
+       }
+
+       return head;
+}
+
+__u16 devpath_to_vendor(const char *dev_path)
+{
+       char path[strlen(dev_path) + strlen("/vendor") + 1];
+       char vendor[7];
+       int fd;
+       __u16 id = 0xffff;
+       int n;
+
+       sprintf(path, "%s/vendor", dev_path);
+
+       fd = open(path, O_RDONLY);
+       if (fd < 0)
+               return 0xffff;
+
+       n = read(fd, vendor, sizeof(vendor));
+       if (n == sizeof(vendor)) {
+               vendor[n - 1] = '\0';
+               id = strtoul(vendor, NULL, 16);
+       }
+       close(fd);
+
+       return id;
+}
+
+static int platform_has_intel_ahci(void)
+{
+       struct sys_dev *devices = find_driver_devices("pci", "ahci");
+       struct sys_dev *dev;
+       int ret = 0;
+
+       for (dev = devices; dev; dev = dev->next)
+               if (devpath_to_vendor(dev->path) == 0x8086) {
+                       ret = 1;
+                       break;
+               }
+
+       free_sys_dev(&devices);
+
+       return ret;
+}
+
+
+static struct imsm_orom imsm_orom;
+static int scan(const void *start, const void *end)
+{
+       int offset;
+       const struct imsm_orom *imsm_mem;
+       int len = (end - start);
+
+       for (offset = 0; offset < len; offset += 4) {
+               imsm_mem = start + offset;
+               if (memcmp(imsm_mem->signature, "$VER", 4) == 0) {
+                       imsm_orom = *imsm_mem;
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+const struct imsm_orom *find_imsm_orom(void)
+{
+       static int populated = 0;
+
+       /* it's static data so we only need to read it once */
+       if (populated)
+               return &imsm_orom;
+
+       if (check_env("IMSM_TEST_OROM")) {
+               memset(&imsm_orom, 0, sizeof(imsm_orom));
+               imsm_orom.rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 |
+                               IMSM_OROM_RLC_RAID10 | IMSM_OROM_RLC_RAID5;
+               imsm_orom.sss = IMSM_OROM_SSS_4kB | IMSM_OROM_SSS_8kB |
+                               IMSM_OROM_SSS_16kB | IMSM_OROM_SSS_32kB |
+                               IMSM_OROM_SSS_64kB | IMSM_OROM_SSS_128kB |
+                               IMSM_OROM_SSS_256kB | IMSM_OROM_SSS_512kB |
+                               IMSM_OROM_SSS_1MB | IMSM_OROM_SSS_2MB;
+               imsm_orom.dpa = 6;
+               imsm_orom.tds = 6;
+               imsm_orom.vpa = 2;
+               imsm_orom.vphba = 4;
+               imsm_orom.attr = imsm_orom.rlc | IMSM_OROM_ATTR_ChecksumVerify;
+               populated = 1;
+               return &imsm_orom;
+       }
+
+       if (!platform_has_intel_ahci())
+               return NULL;
+
+       /* scan option-rom memory looking for an imsm signature */
+       if (probe_roms_init() != 0)
+               return NULL;
+       probe_roms();
+       populated = scan_adapter_roms(scan);
+       probe_roms_exit();
+
+       if (populated)
+               return &imsm_orom;
+       return NULL;
+}
+
+char *devt_to_devpath(dev_t dev)
+{
+       char device[40];
+
+       sprintf(device, "/sys/dev/block/%d:%d/device", major(dev), minor(dev));
+       return canonicalize_file_name(device);
+}
+
+static char *diskfd_to_devpath(int fd)
+{
+       /* return the device path for a disk, return NULL on error or fd
+        * refers to a partition
+        */
+       struct stat st;
+
+       if (fstat(fd, &st) != 0)
+               return NULL;
+       if (!S_ISBLK(st.st_mode))
+               return NULL;
+
+       return devt_to_devpath(st.st_rdev);
+}
+
+int path_attached_to_hba(const char *disk_path, const char *hba_path)
+{
+       int rc;
+
+       if (!disk_path || !hba_path)
+               return 0;
+
+       if (strncmp(disk_path, hba_path, strlen(hba_path)) == 0)
+               rc = 1;
+       else
+               rc = 0;
+
+       return rc;
+}
+
+int devt_attached_to_hba(dev_t dev, const char *hba_path)
+{
+       char *disk_path = devt_to_devpath(dev);
+       int rc = path_attached_to_hba(disk_path, hba_path);
+
+       if (disk_path)
+               free(disk_path);
+
+       return rc;
+}
+
+int disk_attached_to_hba(int fd, const char *hba_path)
+{
+       char *disk_path = diskfd_to_devpath(fd);
+       int rc = path_attached_to_hba(disk_path, hba_path);
+
+       if (disk_path)
+               free(disk_path);
+
+       return rc;
+}
+
diff --git a/platform-intel.h b/platform-intel.h
new file mode 100644 (file)
index 0000000..bbdc9f9
--- /dev/null
@@ -0,0 +1,129 @@
+/*
+ * Intel(R) Matrix Storage Manager hardware and firmware support routines
+ *
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <asm/types.h>
+#include <strings.h>
+
+/* The IMSM OROM Version Table definition */
+struct imsm_orom {
+       __u8 signature[4];
+       __u8 table_ver_major; /* Currently 2 (can change with future revs) */
+       __u8 table_ver_minor; /* Currently 2 (can change with future revs) */
+       __u16 major_ver; /* Example: 8 as in 8.6.0.1020 */
+       __u16 minor_ver; /* Example: 6 as in 8.6.0.1020 */
+       __u16 hotfix_ver; /* Example: 0 as in 8.6.0.1020 */
+       __u16 build; /* Example: 1020 as in 8.6.0.1020 */
+       __u8 len; /* number of bytes in this entire table */
+       __u8 checksum; /* checksum of all the bytes in this table */
+       __u16 rlc; /* RAID Level Capability */
+       /* we assume the cpu is x86 as the orom should not be found
+        * anywhere else
+        */
+       #define IMSM_OROM_RLC_RAID0 (1 << 0)
+       #define IMSM_OROM_RLC_RAID1 (1 << 1)
+       #define IMSM_OROM_RLC_RAID10 (1 << 2)
+       #define IMSM_OROM_RLC_RAID1E (1 << 3)
+       #define IMSM_OROM_RLC_RAID5 (1 << 4)
+       #define IMSM_OROM_RLC_RAID_CNG (1 << 5)
+       __u16 sss; /* Strip Size Supported */
+       #define IMSM_OROM_SSS_2kB (1 << 0)
+       #define IMSM_OROM_SSS_4kB (1 << 1)
+       #define IMSM_OROM_SSS_8kB (1 << 2)
+       #define IMSM_OROM_SSS_16kB (1 << 3)
+       #define IMSM_OROM_SSS_32kB (1 << 4)
+       #define IMSM_OROM_SSS_64kB (1 << 5)
+       #define IMSM_OROM_SSS_128kB (1 << 6)
+       #define IMSM_OROM_SSS_256kB (1 << 7)
+       #define IMSM_OROM_SSS_512kB (1 << 8)
+       #define IMSM_OROM_SSS_1MB (1 << 9)
+       #define IMSM_OROM_SSS_2MB (1 << 10)
+       #define IMSM_OROM_SSS_4MB (1 << 11)
+       #define IMSM_OROM_SSS_8MB (1 << 12)
+       #define IMSM_OROM_SSS_16MB (1 << 13)
+       #define IMSM_OROM_SSS_32MB (1 << 14)
+       #define IMSM_OROM_SSS_64MB (1 << 15)
+       __u16 dpa; /* Disks Per Array supported */
+       __u16 tds; /* Total Disks Supported */
+       __u8 vpa; /* # Volumes Per Array supported */
+       __u8 vphba; /* # Volumes Per Host Bus Adapter supported */
+       /* Attributes supported. This should map to the
+        * attributes in the MPB. Also, lower 16 bits
+        * should match/duplicate RLC bits above.
+        */
+       __u32 attr;
+       #define IMSM_OROM_ATTR_RAID0 IMSM_OROM_RLC_RAID0
+       #define IMSM_OROM_ATTR_RAID1 IMSM_OROM_RLC_RAID1
+       #define IMSM_OROM_ATTR_RAID10 IMSM_OROM_RLC_RAID10
+       #define IMSM_OROM_ATTR_RAID1E IMSM_OROM_RLC_RAID1E
+       #define IMSM_OROM_ATTR_RAID5 IMSM_OROM_RLC_RAID5
+       #define IMSM_OROM_ATTR_RAID_CNG IMSM_OROM_RLC_RAID_CNG
+       #define IMSM_OROM_ATTR_2TB (1 << 29)
+       #define IMSM_OROM_ATTR_PM (1 << 30)
+       #define IMSM_OROM_ATTR_ChecksumVerify (1 << 31)
+       __u32 reserved1;
+       __u32 reserved2;
+} __attribute__((packed));
+
+static inline int imsm_orom_has_raid0(const struct imsm_orom *orom)
+{
+       return !!(orom->rlc & IMSM_OROM_RLC_RAID0);
+}
+static inline int imsm_orom_has_raid1(const struct imsm_orom *orom)
+{
+       return !!(orom->rlc & IMSM_OROM_RLC_RAID1);
+}
+static inline int imsm_orom_has_raid1e(const struct imsm_orom *orom)
+{
+       return !!(orom->rlc & IMSM_OROM_RLC_RAID1E);
+}
+static inline int imsm_orom_has_raid10(const struct imsm_orom *orom)
+{
+       return !!(orom->rlc & IMSM_OROM_RLC_RAID10);
+}
+static inline int imsm_orom_has_raid5(const struct imsm_orom *orom)
+{
+       return !!(orom->rlc & IMSM_OROM_RLC_RAID5);
+}
+
+/**
+ * imsm_orom_has_chunk - check if the orom supports the given chunk size
+ * @orom: orom pointer from find_imsm_orom
+ * @chunk: chunk size in kibibytes
+ */
+static inline int imsm_orom_has_chunk(const struct imsm_orom *orom, int chunk)
+{
+       int fs = ffs(chunk);
+
+       if (!fs)
+               return 0;
+       fs--; /* bit num to bit index */
+       return !!(orom->sss & (1 << (fs - 1)));
+}
+
+struct sys_dev {
+       char *path;
+       struct sys_dev *next;
+};
+
+struct sys_dev *find_driver_devices(const char *bus, const char *driver);
+__u16 devpath_to_vendor(const char *dev_path);
+void free_sys_dev(struct sys_dev **list);
+const struct imsm_orom *find_imsm_orom(void);
+int disk_attached_to_hba(int fd, const char *hba_path);
+char *devt_to_devpath(dev_t dev);
+int path_attached_to_hba(const char *disk_path, const char *hba_path);
diff --git a/probe_roms.c b/probe_roms.c
new file mode 100644 (file)
index 0000000..06ec3f5
--- /dev/null
@@ -0,0 +1,279 @@
+/*
+ * probe_roms - scan for Adapter ROMS
+ *
+ * (based on linux-2.6:arch/x86/kernel/probe_roms_32.c)
+ *
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "probe_roms.h"
+#include <unistd.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <asm/types.h>
+
+static void *rom_mem = MAP_FAILED;
+static int rom_fd = -1;
+const static int rom_len = 0xf0000 - 0xc0000; /* option-rom memory region */
+static int _sigbus;
+#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
+
+static void sigbus(int sig)
+{
+       _sigbus = 1;
+}
+
+static int probe_address8(const __u8 *ptr, __u8 *val)
+{
+       int rc = 0;
+
+       *val = *ptr;
+       if (_sigbus)
+               rc = -1;
+       _sigbus = 0;
+
+       return rc;
+}
+
+static int probe_address16(const __u16 *ptr, __u16 *val)
+{
+       int rc = 0;
+
+       *val = *ptr;
+       if (_sigbus)
+               rc = -1;
+       _sigbus = 0;
+
+       return rc;
+}
+
+void probe_roms_exit(void)
+{
+       signal(SIGBUS, SIG_DFL);
+       if (rom_fd >= 0) {
+               close(rom_fd);
+               rom_fd = -1;
+       }
+       if (rom_mem != MAP_FAILED) {
+               munmap(rom_mem, rom_len);
+               rom_mem = MAP_FAILED;
+       }
+}
+
+int probe_roms_init(void)
+{
+       int fd;
+       int rc = 0;
+
+       if (signal(SIGBUS, sigbus) == SIG_ERR)
+               rc = -1;
+       if (rc == 0) {
+               fd = open("/dev/mem", O_RDONLY);
+               if (fd < 0)
+                       rc = -1;
+       }
+       if (rc == 0) {
+               rom_mem = mmap(NULL, rom_len, PROT_READ, MAP_PRIVATE, fd, 0xc0000);
+               if (rom_mem == MAP_FAILED)
+                       rc = -1;
+       }
+
+       if (rc == 0)
+               rom_fd = fd;
+       else
+               probe_roms_exit();
+
+       return rc;
+}
+
+/**
+ * isa_bus_to_virt - convert physical address to mmap'd region
+ * @addr - address to convert
+ *
+ * Only valid between a successful call to probe_roms_init and the
+ * corresponding probe_roms_exit
+ */
+static void *isa_bus_to_virt(unsigned long addr)
+{
+       return rom_mem + (addr - 0xc0000);
+}
+
+struct resource {
+       unsigned long start;
+       unsigned long end;
+       const char *name;
+};
+
+static struct resource system_rom_resource = {
+       .name   = "System ROM",
+       .start  = 0xf0000,
+       .end    = 0xfffff,
+};
+
+static struct resource extension_rom_resource = {
+       .name   = "Extension ROM",
+       .start  = 0xe0000,
+       .end    = 0xeffff,
+};
+
+static struct resource adapter_rom_resources[] = { {
+       .name   = "Adapter ROM",
+       .start  = 0xc8000,
+       .end    = 0,
+}, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+}, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+}, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+}, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+}, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+} };
+
+static struct resource video_rom_resource = {
+       .name   = "Video ROM",
+       .start  = 0xc0000,
+       .end    = 0xc7fff,
+};
+
+#define ROMSIGNATURE 0xaa55
+
+static int romsignature(const unsigned char *rom)
+{
+       const unsigned short * const ptr = (const unsigned short *)rom;
+       unsigned short sig = 0;
+
+       return probe_address16(ptr, &sig) == 0 && sig == ROMSIGNATURE;
+}
+
+static int romchecksum(const unsigned char *rom, unsigned long length)
+{
+       unsigned char sum, c;
+
+       for (sum = 0; length && probe_address8(rom++, &c) == 0; length--)
+               sum += c;
+       return !length && !sum;
+}
+
+int scan_adapter_roms(scan_fn fn)
+{
+       /* let scan_fn examing each of the adapter roms found by probe_roms */
+       int i;
+       int found;
+
+       if (rom_fd < 0)
+               return 0;
+
+       found = 0;
+       for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
+               struct resource *res = &adapter_rom_resources[i];
+
+               if (res->start) {
+                       found = fn(isa_bus_to_virt(res->start),
+                                  isa_bus_to_virt(res->end));
+                       if (found)
+                               break;
+               } else
+                       break;
+       }
+
+       return found;
+}
+
+void probe_roms(void)
+{
+       const void *rom;
+       unsigned long start, length, upper;
+       unsigned char c;
+       int i;
+
+       if (rom_fd < 0)
+               return;
+
+       /* video rom */
+       upper = adapter_rom_resources[0].start;
+       for (start = video_rom_resource.start; start < upper; start += 2048) {
+               rom = isa_bus_to_virt(start);
+               if (!romsignature(rom))
+                       continue;
+
+               video_rom_resource.start = start;
+
+               if (probe_address8(rom + 2, &c) != 0)
+                       continue;
+
+               /* 0 < length <= 0x7f * 512, historically */
+               length = c * 512;
+
+               /* if checksum okay, trust length byte */
+               if (length && romchecksum(rom, length))
+                       video_rom_resource.end = start + length - 1;
+               break;
+       }
+
+       start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+       if (start < upper)
+               start = upper;
+
+       /* system rom */
+       upper = system_rom_resource.start;
+
+       /* check for extension rom (ignore length byte!) */
+       rom = isa_bus_to_virt(extension_rom_resource.start);
+       if (romsignature(rom)) {
+               length = extension_rom_resource.end - extension_rom_resource.start + 1;
+               if (romchecksum(rom, length))
+                       upper = extension_rom_resource.start;
+       }
+
+       /* check for adapter roms on 2k boundaries */
+       for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+               rom = isa_bus_to_virt(start);
+               if (!romsignature(rom))
+                       continue;
+
+               if (probe_address8(rom + 2, &c) != 0)
+                       continue;
+
+               /* 0 < length <= 0x7f * 512, historically */
+               length = c * 512;
+
+               /* but accept any length that fits if checksum okay */
+               if (!length || start + length > upper || !romchecksum(rom, length))
+                       continue;
+
+               adapter_rom_resources[i].start = start;
+               adapter_rom_resources[i].end = start + length - 1;
+
+               start = adapter_rom_resources[i++].end & ~2047UL;
+       }
+}
+
diff --git a/probe_roms.h b/probe_roms.h
new file mode 100644 (file)
index 0000000..557e933
--- /dev/null
@@ -0,0 +1,24 @@
+/*
+ * probe_roms - scan for Adapter ROMS
+ *
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+void probe_roms_exit(void);
+int probe_roms_init(void);
+typedef int (*scan_fn)(const void *start, const void *end);
+int scan_adapter_roms(scan_fn fn);
+void probe_roms(void);
index afde8363529c5bc48cebc1a108bf2505d074e8d0..29c7336192a12bf9d4664758c53e8bb7ee1059ec 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
@@ -152,7 +152,8 @@ int save_stripes(int *source, unsigned long long *offsets,
                 int nwrites, int *dest,
                 unsigned long long start, unsigned long long length)
 {
-       char buf[8192];
+       char abuf[8192+512];
+       char *buf = (char*)(((unsigned long)abuf+511)&~511UL);
        int cpos = start % chunk_size; /* where in chunk we are up to */
        int len;
        int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2);
@@ -162,7 +163,7 @@ int save_stripes(int *source, unsigned long long *offsets,
                unsigned long long offset;
                int i;
                len = chunk_size - cpos;
-               if (len > sizeof(buf)) len = sizeof(buf);
+               if (len > 8192) len = 8192;
                if (len > length) len = length;
                /* len bytes to be moved from one device */
 
diff --git a/sg_io.c b/sg_io.c
new file mode 100644 (file)
index 0000000..f9682be
--- /dev/null
+++ b/sg_io.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2007-2008 Intel Corporation
+ *
+ *     Retrieve drive serial numbers for scsi disks
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <string.h>
+#include <scsi/scsi.h>
+#include <scsi/sg.h>
+#include <sys/ioctl.h>
+
+int scsi_get_serial(int fd, void *buf, size_t buf_len)
+{
+       unsigned char inq_cmd[] = {INQUIRY, 1, 0x80, 0, buf_len, 0};
+       unsigned char sense[32];
+       struct sg_io_hdr io_hdr;
+
+       memset(&io_hdr, 0, sizeof(io_hdr));
+       io_hdr.interface_id = 'S';
+       io_hdr.cmdp = inq_cmd;
+       io_hdr.cmd_len = sizeof(inq_cmd);
+       io_hdr.dxferp = buf;
+       io_hdr.dxfer_len = buf_len;
+       io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+       io_hdr.sbp = sense;
+       io_hdr.mx_sb_len = sizeof(sense);
+       io_hdr.timeout = 5000;
+
+       return ioctl(fd, SG_IO, &io_hdr);
+}
diff --git a/super-ddf.c b/super-ddf.c
new file mode 100644 (file)
index 0000000..bcd44d1
--- /dev/null
@@ -0,0 +1,3627 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2 of the License, or
+ *    (at your option) any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *    Author: Neil Brown
+ *    Email: <neil@brown.name>
+ *
+ * Specifications for DDF takes from Common RAID DDF Specification Revision 1.2
+ * (July 28 2006).  Reused by permission of SNIA.
+ */
+
+#define HAVE_STDINT_H 1
+#include "mdadm.h"
+#include "mdmon.h"
+#include "sha1.h"
+#include <values.h>
+
+/* a non-official T10 name for creation GUIDs */
+static char T10[] = "Linux-MD";
+
+/* DDF timestamps are 1980 based, so we need to add
+ * second-in-decade-of-seventies to convert to linux timestamps.
+ * 10 years with 2 leap years.
+ */
+#define DECADE (3600*24*(365*10+2))
+unsigned long crc32(
+       unsigned long crc,
+       const unsigned char *buf,
+       unsigned len);
+
+/* The DDF metadata handling.
+ * DDF metadata lives at the end of the device.
+ * The last 512 byte block provides an 'anchor' which is used to locate
+ * the rest of the metadata which usually lives immediately behind the anchor.
+ *
+ * Note:
+ *  - all multibyte numeric fields are bigendian.
+ *  - all strings are space padded.
+ *
+ */
+
+/* Primary Raid Level (PRL) */
+#define        DDF_RAID0       0x00
+#define        DDF_RAID1       0x01
+#define        DDF_RAID3       0x03
+#define        DDF_RAID4       0x04
+#define        DDF_RAID5       0x05
+#define        DDF_RAID1E      0x11
+#define        DDF_JBOD        0x0f
+#define        DDF_CONCAT      0x1f
+#define        DDF_RAID5E      0x15
+#define        DDF_RAID5EE     0x25
+#define        DDF_RAID6       0x06
+
+/* Raid Level Qualifier (RLQ) */
+#define        DDF_RAID0_SIMPLE        0x00
+#define        DDF_RAID1_SIMPLE        0x00 /* just 2 devices in this plex */
+#define        DDF_RAID1_MULTI         0x01 /* exactly 3 devices in this plex */
+#define        DDF_RAID3_0             0x00 /* parity in first extent */
+#define        DDF_RAID3_N             0x01 /* parity in last extent */
+#define        DDF_RAID4_0             0x00 /* parity in first extent */
+#define        DDF_RAID4_N             0x01 /* parity in last extent */
+/* these apply to raid5e and raid5ee as well */
+#define        DDF_RAID5_0_RESTART     0x00 /* same as 'right asymmetric' - layout 1 */
+#define        DDF_RAID6_0_RESTART     0x01 /* raid6 different from raid5 here!!! */
+#define        DDF_RAID5_N_RESTART     0x02 /* same as 'left asymmetric' - layout 0 */
+#define        DDF_RAID5_N_CONTINUE    0x03 /* same as 'left symmetric' - layout 2 */
+
+#define        DDF_RAID1E_ADJACENT     0x00 /* raid10 nearcopies==2 */
+#define        DDF_RAID1E_OFFSET       0x01 /* raid10 offsetcopies==2 */
+
+/* Secondary RAID Level (SRL) */
+#define        DDF_2STRIPED    0x00    /* This is weirder than RAID0 !! */
+#define        DDF_2MIRRORED   0x01
+#define        DDF_2CONCAT     0x02
+#define        DDF_2SPANNED    0x03    /* This is also weird - be careful */
+
+/* Magic numbers */
+#define        DDF_HEADER_MAGIC        __cpu_to_be32(0xDE11DE11)
+#define        DDF_CONTROLLER_MAGIC    __cpu_to_be32(0xAD111111)
+#define        DDF_PHYS_RECORDS_MAGIC  __cpu_to_be32(0x22222222)
+#define        DDF_PHYS_DATA_MAGIC     __cpu_to_be32(0x33333333)
+#define        DDF_VIRT_RECORDS_MAGIC  __cpu_to_be32(0xDDDDDDDD)
+#define        DDF_VD_CONF_MAGIC       __cpu_to_be32(0xEEEEEEEE)
+#define        DDF_SPARE_ASSIGN_MAGIC  __cpu_to_be32(0x55555555)
+#define        DDF_VU_CONF_MAGIC       __cpu_to_be32(0x88888888)
+#define        DDF_VENDOR_LOG_MAGIC    __cpu_to_be32(0x01dBEEF0)
+#define        DDF_BBM_LOG_MAGIC       __cpu_to_be32(0xABADB10C)
+
+#define        DDF_GUID_LEN    24
+#define DDF_REVISION_0 "01.00.00"
+#define DDF_REVISION_2 "01.02.00"
+
+struct ddf_header {
+       __u32   magic;          /* DDF_HEADER_MAGIC */
+       __u32   crc;
+       char    guid[DDF_GUID_LEN];
+       char    revision[8];    /* 01.02.00 */
+       __u32   seq;            /* starts at '1' */
+       __u32   timestamp;
+       __u8    openflag;
+       __u8    foreignflag;
+       __u8    enforcegroups;
+       __u8    pad0;           /* 0xff */
+       __u8    pad1[12];       /* 12 * 0xff */
+       /* 64 bytes so far */
+       __u8    header_ext[32]; /* reserved: fill with 0xff */
+       __u64   primary_lba;
+       __u64   secondary_lba;
+       __u8    type;
+       __u8    pad2[3];        /* 0xff */
+       __u32   workspace_len;  /* sectors for vendor space -
+                                * at least 32768(sectors) */
+       __u64   workspace_lba;
+       __u16   max_pd_entries; /* one of 15, 63, 255, 1023, 4095 */
+       __u16   max_vd_entries; /* 2^(4,6,8,10,12)-1 : i.e. as above */
+       __u16   max_partitions; /* i.e. max num of configuration
+                                  record entries per disk */
+       __u16   config_record_len; /* 1 +ROUNDUP(max_primary_element_entries
+                                                *12/512) */
+       __u16   max_primary_element_entries; /* 16, 64, 256, 1024, or 4096 */
+       __u8    pad3[54];       /* 0xff */
+       /* 192 bytes so far */
+       __u32   controller_section_offset;
+       __u32   controller_section_length;
+       __u32   phys_section_offset;
+       __u32   phys_section_length;
+       __u32   virt_section_offset;
+       __u32   virt_section_length;
+       __u32   config_section_offset;
+       __u32   config_section_length;
+       __u32   data_section_offset;
+       __u32   data_section_length;
+       __u32   bbm_section_offset;
+       __u32   bbm_section_length;
+       __u32   diag_space_offset;
+       __u32   diag_space_length;
+       __u32   vendor_offset;
+       __u32   vendor_length;
+       /* 256 bytes so far */
+       __u8    pad4[256];      /* 0xff */
+};
+
+/* type field */
+#define        DDF_HEADER_ANCHOR       0x00
+#define        DDF_HEADER_PRIMARY      0x01
+#define        DDF_HEADER_SECONDARY    0x02
+
+/* The content of the 'controller section' - global scope */
+struct ddf_controller_data {
+       __u32   magic;                  /* DDF_CONTROLLER_MAGIC */
+       __u32   crc;
+       char    guid[DDF_GUID_LEN];
+       struct controller_type {
+               __u16 vendor_id;
+               __u16 device_id;
+               __u16 sub_vendor_id;
+               __u16 sub_device_id;
+       } type;
+       char    product_id[16];
+       __u8    pad[8]; /* 0xff */
+       __u8    vendor_data[448];
+};
+
+/* The content of phys_section - global scope */
+struct phys_disk {
+       __u32   magic;          /* DDF_PHYS_RECORDS_MAGIC */
+       __u32   crc;
+       __u16   used_pdes;
+       __u16   max_pdes;
+       __u8    pad[52];
+       struct phys_disk_entry {
+               char    guid[DDF_GUID_LEN];
+               __u32   refnum;
+               __u16   type;
+               __u16   state;
+               __u64   config_size; /* DDF structures must be after here */
+               char    path[18];       /* another horrible structure really */
+               __u8    pad[6];
+       } entries[0];
+};
+
+/* phys_disk_entry.type is a bitmap - bigendian remember */
+#define        DDF_Forced_PD_GUID              1
+#define        DDF_Active_in_VD                2
+#define        DDF_Global_Spare                4 /* VD_CONF records are ignored */
+#define        DDF_Spare                       8 /* overrides Global_spare */
+#define        DDF_Foreign                     16
+#define        DDF_Legacy                      32 /* no DDF on this device */
+
+#define        DDF_Interface_mask              0xf00
+#define        DDF_Interface_SCSI              0x100
+#define        DDF_Interface_SAS               0x200
+#define        DDF_Interface_SATA              0x300
+#define        DDF_Interface_FC                0x400
+
+/* phys_disk_entry.state is a bigendian bitmap */
+#define        DDF_Online                      1
+#define        DDF_Failed                      2 /* overrides  1,4,8 */
+#define        DDF_Rebuilding                  4
+#define        DDF_Transition                  8
+#define        DDF_SMART                       16
+#define        DDF_ReadErrors                  32
+#define        DDF_Missing                     64
+
+/* The content of the virt_section global scope */
+struct virtual_disk {
+       __u32   magic;          /* DDF_VIRT_RECORDS_MAGIC */
+       __u32   crc;
+       __u16   populated_vdes;
+       __u16   max_vdes;
+       __u8    pad[52];
+       struct virtual_entry {
+               char    guid[DDF_GUID_LEN];
+               __u16   unit;
+               __u16   pad0;   /* 0xffff */
+               __u16   guid_crc;
+               __u16   type;
+               __u8    state;
+               __u8    init_state;
+               __u8    pad1[14];
+               char    name[16];
+       } entries[0];
+};
+
+/* virtual_entry.type is a bitmap - bigendian */
+#define        DDF_Shared              1
+#define        DDF_Enforce_Groups      2
+#define        DDF_Unicode             4
+#define        DDF_Owner_Valid         8
+
+/* virtual_entry.state is a bigendian bitmap */
+#define        DDF_state_mask          0x7
+#define        DDF_state_optimal       0x0
+#define        DDF_state_degraded      0x1
+#define        DDF_state_deleted       0x2
+#define        DDF_state_missing       0x3
+#define        DDF_state_failed        0x4
+#define        DDF_state_part_optimal  0x5
+
+#define        DDF_state_morphing      0x8
+#define        DDF_state_inconsistent  0x10
+
+/* virtual_entry.init_state is a bigendian bitmap */
+#define        DDF_initstate_mask      0x03
+#define        DDF_init_not            0x00
+#define        DDF_init_quick          0x01 /* initialisation is progress.
+                                     * i.e. 'state_inconsistent' */
+#define        DDF_init_full           0x02
+
+#define        DDF_access_mask         0xc0
+#define        DDF_access_rw           0x00
+#define        DDF_access_ro           0x80
+#define        DDF_access_blocked      0xc0
+
+/* The content of the config_section - local scope
+ * It has multiple records each config_record_len sectors
+ * They can be vd_config or spare_assign
+ */
+
+struct vd_config {
+       __u32   magic;          /* DDF_VD_CONF_MAGIC */
+       __u32   crc;
+       char    guid[DDF_GUID_LEN];
+       __u32   timestamp;
+       __u32   seqnum;
+       __u8    pad0[24];
+       __u16   prim_elmnt_count;
+       __u8    chunk_shift;    /* 0 == 512, 1==1024 etc */
+       __u8    prl;
+       __u8    rlq;
+       __u8    sec_elmnt_count;
+       __u8    sec_elmnt_seq;
+       __u8    srl;
+       __u64   blocks;         /* blocks per component could be different
+                                * on different component devices...(only
+                                * for concat I hope) */
+       __u64   array_blocks;   /* blocks in array */
+       __u8    pad1[8];
+       __u32   spare_refs[8];
+       __u8    cache_pol[8];
+       __u8    bg_rate;
+       __u8    pad2[3];
+       __u8    pad3[52];
+       __u8    pad4[192];
+       __u8    v0[32]; /* reserved- 0xff */
+       __u8    v1[32]; /* reserved- 0xff */
+       __u8    v2[16]; /* reserved- 0xff */
+       __u8    v3[16]; /* reserved- 0xff */
+       __u8    vendor[32];
+       __u32   phys_refnum[0]; /* refnum of each disk in sequence */
+      /*__u64  lba_offset[0];  LBA offset in each phys.  Note extents in a
+                               bvd are always the same size */
+};
+
+/* vd_config.cache_pol[7] is a bitmap */
+#define        DDF_cache_writeback     1       /* else writethrough */
+#define        DDF_cache_wadaptive     2       /* only applies if writeback */
+#define        DDF_cache_readahead     4
+#define        DDF_cache_radaptive     8       /* only if doing read-ahead */
+#define        DDF_cache_ifnobatt      16      /* even to write cache if battery is poor */
+#define        DDF_cache_wallowed      32      /* enable write caching */
+#define        DDF_cache_rallowed      64      /* enable read caching */
+
+struct spare_assign {
+       __u32   magic;          /* DDF_SPARE_ASSIGN_MAGIC */
+       __u32   crc;
+       __u32   timestamp;
+       __u8    reserved[7];
+       __u8    type;
+       __u16   populated;      /* SAEs used */
+       __u16   max;            /* max SAEs */
+       __u8    pad[8];
+       struct spare_assign_entry {
+               char    guid[DDF_GUID_LEN];
+               __u16   secondary_element;
+               __u8    pad[6];
+       } spare_ents[0];
+};
+/* spare_assign.type is a bitmap */
+#define        DDF_spare_dedicated     0x1     /* else global */
+#define        DDF_spare_revertible    0x2     /* else committable */
+#define        DDF_spare_active        0x4     /* else not active */
+#define        DDF_spare_affinity      0x8     /* enclosure affinity */
+
+/* The data_section contents - local scope */
+struct disk_data {
+       __u32   magic;          /* DDF_PHYS_DATA_MAGIC */
+       __u32   crc;
+       char    guid[DDF_GUID_LEN];
+       __u32   refnum;         /* crc of some magic drive data ... */
+       __u8    forced_ref;     /* set when above was not result of magic */
+       __u8    forced_guid;    /* set if guid was forced rather than magic */
+       __u8    vendor[32];
+       __u8    pad[442];
+};
+
+/* bbm_section content */
+struct bad_block_log {
+       __u32   magic;
+       __u32   crc;
+       __u16   entry_count;
+       __u32   spare_count;
+       __u8    pad[10];
+       __u64   first_spare;
+       struct mapped_block {
+               __u64   defective_start;
+               __u32   replacement_start;
+               __u16   remap_count;
+               __u8    pad[2];
+       } entries[0];
+};
+
+/* Struct for internally holding ddf structures */
+/* The DDF structure stored on each device is potentially
+ * quite different, as some data is global and some is local.
+ * The global data is:
+ *   - ddf header
+ *   - controller_data
+ *   - Physical disk records
+ *   - Virtual disk records
+ * The local data is:
+ *   - Configuration records
+ *   - Physical Disk data section
+ *  (  and Bad block and vendor which I don't care about yet).
+ *
+ * The local data is parsed into separate lists as it is read
+ * and reconstructed for writing.  This means that we only need
+ * to make config changes once and they are automatically
+ * propagated to all devices.
+ * Note that the ddf_super has space of the conf and disk data
+ * for this disk and also for a list of all such data.
+ * The list is only used for the superblock that is being
+ * built in Create or Assemble to describe the whole array.
+ */
+struct ddf_super {
+       struct ddf_header anchor, primary, secondary;
+       struct ddf_controller_data controller;
+       struct ddf_header *active;
+       struct phys_disk        *phys;
+       struct virtual_disk     *virt;
+       int pdsize, vdsize;
+       int max_part, mppe, conf_rec_len;
+       int currentdev;
+       int updates_pending;
+       struct vcl {
+               union {
+                       char space[512];
+                       struct {
+                               struct vcl      *next;
+                               __u64           *lba_offset; /* location in 'conf' of
+                                                             * the lba table */
+                               int     vcnum; /* index into ->virt */
+                               __u64           *block_sizes; /* NULL if all the same */
+                       };
+               };
+               struct vd_config conf;
+       } *conflist, *currentconf;
+       struct dl {
+               union {
+                       char space[512];
+                       struct {
+                               struct dl       *next;
+                               int major, minor;
+                               char *devname;
+                               int fd;
+                               unsigned long long size; /* sectors */
+                               int pdnum;      /* index in ->phys */
+                               struct spare_assign *spare;
+                               void *mdupdate; /* hold metadata update */
+
+                               /* These fields used by auto-layout */
+                               int raiddisk; /* slot to fill in autolayout */
+                               __u64 esize;
+                       };
+               };
+               struct disk_data disk;
+               struct vcl *vlist[0]; /* max_part in size */
+       } *dlist, *add_list;
+};
+
+#ifndef offsetof
+#define offsetof(t,f) ((size_t)&(((t*)0)->f))
+#endif
+
+
+static int calc_crc(void *buf, int len)
+{
+       /* crcs are always at the same place as in the ddf_header */
+       struct ddf_header *ddf = buf;
+       __u32 oldcrc = ddf->crc;
+       __u32 newcrc;
+       ddf->crc = 0xffffffff;
+
+       newcrc = crc32(0, buf, len);
+       ddf->crc = oldcrc;
+       /* The crc is store (like everything) bigendian, so convert
+        * here for simplicity
+        */
+       return __cpu_to_be32(newcrc);
+}
+
+static int load_ddf_header(int fd, unsigned long long lba,
+                          unsigned long long size,
+                          int type,
+                          struct ddf_header *hdr, struct ddf_header *anchor)
+{
+       /* read a ddf header (primary or secondary) from fd/lba
+        * and check that it is consistent with anchor
+        * Need to check:
+        *   magic, crc, guid, rev, and LBA's header_type, and
+        *  everything after header_type must be the same
+        */
+       if (lba >= size-1)
+               return 0;
+
+       if (lseek64(fd, lba<<9, 0) < 0)
+               return 0;
+
+       if (read(fd, hdr, 512) != 512)
+               return 0;
+
+       if (hdr->magic != DDF_HEADER_MAGIC)
+               return 0;
+       if (calc_crc(hdr, 512) != hdr->crc)
+               return 0;
+       if (memcmp(anchor->guid, hdr->guid, DDF_GUID_LEN) != 0 ||
+           memcmp(anchor->revision, hdr->revision, 8) != 0 ||
+           anchor->primary_lba != hdr->primary_lba ||
+           anchor->secondary_lba != hdr->secondary_lba ||
+           hdr->type != type ||
+           memcmp(anchor->pad2, hdr->pad2, 512 -
+                  offsetof(struct ddf_header, pad2)) != 0)
+               return 0;
+
+       /* Looks good enough to me... */
+       return 1;
+}
+
+static void *load_section(int fd, struct ddf_super *super, void *buf,
+                         __u32 offset_be, __u32 len_be, int check)
+{
+       unsigned long long offset = __be32_to_cpu(offset_be);
+       unsigned long long len = __be32_to_cpu(len_be);
+       int dofree = (buf == NULL);
+
+       if (check)
+               if (len != 2 && len != 8 && len != 32
+                   && len != 128 && len != 512)
+                       return NULL;
+
+       if (len > 1024)
+               return NULL;
+       if (buf) {
+               /* All pre-allocated sections are a single block */
+               if (len != 1)
+                       return NULL;
+       } else if (posix_memalign(&buf, 512, len<<9) != 0)
+               buf = NULL;
+
+       if (!buf)
+               return NULL;
+
+       if (super->active->type == 1)
+               offset += __be64_to_cpu(super->active->primary_lba);
+       else
+               offset += __be64_to_cpu(super->active->secondary_lba);
+
+       if (lseek64(fd, offset<<9, 0) != (offset<<9)) {
+               if (dofree)
+                       free(buf);
+               return NULL;
+       }
+       if (read(fd, buf, len<<9) != (len<<9)) {
+               if (dofree)
+                       free(buf);
+               return NULL;
+       }
+       return buf;
+}
+
+static int load_ddf_headers(int fd, struct ddf_super *super, char *devname)
+{
+       unsigned long long dsize;
+
+       get_dev_size(fd, NULL, &dsize);
+
+       if (lseek64(fd, dsize-512, 0) < 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name": Cannot seek to anchor block on %s: %s\n",
+                               devname, strerror(errno));
+               return 1;
+       }
+       if (read(fd, &super->anchor, 512) != 512) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Cannot read anchor block on %s: %s\n",
+                               devname, strerror(errno));
+               return 1;
+       }
+       if (super->anchor.magic != DDF_HEADER_MAGIC) {
+               if (devname)
+                       fprintf(stderr, Name ": no DDF anchor found on %s\n",
+                               devname);
+               return 2;
+       }
+       if (calc_crc(&super->anchor, 512) != super->anchor.crc) {
+               if (devname)
+                       fprintf(stderr, Name ": bad CRC on anchor on %s\n",
+                               devname);
+               return 2;
+       }
+       if (memcmp(super->anchor.revision, DDF_REVISION_0, 8) != 0 &&
+           memcmp(super->anchor.revision, DDF_REVISION_2, 8) != 0) {
+               if (devname)
+                       fprintf(stderr, Name ": can only support super revision"
+                               " %.8s and earlier, not %.8s on %s\n",
+                               DDF_REVISION_2, super->anchor.revision,devname);
+               return 2;
+       }
+       if (load_ddf_header(fd, __be64_to_cpu(super->anchor.primary_lba),
+                           dsize >> 9,  1,
+                           &super->primary, &super->anchor) == 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to load primary DDF header "
+                               "on %s\n", devname);
+               return 2;
+       }
+       super->active = &super->primary;
+       if (load_ddf_header(fd, __be64_to_cpu(super->anchor.secondary_lba),
+                           dsize >> 9,  2,
+                           &super->secondary, &super->anchor)) {
+               if ((__be32_to_cpu(super->primary.seq)
+                    < __be32_to_cpu(super->secondary.seq) &&
+                    !super->secondary.openflag)
+                   || (__be32_to_cpu(super->primary.seq)
+                       == __be32_to_cpu(super->secondary.seq) &&
+                       super->primary.openflag && !super->secondary.openflag)
+                       )
+                       super->active = &super->secondary;
+       }
+       return 0;
+}
+
+static int load_ddf_global(int fd, struct ddf_super *super, char *devname)
+{
+       void *ok;
+       ok = load_section(fd, super, &super->controller,
+                         super->active->controller_section_offset,
+                         super->active->controller_section_length,
+                         0);
+       super->phys = load_section(fd, super, NULL,
+                                  super->active->phys_section_offset,
+                                  super->active->phys_section_length,
+                                  1);
+       super->pdsize = __be32_to_cpu(super->active->phys_section_length) * 512;
+
+       super->virt = load_section(fd, super, NULL,
+                                  super->active->virt_section_offset,
+                                  super->active->virt_section_length,
+                                  1);
+       super->vdsize = __be32_to_cpu(super->active->virt_section_length) * 512;
+       if (!ok ||
+           !super->phys ||
+           !super->virt) {
+               free(super->phys);
+               free(super->virt);
+               super->phys = NULL;
+               super->virt = NULL;
+               return 2;
+       }
+       super->conflist = NULL;
+       super->dlist = NULL;
+
+       super->max_part = __be16_to_cpu(super->active->max_partitions);
+       super->mppe = __be16_to_cpu(super->active->max_primary_element_entries);
+       super->conf_rec_len = __be16_to_cpu(super->active->config_record_len);
+       return 0;
+}
+
+static int load_ddf_local(int fd, struct ddf_super *super,
+                         char *devname, int keep)
+{
+       struct dl *dl;
+       struct stat stb;
+       char *conf;
+       int i;
+       int confsec;
+       int vnum;
+       int max_virt_disks = __be16_to_cpu(super->active->max_vd_entries);
+       unsigned long long dsize;
+
+       /* First the local disk info */
+       if (posix_memalign((void**)&dl, 512,
+                      sizeof(*dl) +
+                      (super->max_part) * sizeof(dl->vlist[0])) != 0) {
+               fprintf(stderr, Name ": %s could not allocate disk info buffer\n",
+                       __func__);
+               return 1;
+       }
+
+       load_section(fd, super, &dl->disk,
+                    super->active->data_section_offset,
+                    super->active->data_section_length,
+                    0);
+       dl->devname = devname ? strdup(devname) : NULL;
+
+       fstat(fd, &stb);
+       dl->major = major(stb.st_rdev);
+       dl->minor = minor(stb.st_rdev);
+       dl->next = super->dlist;
+       dl->fd = keep ? fd : -1;
+
+       dl->size = 0;
+       if (get_dev_size(fd, devname, &dsize))
+               dl->size = dsize >> 9;
+       dl->spare = NULL;
+       for (i=0 ; i < super->max_part ; i++)
+               dl->vlist[i] = NULL;
+       super->dlist = dl;
+       dl->pdnum = -1;
+       for (i=0; i < __be16_to_cpu(super->active->max_pd_entries); i++)
+               if (memcmp(super->phys->entries[i].guid,
+                          dl->disk.guid, DDF_GUID_LEN) == 0)
+                       dl->pdnum = i;
+
+       /* Now the config list. */
+       /* 'conf' is an array of config entries, some of which are
+        * probably invalid.  Those which are good need to be copied into
+        * the conflist
+        */
+
+       conf = load_section(fd, super, NULL,
+                           super->active->config_section_offset,
+                           super->active->config_section_length,
+                           0);
+
+       vnum = 0;
+       for (confsec = 0;
+            confsec < __be32_to_cpu(super->active->config_section_length);
+            confsec += super->conf_rec_len) {
+               struct vd_config *vd =
+                       (struct vd_config *)((char*)conf + confsec*512);
+               struct vcl *vcl;
+
+               if (vd->magic == DDF_SPARE_ASSIGN_MAGIC) {
+                       if (dl->spare)
+                               continue;
+                       if (posix_memalign((void**)&dl->spare, 512,
+                                      super->conf_rec_len*512) != 0) {
+                               fprintf(stderr, Name
+                                       ": %s could not allocate spare info buf\n",
+                                       __func__);
+                               return 1;
+                       }
+                               
+                       memcpy(dl->spare, vd, super->conf_rec_len*512);
+                       continue;
+               }
+               if (vd->magic != DDF_VD_CONF_MAGIC)
+                       continue;
+               for (vcl = super->conflist; vcl; vcl = vcl->next) {
+                       if (memcmp(vcl->conf.guid,
+                                  vd->guid, DDF_GUID_LEN) == 0)
+                               break;
+               }
+
+               if (vcl) {
+                       dl->vlist[vnum++] = vcl;
+                       if (__be32_to_cpu(vd->seqnum) <=
+                           __be32_to_cpu(vcl->conf.seqnum))
+                               continue;
+               } else {
+                       if (posix_memalign((void**)&vcl, 512,
+                                      (super->conf_rec_len*512 +
+                                       offsetof(struct vcl, conf))) != 0) {
+                               fprintf(stderr, Name
+                                       ": %s could not allocate vcl buf\n",
+                                       __func__);
+                               return 1;
+                       }
+                       vcl->next = super->conflist;
+                       vcl->block_sizes = NULL; /* FIXME not for CONCAT */
+                       super->conflist = vcl;
+                       dl->vlist[vnum++] = vcl;
+               }
+               memcpy(&vcl->conf, vd, super->conf_rec_len*512);
+               vcl->lba_offset = (__u64*)
+                       &vcl->conf.phys_refnum[super->mppe];
+
+               for (i=0; i < max_virt_disks ; i++)
+                       if (memcmp(super->virt->entries[i].guid,
+                                  vcl->conf.guid, DDF_GUID_LEN)==0)
+                               break;
+               if (i < max_virt_disks)
+                       vcl->vcnum = i;
+       }
+       free(conf);
+
+       return 0;
+}
+
+#ifndef MDASSEMBLE
+static int load_super_ddf_all(struct supertype *st, int fd,
+                             void **sbp, char *devname, int keep_fd);
+#endif
+static int load_super_ddf(struct supertype *st, int fd,
+                         char *devname)
+{
+       unsigned long long dsize;
+       struct ddf_super *super;
+       int rv;
+
+#ifndef MDASSEMBLE
+       /* if 'fd' is a container, load metadata from all the devices */
+       if (load_super_ddf_all(st, fd, &st->sb, devname, 1) == 0)
+               return 0;
+#endif
+       if (st->subarray[0])
+               return 1; /* FIXME Is this correct */
+
+       if (get_dev_size(fd, devname, &dsize) == 0)
+               return 1;
+
+       /* 32M is a lower bound */
+       if (dsize <= 32*1024*1024) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": %s is too small for ddf: "
+                               "size is %llu sectors.\n",
+                               devname, dsize>>9);
+               return 1;
+       }
+       if (dsize & 511) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": %s is an odd size for ddf: "
+                               "size is %llu bytes.\n",
+                               devname, dsize);
+               return 1;
+       }
+
+       if (posix_memalign((void**)&super, 512, sizeof(*super))!= 0) {
+               fprintf(stderr, Name ": malloc of %zu failed.\n",
+                       sizeof(*super));
+               return 1;
+       }
+       memset(super, 0, sizeof(*super));
+
+       rv = load_ddf_headers(fd, super, devname);
+       if (rv) {
+               free(super);
+               return rv;
+       }
+
+       /* Have valid headers and have chosen the best. Let's read in the rest*/
+
+       rv = load_ddf_global(fd, super, devname);
+
+       if (rv) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to load all information "
+                               "sections on %s\n", devname);
+               free(super);
+               return rv;
+       }
+
+       rv = load_ddf_local(fd, super, devname, 0);
+
+       if (rv) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to load all information "
+                               "sections on %s\n", devname);
+               free(super);
+               return rv;
+       }
+
+       /* Should possibly check the sections .... */
+
+       st->sb = super;
+       if (st->ss == NULL) {
+               st->ss = &super_ddf;
+               st->minor_version = 0;
+               st->max_devs = 512;
+       }
+       st->loaded_container = 0;
+       return 0;
+
+}
+
+static void free_super_ddf(struct supertype *st)
+{
+       struct ddf_super *ddf = st->sb;
+       if (ddf == NULL)
+               return;
+       free(ddf->phys);
+       free(ddf->virt);
+       while (ddf->conflist) {
+               struct vcl *v = ddf->conflist;
+               ddf->conflist = v->next;
+               if (v->block_sizes)
+                       free(v->block_sizes);
+               free(v);
+       }
+       while (ddf->dlist) {
+               struct dl *d = ddf->dlist;
+               ddf->dlist = d->next;
+               if (d->fd >= 0)
+                       close(d->fd);
+               if (d->spare)
+                       free(d->spare);
+               free(d);
+       }
+       free(ddf);
+       st->sb = NULL;
+}
+
+static struct supertype *match_metadata_desc_ddf(char *arg)
+{
+       /* 'ddf' only support containers */
+       struct supertype *st;
+       if (strcmp(arg, "ddf") != 0 &&
+           strcmp(arg, "default") != 0
+               )
+               return NULL;
+
+       st = malloc(sizeof(*st));
+       memset(st, 0, sizeof(*st));
+       st->ss = &super_ddf;
+       st->max_devs = 512;
+       st->minor_version = 0;
+       st->sb = NULL;
+       return st;
+}
+
+
+#ifndef MDASSEMBLE
+
+static mapping_t ddf_state[] = {
+       { "Optimal", 0},
+       { "Degraded", 1},
+       { "Deleted", 2},
+       { "Missing", 3},
+       { "Failed", 4},
+       { "Partially Optimal", 5},
+       { "-reserved-", 6},
+       { "-reserved-", 7},
+       { NULL, 0}
+};
+
+static mapping_t ddf_init_state[] = {
+       { "Not Initialised", 0},
+       { "QuickInit in Progress", 1},
+       { "Fully Initialised", 2},
+       { "*UNKNOWN*", 3},
+       { NULL, 0}
+};
+static mapping_t ddf_access[] = {
+       { "Read/Write", 0},
+       { "Reserved", 1},
+       { "Read Only", 2},
+       { "Blocked (no access)", 3},
+       { NULL ,0}
+};
+
+static mapping_t ddf_level[] = {
+       { "RAID0", DDF_RAID0},
+       { "RAID1", DDF_RAID1},
+       { "RAID3", DDF_RAID3},
+       { "RAID4", DDF_RAID4},
+       { "RAID5", DDF_RAID5},
+       { "RAID1E",DDF_RAID1E},
+       { "JBOD",  DDF_JBOD},
+       { "CONCAT",DDF_CONCAT},
+       { "RAID5E",DDF_RAID5E},
+       { "RAID5EE",DDF_RAID5EE},
+       { "RAID6", DDF_RAID6},
+       { NULL, 0}
+};
+static mapping_t ddf_sec_level[] = {
+       { "Striped", DDF_2STRIPED},
+       { "Mirrored", DDF_2MIRRORED},
+       { "Concat", DDF_2CONCAT},
+       { "Spanned", DDF_2SPANNED},
+       { NULL, 0}
+};
+#endif
+
+struct num_mapping {
+       int num1, num2;
+};
+static struct num_mapping ddf_level_num[] = {
+       { DDF_RAID0, 0 },
+       { DDF_RAID1, 1 },
+       { DDF_RAID3, LEVEL_UNSUPPORTED },
+       { DDF_RAID4, 4 },
+       { DDF_RAID5, 5 },
+       { DDF_RAID1E, LEVEL_UNSUPPORTED },
+       { DDF_JBOD, LEVEL_UNSUPPORTED },
+       { DDF_CONCAT, LEVEL_LINEAR },
+       { DDF_RAID5E, LEVEL_UNSUPPORTED },
+       { DDF_RAID5EE, LEVEL_UNSUPPORTED },
+       { DDF_RAID6, 6},
+       { MAXINT, MAXINT }
+};
+
+static int map_num1(struct num_mapping *map, int num)
+{
+       int i;
+       for (i=0 ; map[i].num1 != MAXINT; i++)
+               if (map[i].num1 == num)
+                       break;
+       return map[i].num2;
+}
+
+static int all_ff(char *guid)
+{
+       int i;
+       for (i = 0; i < DDF_GUID_LEN; i++)
+               if (guid[i] != (char)0xff)
+                       return 0;
+       return 1;
+}
+
+#ifndef MDASSEMBLE
+static void print_guid(char *guid, int tstamp)
+{
+       /* A GUIDs are part (or all) ASCII and part binary.
+        * They tend to be space padded.
+        * We print the GUID in HEX, then in parentheses add
+        * any initial ASCII sequence, and a possible
+        * time stamp from bytes 16-19
+        */
+       int l = DDF_GUID_LEN;
+       int i;
+
+       for (i=0 ; i<DDF_GUID_LEN ; i++) {
+               if ((i&3)==0 && i != 0) printf(":");
+               printf("%02X", guid[i]&255);
+       }
+
+       printf("\n                  (");
+       while (l && guid[l-1] == ' ')
+               l--;
+       for (i=0 ; i<l ; i++) {
+               if (guid[i] >= 0x20 && guid[i] < 0x7f)
+                       fputc(guid[i], stdout);
+               else
+                       break;
+       }
+       if (tstamp) {
+               time_t then = __be32_to_cpu(*(__u32*)(guid+16)) + DECADE;
+               char tbuf[100];
+               struct tm *tm;
+               tm = localtime(&then);
+               strftime(tbuf, 100, " %D %T",tm);
+               fputs(tbuf, stdout);
+       }
+       printf(")");
+}
+
+static void examine_vd(int n, struct ddf_super *sb, char *guid)
+{
+       int crl = sb->conf_rec_len;
+       struct vcl *vcl;
+
+       for (vcl = sb->conflist ; vcl ; vcl = vcl->next) {
+               int i;
+               struct vd_config *vc = &vcl->conf;
+
+               if (calc_crc(vc, crl*512) != vc->crc)
+                       continue;
+               if (memcmp(vc->guid, guid, DDF_GUID_LEN) != 0)
+                       continue;
+
+               /* Ok, we know about this VD, let's give more details */
+               printf(" Raid Devices[%d] : %d (", n,
+                      __be16_to_cpu(vc->prim_elmnt_count));
+               for (i=0; i<__be16_to_cpu(vc->prim_elmnt_count); i++) {
+                       int j;
+                       int cnt = __be16_to_cpu(sb->phys->used_pdes);
+                       for (j=0; j<cnt; j++)
+                               if (vc->phys_refnum[i] == sb->phys->entries[j].refnum)
+                                       break;
+                       if (i) printf(" ");
+                       if (j < cnt)
+                               printf("%d", j);
+                       else
+                               printf("--");
+               }
+               printf(")\n");
+               if (vc->chunk_shift != 255)
+               printf("   Chunk Size[%d] : %d sectors\n", n,
+                      1 << vc->chunk_shift);
+               printf("   Raid Level[%d] : %s\n", n,
+                      map_num(ddf_level, vc->prl)?:"-unknown-");
+               if (vc->sec_elmnt_count != 1) {
+                       printf("  Secondary Position[%d] : %d of %d\n", n,
+                              vc->sec_elmnt_seq, vc->sec_elmnt_count);
+                       printf("  Secondary Level[%d] : %s\n", n,
+                              map_num(ddf_sec_level, vc->srl) ?: "-unknown-");
+               }
+               printf("  Device Size[%d] : %llu\n", n,
+                      (unsigned long long)__be64_to_cpu(vc->blocks)/2);
+               printf("   Array Size[%d] : %llu\n", n,
+                      (unsigned long long)__be64_to_cpu(vc->array_blocks)/2);
+       }
+}
+
+static void examine_vds(struct ddf_super *sb)
+{
+       int cnt = __be16_to_cpu(sb->virt->populated_vdes);
+       int i;
+       printf("  Virtual Disks : %d\n", cnt);
+
+       for (i=0; i<cnt; i++) {
+               struct virtual_entry *ve = &sb->virt->entries[i];
+               printf("\n");
+               printf("      VD GUID[%d] : ", i); print_guid(ve->guid, 1);
+               printf("\n");
+               printf("         unit[%d] : %d\n", i, __be16_to_cpu(ve->unit));
+               printf("        state[%d] : %s, %s%s\n", i,
+                      map_num(ddf_state, ve->state & 7),
+                      (ve->state & 8) ? "Morphing, ": "",
+                      (ve->state & 16)? "Not Consistent" : "Consistent");
+               printf("   init state[%d] : %s\n", i,
+                      map_num(ddf_init_state, ve->init_state&3));
+               printf("       access[%d] : %s\n", i,
+                      map_num(ddf_access, (ve->init_state>>6) & 3));
+               printf("         Name[%d] : %.16s\n", i, ve->name);
+               examine_vd(i, sb, ve->guid);
+       }
+       if (cnt) printf("\n");
+}
+
+static void examine_pds(struct ddf_super *sb)
+{
+       int cnt = __be16_to_cpu(sb->phys->used_pdes);
+       int i;
+       struct dl *dl;
+       printf(" Physical Disks : %d\n", cnt);
+       printf("      Number    RefNo      Size       Device      Type/State\n");
+
+       for (i=0 ; i<cnt ; i++) {
+               struct phys_disk_entry *pd = &sb->phys->entries[i];
+               int type = __be16_to_cpu(pd->type);
+               int state = __be16_to_cpu(pd->state);
+
+               //printf("      PD GUID[%d] : ", i); print_guid(pd->guid, 0);
+               //printf("\n");
+               printf("       %3d    %08x  ", i,
+                      __be32_to_cpu(pd->refnum));
+               printf("%8lluK ", 
+                      (unsigned long long)__be64_to_cpu(pd->config_size)>>1);
+               for (dl = sb->dlist; dl ; dl = dl->next) {
+                       if (dl->disk.refnum == pd->refnum) {
+                               char *dv = map_dev(dl->major, dl->minor, 0);
+                               if (dv) {
+                                       printf("%-15s", dv);
+                                       break;
+                               }
+                       }
+               }
+               if (!dl)
+                       printf("%15s","");
+               printf(" %s%s%s%s%s",
+                      (type&2) ? "active":"",
+                      (type&4) ? "Global-Spare":"",
+                      (type&8) ? "spare" : "",
+                      (type&16)? ", foreign" : "",
+                      (type&32)? "pass-through" : "");
+               printf("/%s%s%s%s%s%s%s",
+                      (state&1)? "Online": "Offline",
+                      (state&2)? ", Failed": "",
+                      (state&4)? ", Rebuilding": "",
+                      (state&8)? ", in-transition": "",
+                      (state&16)? ", SMART-errors": "",
+                      (state&32)? ", Unrecovered-Read-Errors": "",
+                      (state&64)? ", Missing" : "");
+               printf("\n");
+       }
+}
+
+static void examine_super_ddf(struct supertype *st, char *homehost)
+{
+       struct ddf_super *sb = st->sb;
+
+       printf("          Magic : %08x\n", __be32_to_cpu(sb->anchor.magic));
+       printf("        Version : %.8s\n", sb->anchor.revision);
+       printf("Controller GUID : "); print_guid(sb->controller.guid, 0);
+       printf("\n");
+       printf(" Container GUID : "); print_guid(sb->anchor.guid, 1);
+       printf("\n");
+       printf("            Seq : %08x\n", __be32_to_cpu(sb->active->seq));
+       printf("  Redundant hdr : %s\n", sb->secondary.magic == DDF_HEADER_MAGIC
+              ?"yes" : "no");
+       examine_vds(sb);
+       examine_pds(sb);
+}
+
+static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info);
+
+static void uuid_from_super_ddf(struct supertype *st, int uuid[4]);
+
+static void brief_examine_super_ddf(struct supertype *st, int verbose)
+{
+       /* We just write a generic DDF ARRAY entry
+        */
+       struct ddf_super *ddf = st->sb;
+       struct mdinfo info;
+       int i;
+       char nbuf[64];
+       getinfo_super_ddf(st, &info);
+       fname_from_uuid(st, &info, nbuf, ':');
+       printf("ARRAY metadata=ddf UUID=%s\n", nbuf + 5);
+
+       for (i=0; i<__be16_to_cpu(ddf->virt->max_vdes); i++) {
+               struct virtual_entry *ve = &ddf->virt->entries[i];
+               struct vcl vcl;
+               char nbuf1[64];
+               if (all_ff(ve->guid))
+                       continue;
+               memcpy(vcl.conf.guid, ve->guid, DDF_GUID_LEN);
+               ddf->currentconf =&vcl;
+               uuid_from_super_ddf(st, info.uuid);
+               fname_from_uuid(st, &info, nbuf1, ':');
+               printf("ARRAY container=%s member=%d UUID=%s\n",
+                      nbuf+5, i, nbuf1+5);
+       }
+}
+
+static void export_examine_super_ddf(struct supertype *st)
+{
+       struct mdinfo info;
+       char nbuf[64];
+       getinfo_super_ddf(st, &info);
+       fname_from_uuid(st, &info, nbuf, ':');
+       printf("MD_METADATA=ddf\n");
+       printf("MD_LEVEL=container\n");
+       printf("MD_UUID=%s\n", nbuf+5);
+}
+       
+
+static void detail_super_ddf(struct supertype *st, char *homehost)
+{
+       /* FIXME later
+        * Could print DDF GUID
+        * Need to find which array
+        *  If whole, briefly list all arrays
+        *  If one, give name
+        */
+}
+
+static void brief_detail_super_ddf(struct supertype *st)
+{
+       /* FIXME I really need to know which array we are detailing.
+        * Can that be stored in ddf_super??
+        */
+//     struct ddf_super *ddf = st->sb;
+       struct mdinfo info;
+       char nbuf[64];
+       getinfo_super_ddf(st, &info);
+       fname_from_uuid(st, &info, nbuf,':');
+       printf(" UUID=%s", nbuf + 5);
+}
+#endif
+
+static int match_home_ddf(struct supertype *st, char *homehost)
+{
+       /* It matches 'this' host if the controller is a
+        * Linux-MD controller with vendor_data matching
+        * the hostname
+        */
+       struct ddf_super *ddf = st->sb;
+       int len = strlen(homehost);
+
+       return (memcmp(ddf->controller.guid, T10, 8) == 0 &&
+               len < sizeof(ddf->controller.vendor_data) &&
+               memcmp(ddf->controller.vendor_data, homehost,len) == 0 &&
+               ddf->controller.vendor_data[len] == 0);
+}
+
+#ifndef MDASSEMBLE
+static struct vd_config *find_vdcr(struct ddf_super *ddf, int inst)
+{
+       struct vcl *v;
+
+       for (v = ddf->conflist; v; v = v->next)
+               if (inst == v->vcnum)
+                       return &v->conf;
+       return NULL;
+}
+#endif
+
+static int find_phys(struct ddf_super *ddf, __u32 phys_refnum)
+{
+       /* Find the entry in phys_disk which has the given refnum
+        * and return it's index
+        */
+       int i;
+       for (i=0; i < __be16_to_cpu(ddf->phys->max_pdes); i++)
+               if (ddf->phys->entries[i].refnum == phys_refnum)
+                       return i;
+       return -1;
+}
+
+static void uuid_from_super_ddf(struct supertype *st, int uuid[4])
+{
+       /* The uuid returned here is used for:
+        *  uuid to put into bitmap file (Create, Grow)
+        *  uuid for backup header when saving critical section (Grow)
+        *  comparing uuids when re-adding a device into an array
+        *    In these cases the uuid required is that of the data-array,
+        *    not the device-set.
+        *  uuid to recognise same set when adding a missing device back
+        *    to an array.   This is a uuid for the device-set.
+        *  
+        * For each of these we can make do with a truncated
+        * or hashed uuid rather than the original, as long as
+        * everyone agrees.
+        * In the case of SVD we assume the BVD is of interest,
+        * though that might be the case if a bitmap were made for
+        * a mirrored SVD - worry about that later.
+        * So we need to find the VD configuration record for the
+        * relevant BVD and extract the GUID and Secondary_Element_Seq.
+        * The first 16 bytes of the sha1 of these is used.
+        */
+       struct ddf_super *ddf = st->sb;
+       struct vcl *vcl = ddf->currentconf;
+       char *guid;
+       char buf[20];
+       struct sha1_ctx ctx;
+
+       if (vcl)
+               guid = vcl->conf.guid;
+       else
+               guid = ddf->anchor.guid;
+
+       sha1_init_ctx(&ctx);
+       sha1_process_bytes(guid, DDF_GUID_LEN, &ctx);
+       sha1_finish_ctx(&ctx, buf);
+       memcpy(uuid, buf, 4*4);
+}
+
+static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info);
+
+static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info)
+{
+       struct ddf_super *ddf = st->sb;
+
+       if (ddf->currentconf) {
+               getinfo_super_ddf_bvd(st, info);
+               return;
+       }
+
+       info->array.raid_disks    = __be16_to_cpu(ddf->phys->used_pdes);
+       info->array.level         = LEVEL_CONTAINER;
+       info->array.layout        = 0;
+       info->array.md_minor      = -1;
+       info->array.ctime         = DECADE + __be32_to_cpu(*(__u32*)
+                                                        (ddf->anchor.guid+16));
+       info->array.utime         = 0;
+       info->array.chunk_size    = 0;
+
+
+       info->disk.major = 0;
+       info->disk.minor = 0;
+       if (ddf->dlist) {
+               info->disk.number = __be32_to_cpu(ddf->dlist->disk.refnum);
+               info->disk.raid_disk = find_phys(ddf, ddf->dlist->disk.refnum);
+
+               info->data_offset = __be64_to_cpu(ddf->phys->
+                                         entries[info->disk.raid_disk].
+                                         config_size);
+               info->component_size = ddf->dlist->size - info->data_offset;
+       } else {
+               info->disk.number = -1;
+               info->disk.raid_disk = -1;
+//             info->disk.raid_disk = find refnum in the table and use index;
+       }
+       info->disk.state = (1 << MD_DISK_SYNC);
+
+
+       info->reshape_active = 0;
+       info->name[0] = 0;
+
+       info->array.major_version = -1;
+       info->array.minor_version = -2;
+       strcpy(info->text_version, "ddf");
+       info->safe_mode_delay = 0;
+
+       uuid_from_super_ddf(st, info->uuid);
+
+}
+
+static int rlq_to_layout(int rlq, int prl, int raiddisks);
+
+static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info)
+{
+       struct ddf_super *ddf = st->sb;
+       struct vcl *vc = ddf->currentconf;
+       int cd = ddf->currentdev;
+       int j;
+       struct dl *dl;
+
+       /* FIXME this returns BVD info - what if we want SVD ?? */
+
+       info->array.raid_disks    = __be16_to_cpu(vc->conf.prim_elmnt_count);
+       info->array.level         = map_num1(ddf_level_num, vc->conf.prl);
+       info->array.layout        = rlq_to_layout(vc->conf.rlq, vc->conf.prl,
+                                                 info->array.raid_disks);
+       info->array.md_minor      = -1;
+       info->array.ctime         = DECADE +
+               __be32_to_cpu(*(__u32*)(vc->conf.guid+16));
+       info->array.utime         = DECADE + __be32_to_cpu(vc->conf.timestamp);
+       info->array.chunk_size    = 512 << vc->conf.chunk_shift;
+       info->custom_array_size   = 0;
+
+       if (cd >= 0 && cd < ddf->mppe) {
+               info->data_offset         = __be64_to_cpu(vc->lba_offset[cd]);
+               if (vc->block_sizes)
+                       info->component_size = vc->block_sizes[cd];
+               else
+                       info->component_size = __be64_to_cpu(vc->conf.blocks);
+       }
+
+       for (dl = ddf->dlist; dl ; dl = dl->next)
+               if (dl->raiddisk == info->disk.raid_disk)
+                       break;
+       info->disk.major = 0;
+       info->disk.minor = 0;
+       if (dl) {
+               info->disk.major = dl->major;
+               info->disk.minor = dl->minor;
+       }
+//     info->disk.number = __be32_to_cpu(ddf->disk.refnum);
+//     info->disk.raid_disk = find refnum in the table and use index;
+//     info->disk.state = ???;
+
+       info->container_member = ddf->currentconf->vcnum;
+
+       info->resync_start = 0;
+       if (!(ddf->virt->entries[info->container_member].state
+             & DDF_state_inconsistent)  &&
+           (ddf->virt->entries[info->container_member].init_state
+            & DDF_initstate_mask)
+           == DDF_init_full)
+               info->resync_start = ~0ULL;
+
+       uuid_from_super_ddf(st, info->uuid);
+
+       info->container_member = atoi(st->subarray);
+       info->array.major_version = -1;
+       info->array.minor_version = -2;
+       sprintf(info->text_version, "/%s/%s",
+               devnum2devname(st->container_dev),
+               st->subarray);
+       info->safe_mode_delay = 200;
+
+       memcpy(info->name, ddf->virt->entries[info->container_member].name, 16);
+       info->name[16]=0;
+       for(j=0; j<16; j++)
+               if (info->name[j] == ' ')
+                       info->name[j] = 0;
+}
+
+
+static int update_super_ddf(struct supertype *st, struct mdinfo *info,
+                           char *update,
+                           char *devname, int verbose,
+                           int uuid_set, char *homehost)
+{
+       /* For 'assemble' and 'force' we need to return non-zero if any
+        * change was made.  For others, the return value is ignored.
+        * Update options are:
+        *  force-one : This device looks a bit old but needs to be included,
+        *        update age info appropriately.
+        *  assemble: clear any 'faulty' flag to allow this device to
+        *              be assembled.
+        *  force-array: Array is degraded but being forced, mark it clean
+        *         if that will be needed to assemble it.
+        *
+        *  newdev:  not used ????
+        *  grow:  Array has gained a new device - this is currently for
+        *              linear only
+        *  resync: mark as dirty so a resync will happen.
+        *  uuid:  Change the uuid of the array to match what is given
+        *  homehost:  update the recorded homehost
+        *  name:  update the name - preserving the homehost
+        *  _reshape_progress: record new reshape_progress position.
+        *
+        * Following are not relevant for this version:
+        *  sparc2.2 : update from old dodgey metadata
+        *  super-minor: change the preferred_minor number
+        *  summaries:  update redundant counters.
+        */
+       int rv = 0;
+//     struct ddf_super *ddf = st->sb;
+//     struct vd_config *vd = find_vdcr(ddf, info->container_member);
+//     struct virtual_entry *ve = find_ve(ddf);
+
+       /* we don't need to handle "force-*" or "assemble" as
+        * there is no need to 'trick' the kernel.  We the metadata is
+        * first updated to activate the array, all the implied modifications
+        * will just happen.
+        */
+
+       if (strcmp(update, "grow") == 0) {
+               /* FIXME */
+       }
+       if (strcmp(update, "resync") == 0) {
+//             info->resync_checkpoint = 0;
+       }
+       /* We ignore UUID updates as they make even less sense
+        * with DDF
+        */
+       if (strcmp(update, "homehost") == 0) {
+               /* homehost is stored in controller->vendor_data,
+                * or it is when we are the vendor
+                */
+//             if (info->vendor_is_local)
+//                     strcpy(ddf->controller.vendor_data, homehost);
+       }
+       if (strcmp(update, "name") == 0) {
+               /* name is stored in virtual_entry->name */
+//             memset(ve->name, ' ', 16);
+//             strncpy(ve->name, info->name, 16);
+       }
+       if (strcmp(update, "_reshape_progress") == 0) {
+               /* We don't support reshape yet */
+       }
+
+//     update_all_csum(ddf);
+
+       return rv;
+}
+
+__u32 random32(void)
+{
+       __u32 rv;
+       int rfd = open("/dev/urandom", O_RDONLY);
+       if (rfd < 0 || read(rfd, &rv, 4) != 4)
+               rv = random();
+       if (rfd >= 0)
+               close(rfd);
+       return rv;
+}
+
+static void make_header_guid(char *guid)
+{
+       __u32 stamp;
+       /* Create a DDF Header of Virtual Disk GUID */
+
+       /* 24 bytes of fiction required.
+        * first 8 are a 'vendor-id'  - "Linux-MD"
+        * next 8 are controller type.. how about 0X DEAD BEEF 0000 0000
+        * Remaining 8 random number plus timestamp
+        */
+       memcpy(guid, T10, sizeof(T10));
+       stamp = __cpu_to_be32(0xdeadbeef);
+       memcpy(guid+8, &stamp, 4);
+       stamp = __cpu_to_be32(0);
+       memcpy(guid+12, &stamp, 4);
+       stamp = __cpu_to_be32(time(0) - DECADE);
+       memcpy(guid+16, &stamp, 4);
+       stamp = random32();
+       memcpy(guid+20, &stamp, 4);
+}
+
+static int init_super_ddf_bvd(struct supertype *st,
+                             mdu_array_info_t *info,
+                             unsigned long long size,
+                             char *name, char *homehost,
+                             int *uuid);
+
+static int init_super_ddf(struct supertype *st,
+                         mdu_array_info_t *info,
+                         unsigned long long size, char *name, char *homehost,
+                         int *uuid)
+{
+       /* This is primarily called by Create when creating a new array.
+        * We will then get add_to_super called for each component, and then
+        * write_init_super called to write it out to each device.
+        * For DDF, Create can create on fresh devices or on a pre-existing
+        * array.
+        * To create on a pre-existing array a different method will be called.
+        * This one is just for fresh drives.
+        *
+        * We need to create the entire 'ddf' structure which includes:
+        *  DDF headers - these are easy.
+        *  Controller data - a Sector describing this controller .. not that
+        *                  this is a controller exactly.
+        *  Physical Disk Record - one entry per device, so
+        *                      leave plenty of space.
+        *  Virtual Disk Records - again, just leave plenty of space.
+        *                   This just lists VDs, doesn't give details
+        *  Config records - describes the VDs that use this disk
+        *  DiskData  - describes 'this' device.
+        *  BadBlockManagement - empty
+        *  Diag Space - empty
+        *  Vendor Logs - Could we put bitmaps here?
+        *
+        */
+       struct ddf_super *ddf;
+       char hostname[17];
+       int hostlen;
+       int max_phys_disks, max_virt_disks;
+       unsigned long long sector;
+       int clen;
+       int i;
+       int pdsize, vdsize;
+       struct phys_disk *pd;
+       struct virtual_disk *vd;
+
+       if (!info) {
+               st->sb = NULL;
+               return 0;
+       }
+       if (st->sb)
+               return init_super_ddf_bvd(st, info, size, name, homehost,
+                                         uuid);
+
+       if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) {
+               fprintf(stderr, Name ": %s could not allocate superblock\n", __func__);
+               return 0;
+       }
+       memset(ddf, 0, sizeof(*ddf));
+       ddf->dlist = NULL; /* no physical disks yet */
+       ddf->conflist = NULL; /* No virtual disks yet */
+
+       /* At least 32MB *must* be reserved for the ddf.  So let's just
+        * start 32MB from the end, and put the primary header there.
+        * Don't do secondary for now.
+        * We don't know exactly where that will be yet as it could be
+        * different on each device.  To just set up the lengths.
+        *
+        */
+
+       ddf->anchor.magic = DDF_HEADER_MAGIC;
+       make_header_guid(ddf->anchor.guid);
+
+       memcpy(ddf->anchor.revision, DDF_REVISION_2, 8);
+       ddf->anchor.seq = __cpu_to_be32(1);
+       ddf->anchor.timestamp = __cpu_to_be32(time(0) - DECADE);
+       ddf->anchor.openflag = 0xFF;
+       ddf->anchor.foreignflag = 0;
+       ddf->anchor.enforcegroups = 0; /* Is this best?? */
+       ddf->anchor.pad0 = 0xff;
+       memset(ddf->anchor.pad1, 0xff, 12);
+       memset(ddf->anchor.header_ext, 0xff, 32);
+       ddf->anchor.primary_lba = ~(__u64)0;
+       ddf->anchor.secondary_lba = ~(__u64)0;
+       ddf->anchor.type = DDF_HEADER_ANCHOR;
+       memset(ddf->anchor.pad2, 0xff, 3);
+       ddf->anchor.workspace_len = __cpu_to_be32(32768); /* Must be reserved */
+       ddf->anchor.workspace_lba = ~(__u64)0; /* Put this at bottom
+                                                 of 32M reserved.. */
+       max_phys_disks = 1023;   /* Should be enough */
+       ddf->anchor.max_pd_entries = __cpu_to_be16(max_phys_disks);
+       max_virt_disks = 255;
+       ddf->anchor.max_vd_entries = __cpu_to_be16(max_virt_disks); /* ?? */
+       ddf->anchor.max_partitions = __cpu_to_be16(64); /* ?? */
+       ddf->max_part = 64;
+       ddf->mppe = 256;
+       ddf->conf_rec_len = 1 + ROUND_UP(ddf->mppe * (4+8), 512)/512;
+       ddf->anchor.config_record_len = __cpu_to_be16(ddf->conf_rec_len);
+       ddf->anchor.max_primary_element_entries = __cpu_to_be16(ddf->mppe);
+       memset(ddf->anchor.pad3, 0xff, 54);
+       /* controller sections is one sector long immediately
+        * after the ddf header */
+       sector = 1;
+       ddf->anchor.controller_section_offset = __cpu_to_be32(sector);
+       ddf->anchor.controller_section_length = __cpu_to_be32(1);
+       sector += 1;
+
+       /* phys is 8 sectors after that */
+       pdsize = ROUND_UP(sizeof(struct phys_disk) +
+                         sizeof(struct phys_disk_entry)*max_phys_disks,
+                         512);
+       switch(pdsize/512) {
+       case 2: case 8: case 32: case 128: case 512: break;
+       default: abort();
+       }
+       ddf->anchor.phys_section_offset = __cpu_to_be32(sector);
+       ddf->anchor.phys_section_length =
+               __cpu_to_be32(pdsize/512); /* max_primary_element_entries/8 */
+       sector += pdsize/512;
+
+       /* virt is another 32 sectors */
+       vdsize = ROUND_UP(sizeof(struct virtual_disk) +
+                         sizeof(struct virtual_entry) * max_virt_disks,
+                         512);
+       switch(vdsize/512) {
+       case 2: case 8: case 32: case 128: case 512: break;
+       default: abort();
+       }
+       ddf->anchor.virt_section_offset = __cpu_to_be32(sector);
+       ddf->anchor.virt_section_length =
+               __cpu_to_be32(vdsize/512); /* max_vd_entries/8 */
+       sector += vdsize/512;
+
+       clen = ddf->conf_rec_len * (ddf->max_part+1);
+       ddf->anchor.config_section_offset = __cpu_to_be32(sector);
+       ddf->anchor.config_section_length = __cpu_to_be32(clen);
+       sector += clen;
+
+       ddf->anchor.data_section_offset = __cpu_to_be32(sector);
+       ddf->anchor.data_section_length = __cpu_to_be32(1);
+       sector += 1;
+
+       ddf->anchor.bbm_section_length = __cpu_to_be32(0);
+       ddf->anchor.bbm_section_offset = __cpu_to_be32(0xFFFFFFFF);
+       ddf->anchor.diag_space_length = __cpu_to_be32(0);
+       ddf->anchor.diag_space_offset = __cpu_to_be32(0xFFFFFFFF);
+       ddf->anchor.vendor_length = __cpu_to_be32(0);
+       ddf->anchor.vendor_offset = __cpu_to_be32(0xFFFFFFFF);
+
+       memset(ddf->anchor.pad4, 0xff, 256);
+
+       memcpy(&ddf->primary, &ddf->anchor, 512);
+       memcpy(&ddf->secondary, &ddf->anchor, 512);
+
+       ddf->primary.openflag = 1; /* I guess.. */
+       ddf->primary.type = DDF_HEADER_PRIMARY;
+
+       ddf->secondary.openflag = 1; /* I guess.. */
+       ddf->secondary.type = DDF_HEADER_SECONDARY;
+
+       ddf->active = &ddf->primary;
+
+       ddf->controller.magic = DDF_CONTROLLER_MAGIC;
+
+       /* 24 more bytes of fiction required.
+        * first 8 are a 'vendor-id'  - "Linux-MD"
+        * Remaining 16 are serial number.... maybe a hostname would do?
+        */
+       memcpy(ddf->controller.guid, T10, sizeof(T10));
+       gethostname(hostname, sizeof(hostname));
+       hostname[sizeof(hostname) - 1] = 0;
+       hostlen = strlen(hostname);
+       memcpy(ddf->controller.guid + 24 - hostlen, hostname, hostlen);
+       for (i = strlen(T10) ; i+hostlen < 24; i++)
+               ddf->controller.guid[i] = ' ';
+
+       ddf->controller.type.vendor_id = __cpu_to_be16(0xDEAD);
+       ddf->controller.type.device_id = __cpu_to_be16(0xBEEF);
+       ddf->controller.type.sub_vendor_id = 0;
+       ddf->controller.type.sub_device_id = 0;
+       memcpy(ddf->controller.product_id, "What Is My PID??", 16);
+       memset(ddf->controller.pad, 0xff, 8);
+       memset(ddf->controller.vendor_data, 0xff, 448);
+       if (homehost && strlen(homehost) < 440)
+               strcpy((char*)ddf->controller.vendor_data, homehost);
+
+       if (posix_memalign((void**)&pd, 512, pdsize) != 0) {
+               fprintf(stderr, Name ": %s could not allocate pd\n", __func__);
+               return 0;
+       }
+       ddf->phys = pd;
+       ddf->pdsize = pdsize;
+
+       memset(pd, 0xff, pdsize);
+       memset(pd, 0, sizeof(*pd));
+       pd->magic = DDF_PHYS_DATA_MAGIC;
+       pd->used_pdes = __cpu_to_be16(0);
+       pd->max_pdes = __cpu_to_be16(max_phys_disks);
+       memset(pd->pad, 0xff, 52);
+
+       if (posix_memalign((void**)&vd, 512, vdsize) != 0) {
+               fprintf(stderr, Name ": %s could not allocate vd\n", __func__);
+               return 0;
+       }
+       ddf->virt = vd;
+       ddf->vdsize = vdsize;
+       memset(vd, 0, vdsize);
+       vd->magic = DDF_VIRT_RECORDS_MAGIC;
+       vd->populated_vdes = __cpu_to_be16(0);
+       vd->max_vdes = __cpu_to_be16(max_virt_disks);
+       memset(vd->pad, 0xff, 52);
+
+       for (i=0; i<max_virt_disks; i++)
+               memset(&vd->entries[i], 0xff, sizeof(struct virtual_entry));
+
+       st->sb = ddf;
+       ddf->updates_pending = 1;
+       return 1;
+}
+
+static int chunk_to_shift(int chunksize)
+{
+       return ffs(chunksize/512)-1;
+}
+
+static int level_to_prl(int level)
+{
+       switch (level) {
+       case LEVEL_LINEAR: return DDF_CONCAT;
+       case 0: return DDF_RAID0;
+       case 1: return DDF_RAID1;
+       case 4: return DDF_RAID4;
+       case 5: return DDF_RAID5;
+       case 6: return DDF_RAID6;
+       default: return -1;
+       }
+}
+static int layout_to_rlq(int level, int layout, int raiddisks)
+{
+       switch(level) {
+       case 0:
+               return DDF_RAID0_SIMPLE;
+       case 1:
+               switch(raiddisks) {
+               case 2: return DDF_RAID1_SIMPLE;
+               case 3: return DDF_RAID1_MULTI;
+               default: return -1;
+               }
+       case 4:
+               switch(layout) {
+               case 0: return DDF_RAID4_N;
+               }
+               break;
+       case 5:
+               switch(layout) {
+               case ALGORITHM_LEFT_ASYMMETRIC:
+                       return DDF_RAID5_N_RESTART;
+               case ALGORITHM_RIGHT_ASYMMETRIC:
+                       return DDF_RAID5_0_RESTART;
+               case ALGORITHM_LEFT_SYMMETRIC:
+                       return DDF_RAID5_N_CONTINUE;
+               case ALGORITHM_RIGHT_SYMMETRIC:
+                       return -1; /* not mentioned in standard */
+               }
+       case 6:
+               switch(layout) {
+               case ALGORITHM_ROTATING_N_RESTART:
+                       return DDF_RAID5_N_RESTART;
+               case ALGORITHM_ROTATING_ZERO_RESTART:
+                       return DDF_RAID6_0_RESTART;
+               case ALGORITHM_ROTATING_N_CONTINUE:
+                       return DDF_RAID5_N_CONTINUE;
+               }
+       }
+       return -1;
+}
+
+static int rlq_to_layout(int rlq, int prl, int raiddisks)
+{
+       switch(prl) {
+       case DDF_RAID0:
+               return 0; /* hopefully rlq == DDF_RAID0_SIMPLE */
+       case DDF_RAID1:
+               return 0; /* hopefully rlq == SIMPLE or MULTI depending
+                            on raiddisks*/
+       case DDF_RAID4:
+               switch(rlq) {
+               case DDF_RAID4_N:
+                       return 0;
+               default:
+                       /* not supported */
+                       return -1; /* FIXME this isn't checked */
+               }
+       case DDF_RAID5:
+               switch(rlq) {
+               case DDF_RAID5_N_RESTART:
+                       return ALGORITHM_LEFT_ASYMMETRIC;
+               case DDF_RAID5_0_RESTART:
+                       return ALGORITHM_RIGHT_ASYMMETRIC;
+               case DDF_RAID5_N_CONTINUE:
+                       return ALGORITHM_LEFT_SYMMETRIC;
+               default:
+                       return -1;
+               }
+       case DDF_RAID6:
+               switch(rlq) {
+               case DDF_RAID5_N_RESTART:
+                       return ALGORITHM_ROTATING_N_RESTART;
+               case DDF_RAID6_0_RESTART:
+                       return ALGORITHM_ROTATING_ZERO_RESTART;
+               case DDF_RAID5_N_CONTINUE:
+                       return ALGORITHM_ROTATING_N_CONTINUE;
+               default:
+                       return -1;
+               }
+       }
+       return -1;
+}
+
+#ifndef MDASSEMBLE
+struct extent {
+       unsigned long long start, size;
+};
+static int cmp_extent(const void *av, const void *bv)
+{
+       const struct extent *a = av;
+       const struct extent *b = bv;
+       if (a->start < b->start)
+               return -1;
+       if (a->start > b->start)
+               return 1;
+       return 0;
+}
+
+static struct extent *get_extents(struct ddf_super *ddf, struct dl *dl)
+{
+       /* find a list of used extents on the give physical device
+        * (dnum) of the given ddf.
+        * Return a malloced array of 'struct extent'
+
+FIXME ignore DDF_Legacy devices?
+
+        */
+       struct extent *rv;
+       int n = 0;
+       int i, j;
+
+       rv = malloc(sizeof(struct extent) * (ddf->max_part + 2));
+       if (!rv)
+               return NULL;
+
+       for (i = 0; i < ddf->max_part; i++) {
+               struct vcl *v = dl->vlist[i];
+               if (v == NULL)
+                       continue;
+               for (j=0; j < v->conf.prim_elmnt_count; j++)
+                       if (v->conf.phys_refnum[j] == dl->disk.refnum) {
+                               /* This device plays role 'j' in  'v'. */
+                               rv[n].start = __be64_to_cpu(v->lba_offset[j]);
+                               rv[n].size = __be64_to_cpu(v->conf.blocks);
+                               n++;
+                               break;
+                       }
+       }
+       qsort(rv, n, sizeof(*rv), cmp_extent);
+
+       rv[n].start = __be64_to_cpu(ddf->phys->entries[dl->pdnum].config_size);
+       rv[n].size = 0;
+       return rv;
+}
+#endif
+
+static int init_super_ddf_bvd(struct supertype *st,
+                             mdu_array_info_t *info,
+                             unsigned long long size,
+                             char *name, char *homehost,
+                             int *uuid)
+{
+       /* We are creating a BVD inside a pre-existing container.
+        * so st->sb is already set.
+        * We need to create a new vd_config and a new virtual_entry
+        */
+       struct ddf_super *ddf = st->sb;
+       int venum;
+       struct virtual_entry *ve;
+       struct vcl *vcl;
+       struct vd_config *vc;
+
+       if (__be16_to_cpu(ddf->virt->populated_vdes)
+           >= __be16_to_cpu(ddf->virt->max_vdes)) {
+               fprintf(stderr, Name": This ddf already has the "
+                       "maximum of %d virtual devices\n",
+                       __be16_to_cpu(ddf->virt->max_vdes));
+               return 0;
+       }
+
+       for (venum = 0; venum < __be16_to_cpu(ddf->virt->max_vdes); venum++)
+               if (all_ff(ddf->virt->entries[venum].guid))
+                       break;
+       if (venum == __be16_to_cpu(ddf->virt->max_vdes)) {
+               fprintf(stderr, Name ": Cannot find spare slot for "
+                       "virtual disk - DDF is corrupt\n");
+               return 0;
+       }
+       ve = &ddf->virt->entries[venum];
+
+       /* A Virtual Disk GUID contains the T10 Vendor ID, controller type,
+        * timestamp, random number
+        */
+       make_header_guid(ve->guid);
+       ve->unit = __cpu_to_be16(info->md_minor);
+       ve->pad0 = 0xFFFF;
+       ve->guid_crc = crc32(0, (unsigned char*)ddf->anchor.guid, DDF_GUID_LEN);
+       ve->type = 0;
+       ve->state = DDF_state_degraded; /* Will be modified as devices are added */
+       if (info->state & 1) /* clean */
+               ve->init_state = DDF_init_full;
+       else
+               ve->init_state = DDF_init_not;
+
+       memset(ve->pad1, 0xff, 14);
+       memset(ve->name, ' ', 16);
+       if (name)
+               strncpy(ve->name, name, 16);
+       ddf->virt->populated_vdes =
+               __cpu_to_be16(__be16_to_cpu(ddf->virt->populated_vdes)+1);
+
+       /* Now create a new vd_config */
+       if (posix_memalign((void**)&vcl, 512,
+                          (offsetof(struct vcl, conf) + ddf->conf_rec_len * 512)) != 0) {
+               fprintf(stderr, Name ": %s could not allocate vd_config\n", __func__);
+               return 0;
+       }
+       vcl->lba_offset = (__u64*) &vcl->conf.phys_refnum[ddf->mppe];
+       vcl->vcnum = venum;
+       sprintf(st->subarray, "%d", venum);
+       vcl->block_sizes = NULL; /* FIXME not for CONCAT */
+
+       vc = &vcl->conf;
+
+       vc->magic = DDF_VD_CONF_MAGIC;
+       memcpy(vc->guid, ve->guid, DDF_GUID_LEN);
+       vc->timestamp = __cpu_to_be32(time(0)-DECADE);
+       vc->seqnum = __cpu_to_be32(1);
+       memset(vc->pad0, 0xff, 24);
+       vc->prim_elmnt_count = __cpu_to_be16(info->raid_disks);
+       vc->chunk_shift = chunk_to_shift(info->chunk_size);
+       vc->prl = level_to_prl(info->level);
+       vc->rlq = layout_to_rlq(info->level, info->layout, info->raid_disks);
+       vc->sec_elmnt_count = 1;
+       vc->sec_elmnt_seq = 0;
+       vc->srl = 0;
+       vc->blocks = __cpu_to_be64(info->size * 2);
+       vc->array_blocks = __cpu_to_be64(
+               calc_array_size(info->level, info->raid_disks, info->layout,
+                               info->chunk_size, info->size*2));
+       memset(vc->pad1, 0xff, 8);
+       vc->spare_refs[0] = 0xffffffff;
+       vc->spare_refs[1] = 0xffffffff;
+       vc->spare_refs[2] = 0xffffffff;
+       vc->spare_refs[3] = 0xffffffff;
+       vc->spare_refs[4] = 0xffffffff;
+       vc->spare_refs[5] = 0xffffffff;
+       vc->spare_refs[6] = 0xffffffff;
+       vc->spare_refs[7] = 0xffffffff;
+       memset(vc->cache_pol, 0, 8);
+       vc->bg_rate = 0x80;
+       memset(vc->pad2, 0xff, 3);
+       memset(vc->pad3, 0xff, 52);
+       memset(vc->pad4, 0xff, 192);
+       memset(vc->v0, 0xff, 32);
+       memset(vc->v1, 0xff, 32);
+       memset(vc->v2, 0xff, 16);
+       memset(vc->v3, 0xff, 16);
+       memset(vc->vendor, 0xff, 32);
+
+       memset(vc->phys_refnum, 0xff, 4*ddf->mppe);
+       memset(vc->phys_refnum+ddf->mppe, 0x00, 8*ddf->mppe);
+
+       vcl->next = ddf->conflist;
+       ddf->conflist = vcl;
+       ddf->currentconf = vcl;
+       ddf->updates_pending = 1;
+       return 1;
+}
+
+#ifndef MDASSEMBLE
+static void add_to_super_ddf_bvd(struct supertype *st,
+                                mdu_disk_info_t *dk, int fd, char *devname)
+{
+       /* fd and devname identify a device with-in the ddf container (st).
+        * dk identifies a location in the new BVD.
+        * We need to find suitable free space in that device and update
+        * the phys_refnum and lba_offset for the newly created vd_config.
+        * We might also want to update the type in the phys_disk
+        * section.
+        *
+        * Alternately: fd == -1 and we have already chosen which device to
+        * use and recorded in dlist->raid_disk;
+        */
+       struct dl *dl;
+       struct ddf_super *ddf = st->sb;
+       struct vd_config *vc;
+       __u64 *lba_offset;
+       int working;
+       int i;
+       unsigned long long blocks, pos, esize;
+       struct extent *ex;
+
+       if (fd == -1) {
+               for (dl = ddf->dlist; dl ; dl = dl->next)
+                       if (dl->raiddisk == dk->raid_disk)
+                               break;
+       } else {
+               for (dl = ddf->dlist; dl ; dl = dl->next)
+                       if (dl->major == dk->major &&
+                           dl->minor == dk->minor)
+                               break;
+       }
+       if (!dl || ! (dk->state & (1<<MD_DISK_SYNC)))
+               return;
+
+       vc = &ddf->currentconf->conf;
+       lba_offset = ddf->currentconf->lba_offset;
+
+       ex = get_extents(ddf, dl);
+       if (!ex)
+               return;
+
+       i = 0; pos = 0;
+       blocks = __be64_to_cpu(vc->blocks);
+       if (ddf->currentconf->block_sizes)
+               blocks = ddf->currentconf->block_sizes[dk->raid_disk];
+
+       do {
+               esize = ex[i].start - pos;
+               if (esize >= blocks)
+                       break;
+               pos = ex[i].start + ex[i].size;
+               i++;
+       } while (ex[i-1].size);
+
+       free(ex);
+       if (esize < blocks)
+               return;
+
+       ddf->currentdev = dk->raid_disk;
+       vc->phys_refnum[dk->raid_disk] = dl->disk.refnum;
+       lba_offset[dk->raid_disk] = __cpu_to_be64(pos);
+
+       for (i=0; i < ddf->max_part ; i++)
+               if (dl->vlist[i] == NULL)
+                       break;
+       if (i == ddf->max_part)
+               return;
+       dl->vlist[i] = ddf->currentconf;
+
+       if (fd >= 0)
+               dl->fd = fd;
+       if (devname)
+               dl->devname = devname;
+
+       /* Check how many working raid_disks, and if we can mark
+        * array as optimal yet
+        */
+       working = 0;
+
+       for (i=0; i < __be16_to_cpu(vc->prim_elmnt_count); i++)
+               if (vc->phys_refnum[i] != 0xffffffff)
+                       working++;
+
+       /* Find which virtual_entry */
+       i = ddf->currentconf->vcnum;
+       if (working == __be16_to_cpu(vc->prim_elmnt_count))
+               ddf->virt->entries[i].state =
+                       (ddf->virt->entries[i].state & ~DDF_state_mask)
+                       | DDF_state_optimal;
+
+       if (vc->prl == DDF_RAID6 &&
+           working+1 == __be16_to_cpu(vc->prim_elmnt_count))
+               ddf->virt->entries[i].state =
+                       (ddf->virt->entries[i].state & ~DDF_state_mask)
+                       | DDF_state_part_optimal;
+
+       ddf->phys->entries[dl->pdnum].type &= ~__cpu_to_be16(DDF_Global_Spare);
+       ddf->phys->entries[dl->pdnum].type |= __cpu_to_be16(DDF_Active_in_VD);
+       ddf->updates_pending = 1;
+}
+
+/* add a device to a container, either while creating it or while
+ * expanding a pre-existing container
+ */
+static int add_to_super_ddf(struct supertype *st,
+                            mdu_disk_info_t *dk, int fd, char *devname)
+{
+       struct ddf_super *ddf = st->sb;
+       struct dl *dd;
+       time_t now;
+       struct tm *tm;
+       unsigned long long size;
+       struct phys_disk_entry *pde;
+       int n, i;
+       struct stat stb;
+
+       if (ddf->currentconf) {
+               add_to_super_ddf_bvd(st, dk, fd, devname);
+               return 0;
+       }
+
+       /* This is device numbered dk->number.  We need to create
+        * a phys_disk entry and a more detailed disk_data entry.
+        */
+       fstat(fd, &stb);
+       if (posix_memalign((void**)&dd, 512,
+                          sizeof(*dd) + sizeof(dd->vlist[0]) * ddf->max_part) != 0) {
+               fprintf(stderr, Name
+                       ": %s could allocate buffer for new disk, aborting\n",
+                       __func__);
+               return 1;
+       }
+       dd->major = major(stb.st_rdev);
+       dd->minor = minor(stb.st_rdev);
+       dd->devname = devname;
+       dd->fd = fd;
+       dd->spare = NULL;
+
+       dd->disk.magic = DDF_PHYS_DATA_MAGIC;
+       now = time(0);
+       tm = localtime(&now);
+       sprintf(dd->disk.guid, "%8s%04d%02d%02d",
+               T10, tm->tm_year+1900, tm->tm_mon+1, tm->tm_mday);
+       *(__u32*)(dd->disk.guid + 16) = random32();
+       *(__u32*)(dd->disk.guid + 20) = random32();
+
+       do {
+               /* Cannot be bothered finding a CRC of some irrelevant details*/
+               dd->disk.refnum = random32();
+               for (i = __be16_to_cpu(ddf->active->max_pd_entries) - 1;
+                    i >= 0; i--)
+                       if (ddf->phys->entries[i].refnum == dd->disk.refnum)
+                               break;
+       } while (i >= 0);
+
+       dd->disk.forced_ref = 1;
+       dd->disk.forced_guid = 1;
+       memset(dd->disk.vendor, ' ', 32);
+       memcpy(dd->disk.vendor, "Linux", 5);
+       memset(dd->disk.pad, 0xff, 442);
+       for (i = 0; i < ddf->max_part ; i++)
+               dd->vlist[i] = NULL;
+
+       n = __be16_to_cpu(ddf->phys->used_pdes);
+       pde = &ddf->phys->entries[n];
+       dd->pdnum = n;
+
+       if (st->update_tail) {
+               int len = (sizeof(struct phys_disk) +
+                          sizeof(struct phys_disk_entry));
+               struct phys_disk *pd;
+
+               pd = malloc(len);
+               pd->magic = DDF_PHYS_RECORDS_MAGIC;
+               pd->used_pdes = __cpu_to_be16(n);
+               pde = &pd->entries[0];
+               dd->mdupdate = pd;
+       } else {
+               n++;
+               ddf->phys->used_pdes = __cpu_to_be16(n);
+       }
+
+       memcpy(pde->guid, dd->disk.guid, DDF_GUID_LEN);
+       pde->refnum = dd->disk.refnum;
+       pde->type = __cpu_to_be16(DDF_Forced_PD_GUID | DDF_Global_Spare);
+       pde->state = __cpu_to_be16(DDF_Online);
+       get_dev_size(fd, NULL, &size);
+       /* We are required to reserve 32Meg, and record the size in sectors */
+       pde->config_size = __cpu_to_be64( (size - 32*1024*1024) / 512);
+       sprintf(pde->path, "%17.17s","Information: nil") ;
+       memset(pde->pad, 0xff, 6);
+
+       dd->size = size >> 9;
+       if (st->update_tail) {
+               dd->next = ddf->add_list;
+               ddf->add_list = dd;
+       } else {
+               dd->next = ddf->dlist;
+               ddf->dlist = dd;
+               ddf->updates_pending = 1;
+       }
+
+       return 0;
+}
+
+/*
+ * This is the write_init_super method for a ddf container.  It is
+ * called when creating a container or adding another device to a
+ * container.
+ */
+
+static unsigned char null_conf[4096+512];
+
+static int __write_init_super_ddf(struct supertype *st, int do_close)
+{
+
+       struct ddf_super *ddf = st->sb;
+       int i;
+       struct dl *d;
+       int n_config;
+       int conf_size;
+       int attempts = 0;
+       int successes = 0;
+       unsigned long long size, sector;
+
+       /* try to write updated metadata,
+        * if we catch a failure move on to the next disk
+        */
+       for (d = ddf->dlist; d; d=d->next) {
+               int fd = d->fd;
+
+               if (fd < 0)
+                       continue;
+
+               attempts++;
+               /* We need to fill in the primary, (secondary) and workspace
+                * lba's in the headers, set their checksums,
+                * Also checksum phys, virt....
+                *
+                * Then write everything out, finally the anchor is written.
+                */
+               get_dev_size(fd, NULL, &size);
+               size /= 512;
+               ddf->anchor.workspace_lba = __cpu_to_be64(size - 32*1024*2);
+               ddf->anchor.primary_lba = __cpu_to_be64(size - 16*1024*2);
+               ddf->anchor.seq = __cpu_to_be32(1);
+               memcpy(&ddf->primary, &ddf->anchor, 512);
+               memcpy(&ddf->secondary, &ddf->anchor, 512);
+
+               ddf->anchor.openflag = 0xFF; /* 'open' means nothing */
+               ddf->anchor.seq = 0xFFFFFFFF; /* no sequencing in anchor */
+               ddf->anchor.crc = calc_crc(&ddf->anchor, 512);
+
+               ddf->primary.openflag = 0;
+               ddf->primary.type = DDF_HEADER_PRIMARY;
+
+               ddf->secondary.openflag = 0;
+               ddf->secondary.type = DDF_HEADER_SECONDARY;
+
+               ddf->primary.crc = calc_crc(&ddf->primary, 512);
+               ddf->secondary.crc = calc_crc(&ddf->secondary, 512);
+
+               sector = size - 16*1024*2;
+               lseek64(fd, sector<<9, 0);
+               if (write(fd, &ddf->primary, 512) < 0)
+                       continue;
+
+               ddf->controller.crc = calc_crc(&ddf->controller, 512);
+               if (write(fd, &ddf->controller, 512) < 0)
+                       continue;
+
+               ddf->phys->crc = calc_crc(ddf->phys, ddf->pdsize);
+
+               if (write(fd, ddf->phys, ddf->pdsize) < 0)
+                       continue;
+
+               ddf->virt->crc = calc_crc(ddf->virt, ddf->vdsize);
+               if (write(fd, ddf->virt, ddf->vdsize) < 0)
+                       continue;
+
+               /* Now write lots of config records. */
+               n_config = ddf->max_part;
+               conf_size = ddf->conf_rec_len * 512;
+               for (i = 0 ; i <= n_config ; i++) {
+                       struct vcl *c = d->vlist[i];
+                       if (i == n_config)
+                               c = (struct vcl*)d->spare;
+
+                       if (c) {
+                               c->conf.crc = calc_crc(&c->conf, conf_size);
+                               if (write(fd, &c->conf, conf_size) < 0)
+                                       break;
+                       } else {
+                               char *null_aligned = (char*)((((unsigned long)null_conf)+511)&~511UL);
+                               if (null_conf[0] != 0xff)
+                                       memset(null_conf, 0xff, sizeof(null_conf));
+                               int togo = conf_size;
+                               while (togo > sizeof(null_conf)-512) {
+                                       if (write(fd, null_aligned, sizeof(null_conf)-512) < 0)
+                                               break;
+                                       togo -= sizeof(null_conf)-512;
+                               }
+                               if (write(fd, null_aligned, togo) < 0)
+                                       break;
+                       }
+               }
+               if (i <= n_config)
+                       continue;
+               d->disk.crc = calc_crc(&d->disk, 512);
+               if (write(fd, &d->disk, 512) < 0)
+                       continue;
+
+               /* Maybe do the same for secondary */
+
+               lseek64(fd, (size-1)*512, SEEK_SET);
+               if (write(fd, &ddf->anchor, 512) < 0)
+                       continue;
+               successes++;
+       }
+
+       if (do_close)
+               for (d = ddf->dlist; d; d=d->next) {
+                       close(d->fd);
+                       d->fd = -1;
+               }
+
+       return attempts != successes;
+}
+
+static int write_init_super_ddf(struct supertype *st)
+{
+
+       if (st->update_tail) {
+               /* queue the virtual_disk and vd_config as metadata updates */
+               struct virtual_disk *vd;
+               struct vd_config *vc;
+               struct ddf_super *ddf = st->sb;
+               int len;
+
+               if (!ddf->currentconf) {
+                       int len = (sizeof(struct phys_disk) +
+                                  sizeof(struct phys_disk_entry));
+
+                       /* adding a disk to the container. */
+                       if (!ddf->add_list)
+                               return 0;
+
+                       append_metadata_update(st, ddf->add_list->mdupdate, len);
+                       ddf->add_list->mdupdate = NULL;
+                       return 0;
+               }
+
+               /* Newly created VD */
+
+               /* First the virtual disk.  We have a slightly fake header */
+               len = sizeof(struct virtual_disk) + sizeof(struct virtual_entry);
+               vd = malloc(len);
+               *vd = *ddf->virt;
+               vd->entries[0] = ddf->virt->entries[ddf->currentconf->vcnum];
+               vd->populated_vdes = __cpu_to_be16(ddf->currentconf->vcnum);
+               append_metadata_update(st, vd, len);
+
+               /* Then the vd_config */
+               len = ddf->conf_rec_len * 512;
+               vc = malloc(len);
+               memcpy(vc, &ddf->currentconf->conf, len);
+               append_metadata_update(st, vc, len);
+
+               /* FIXME I need to close the fds! */
+               return 0;
+       } else 
+               return __write_init_super_ddf(st, 1);
+}
+
+#endif
+
+static __u64 avail_size_ddf(struct supertype *st, __u64 devsize)
+{
+       /* We must reserve the last 32Meg */
+       if (devsize <= 32*1024*2)
+               return 0;
+       return devsize - 32*1024*2;
+}
+
+#ifndef MDASSEMBLE
+
+static int reserve_space(struct supertype *st, int raiddisks,
+                        unsigned long long size, int chunk,
+                        unsigned long long *freesize)
+{
+       /* Find 'raiddisks' spare extents at least 'size' big (but
+        * only caring about multiples of 'chunk') and remember
+        * them.
+        * If the cannot be found, fail.
+        */
+       struct dl *dl;
+       struct ddf_super *ddf = st->sb;
+       int cnt = 0;
+
+       for (dl = ddf->dlist; dl ; dl=dl->next) {
+               dl->raiddisk = -1;      
+               dl->esize = 0;
+       }
+       /* Now find largest extent on each device */
+       for (dl = ddf->dlist ; dl ; dl=dl->next) {
+               struct extent *e = get_extents(ddf, dl);
+               unsigned long long pos = 0;
+               int i = 0;
+               int found = 0;
+               unsigned long long minsize = size;
+
+               if (size == 0)
+                       minsize = chunk;
+
+               if (!e)
+                       continue;
+               do {
+                       unsigned long long esize;
+                       esize = e[i].start - pos;
+                       if (esize >= minsize) {
+                               found = 1;
+                               minsize = esize;
+                       }
+                       pos = e[i].start + e[i].size;
+                       i++;
+               } while (e[i-1].size);
+               if (found) {
+                       cnt++;
+                       dl->esize = minsize;
+               }
+               free(e);
+       }
+       if (cnt < raiddisks) {
+               fprintf(stderr, Name ": not enough devices with space to create array.\n");
+               return 0; /* No enough free spaces large enough */
+       }
+       if (size == 0) {
+               /* choose the largest size of which there are at least 'raiddisk' */
+               for (dl = ddf->dlist ; dl ; dl=dl->next) {
+                       struct dl *dl2;
+                       if (dl->esize <= size)
+                               continue;
+                       /* This is bigger than 'size', see if there are enough */
+                       cnt = 0;
+                       for (dl2 = dl; dl2 ; dl2=dl2->next)
+                               if (dl2->esize >= dl->esize)
+                                       cnt++;
+                       if (cnt >= raiddisks)
+                               size = dl->esize;
+               }
+               if (chunk) {
+                       size = size / chunk;
+                       size *= chunk;
+               }
+               *freesize = size;
+               if (size < 32) {
+                       fprintf(stderr, Name ": not enough spare devices to create array.\n");
+                       return 0;
+               }
+       }
+       /* We have a 'size' of which there are enough spaces.
+        * We simply do a first-fit */
+       cnt = 0;
+       for (dl = ddf->dlist ; dl && cnt < raiddisks ; dl=dl->next) {
+               if (dl->esize < size)
+                       continue;
+               
+               dl->raiddisk = cnt;
+               cnt++;
+       }
+       return 1;
+}
+
+
+
+static int
+validate_geometry_ddf_container(struct supertype *st,
+                               int level, int layout, int raiddisks,
+                               int chunk, unsigned long long size,
+                               char *dev, unsigned long long *freesize,
+                               int verbose);
+
+static int validate_geometry_ddf_bvd(struct supertype *st,
+                                    int level, int layout, int raiddisks,
+                                    int chunk, unsigned long long size,
+                                    char *dev, unsigned long long *freesize,
+                                    int verbose);
+
+static int validate_geometry_ddf(struct supertype *st,
+                                int level, int layout, int raiddisks,
+                                int chunk, unsigned long long size,
+                                char *dev, unsigned long long *freesize,
+                                int verbose)
+{
+       int fd;
+       struct mdinfo *sra;
+       int cfd;
+
+       /* ddf potentially supports lots of things, but it depends on
+        * what devices are offered (and maybe kernel version?)
+        * If given unused devices, we will make a container.
+        * If given devices in a container, we will make a BVD.
+        * If given BVDs, we make an SVD, changing all the GUIDs in the process.
+        */
+
+       if (level == LEVEL_CONTAINER) {
+               /* Must be a fresh device to add to a container */
+               return validate_geometry_ddf_container(st, level, layout,
+                                                      raiddisks, chunk,
+                                                      size, dev, freesize,
+                                                      verbose);
+       }
+
+       if (!dev) {
+               /* Initial sanity check.  Exclude illegal levels. */
+               int i;
+               for (i=0; ddf_level_num[i].num1 != MAXINT; i++)
+                       if (ddf_level_num[i].num2 == level)
+                               break;
+               if (ddf_level_num[i].num1 == MAXINT)
+                       return 0;
+               /* Should check layout? etc */
+
+               if (st->sb && freesize) {
+                       /* --create was given a container to create in.
+                        * So we need to check that there are enough
+                        * free spaces and return the amount of space.
+                        * We may as well remember which drives were
+                        * chosen so that add_to_super/getinfo_super
+                        * can return them.
+                        */
+                       return reserve_space(st, raiddisks, size, chunk, freesize);
+               }
+               return 1;
+       }
+
+       if (st->sb) {
+               /* A container has already been opened, so we are
+                * creating in there.  Maybe a BVD, maybe an SVD.
+                * Should make a distinction one day.
+                */
+               return validate_geometry_ddf_bvd(st, level, layout, raiddisks,
+                                                chunk, size, dev, freesize,
+                                                verbose);
+       }
+       /* This is the first device for the array.
+        * If it is a container, we read it in and do automagic allocations,
+        * no other devices should be given.
+        * Otherwise it must be a member device of a container, and we
+        * do manual allocation.
+        * Later we should check for a BVD and make an SVD.
+        */
+       fd = open(dev, O_RDONLY|O_EXCL, 0);
+       if (fd >= 0) {
+               sra = sysfs_read(fd, 0, GET_VERSION);
+               close(fd);
+               if (sra && sra->array.major_version == -1 &&
+                   strcmp(sra->text_version, "ddf") == 0) {
+
+                       /* load super */
+                       /* find space for 'n' devices. */
+                       /* remember the devices */
+                       /* Somehow return the fact that we have enough */
+               }
+
+               if (verbose)
+                       fprintf(stderr,
+                               Name ": ddf: Cannot create this array "
+                               "on device %s\n",
+                               dev);
+               return 0;
+       }
+       if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": ddf: Cannot open %s: %s\n",
+                               dev, strerror(errno));
+               return 0;
+       }
+       /* Well, it is in use by someone, maybe a 'ddf' container. */
+       cfd = open_container(fd);
+       if (cfd < 0) {
+               close(fd);
+               if (verbose)
+                       fprintf(stderr, Name ": ddf: Cannot use %s: %s\n",
+                               dev, strerror(EBUSY));
+               return 0;
+       }
+       sra = sysfs_read(cfd, 0, GET_VERSION);
+       close(fd);
+       if (sra && sra->array.major_version == -1 &&
+           strcmp(sra->text_version, "ddf") == 0) {
+               /* This is a member of a ddf container.  Load the container
+                * and try to create a bvd
+                */
+               struct ddf_super *ddf;
+               if (load_super_ddf_all(st, cfd, (void **)&ddf, NULL, 1) == 0) {
+                       st->sb = ddf;
+                       st->container_dev = fd2devnum(cfd);
+                       close(cfd);
+                       return validate_geometry_ddf_bvd(st, level, layout,
+                                                        raiddisks, chunk, size,
+                                                        dev, freesize,
+                                                        verbose);
+               }
+               close(cfd);
+       } else /* device may belong to a different container */
+               return 0;
+
+       return 1;
+}
+
+static int
+validate_geometry_ddf_container(struct supertype *st,
+                               int level, int layout, int raiddisks,
+                               int chunk, unsigned long long size,
+                               char *dev, unsigned long long *freesize,
+                               int verbose)
+{
+       int fd;
+       unsigned long long ldsize;
+
+       if (level != LEVEL_CONTAINER)
+               return 0;
+       if (!dev)
+               return 1;
+
+       fd = open(dev, O_RDONLY|O_EXCL, 0);
+       if (fd < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": ddf: Cannot open %s: %s\n",
+                               dev, strerror(errno));
+               return 0;
+       }
+       if (!get_dev_size(fd, dev, &ldsize)) {
+               close(fd);
+               return 0;
+       }
+       close(fd);
+
+       *freesize = avail_size_ddf(st, ldsize >> 9);
+       if (*freesize == 0)
+               return 0;
+
+       return 1;
+}
+
+static int validate_geometry_ddf_bvd(struct supertype *st,
+                                    int level, int layout, int raiddisks,
+                                    int chunk, unsigned long long size,
+                                    char *dev, unsigned long long *freesize,
+                                    int verbose)
+{
+       struct stat stb;
+       struct ddf_super *ddf = st->sb;
+       struct dl *dl;
+       unsigned long long pos = 0;
+       unsigned long long maxsize;
+       struct extent *e;
+       int i;
+       /* ddf/bvd supports lots of things, but not containers */
+       if (level == LEVEL_CONTAINER)
+               return 0;
+       /* We must have the container info already read in. */
+       if (!ddf)
+               return 0;
+
+       if (!dev) {
+               /* General test:  make sure there is space for
+                * 'raiddisks' device extents of size 'size'.
+                */
+               unsigned long long minsize = size;
+               int dcnt = 0;
+               if (minsize == 0)
+                       minsize = 8;
+               for (dl = ddf->dlist; dl ; dl = dl->next)
+               {
+                       int found = 0;
+                       pos = 0;
+
+                       i = 0;
+                       e = get_extents(ddf, dl);
+                       if (!e) continue;
+                       do {
+                               unsigned long long esize;
+                               esize = e[i].start - pos;
+                               if (esize >= minsize)
+                                       found = 1;
+                               pos = e[i].start + e[i].size;
+                               i++;
+                       } while (e[i-1].size);
+                       if (found)
+                               dcnt++;
+                       free(e);
+               }
+               if (dcnt < raiddisks) {
+                       if (verbose)
+                               fprintf(stderr,
+                                       Name ": ddf: Not enough devices with "
+                                       "space for this array (%d < %d)\n",
+                                       dcnt, raiddisks);
+                       return 0;
+               }
+               return 1;
+       }
+       /* This device must be a member of the set */
+       if (stat(dev, &stb) < 0)
+               return 0;
+       if ((S_IFMT & stb.st_mode) != S_IFBLK)
+               return 0;
+       for (dl = ddf->dlist ; dl ; dl = dl->next) {
+               if (dl->major == major(stb.st_rdev) &&
+                   dl->minor == minor(stb.st_rdev))
+                       break;
+       }
+       if (!dl) {
+               if (verbose)
+                       fprintf(stderr, Name ": ddf: %s is not in the "
+                               "same DDF set\n",
+                               dev);
+               return 0;
+       }
+       e = get_extents(ddf, dl);
+       maxsize = 0;
+       i = 0;
+       if (e) do {
+               unsigned long long esize;
+               esize = e[i].start - pos;
+               if (esize >= maxsize)
+                       maxsize = esize;
+               pos = e[i].start + e[i].size;
+               i++;
+       } while (e[i-1].size);
+       *freesize = maxsize;
+       // FIXME here I am
+
+       return 1;
+}
+
+static int load_super_ddf_all(struct supertype *st, int fd,
+                             void **sbp, char *devname, int keep_fd)
+{
+       struct mdinfo *sra;
+       struct ddf_super *super;
+       struct mdinfo *sd, *best = NULL;
+       int bestseq = 0;
+       int seq;
+       char nm[20];
+       int dfd;
+       int devnum = fd2devnum(fd);
+       enum sysfs_read_flags flags;
+
+       flags = GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE;
+       if (mdmon_running(devnum))
+               flags |= SKIP_GONE_DEVS;
+
+       sra = sysfs_read(fd, 0, flags);
+       if (!sra)
+               return 1;
+       if (sra->array.major_version != -1 ||
+           sra->array.minor_version != -2 ||
+           strcmp(sra->text_version, "ddf") != 0)
+               return 1;
+
+       if (posix_memalign((void**)&super, 512, sizeof(*super)) != 0)
+               return 1;
+       memset(super, 0, sizeof(*super));
+
+       /* first, try each device, and choose the best ddf */
+       for (sd = sra->devs ; sd ; sd = sd->next) {
+               int rv;
+               sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+               dfd = dev_open(nm, O_RDONLY);
+               if (dfd < 0)
+                       return 2;
+               rv = load_ddf_headers(dfd, super, NULL);
+               close(dfd);
+               if (rv == 0) {
+                       seq = __be32_to_cpu(super->active->seq);
+                       if (super->active->openflag)
+                               seq--;
+                       if (!best || seq > bestseq) {
+                               bestseq = seq;
+                               best = sd;
+                       }
+               }
+       }
+       if (!best)
+               return 1;
+       /* OK, load this ddf */
+       sprintf(nm, "%d:%d", best->disk.major, best->disk.minor);
+       dfd = dev_open(nm, O_RDONLY);
+       if (dfd < 0)
+               return 1;
+       load_ddf_headers(dfd, super, NULL);
+       load_ddf_global(dfd, super, NULL);
+       close(dfd);
+       /* Now we need the device-local bits */
+       for (sd = sra->devs ; sd ; sd = sd->next) {
+               int rv;
+
+               sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+               dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY);
+               if (dfd < 0)
+                       return 2;
+               rv = load_ddf_headers(dfd, super, NULL);
+               if (rv == 0)
+                       rv = load_ddf_local(dfd, super, NULL, keep_fd);
+               if (!keep_fd) close(dfd);
+               if (rv)
+                       return 1;
+       }
+       if (st->subarray[0]) {
+               struct vcl *v;
+
+               for (v = super->conflist; v; v = v->next)
+                       if (v->vcnum == atoi(st->subarray))
+                               super->currentconf = v;
+               if (!super->currentconf)
+                       return 1;
+       }
+       *sbp = super;
+       if (st->ss == NULL) {
+               st->ss = &super_ddf;
+               st->minor_version = 0;
+               st->max_devs = 512;
+               st->container_dev = fd2devnum(fd);
+       }
+       st->loaded_container = 1;
+       return 0;
+}
+#endif /* MDASSEMBLE */
+
+static struct mdinfo *container_content_ddf(struct supertype *st)
+{
+       /* Given a container loaded by load_super_ddf_all,
+        * extract information about all the arrays into
+        * an mdinfo tree.
+        *
+        * For each vcl in conflist: create an mdinfo, fill it in,
+        *  then look for matching devices (phys_refnum) in dlist
+        *  and create appropriate device mdinfo.
+        */
+       struct ddf_super *ddf = st->sb;
+       struct mdinfo *rest = NULL;
+       struct vcl *vc;
+
+       for (vc = ddf->conflist ; vc ; vc=vc->next)
+       {
+               int i;
+               int j;
+               struct mdinfo *this;
+               this = malloc(sizeof(*this));
+               memset(this, 0, sizeof(*this));
+               this->next = rest;
+               rest = this;
+
+               this->array.level = map_num1(ddf_level_num, vc->conf.prl);
+               this->array.raid_disks =
+                       __be16_to_cpu(vc->conf.prim_elmnt_count);
+               this->array.layout = rlq_to_layout(vc->conf.rlq, vc->conf.prl,
+                                                  this->array.raid_disks);
+               this->array.md_minor      = -1;
+               this->array.major_version = -1;
+               this->array.minor_version = -2;
+               this->array.ctime         = DECADE +
+                       __be32_to_cpu(*(__u32*)(vc->conf.guid+16));
+               this->array.utime         = DECADE +
+                       __be32_to_cpu(vc->conf.timestamp);
+               this->array.chunk_size    = 512 << vc->conf.chunk_shift;
+
+               i = vc->vcnum;
+               if ((ddf->virt->entries[i].state & DDF_state_inconsistent) ||
+                   (ddf->virt->entries[i].init_state & DDF_initstate_mask) !=
+                   DDF_init_full) {
+                       this->array.state = 0;
+                       this->resync_start = 0;
+               } else {
+                       this->array.state = 1;
+                       this->resync_start = ~0ULL;
+               }
+               memcpy(this->name, ddf->virt->entries[i].name, 16);
+               this->name[16]=0;
+               for(j=0; j<16; j++)
+                       if (this->name[j] == ' ')
+                               this->name[j] = 0;
+
+               memset(this->uuid, 0, sizeof(this->uuid));
+               this->component_size = __be64_to_cpu(vc->conf.blocks);
+               this->array.size = this->component_size / 2;
+               this->container_member = i;
+
+               ddf->currentconf = vc;
+               uuid_from_super_ddf(st, this->uuid);
+               ddf->currentconf = NULL;
+
+               sprintf(this->text_version, "/%s/%d",
+                       devnum2devname(st->container_dev),
+                       this->container_member);
+
+               for (i=0 ; i < ddf->mppe ; i++) {
+                       struct mdinfo *dev;
+                       struct dl *d;
+
+                       if (vc->conf.phys_refnum[i] == 0xFFFFFFFF)
+                               continue;
+
+                       this->array.working_disks++;
+
+                       for (d = ddf->dlist; d ; d=d->next)
+                               if (d->disk.refnum == vc->conf.phys_refnum[i])
+                                       break;
+                       if (d == NULL)
+                               /* Haven't found that one yet, maybe there are others */
+                               continue;
+
+                       dev = malloc(sizeof(*dev));
+                       memset(dev, 0, sizeof(*dev));
+                       dev->next = this->devs;
+                       this->devs = dev;
+
+                       dev->disk.number = __be32_to_cpu(d->disk.refnum);
+                       dev->disk.major = d->major;
+                       dev->disk.minor = d->minor;
+                       dev->disk.raid_disk = i;
+                       dev->disk.state = (1<<MD_DISK_SYNC)|(1<<MD_DISK_ACTIVE);
+
+                       dev->events = __be32_to_cpu(ddf->primary.seq);
+                       dev->data_offset = __be64_to_cpu(vc->lba_offset[i]);
+                       dev->component_size = __be64_to_cpu(vc->conf.blocks);
+                       if (d->devname)
+                               strcpy(dev->name, d->devname);
+               }
+       }
+       return rest;
+}
+
+static int store_zero_ddf(struct supertype *st, int fd)
+{
+       unsigned long long dsize;
+       void *buf;
+       int rc;
+
+       if (!get_dev_size(fd, NULL, &dsize))
+               return 1;
+
+       if (posix_memalign(&buf, 512, 512) != 0)
+               return 1;
+       memset(buf, 0, 512);
+
+       lseek64(fd, dsize-512, 0);
+       rc = write(fd, buf, 512);
+       free(buf);
+       if (rc < 0)
+               return 1;
+       return 0;
+}
+
+static int compare_super_ddf(struct supertype *st, struct supertype *tst)
+{
+       /*
+        * return:
+        *  0 same, or first was empty, and second was copied
+        *  1 second had wrong number
+        *  2 wrong uuid
+        *  3 wrong other info
+        */
+       struct ddf_super *first = st->sb;
+       struct ddf_super *second = tst->sb;
+
+       if (!first) {
+               st->sb = tst->sb;
+               tst->sb = NULL;
+               return 0;
+       }
+
+       if (memcmp(first->anchor.guid, second->anchor.guid, DDF_GUID_LEN) != 0)
+               return 2;
+
+       /* FIXME should I look at anything else? */
+       return 0;
+}
+
+#ifndef MDASSEMBLE
+/*
+ * A new array 'a' has been started which claims to be instance 'inst'
+ * within container 'c'.
+ * We need to confirm that the array matches the metadata in 'c' so
+ * that we don't corrupt any metadata.
+ */
+static int ddf_open_new(struct supertype *c, struct active_array *a, char *inst)
+{
+       dprintf("ddf: open_new %s\n", inst);
+       a->info.container_member = atoi(inst);
+       return 0;
+}
+
+/*
+ * The array 'a' is to be marked clean in the metadata.
+ * If '->resync_start' is not ~(unsigned long long)0, then the array is only
+ * clean up to the point (in sectors).  If that cannot be recorded in the
+ * metadata, then leave it as dirty.
+ *
+ * For DDF, we need to clear the DDF_state_inconsistent bit in the
+ * !global! virtual_disk.virtual_entry structure.
+ */
+static int ddf_set_array_state(struct active_array *a, int consistent)
+{
+       struct ddf_super *ddf = a->container->sb;
+       int inst = a->info.container_member;
+       int old = ddf->virt->entries[inst].state;
+       if (consistent == 2) {
+               /* Should check if a recovery should be started FIXME */
+               consistent = 1;
+               if (!is_resync_complete(a))
+                       consistent = 0;
+       }
+       if (consistent)
+               ddf->virt->entries[inst].state &= ~DDF_state_inconsistent;
+       else
+               ddf->virt->entries[inst].state |= DDF_state_inconsistent;
+       if (old != ddf->virt->entries[inst].state)
+               ddf->updates_pending = 1;
+
+       old = ddf->virt->entries[inst].init_state;
+       ddf->virt->entries[inst].init_state &= ~DDF_initstate_mask;
+       if (is_resync_complete(a))
+               ddf->virt->entries[inst].init_state |= DDF_init_full;
+       else if (a->resync_start == 0)
+               ddf->virt->entries[inst].init_state |= DDF_init_not;
+       else
+               ddf->virt->entries[inst].init_state |= DDF_init_quick;
+       if (old != ddf->virt->entries[inst].init_state)
+               ddf->updates_pending = 1;
+
+       dprintf("ddf mark %d %s %llu\n", inst, consistent?"clean":"dirty",
+               a->resync_start);
+       return consistent;
+}
+
+/*
+ * The state of each disk is stored in the global phys_disk structure
+ * in phys_disk.entries[n].state.
+ * This makes various combinations awkward.
+ * - When a device fails in any array, it must be failed in all arrays
+ *   that include a part of this device.
+ * - When a component is rebuilding, we cannot include it officially in the
+ *   array unless this is the only array that uses the device.
+ *
+ * So: when transitioning:
+ *   Online -> failed,  just set failed flag.  monitor will propagate
+ *   spare -> online,   the device might need to be added to the array.
+ *   spare -> failed,   just set failed.  Don't worry if in array or not.
+ */
+static void ddf_set_disk(struct active_array *a, int n, int state)
+{
+       struct ddf_super *ddf = a->container->sb;
+       int inst = a->info.container_member;
+       struct vd_config *vc = find_vdcr(ddf, inst);
+       int pd = find_phys(ddf, vc->phys_refnum[n]);
+       int i, st, working;
+
+       if (vc == NULL) {
+               dprintf("ddf: cannot find instance %d!!\n", inst);
+               return;
+       }
+       if (pd < 0) {
+               /* disk doesn't currently exist. If it is now in_sync,
+                * insert it. */
+               if ((state & DS_INSYNC) && ! (state & DS_FAULTY)) {
+                       /* Find dev 'n' in a->info->devs, determine the
+                        * ddf refnum, and set vc->phys_refnum and update
+                        * phys->entries[]
+                        */
+                       /* FIXME */
+               }
+       } else {
+               int old = ddf->phys->entries[pd].state;
+               if (state & DS_FAULTY)
+                       ddf->phys->entries[pd].state  |= __cpu_to_be16(DDF_Failed);
+               if (state & DS_INSYNC) {
+                       ddf->phys->entries[pd].state  |= __cpu_to_be16(DDF_Online);
+                       ddf->phys->entries[pd].state  &= __cpu_to_be16(~DDF_Rebuilding);
+               }
+               if (old != ddf->phys->entries[pd].state)
+                       ddf->updates_pending = 1;
+       }
+
+       dprintf("ddf: set_disk %d to %x\n", n, state);
+
+       /* Now we need to check the state of the array and update
+        * virtual_disk.entries[n].state.
+        * It needs to be one of "optimal", "degraded", "failed".
+        * I don't understand 'deleted' or 'missing'.
+        */
+       working = 0;
+       for (i=0; i < a->info.array.raid_disks; i++) {
+               pd = find_phys(ddf, vc->phys_refnum[i]);
+               if (pd < 0)
+                       continue;
+               st = __be16_to_cpu(ddf->phys->entries[pd].state);
+               if ((st & (DDF_Online|DDF_Failed|DDF_Rebuilding))
+                   == DDF_Online)
+                       working++;
+       }
+       state = DDF_state_degraded;
+       if (working == a->info.array.raid_disks)
+               state = DDF_state_optimal;
+       else switch(vc->prl) {
+       case DDF_RAID0:
+       case DDF_CONCAT:
+       case DDF_JBOD:
+               state = DDF_state_failed;
+               break;
+       case DDF_RAID1:
+               if (working == 0)
+                       state = DDF_state_failed;
+               break;
+       case DDF_RAID4:
+       case DDF_RAID5:
+               if (working < a->info.array.raid_disks-1)
+                       state = DDF_state_failed;
+               break;
+       case DDF_RAID6:
+               if (working < a->info.array.raid_disks-2)
+                       state = DDF_state_failed;
+               else if (working == a->info.array.raid_disks-1)
+                       state = DDF_state_part_optimal;
+               break;
+       }
+
+       if (ddf->virt->entries[inst].state !=
+           ((ddf->virt->entries[inst].state & ~DDF_state_mask)
+            | state)) {
+
+               ddf->virt->entries[inst].state =
+                       (ddf->virt->entries[inst].state & ~DDF_state_mask)
+                       | state;
+               ddf->updates_pending = 1;
+       }
+
+}
+
+static void ddf_sync_metadata(struct supertype *st)
+{
+
+       /*
+        * Write all data to all devices.
+        * Later, we might be able to track whether only local changes
+        * have been made, or whether any global data has been changed,
+        * but ddf is sufficiently weird that it probably always
+        * changes global data ....
+        */
+       struct ddf_super *ddf = st->sb;
+       if (!ddf->updates_pending)
+               return;
+       ddf->updates_pending = 0;
+       __write_init_super_ddf(st, 0);
+       dprintf("ddf: sync_metadata\n");
+}
+
+static void ddf_process_update(struct supertype *st,
+                              struct metadata_update *update)
+{
+       /* Apply this update to the metadata.
+        * The first 4 bytes are a DDF_*_MAGIC which guides
+        * our actions.
+        * Possible update are:
+        *  DDF_PHYS_RECORDS_MAGIC
+        *    Add a new physical device.  Changes to this record
+        *    only happen implicitly.
+        *    used_pdes is the device number.
+        *  DDF_VIRT_RECORDS_MAGIC
+        *    Add a new VD.  Possibly also change the 'access' bits.
+        *    populated_vdes is the entry number.
+        *  DDF_VD_CONF_MAGIC
+        *    New or updated VD.  the VIRT_RECORD must already
+        *    exist.  For an update, phys_refnum and lba_offset
+        *    (at least) are updated, and the VD_CONF must
+        *    be written to precisely those devices listed with
+        *    a phys_refnum.
+        *  DDF_SPARE_ASSIGN_MAGIC
+        *    replacement Spare Assignment Record... but for which device?
+        *
+        * So, e.g.:
+        *  - to create a new array, we send a VIRT_RECORD and
+        *    a VD_CONF.  Then assemble and start the array.
+        *  - to activate a spare we send a VD_CONF to add the phys_refnum
+        *    and offset.  This will also mark the spare as active with
+        *    a spare-assignment record.
+        */
+       struct ddf_super *ddf = st->sb;
+       __u32 *magic = (__u32*)update->buf;
+       struct phys_disk *pd;
+       struct virtual_disk *vd;
+       struct vd_config *vc;
+       struct vcl *vcl;
+       struct dl *dl;
+       int mppe;
+       int ent;
+
+       dprintf("Process update %x\n", *magic);
+
+       switch (*magic) {
+       case DDF_PHYS_RECORDS_MAGIC:
+
+               if (update->len != (sizeof(struct phys_disk) +
+                                   sizeof(struct phys_disk_entry)))
+                       return;
+               pd = (struct phys_disk*)update->buf;
+
+               ent = __be16_to_cpu(pd->used_pdes);
+               if (ent >= __be16_to_cpu(ddf->phys->max_pdes))
+                       return;
+               if (!all_ff(ddf->phys->entries[ent].guid))
+                       return;
+               ddf->phys->entries[ent] = pd->entries[0];
+               ddf->phys->used_pdes = __cpu_to_be16(1 +
+                                          __be16_to_cpu(ddf->phys->used_pdes));
+               ddf->updates_pending = 1;
+               if (ddf->add_list) {
+                       struct active_array *a;
+                       struct dl *al = ddf->add_list;
+                       ddf->add_list = al->next;
+
+                       al->next = ddf->dlist;
+                       ddf->dlist = al;
+
+                       /* As a device has been added, we should check
+                        * for any degraded devices that might make
+                        * use of this spare */
+                       for (a = st->arrays ; a; a=a->next)
+                               a->check_degraded = 1;
+               }
+               break;
+
+       case DDF_VIRT_RECORDS_MAGIC:
+
+               if (update->len != (sizeof(struct virtual_disk) +
+                                   sizeof(struct virtual_entry)))
+                       return;
+               vd = (struct virtual_disk*)update->buf;
+
+               ent = __be16_to_cpu(vd->populated_vdes);
+               if (ent >= __be16_to_cpu(ddf->virt->max_vdes))
+                       return;
+               if (!all_ff(ddf->virt->entries[ent].guid))
+                       return;
+               ddf->virt->entries[ent] = vd->entries[0];
+               ddf->virt->populated_vdes = __cpu_to_be16(1 +
+                             __be16_to_cpu(ddf->virt->populated_vdes));
+               ddf->updates_pending = 1;
+               break;
+
+       case DDF_VD_CONF_MAGIC:
+               dprintf("len %d %d\n", update->len, ddf->conf_rec_len);
+
+               mppe = __be16_to_cpu(ddf->anchor.max_primary_element_entries);
+               if (update->len != ddf->conf_rec_len * 512)
+                       return;
+               vc = (struct vd_config*)update->buf;
+               for (vcl = ddf->conflist; vcl ; vcl = vcl->next)
+                       if (memcmp(vcl->conf.guid, vc->guid, DDF_GUID_LEN) == 0)
+                               break;
+               dprintf("vcl = %p\n", vcl);
+               if (vcl) {
+                       /* An update, just copy the phys_refnum and lba_offset
+                        * fields
+                        */
+                       memcpy(vcl->conf.phys_refnum, vc->phys_refnum,
+                              mppe * (sizeof(__u32) + sizeof(__u64)));
+               } else {
+                       /* A new VD_CONF */
+                       if (!update->space)
+                               return;
+                       vcl = update->space;
+                       update->space = NULL;
+                       vcl->next = ddf->conflist;
+                       memcpy(&vcl->conf, vc, update->len);
+                       vcl->lba_offset = (__u64*)
+                               &vcl->conf.phys_refnum[mppe];
+                       ddf->conflist = vcl;
+               }
+               /* Now make sure vlist is correct for each dl. */
+               for (dl = ddf->dlist; dl; dl = dl->next) {
+                       int dn;
+                       int vn = 0;
+                       for (vcl = ddf->conflist; vcl ; vcl = vcl->next)
+                               for (dn=0; dn < ddf->mppe ; dn++)
+                                       if (vcl->conf.phys_refnum[dn] ==
+                                           dl->disk.refnum) {
+                                               dprintf("dev %d has %p at %d\n",
+                                                       dl->pdnum, vcl, vn);
+                                               dl->vlist[vn++] = vcl;
+                                               break;
+                                       }
+                       while (vn < ddf->max_part)
+                               dl->vlist[vn++] = NULL;
+                       if (dl->vlist[0]) {
+                               ddf->phys->entries[dl->pdnum].type &=
+                                       ~__cpu_to_be16(DDF_Global_Spare);
+                               ddf->phys->entries[dl->pdnum].type |=
+                                       __cpu_to_be16(DDF_Active_in_VD);
+                       }
+                       if (dl->spare) {
+                               ddf->phys->entries[dl->pdnum].type &=
+                                       ~__cpu_to_be16(DDF_Global_Spare);
+                               ddf->phys->entries[dl->pdnum].type |=
+                                       __cpu_to_be16(DDF_Spare);
+                       }
+                       if (!dl->vlist[0] && !dl->spare) {
+                               ddf->phys->entries[dl->pdnum].type |=
+                                       __cpu_to_be16(DDF_Global_Spare);
+                               ddf->phys->entries[dl->pdnum].type &=
+                                       ~__cpu_to_be16(DDF_Spare |
+                                                      DDF_Active_in_VD);
+                       }
+               }
+               ddf->updates_pending = 1;
+               break;
+       case DDF_SPARE_ASSIGN_MAGIC:
+       default: break;
+       }
+}
+
+static void ddf_prepare_update(struct supertype *st,
+                              struct metadata_update *update)
+{
+       /* This update arrived at managemon.
+        * We are about to pass it to monitor.
+        * If a malloc is needed, do it here.
+        */
+       struct ddf_super *ddf = st->sb;
+       __u32 *magic = (__u32*)update->buf;
+       if (*magic == DDF_VD_CONF_MAGIC)
+               if (posix_memalign(&update->space, 512,
+                              offsetof(struct vcl, conf)
+                              + ddf->conf_rec_len * 512) != 0)
+                       update->space = NULL;
+}
+
+/*
+ * Check if the array 'a' is degraded but not failed.
+ * If it is, find as many spares as are available and needed and
+ * arrange for their inclusion.
+ * We only choose devices which are not already in the array,
+ * and prefer those with a spare-assignment to this array.
+ * otherwise we choose global spares - assuming always that
+ * there is enough room.
+ * For each spare that we assign, we return an 'mdinfo' which
+ * describes the position for the device in the array.
+ * We also add to 'updates' a DDF_VD_CONF_MAGIC update with
+ * the new phys_refnum and lba_offset values.
+ *
+ * Only worry about BVDs at the moment.
+ */
+static struct mdinfo *ddf_activate_spare(struct active_array *a,
+                                        struct metadata_update **updates)
+{
+       int working = 0;
+       struct mdinfo *d;
+       struct ddf_super *ddf = a->container->sb;
+       int global_ok = 0;
+       struct mdinfo *rv = NULL;
+       struct mdinfo *di;
+       struct metadata_update *mu;
+       struct dl *dl;
+       int i;
+       struct vd_config *vc;
+       __u64 *lba;
+
+       for (d = a->info.devs ; d ; d = d->next) {
+               if ((d->curr_state & DS_FAULTY) &&
+                       d->state_fd >= 0)
+                       /* wait for Removal to happen */
+                       return NULL;
+               if (d->state_fd >= 0)
+                       working ++;
+       }
+
+       dprintf("ddf_activate: working=%d (%d) level=%d\n", working, a->info.array.raid_disks,
+               a->info.array.level);
+       if (working == a->info.array.raid_disks)
+               return NULL; /* array not degraded */
+       switch (a->info.array.level) {
+       case 1:
+               if (working == 0)
+                       return NULL; /* failed */
+               break;
+       case 4:
+       case 5:
+               if (working < a->info.array.raid_disks - 1)
+                       return NULL; /* failed */
+               break;
+       case 6:
+               if (working < a->info.array.raid_disks - 2)
+                       return NULL; /* failed */
+               break;
+       default: /* concat or stripe */
+               return NULL; /* failed */
+       }
+
+       /* For each slot, if it is not working, find a spare */
+       dl = ddf->dlist;
+       for (i = 0; i < a->info.array.raid_disks; i++) {
+               for (d = a->info.devs ; d ; d = d->next)
+                       if (d->disk.raid_disk == i)
+                               break;
+               dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0);
+               if (d && (d->state_fd >= 0))
+                       continue;
+
+               /* OK, this device needs recovery.  Find a spare */
+       again:
+               for ( ; dl ; dl = dl->next) {
+                       unsigned long long esize;
+                       unsigned long long pos;
+                       struct mdinfo *d2;
+                       int is_global = 0;
+                       int is_dedicated = 0;
+                       struct extent *ex;
+                       int j;
+                       /* If in this array, skip */
+                       for (d2 = a->info.devs ; d2 ; d2 = d2->next)
+                               if (d2->disk.major == dl->major &&
+                                   d2->disk.minor == dl->minor) {
+                                       dprintf("%x:%x already in array\n", dl->major, dl->minor);
+                                       break;
+                               }
+                       if (d2)
+                               continue;
+                       if (ddf->phys->entries[dl->pdnum].type &
+                           __cpu_to_be16(DDF_Spare)) {
+                               /* Check spare assign record */
+                               if (dl->spare) {
+                                       if (dl->spare->type & DDF_spare_dedicated) {
+                                               /* check spare_ents for guid */
+                                               for (j = 0 ;
+                                                    j < __be16_to_cpu(dl->spare->populated);
+                                                    j++) {
+                                                       if (memcmp(dl->spare->spare_ents[j].guid,
+                                                                  ddf->virt->entries[a->info.container_member].guid,
+                                                                  DDF_GUID_LEN) == 0)
+                                                               is_dedicated = 1;
+                                               }
+                                       } else
+                                               is_global = 1;
+                               }
+                       } else if (ddf->phys->entries[dl->pdnum].type &
+                                  __cpu_to_be16(DDF_Global_Spare)) {
+                               is_global = 1;
+                       }
+                       if ( ! (is_dedicated ||
+                               (is_global && global_ok))) {
+                               dprintf("%x:%x not suitable: %d %d\n", dl->major, dl->minor,
+                                      is_dedicated, is_global);
+                               continue;
+                       }
+
+                       /* We are allowed to use this device - is there space?
+                        * We need a->info.component_size sectors */
+                       ex = get_extents(ddf, dl);
+                       if (!ex) {
+                               dprintf("cannot get extents\n");
+                               continue;
+                       }
+                       j = 0; pos = 0;
+                       esize = 0;
+
+                       do {
+                               esize = ex[j].start - pos;
+                               if (esize >= a->info.component_size)
+                                       break;
+                               pos = ex[i].start + ex[i].size;
+                               i++;
+                       } while (ex[i-1].size);
+
+                       free(ex);
+                       if (esize < a->info.component_size) {
+                               dprintf("%x:%x has no room: %llu %llu\n", dl->major, dl->minor,
+                                       esize, a->info.component_size);
+                               /* No room */
+                               continue;
+                       }
+
+                       /* Cool, we have a device with some space at pos */
+                       di = malloc(sizeof(*di));
+                       if (!di)
+                               continue;
+                       memset(di, 0, sizeof(*di));
+                       di->disk.number = i;
+                       di->disk.raid_disk = i;
+                       di->disk.major = dl->major;
+                       di->disk.minor = dl->minor;
+                       di->disk.state = 0;
+                       di->data_offset = pos;
+                       di->component_size = a->info.component_size;
+                       di->container_member = dl->pdnum;
+                       di->next = rv;
+                       rv = di;
+                       dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor,
+                               i, pos);
+
+                       break;
+               }
+               if (!dl && ! global_ok) {
+                       /* not enough dedicated spares, try global */
+                       global_ok = 1;
+                       dl = ddf->dlist;
+                       goto again;
+               }
+       }
+
+       if (!rv)
+               /* No spares found */
+               return rv;
+       /* Now 'rv' has a list of devices to return.
+        * Create a metadata_update record to update the
+        * phys_refnum and lba_offset values
+        */
+       mu = malloc(sizeof(*mu));
+       if (mu && posix_memalign(&mu->space, 512, sizeof(struct vcl)) != 0) {
+               free(mu);
+               mu = NULL;
+       }
+       if (!mu) {
+               while (rv) {
+                       struct mdinfo *n = rv->next;
+
+                       free(rv);
+                       rv = n;
+               }
+               return NULL;
+       }
+               
+       mu->buf = malloc(ddf->conf_rec_len * 512);
+       mu->len = ddf->conf_rec_len;
+       mu->next = *updates;
+       vc = find_vdcr(ddf, a->info.container_member);
+       memcpy(mu->buf, vc, ddf->conf_rec_len * 512);
+
+       vc = (struct vd_config*)mu->buf;
+       lba = (__u64*)&vc->phys_refnum[ddf->mppe];
+       for (di = rv ; di ; di = di->next) {
+               vc->phys_refnum[di->disk.raid_disk] =
+                       ddf->phys->entries[dl->pdnum].refnum;
+               lba[di->disk.raid_disk] = di->data_offset;
+       }
+       *updates = mu;
+       return rv;
+}
+#endif /* MDASSEMBLE */
+
+static int ddf_level_to_layout(int level)
+{
+       switch(level) {
+       case 0:
+       case 1:
+               return 0;
+       case 5:
+               return ALGORITHM_LEFT_SYMMETRIC;
+       case 6:
+               return ALGORITHM_ROTATING_N_CONTINUE;
+       case 10:
+               return 0x102;
+       default:
+               return UnSet;
+       }
+}
+
+struct superswitch super_ddf = {
+#ifndef        MDASSEMBLE
+       .examine_super  = examine_super_ddf,
+       .brief_examine_super = brief_examine_super_ddf,
+       .export_examine_super = export_examine_super_ddf,
+       .detail_super   = detail_super_ddf,
+       .brief_detail_super = brief_detail_super_ddf,
+       .validate_geometry = validate_geometry_ddf,
+       .write_init_super = write_init_super_ddf,
+       .add_to_super   = add_to_super_ddf,
+#endif
+       .match_home     = match_home_ddf,
+       .uuid_from_super= uuid_from_super_ddf,
+       .getinfo_super  = getinfo_super_ddf,
+       .update_super   = update_super_ddf,
+
+       .avail_size     = avail_size_ddf,
+
+       .compare_super  = compare_super_ddf,
+
+       .load_super     = load_super_ddf,
+       .init_super     = init_super_ddf,
+       .store_super    = store_zero_ddf,
+       .free_super     = free_super_ddf,
+       .match_metadata_desc = match_metadata_desc_ddf,
+       .container_content = container_content_ddf,
+       .default_layout = ddf_level_to_layout,
+
+       .external       = 1,
+
+#ifndef MDASSEMBLE
+/* for mdmon */
+       .open_new       = ddf_open_new,
+       .set_array_state= ddf_set_array_state,
+       .set_disk       = ddf_set_disk,
+       .sync_metadata  = ddf_sync_metadata,
+       .process_update = ddf_process_update,
+       .prepare_update = ddf_prepare_update,
+       .activate_spare = ddf_activate_spare,
+#endif
+       .name = "ddf",
+};
diff --git a/super-intel.c b/super-intel.c
new file mode 100644 (file)
index 0000000..d7383fb
--- /dev/null
@@ -0,0 +1,4544 @@
+/*
+ * mdadm - Intel(R) Matrix Storage Manager Support
+ *
+ * Copyright (C) 2002-2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#define HAVE_STDINT_H 1
+#include "mdadm.h"
+#include "mdmon.h"
+#include "sha1.h"
+#include "platform-intel.h"
+#include <values.h>
+#include <scsi/sg.h>
+#include <ctype.h>
+#include <dirent.h>
+
+/* MPB == Metadata Parameter Block */
+#define MPB_SIGNATURE "Intel Raid ISM Cfg Sig. "
+#define MPB_SIG_LEN (strlen(MPB_SIGNATURE))
+#define MPB_VERSION_RAID0 "1.0.00"
+#define MPB_VERSION_RAID1 "1.1.00"
+#define MPB_VERSION_MANY_VOLUMES_PER_ARRAY "1.2.00"
+#define MPB_VERSION_3OR4_DISK_ARRAY "1.2.01"
+#define MPB_VERSION_RAID5 "1.2.02"
+#define MPB_VERSION_5OR6_DISK_ARRAY "1.2.04"
+#define MPB_VERSION_CNG "1.2.06"
+#define MPB_VERSION_ATTRIBS "1.3.00"
+#define MAX_SIGNATURE_LENGTH  32
+#define MAX_RAID_SERIAL_LEN   16
+
+#define MPB_ATTRIB_CHECKSUM_VERIFY __cpu_to_le32(0x80000000)
+#define MPB_ATTRIB_PM      __cpu_to_le32(0x40000000)
+#define MPB_ATTRIB_2TB     __cpu_to_le32(0x20000000)
+#define MPB_ATTRIB_RAID0   __cpu_to_le32(0x00000001)
+#define MPB_ATTRIB_RAID1   __cpu_to_le32(0x00000002)
+#define MPB_ATTRIB_RAID10  __cpu_to_le32(0x00000004)
+#define MPB_ATTRIB_RAID1E  __cpu_to_le32(0x00000008)
+#define MPB_ATTRIB_RAID5   __cpu_to_le32(0x00000010)
+#define MPB_ATTRIB_RAIDCNG __cpu_to_le32(0x00000020)
+
+#define MPB_SECTOR_CNT 418
+#define IMSM_RESERVED_SECTORS 4096
+#define SECT_PER_MB_SHIFT 11
+
+/* Disk configuration info. */
+#define IMSM_MAX_DEVICES 255
+struct imsm_disk {
+       __u8 serial[MAX_RAID_SERIAL_LEN];/* 0xD8 - 0xE7 ascii serial number */
+       __u32 total_blocks;              /* 0xE8 - 0xEB total blocks */
+       __u32 scsi_id;                   /* 0xEC - 0xEF scsi ID */
+#define SPARE_DISK      __cpu_to_le32(0x01)  /* Spare */
+#define CONFIGURED_DISK __cpu_to_le32(0x02)  /* Member of some RaidDev */
+#define FAILED_DISK     __cpu_to_le32(0x04)  /* Permanent failure */
+#define USABLE_DISK     __cpu_to_le32(0x08)  /* Fully usable unless FAILED_DISK is set */
+       __u32 status;                    /* 0xF0 - 0xF3 */
+       __u32 owner_cfg_num; /* which config 0,1,2... owns this disk */ 
+#define        IMSM_DISK_FILLERS       4
+       __u32 filler[IMSM_DISK_FILLERS]; /* 0xF4 - 0x107 MPB_DISK_FILLERS for future expansion */
+};
+
+/* RAID map configuration infos. */
+struct imsm_map {
+       __u32 pba_of_lba0;      /* start address of partition */
+       __u32 blocks_per_member;/* blocks per member */
+       __u32 num_data_stripes; /* number of data stripes */
+       __u16 blocks_per_strip;
+       __u8  map_state;        /* Normal, Uninitialized, Degraded, Failed */
+#define IMSM_T_STATE_NORMAL 0
+#define IMSM_T_STATE_UNINITIALIZED 1
+#define IMSM_T_STATE_DEGRADED 2
+#define IMSM_T_STATE_FAILED 3
+       __u8  raid_level;
+#define IMSM_T_RAID0 0
+#define IMSM_T_RAID1 1
+#define IMSM_T_RAID5 5         /* since metadata version 1.2.02 ? */
+       __u8  num_members;      /* number of member disks */
+       __u8  num_domains;      /* number of parity domains */
+       __u8  failed_disk_num;  /* valid only when state is degraded */
+       __u8  ddf;
+       __u32 filler[7];        /* expansion area */
+#define IMSM_ORD_REBUILD (1 << 24)
+       __u32 disk_ord_tbl[1];  /* disk_ord_tbl[num_members],
+                                * top byte contains some flags
+                                */
+} __attribute__ ((packed));
+
+struct imsm_vol {
+       __u32 curr_migr_unit;
+       __u32 checkpoint_id;    /* id to access curr_migr_unit */
+       __u8  migr_state;       /* Normal or Migrating */
+#define MIGR_INIT 0
+#define MIGR_REBUILD 1
+#define MIGR_VERIFY 2 /* analagous to echo check > sync_action */
+#define MIGR_GEN_MIGR 3
+#define MIGR_STATE_CHANGE 4
+#define MIGR_REPAIR 5
+       __u8  migr_type;        /* Initializing, Rebuilding, ... */
+       __u8  dirty;
+       __u8  fs_state;         /* fast-sync state for CnG (0xff == disabled) */
+       __u16 verify_errors;    /* number of mismatches */
+       __u16 bad_blocks;       /* number of bad blocks during verify */
+       __u32 filler[4];
+       struct imsm_map map[1];
+       /* here comes another one if migr_state */
+} __attribute__ ((packed));
+
+struct imsm_dev {
+       __u8  volume[MAX_RAID_SERIAL_LEN];
+       __u32 size_low;
+       __u32 size_high;
+#define DEV_BOOTABLE           __cpu_to_le32(0x01)
+#define DEV_BOOT_DEVICE                __cpu_to_le32(0x02)
+#define DEV_READ_COALESCING    __cpu_to_le32(0x04)
+#define DEV_WRITE_COALESCING   __cpu_to_le32(0x08)
+#define DEV_LAST_SHUTDOWN_DIRTY        __cpu_to_le32(0x10)
+#define DEV_HIDDEN_AT_BOOT     __cpu_to_le32(0x20)
+#define DEV_CURRENTLY_HIDDEN   __cpu_to_le32(0x40)
+#define DEV_VERIFY_AND_FIX     __cpu_to_le32(0x80)
+#define DEV_MAP_STATE_UNINIT   __cpu_to_le32(0x100)
+#define DEV_NO_AUTO_RECOVERY   __cpu_to_le32(0x200)
+#define DEV_CLONE_N_GO         __cpu_to_le32(0x400)
+#define DEV_CLONE_MAN_SYNC     __cpu_to_le32(0x800)
+#define DEV_CNG_MASTER_DISK_NUM        __cpu_to_le32(0x1000)
+       __u32 status;   /* Persistent RaidDev status */
+       __u32 reserved_blocks; /* Reserved blocks at beginning of volume */
+       __u8  migr_priority;
+       __u8  num_sub_vols;
+       __u8  tid;
+       __u8  cng_master_disk;
+       __u16 cache_policy;
+       __u8  cng_state;
+       __u8  cng_sub_state;
+#define IMSM_DEV_FILLERS 10
+       __u32 filler[IMSM_DEV_FILLERS];
+       struct imsm_vol vol;
+} __attribute__ ((packed));
+
+struct imsm_super {
+       __u8 sig[MAX_SIGNATURE_LENGTH]; /* 0x00 - 0x1F */
+       __u32 check_sum;                /* 0x20 - 0x23 MPB Checksum */
+       __u32 mpb_size;                 /* 0x24 - 0x27 Size of MPB */
+       __u32 family_num;               /* 0x28 - 0x2B Checksum from first time this config was written */
+       __u32 generation_num;           /* 0x2C - 0x2F Incremented each time this array's MPB is written */
+       __u32 error_log_size;           /* 0x30 - 0x33 in bytes */
+       __u32 attributes;               /* 0x34 - 0x37 */
+       __u8 num_disks;                 /* 0x38 Number of configured disks */
+       __u8 num_raid_devs;             /* 0x39 Number of configured volumes */
+       __u8 error_log_pos;             /* 0x3A  */
+       __u8 fill[1];                   /* 0x3B */
+       __u32 cache_size;               /* 0x3c - 0x40 in mb */
+       __u32 orig_family_num;          /* 0x40 - 0x43 original family num */
+       __u32 pwr_cycle_count;          /* 0x44 - 0x47 simulated power cycle count for array */
+       __u32 bbm_log_size;             /* 0x48 - 0x4B - size of bad Block Mgmt Log in bytes */
+#define IMSM_FILLERS 35
+       __u32 filler[IMSM_FILLERS];     /* 0x4C - 0xD7 RAID_MPB_FILLERS */
+       struct imsm_disk disk[1];       /* 0xD8 diskTbl[numDisks] */
+       /* here comes imsm_dev[num_raid_devs] */
+       /* here comes BBM logs */
+} __attribute__ ((packed));
+
+#define BBM_LOG_MAX_ENTRIES 254
+
+struct bbm_log_entry {
+       __u64 defective_block_start;
+#define UNREADABLE 0xFFFFFFFF
+       __u32 spare_block_offset;
+       __u16 remapped_marked_count;
+       __u16 disk_ordinal;
+} __attribute__ ((__packed__));
+
+struct bbm_log {
+       __u32 signature; /* 0xABADB10C */
+       __u32 entry_count;
+       __u32 reserved_spare_block_count; /* 0 */
+       __u32 reserved; /* 0xFFFF */
+       __u64 first_spare_lba;
+       struct bbm_log_entry mapped_block_entries[BBM_LOG_MAX_ENTRIES];
+} __attribute__ ((__packed__));
+
+
+#ifndef MDASSEMBLE
+static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed" };
+#endif
+
+static __u8 migr_type(struct imsm_dev *dev)
+{
+       if (dev->vol.migr_type == MIGR_VERIFY &&
+           dev->status & DEV_VERIFY_AND_FIX)
+               return MIGR_REPAIR;
+       else
+               return dev->vol.migr_type;
+}
+
+static void set_migr_type(struct imsm_dev *dev, __u8 migr_type)
+{
+       /* for compatibility with older oroms convert MIGR_REPAIR, into
+        * MIGR_VERIFY w/ DEV_VERIFY_AND_FIX status
+        */
+       if (migr_type == MIGR_REPAIR) {
+               dev->vol.migr_type = MIGR_VERIFY;
+               dev->status |= DEV_VERIFY_AND_FIX;
+       } else {
+               dev->vol.migr_type = migr_type;
+               dev->status &= ~DEV_VERIFY_AND_FIX;
+       }
+}
+
+static unsigned int sector_count(__u32 bytes)
+{
+       return ((bytes + (512-1)) & (~(512-1))) / 512;
+}
+
+static unsigned int mpb_sectors(struct imsm_super *mpb)
+{
+       return sector_count(__le32_to_cpu(mpb->mpb_size));
+}
+
+struct intel_dev {
+       struct imsm_dev *dev;
+       struct intel_dev *next;
+       int index;
+};
+
+/* internal representation of IMSM metadata */
+struct intel_super {
+       union {
+               void *buf; /* O_DIRECT buffer for reading/writing metadata */
+               struct imsm_super *anchor; /* immovable parameters */
+       };
+       size_t len; /* size of the 'buf' allocation */
+       void *next_buf; /* for realloc'ing buf from the manager */
+       size_t next_len;
+       int updates_pending; /* count of pending updates for mdmon */
+       int creating_imsm; /* flag to indicate container creation */
+       int current_vol; /* index of raid device undergoing creation */
+       __u32 create_offset; /* common start for 'current_vol' */
+       struct intel_dev *devlist;
+       struct dl {
+               struct dl *next;
+               int index;
+               __u8 serial[MAX_RAID_SERIAL_LEN];
+               int major, minor;
+               char *devname;
+               struct imsm_disk disk;
+               int fd;
+               int extent_cnt;
+               struct extent *e; /* for determining freespace @ create */
+               int raiddisk; /* slot to fill in autolayout */
+       } *disks;
+       struct dl *add; /* list of disks to add while mdmon active */
+       struct dl *missing; /* disks removed while we weren't looking */
+       struct bbm_log *bbm_log;
+       const char *hba; /* device path of the raid controller for this metadata */
+       const struct imsm_orom *orom; /* platform firmware support */
+};
+
+struct extent {
+       unsigned long long start, size;
+};
+
+/* definition of messages passed to imsm_process_update */
+enum imsm_update_type {
+       update_activate_spare,
+       update_create_array,
+       update_add_disk,
+};
+
+struct imsm_update_activate_spare {
+       enum imsm_update_type type;
+       struct dl *dl;
+       int slot;
+       int array;
+       struct imsm_update_activate_spare *next;
+};
+
+struct disk_info {
+       __u8 serial[MAX_RAID_SERIAL_LEN];
+};
+
+struct imsm_update_create_array {
+       enum imsm_update_type type;
+       int dev_idx;
+       struct imsm_dev dev;
+};
+
+struct imsm_update_add_disk {
+       enum imsm_update_type type;
+};
+
+static struct supertype *match_metadata_desc_imsm(char *arg)
+{
+       struct supertype *st;
+
+       if (strcmp(arg, "imsm") != 0 &&
+           strcmp(arg, "default") != 0
+               )
+               return NULL;
+
+       st = malloc(sizeof(*st));
+       memset(st, 0, sizeof(*st));
+       st->ss = &super_imsm;
+       st->max_devs = IMSM_MAX_DEVICES;
+       st->minor_version = 0;
+       st->sb = NULL;
+       return st;
+}
+
+#ifndef MDASSEMBLE
+static __u8 *get_imsm_version(struct imsm_super *mpb)
+{
+       return &mpb->sig[MPB_SIG_LEN];
+}
+#endif 
+
+/* retrieve a disk directly from the anchor when the anchor is known to be
+ * up-to-date, currently only at load time
+ */
+static struct imsm_disk *__get_imsm_disk(struct imsm_super *mpb, __u8 index)
+{
+       if (index >= mpb->num_disks)
+               return NULL;
+       return &mpb->disk[index];
+}
+
+#ifndef MDASSEMBLE
+/* retrieve a disk from the parsed metadata */
+static struct imsm_disk *get_imsm_disk(struct intel_super *super, __u8 index)
+{
+       struct dl *d;
+
+       for (d = super->disks; d; d = d->next)
+               if (d->index == index)
+                       return &d->disk;
+       
+       return NULL;
+}
+#endif
+
+/* generate a checksum directly from the anchor when the anchor is known to be
+ * up-to-date, currently only at load or write_super after coalescing
+ */
+static __u32 __gen_imsm_checksum(struct imsm_super *mpb)
+{
+       __u32 end = mpb->mpb_size / sizeof(end);
+       __u32 *p = (__u32 *) mpb;
+       __u32 sum = 0;
+
+        while (end--) {
+                sum += __le32_to_cpu(*p);
+               p++;
+       }
+
+        return sum - __le32_to_cpu(mpb->check_sum);
+}
+
+static size_t sizeof_imsm_map(struct imsm_map *map)
+{
+       return sizeof(struct imsm_map) + sizeof(__u32) * (map->num_members - 1);
+}
+
+struct imsm_map *get_imsm_map(struct imsm_dev *dev, int second_map)
+{
+       struct imsm_map *map = &dev->vol.map[0];
+
+       if (second_map && !dev->vol.migr_state)
+               return NULL;
+       else if (second_map) {
+               void *ptr = map;
+
+               return ptr + sizeof_imsm_map(map);
+       } else
+               return map;
+               
+}
+
+/* return the size of the device.
+ * migr_state increases the returned size if map[0] were to be duplicated
+ */
+static size_t sizeof_imsm_dev(struct imsm_dev *dev, int migr_state)
+{
+       size_t size = sizeof(*dev) - sizeof(struct imsm_map) +
+                     sizeof_imsm_map(get_imsm_map(dev, 0));
+
+       /* migrating means an additional map */
+       if (dev->vol.migr_state)
+               size += sizeof_imsm_map(get_imsm_map(dev, 1));
+       else if (migr_state)
+               size += sizeof_imsm_map(get_imsm_map(dev, 0));
+
+       return size;
+}
+
+#ifndef MDASSEMBLE
+/* retrieve disk serial number list from a metadata update */
+static struct disk_info *get_disk_info(struct imsm_update_create_array *update)
+{
+       void *u = update;
+       struct disk_info *inf;
+
+       inf = u + sizeof(*update) - sizeof(struct imsm_dev) +
+             sizeof_imsm_dev(&update->dev, 0);
+
+       return inf;
+}
+#endif
+
+static struct imsm_dev *__get_imsm_dev(struct imsm_super *mpb, __u8 index)
+{
+       int offset;
+       int i;
+       void *_mpb = mpb;
+
+       if (index >= mpb->num_raid_devs)
+               return NULL;
+
+       /* devices start after all disks */
+       offset = ((void *) &mpb->disk[mpb->num_disks]) - _mpb;
+
+       for (i = 0; i <= index; i++)
+               if (i == index)
+                       return _mpb + offset;
+               else
+                       offset += sizeof_imsm_dev(_mpb + offset, 0);
+
+       return NULL;
+}
+
+static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index)
+{
+       struct intel_dev *dv;
+
+       if (index >= super->anchor->num_raid_devs)
+               return NULL;
+       for (dv = super->devlist; dv; dv = dv->next)
+               if (dv->index == index)
+                       return dv->dev;
+       return NULL;
+}
+
+static __u32 get_imsm_ord_tbl_ent(struct imsm_dev *dev, int slot)
+{
+       struct imsm_map *map;
+
+       if (dev->vol.migr_state)
+               map = get_imsm_map(dev, 1);
+       else
+               map = get_imsm_map(dev, 0);
+
+       /* top byte identifies disk under rebuild */
+       return __le32_to_cpu(map->disk_ord_tbl[slot]);
+}
+
+#define ord_to_idx(ord) (((ord) << 8) >> 8)
+static __u32 get_imsm_disk_idx(struct imsm_dev *dev, int slot)
+{
+       __u32 ord = get_imsm_ord_tbl_ent(dev, slot);
+
+       return ord_to_idx(ord);
+}
+
+static void set_imsm_ord_tbl_ent(struct imsm_map *map, int slot, __u32 ord)
+{
+       map->disk_ord_tbl[slot] = __cpu_to_le32(ord);
+}
+
+static int get_imsm_disk_slot(struct imsm_map *map, int idx)
+{
+       int slot;
+       __u32 ord;
+
+       for (slot = 0; slot < map->num_members; slot++) {
+               ord = __le32_to_cpu(map->disk_ord_tbl[slot]);
+               if (ord_to_idx(ord) == idx)
+                       return slot;
+       }
+
+       return -1;
+}
+
+static int get_imsm_raid_level(struct imsm_map *map)
+{
+       if (map->raid_level == 1) {
+               if (map->num_members == 2)
+                       return 1;
+               else
+                       return 10;
+       }
+
+       return map->raid_level;
+}
+
+static int cmp_extent(const void *av, const void *bv)
+{
+       const struct extent *a = av;
+       const struct extent *b = bv;
+       if (a->start < b->start)
+               return -1;
+       if (a->start > b->start)
+               return 1;
+       return 0;
+}
+
+static int count_memberships(struct dl *dl, struct intel_super *super)
+{
+       int memberships = 0;
+       int i;
+
+       for (i = 0; i < super->anchor->num_raid_devs; i++) {
+               struct imsm_dev *dev = get_imsm_dev(super, i);
+               struct imsm_map *map = get_imsm_map(dev, 0);
+
+               if (get_imsm_disk_slot(map, dl->index) >= 0)
+                       memberships++;
+       }
+
+       return memberships;
+}
+
+static struct extent *get_extents(struct intel_super *super, struct dl *dl)
+{
+       /* find a list of used extents on the given physical device */
+       struct extent *rv, *e;
+       int i;
+       int memberships = count_memberships(dl, super);
+       __u32 reservation = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
+
+       rv = malloc(sizeof(struct extent) * (memberships + 1));
+       if (!rv)
+               return NULL;
+       e = rv;
+
+       for (i = 0; i < super->anchor->num_raid_devs; i++) {
+               struct imsm_dev *dev = get_imsm_dev(super, i);
+               struct imsm_map *map = get_imsm_map(dev, 0);
+
+               if (get_imsm_disk_slot(map, dl->index) >= 0) {
+                       e->start = __le32_to_cpu(map->pba_of_lba0);
+                       e->size = __le32_to_cpu(map->blocks_per_member);
+                       e++;
+               }
+       }
+       qsort(rv, memberships, sizeof(*rv), cmp_extent);
+
+       /* determine the start of the metadata 
+        * when no raid devices are defined use the default
+        * ...otherwise allow the metadata to truncate the value
+        * as is the case with older versions of imsm
+        */
+       if (memberships) {
+               struct extent *last = &rv[memberships - 1];
+               __u32 remainder;
+
+               remainder = __le32_to_cpu(dl->disk.total_blocks) - 
+                           (last->start + last->size);
+               /* round down to 1k block to satisfy precision of the kernel
+                * 'size' interface
+                */
+               remainder &= ~1UL;
+               /* make sure remainder is still sane */
+               if (remainder < ROUND_UP(super->len, 512) >> 9)
+                       remainder = ROUND_UP(super->len, 512) >> 9;
+               if (reservation > remainder)
+                       reservation = remainder;
+       }
+       e->start = __le32_to_cpu(dl->disk.total_blocks) - reservation;
+       e->size = 0;
+       return rv;
+}
+
+/* try to determine how much space is reserved for metadata from
+ * the last get_extents() entry, otherwise fallback to the
+ * default
+ */
+static __u32 imsm_reserved_sectors(struct intel_super *super, struct dl *dl)
+{
+       struct extent *e;
+       int i;
+       __u32 rv;
+
+       /* for spares just return a minimal reservation which will grow
+        * once the spare is picked up by an array
+        */
+       if (dl->index == -1)
+               return MPB_SECTOR_CNT;
+
+       e = get_extents(super, dl);
+       if (!e)
+               return MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
+
+       /* scroll to last entry */
+       for (i = 0; e[i].size; i++)
+               continue;
+
+       rv = __le32_to_cpu(dl->disk.total_blocks) - e[i].start;
+
+       free(e);
+
+       return rv;
+}
+
+#ifndef MDASSEMBLE
+static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx)
+{
+       __u64 sz;
+       int slot;
+       struct imsm_map *map = get_imsm_map(dev, 0);
+       __u32 ord;
+
+       printf("\n");
+       printf("[%.16s]:\n", dev->volume);
+       printf("           UUID : %s\n", uuid);
+       printf("     RAID Level : %d\n", get_imsm_raid_level(map));
+       printf("        Members : %d\n", map->num_members);
+       slot = get_imsm_disk_slot(map, disk_idx);
+       if (slot >= 0) {
+               ord = get_imsm_ord_tbl_ent(dev, slot);
+               printf("      This Slot : %d%s\n", slot,
+                      ord & IMSM_ORD_REBUILD ? " (out-of-sync)" : "");
+       } else
+               printf("      This Slot : ?\n");
+       sz = __le32_to_cpu(dev->size_high);
+       sz <<= 32;
+       sz += __le32_to_cpu(dev->size_low);
+       printf("     Array Size : %llu%s\n", (unsigned long long)sz,
+              human_size(sz * 512));
+       sz = __le32_to_cpu(map->blocks_per_member);
+       printf("   Per Dev Size : %llu%s\n", (unsigned long long)sz,
+              human_size(sz * 512));
+       printf("  Sector Offset : %u\n",
+               __le32_to_cpu(map->pba_of_lba0));
+       printf("    Num Stripes : %u\n",
+               __le32_to_cpu(map->num_data_stripes));
+       printf("     Chunk Size : %u KiB\n",
+               __le16_to_cpu(map->blocks_per_strip) / 2);
+       printf("       Reserved : %d\n", __le32_to_cpu(dev->reserved_blocks));
+       printf("  Migrate State : %s", dev->vol.migr_state ? "migrating" : "idle\n");
+       if (dev->vol.migr_state) {
+               if (migr_type(dev) == MIGR_INIT)
+                       printf(": initializing\n");
+               else if (migr_type(dev) == MIGR_REBUILD)
+                       printf(": rebuilding\n");
+               else if (migr_type(dev) == MIGR_VERIFY)
+                       printf(": check\n");
+               else if (migr_type(dev) == MIGR_GEN_MIGR)
+                       printf(": general migration\n");
+               else if (migr_type(dev) == MIGR_STATE_CHANGE)
+                       printf(": state change\n");
+               else if (migr_type(dev) == MIGR_REPAIR)
+                       printf(": repair\n");
+               else
+                       printf(": <unknown:%d>\n", migr_type(dev));
+       }
+       printf("      Map State : %s", map_state_str[map->map_state]);
+       if (dev->vol.migr_state) {
+               struct imsm_map *map = get_imsm_map(dev, 1);
+               printf(" <-- %s", map_state_str[map->map_state]);
+       }
+       printf("\n");
+       printf("    Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean");
+}
+
+static void print_imsm_disk(struct imsm_super *mpb, int index, __u32 reserved)
+{
+       struct imsm_disk *disk = __get_imsm_disk(mpb, index);
+       char str[MAX_RAID_SERIAL_LEN + 1];
+       __u32 s;
+       __u64 sz;
+
+       if (index < 0)
+               return;
+
+       printf("\n");
+       snprintf(str, MAX_RAID_SERIAL_LEN + 1, "%s", disk->serial);
+       printf("  Disk%02d Serial : %s\n", index, str);
+       s = disk->status;
+       printf("          State :%s%s%s%s\n", s&SPARE_DISK ? " spare" : "",
+                                             s&CONFIGURED_DISK ? " active" : "",
+                                             s&FAILED_DISK ? " failed" : "",
+                                             s&USABLE_DISK ? " usable" : "");
+       printf("             Id : %08x\n", __le32_to_cpu(disk->scsi_id));
+       sz = __le32_to_cpu(disk->total_blocks) - reserved;
+       printf("    Usable Size : %llu%s\n", (unsigned long long)sz,
+              human_size(sz * 512));
+}
+
+static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info);
+
+static void examine_super_imsm(struct supertype *st, char *homehost)
+{
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       char str[MAX_SIGNATURE_LENGTH];
+       int i;
+       struct mdinfo info;
+       char nbuf[64];
+       __u32 sum;
+       __u32 reserved = imsm_reserved_sectors(super, super->disks);
+
+
+       snprintf(str, MPB_SIG_LEN, "%s", mpb->sig);
+       printf("          Magic : %s\n", str);
+       snprintf(str, strlen(MPB_VERSION_RAID0), "%s", get_imsm_version(mpb));
+       printf("        Version : %s\n", get_imsm_version(mpb));
+       printf("         Family : %08x\n", __le32_to_cpu(mpb->family_num));
+       printf("     Generation : %08x\n", __le32_to_cpu(mpb->generation_num));
+       getinfo_super_imsm(st, &info);
+       fname_from_uuid(st, &info, nbuf, ':');
+       printf("           UUID : %s\n", nbuf + 5);
+       sum = __le32_to_cpu(mpb->check_sum);
+       printf("       Checksum : %08x %s\n", sum,
+               __gen_imsm_checksum(mpb) == sum ? "correct" : "incorrect");
+       printf("    MPB Sectors : %d\n", mpb_sectors(mpb));
+       printf("          Disks : %d\n", mpb->num_disks);
+       printf("   RAID Devices : %d\n", mpb->num_raid_devs);
+       print_imsm_disk(mpb, super->disks->index, reserved);
+       if (super->bbm_log) {
+               struct bbm_log *log = super->bbm_log;
+
+               printf("\n");
+               printf("Bad Block Management Log:\n");
+               printf("       Log Size : %d\n", __le32_to_cpu(mpb->bbm_log_size));
+               printf("      Signature : %x\n", __le32_to_cpu(log->signature));
+               printf("    Entry Count : %d\n", __le32_to_cpu(log->entry_count));
+               printf("   Spare Blocks : %d\n",  __le32_to_cpu(log->reserved_spare_block_count));
+               printf("    First Spare : %llx\n",
+                      (unsigned long long) __le64_to_cpu(log->first_spare_lba));
+       }
+       for (i = 0; i < mpb->num_raid_devs; i++) {
+               struct mdinfo info;
+               struct imsm_dev *dev = __get_imsm_dev(mpb, i);
+
+               super->current_vol = i;
+               getinfo_super_imsm(st, &info);
+               fname_from_uuid(st, &info, nbuf, ':');
+               print_imsm_dev(dev, nbuf + 5, super->disks->index);
+       }
+       for (i = 0; i < mpb->num_disks; i++) {
+               if (i == super->disks->index)
+                       continue;
+               print_imsm_disk(mpb, i, reserved);
+       }
+}
+
+static void brief_examine_super_imsm(struct supertype *st, int verbose)
+{
+       /* We just write a generic IMSM ARRAY entry */
+       struct mdinfo info;
+       char nbuf[64];
+       char nbuf1[64];
+       struct intel_super *super = st->sb;
+       int i;
+
+       if (!super->anchor->num_raid_devs)
+               return;
+
+       getinfo_super_imsm(st, &info);
+       fname_from_uuid(st, &info, nbuf, ':');
+       printf("ARRAY metadata=imsm auto=md UUID=%s\n", nbuf + 5);
+       for (i = 0; i < super->anchor->num_raid_devs; i++) {
+               struct imsm_dev *dev = get_imsm_dev(super, i);
+
+               super->current_vol = i;
+               getinfo_super_imsm(st, &info);
+               fname_from_uuid(st, &info, nbuf1, ':');
+               printf("ARRAY /dev/md/%.16s container=%s\n"
+                      "   member=%d auto=mdp UUID=%s\n",
+                      dev->volume, nbuf + 5, i, nbuf1 + 5);
+       }
+}
+
+static void export_examine_super_imsm(struct supertype *st)
+{
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       struct mdinfo info;
+       char nbuf[64];
+
+       getinfo_super_imsm(st, &info);
+       fname_from_uuid(st, &info, nbuf, ':');
+       printf("MD_METADATA=imsm\n");
+       printf("MD_LEVEL=container\n");
+       printf("MD_UUID=%s\n", nbuf+5);
+       printf("MD_DEVICES=%u\n", mpb->num_disks);
+}
+
+static void detail_super_imsm(struct supertype *st, char *homehost)
+{
+       struct mdinfo info;
+       char nbuf[64];
+
+       getinfo_super_imsm(st, &info);
+       fname_from_uuid(st, &info, nbuf, ':');
+       printf("\n           UUID : %s\n", nbuf + 5);
+}
+
+static void brief_detail_super_imsm(struct supertype *st)
+{
+       struct mdinfo info;
+       char nbuf[64];
+       getinfo_super_imsm(st, &info);
+       fname_from_uuid(st, &info, nbuf, ':');
+       printf(" UUID=%s", nbuf + 5);
+}
+
+static int imsm_read_serial(int fd, char *devname, __u8 *serial);
+static void fd2devname(int fd, char *name);
+
+static int imsm_enumerate_ports(const char *hba_path, int port_count, int host_base, int verbose)
+{
+       /* dump an unsorted list of devices attached to ahci, as well as
+        * non-connected ports
+        */
+       int hba_len = strlen(hba_path) + 1;
+       struct dirent *ent;
+       DIR *dir;
+       char *path = NULL;
+       int err = 0;
+       unsigned long port_mask = (1 << port_count) - 1;
+
+       if (port_count > sizeof(port_mask) * 8) {
+               if (verbose)
+                       fprintf(stderr, Name ": port_count %d out of range\n", port_count);
+               return 2;
+       }
+
+       /* scroll through /sys/dev/block looking for devices attached to
+        * this hba
+        */
+       dir = opendir("/sys/dev/block");
+       for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) {
+               int fd;
+               char model[64];
+               char vendor[64];
+               char buf[1024];
+               int major, minor;
+               char *device;
+               char *c;
+               int port;
+               int type;
+
+               if (sscanf(ent->d_name, "%d:%d", &major, &minor) != 2)
+                       continue;
+               path = devt_to_devpath(makedev(major, minor));
+               if (!path)
+                       continue;
+               if (!path_attached_to_hba(path, hba_path)) {
+                       free(path);
+                       path = NULL;
+                       continue;
+               }
+
+               /* retrieve the scsi device type */
+               if (asprintf(&device, "/sys/dev/block/%d:%d/device/xxxxxxx", major, minor) < 0) {
+                       if (verbose)
+                               fprintf(stderr, Name ": failed to allocate 'device'\n");
+                       err = 2;
+                       break;
+               }
+               sprintf(device, "/sys/dev/block/%d:%d/device/type", major, minor);
+               if (load_sys(device, buf) != 0) {
+                       if (verbose)
+                               fprintf(stderr, Name ": failed to read device type for %s\n",
+                                       path);
+                       err = 2;
+                       free(device);
+                       break;
+               }
+               type = strtoul(buf, NULL, 10);
+
+               /* if it's not a disk print the vendor and model */
+               if (!(type == 0 || type == 7 || type == 14)) {
+                       vendor[0] = '\0';
+                       model[0] = '\0';
+                       sprintf(device, "/sys/dev/block/%d:%d/device/vendor", major, minor);
+                       if (load_sys(device, buf) == 0) {
+                               strncpy(vendor, buf, sizeof(vendor));
+                               vendor[sizeof(vendor) - 1] = '\0';
+                               c = (char *) &vendor[sizeof(vendor) - 1];
+                               while (isspace(*c) || *c == '\0')
+                                       *c-- = '\0';
+
+                       }
+                       sprintf(device, "/sys/dev/block/%d:%d/device/model", major, minor);
+                       if (load_sys(device, buf) == 0) {
+                               strncpy(model, buf, sizeof(model));
+                               model[sizeof(model) - 1] = '\0';
+                               c = (char *) &model[sizeof(model) - 1];
+                               while (isspace(*c) || *c == '\0')
+                                       *c-- = '\0';
+                       }
+
+                       if (vendor[0] && model[0])
+                               sprintf(buf, "%.64s %.64s", vendor, model);
+                       else
+                               switch (type) { /* numbers from hald/linux/device.c */
+                               case 1: sprintf(buf, "tape"); break;
+                               case 2: sprintf(buf, "printer"); break;
+                               case 3: sprintf(buf, "processor"); break;
+                               case 4:
+                               case 5: sprintf(buf, "cdrom"); break;
+                               case 6: sprintf(buf, "scanner"); break;
+                               case 8: sprintf(buf, "media_changer"); break;
+                               case 9: sprintf(buf, "comm"); break;
+                               case 12: sprintf(buf, "raid"); break;
+                               default: sprintf(buf, "unknown");
+                               }
+               } else
+                       buf[0] = '\0';
+               free(device);
+
+               /* chop device path to 'host%d' and calculate the port number */
+               c = strchr(&path[hba_len], '/');
+               *c = '\0';
+               if (sscanf(&path[hba_len], "host%d", &port) == 1)
+                       port -= host_base;
+               else {
+                       if (verbose) {
+                               *c = '/'; /* repair the full string */
+                               fprintf(stderr, Name ": failed to determine port number for %s\n",
+                                       path);
+                       }
+                       err = 2;
+                       break;
+               }
+
+               /* mark this port as used */
+               port_mask &= ~(1 << port);
+
+               /* print out the device information */
+               if (buf[0]) {
+                       printf("          Port%d : - non-disk device (%s) -\n", port, buf);
+                       continue;
+               }
+
+               fd = dev_open(ent->d_name, O_RDONLY);
+               if (fd < 0)
+                       printf("          Port%d : - disk info unavailable -\n", port);
+               else {
+                       fd2devname(fd, buf);
+                       printf("          Port%d : %s", port, buf);
+                       if (imsm_read_serial(fd, NULL, (__u8 *) buf) == 0)
+                               printf(" (%s)\n", buf);
+                       else
+                               printf("()\n");
+               }
+               close(fd);
+               free(path);
+               path = NULL;
+       }
+       if (path)
+               free(path);
+       if (dir)
+               closedir(dir);
+       if (err == 0) {
+               int i;
+
+               for (i = 0; i < port_count; i++)
+                       if (port_mask & (1 << i))
+                               printf("          Port%d : - no device attached -\n", i);
+       }
+
+       return err;
+}
+
+static int detail_platform_imsm(int verbose, int enumerate_only)
+{
+       /* There are two components to imsm platform support, the ahci SATA
+        * controller and the option-rom.  To find the SATA controller we
+        * simply look in /sys/bus/pci/drivers/ahci to see if an ahci
+        * controller with the Intel vendor id is present.  This approach
+        * allows mdadm to leverage the kernel's ahci detection logic, with the
+        * caveat that if ahci.ko is not loaded mdadm will not be able to
+        * detect platform raid capabilities.  The option-rom resides in a
+        * platform "Adapter ROM".  We scan for its signature to retrieve the
+        * platform capabilities.  If raid support is disabled in the BIOS the
+        * option-rom capability structure will not be available.
+        */
+       const struct imsm_orom *orom;
+       struct sys_dev *list, *hba;
+       DIR *dir;
+       struct dirent *ent;
+       const char *hba_path;
+       int host_base = 0;
+       int port_count = 0;
+
+       if (enumerate_only) {
+               if (check_env("IMSM_NO_PLATFORM") || find_imsm_orom())
+                       return 0;
+               return 2;
+       }
+
+       list = find_driver_devices("pci", "ahci");
+       for (hba = list; hba; hba = hba->next)
+               if (devpath_to_vendor(hba->path) == 0x8086)
+                       break;
+
+       if (!hba) {
+               if (verbose)
+                       fprintf(stderr, Name ": unable to find active ahci controller\n");
+               free_sys_dev(&list);
+               return 2;
+       } else if (verbose)
+               fprintf(stderr, Name ": found Intel SATA AHCI Controller\n");
+       hba_path = hba->path;
+       hba->path = NULL;
+       free_sys_dev(&list);
+
+       orom = find_imsm_orom();
+       if (!orom) {
+               if (verbose)
+                       fprintf(stderr, Name ": imsm option-rom not found\n");
+               return 2;
+       }
+
+       printf("       Platform : Intel(R) Matrix Storage Manager\n");
+       printf("        Version : %d.%d.%d.%d\n", orom->major_ver, orom->minor_ver,
+              orom->hotfix_ver, orom->build);
+       printf("    RAID Levels :%s%s%s%s%s\n",
+              imsm_orom_has_raid0(orom) ? " raid0" : "",
+              imsm_orom_has_raid1(orom) ? " raid1" : "",
+              imsm_orom_has_raid1e(orom) ? " raid1e" : "",
+              imsm_orom_has_raid10(orom) ? " raid10" : "",
+              imsm_orom_has_raid5(orom) ? " raid5" : "");
+       printf("    Chunk Sizes :%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
+              imsm_orom_has_chunk(orom, 2) ? " 2k" : "",
+              imsm_orom_has_chunk(orom, 4) ? " 4k" : "",
+              imsm_orom_has_chunk(orom, 8) ? " 8k" : "",
+              imsm_orom_has_chunk(orom, 16) ? " 16k" : "",
+              imsm_orom_has_chunk(orom, 32) ? " 32k" : "",
+              imsm_orom_has_chunk(orom, 64) ? " 64k" : "",
+              imsm_orom_has_chunk(orom, 128) ? " 128k" : "",
+              imsm_orom_has_chunk(orom, 256) ? " 256k" : "",
+              imsm_orom_has_chunk(orom, 512) ? " 512k" : "",
+              imsm_orom_has_chunk(orom, 1024*1) ? " 1M" : "",
+              imsm_orom_has_chunk(orom, 1024*2) ? " 2M" : "",
+              imsm_orom_has_chunk(orom, 1024*4) ? " 4M" : "",
+              imsm_orom_has_chunk(orom, 1024*8) ? " 8M" : "",
+              imsm_orom_has_chunk(orom, 1024*16) ? " 16M" : "",
+              imsm_orom_has_chunk(orom, 1024*32) ? " 32M" : "",
+              imsm_orom_has_chunk(orom, 1024*64) ? " 64M" : "");
+       printf("      Max Disks : %d\n", orom->tds);
+       printf("    Max Volumes : %d\n", orom->vpa);
+       printf(" I/O Controller : %s\n", hba_path);
+
+       /* find the smallest scsi host number to determine a port number base */
+       dir = opendir(hba_path);
+       for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) {
+               int host;
+
+               if (sscanf(ent->d_name, "host%d", &host) != 1)
+                       continue;
+               if (port_count == 0)
+                       host_base = host;
+               else if (host < host_base)
+                       host_base = host;
+
+               if (host + 1 > port_count + host_base)
+                       port_count = host + 1 - host_base;
+
+       }
+       if (dir)
+               closedir(dir);
+
+       if (!port_count || imsm_enumerate_ports(hba_path, port_count,
+                                               host_base, verbose) != 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": failed to enumerate ports\n");
+               return 2;
+       }
+
+       return 0;
+}
+#endif
+
+static int match_home_imsm(struct supertype *st, char *homehost)
+{
+       /* the imsm metadata format does not specify any host
+        * identification information.  We return -1 since we can never
+        * confirm nor deny whether a given array is "meant" for this
+        * host.  We rely on compare_super and the 'family_num' field to
+        * exclude member disks that do not belong, and we rely on
+        * mdadm.conf to specify the arrays that should be assembled.
+        * Auto-assembly may still pick up "foreign" arrays.
+        */
+
+       return -1;
+}
+
+static void uuid_from_super_imsm(struct supertype *st, int uuid[4])
+{
+       /* The uuid returned here is used for:
+        *  uuid to put into bitmap file (Create, Grow)
+        *  uuid for backup header when saving critical section (Grow)
+        *  comparing uuids when re-adding a device into an array
+        *    In these cases the uuid required is that of the data-array,
+        *    not the device-set.
+        *  uuid to recognise same set when adding a missing device back
+        *    to an array.   This is a uuid for the device-set.
+        *  
+        * For each of these we can make do with a truncated
+        * or hashed uuid rather than the original, as long as
+        * everyone agrees.
+        * In each case the uuid required is that of the data-array,
+        * not the device-set.
+        */
+       /* imsm does not track uuid's so we synthesis one using sha1 on
+        * - The signature (Which is constant for all imsm array, but no matter)
+        * - the family_num of the container
+        * - the index number of the volume
+        * - the 'serial' number of the volume.
+        * Hopefully these are all constant.
+        */
+       struct intel_super *super = st->sb;
+
+       char buf[20];
+       struct sha1_ctx ctx;
+       struct imsm_dev *dev = NULL;
+
+       sha1_init_ctx(&ctx);
+       sha1_process_bytes(super->anchor->sig, MPB_SIG_LEN, &ctx);
+       sha1_process_bytes(&super->anchor->family_num, sizeof(__u32), &ctx);
+       if (super->current_vol >= 0)
+               dev = get_imsm_dev(super, super->current_vol);
+       if (dev) {
+               __u32 vol = super->current_vol;
+               sha1_process_bytes(&vol, sizeof(vol), &ctx);
+               sha1_process_bytes(dev->volume, MAX_RAID_SERIAL_LEN, &ctx);
+       }
+       sha1_finish_ctx(&ctx, buf);
+       memcpy(uuid, buf, 4*4);
+}
+
+#if 0
+static void
+get_imsm_numerical_version(struct imsm_super *mpb, int *m, int *p)
+{
+       __u8 *v = get_imsm_version(mpb);
+       __u8 *end = mpb->sig + MAX_SIGNATURE_LENGTH;
+       char major[] = { 0, 0, 0 };
+       char minor[] = { 0 ,0, 0 };
+       char patch[] = { 0, 0, 0 };
+       char *ver_parse[] = { major, minor, patch };
+       int i, j;
+
+       i = j = 0;
+       while (*v != '\0' && v < end) {
+               if (*v != '.' && j < 2)
+                       ver_parse[i][j++] = *v;
+               else {
+                       i++;
+                       j = 0;
+               }
+               v++;
+       }
+
+       *m = strtol(minor, NULL, 0);
+       *p = strtol(patch, NULL, 0);
+}
+#endif
+
+static int imsm_level_to_layout(int level)
+{
+       switch (level) {
+       case 0:
+       case 1:
+               return 0;
+       case 5:
+       case 6:
+               return ALGORITHM_LEFT_ASYMMETRIC;
+       case 10:
+               return 0x102;
+       }
+       return UnSet;
+}
+
+static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info)
+{
+       struct intel_super *super = st->sb;
+       struct imsm_dev *dev = get_imsm_dev(super, super->current_vol);
+       struct imsm_map *map = get_imsm_map(dev, 0);
+       struct dl *dl;
+
+       for (dl = super->disks; dl; dl = dl->next)
+               if (dl->raiddisk == info->disk.raid_disk)
+                       break;
+       info->container_member    = super->current_vol;
+       info->array.raid_disks    = map->num_members;
+       info->array.level         = get_imsm_raid_level(map);
+       info->array.layout        = imsm_level_to_layout(info->array.level);
+       info->array.md_minor      = -1;
+       info->array.ctime         = 0;
+       info->array.utime         = 0;
+       info->array.chunk_size    = __le16_to_cpu(map->blocks_per_strip) << 9;
+       info->array.state         = !dev->vol.dirty;
+       info->custom_array_size   = __le32_to_cpu(dev->size_high);
+       info->custom_array_size   <<= 32;
+       info->custom_array_size   |= __le32_to_cpu(dev->size_low);
+
+       info->disk.major = 0;
+       info->disk.minor = 0;
+       if (dl) {
+               info->disk.major = dl->major;
+               info->disk.minor = dl->minor;
+       }
+
+       info->data_offset         = __le32_to_cpu(map->pba_of_lba0);
+       info->component_size      = __le32_to_cpu(map->blocks_per_member);
+       memset(info->uuid, 0, sizeof(info->uuid));
+
+       if (map->map_state == IMSM_T_STATE_UNINITIALIZED || dev->vol.dirty)
+               info->resync_start = 0;
+       else if (dev->vol.migr_state)
+               /* FIXME add curr_migr_unit to resync_start conversion */
+               info->resync_start = 0;
+       else
+               info->resync_start = ~0ULL;
+
+       strncpy(info->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN);
+       info->name[MAX_RAID_SERIAL_LEN] = 0;
+
+       info->array.major_version = -1;
+       info->array.minor_version = -2;
+       sprintf(info->text_version, "/%s/%d",
+               devnum2devname(st->container_dev),
+               info->container_member);
+       info->safe_mode_delay = 4000;  /* 4 secs like the Matrix driver */
+       uuid_from_super_imsm(st, info->uuid);
+}
+
+/* check the config file to see if we can return a real uuid for this spare */
+static void fixup_container_spare_uuid(struct mdinfo *inf)
+{
+       struct mddev_ident_s *array_list;
+
+       if (inf->array.level != LEVEL_CONTAINER ||
+           memcmp(inf->uuid, uuid_match_any, sizeof(int[4])) != 0)
+               return;
+
+       array_list = conf_get_ident(NULL);
+
+       for (; array_list; array_list = array_list->next) {
+               if (array_list->uuid_set) {
+                       struct supertype *_sst; /* spare supertype */
+                       struct supertype *_cst; /* container supertype */
+
+                       _cst = array_list->st;
+                       _sst = _cst->ss->match_metadata_desc(inf->text_version);
+                       if (_sst) {
+                               memcpy(inf->uuid, array_list->uuid, sizeof(int[4]));
+                               free(_sst);
+                               break;
+                       }
+               }
+       }
+}
+
+static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info)
+{
+       struct intel_super *super = st->sb;
+       struct imsm_disk *disk;
+       __u32 s;
+
+       if (super->current_vol >= 0) {
+               getinfo_super_imsm_volume(st, info);
+               return;
+       }
+
+       /* Set raid_disks to zero so that Assemble will always pull in valid
+        * spares
+        */
+       info->array.raid_disks    = 0;
+       info->array.level         = LEVEL_CONTAINER;
+       info->array.layout        = 0;
+       info->array.md_minor      = -1;
+       info->array.ctime         = 0; /* N/A for imsm */ 
+       info->array.utime         = 0;
+       info->array.chunk_size    = 0;
+
+       info->disk.major = 0;
+       info->disk.minor = 0;
+       info->disk.raid_disk = -1;
+       info->reshape_active = 0;
+       info->array.major_version = -1;
+       info->array.minor_version = -2;
+       strcpy(info->text_version, "imsm");
+       info->safe_mode_delay = 0;
+       info->disk.number = -1;
+       info->disk.state = 0;
+       info->name[0] = 0;
+
+       if (super->disks) {
+               __u32 reserved = imsm_reserved_sectors(super, super->disks);
+
+               disk = &super->disks->disk;
+               info->data_offset = __le32_to_cpu(disk->total_blocks) - reserved;
+               info->component_size = reserved;
+               s = disk->status;
+               info->disk.state  = s & CONFIGURED_DISK ? (1 << MD_DISK_ACTIVE) : 0;
+               /* we don't change info->disk.raid_disk here because
+                * this state will be finalized in mdmon after we have
+                * found the 'most fresh' version of the metadata
+                */
+               info->disk.state |= s & FAILED_DISK ? (1 << MD_DISK_FAULTY) : 0;
+               info->disk.state |= s & SPARE_DISK ? 0 : (1 << MD_DISK_SYNC);
+       }
+
+       /* only call uuid_from_super_imsm when this disk is part of a populated container,
+        * ->compare_super may have updated the 'num_raid_devs' field for spares
+        */
+       if (info->disk.state & (1 << MD_DISK_SYNC) || super->anchor->num_raid_devs)
+               uuid_from_super_imsm(st, info->uuid);
+       else {
+               memcpy(info->uuid, uuid_match_any, sizeof(int[4]));
+               fixup_container_spare_uuid(info);
+       }
+}
+
+static int update_super_imsm(struct supertype *st, struct mdinfo *info,
+                            char *update, char *devname, int verbose,
+                            int uuid_set, char *homehost)
+{
+       /* FIXME */
+
+       /* For 'assemble' and 'force' we need to return non-zero if any
+        * change was made.  For others, the return value is ignored.
+        * Update options are:
+        *  force-one : This device looks a bit old but needs to be included,
+        *        update age info appropriately.
+        *  assemble: clear any 'faulty' flag to allow this device to
+        *              be assembled.
+        *  force-array: Array is degraded but being forced, mark it clean
+        *         if that will be needed to assemble it.
+        *
+        *  newdev:  not used ????
+        *  grow:  Array has gained a new device - this is currently for
+        *              linear only
+        *  resync: mark as dirty so a resync will happen.
+        *  name:  update the name - preserving the homehost
+        *
+        * Following are not relevant for this imsm:
+        *  sparc2.2 : update from old dodgey metadata
+        *  super-minor: change the preferred_minor number
+        *  summaries:  update redundant counters.
+        *  uuid:  Change the uuid of the array to match watch is given
+        *  homehost:  update the recorded homehost
+        *  _reshape_progress: record new reshape_progress position.
+        */
+       int rv = 0;
+       //struct intel_super *super = st->sb;
+       //struct imsm_super *mpb = super->mpb;
+
+       if (strcmp(update, "grow") == 0) {
+       }
+       if (strcmp(update, "resync") == 0) {
+               /* dev->vol.dirty = 1; */
+       }
+
+       /* IMSM has no concept of UUID or homehost */
+
+       return rv;
+}
+
+static size_t disks_to_mpb_size(int disks)
+{
+       size_t size;
+
+       size = sizeof(struct imsm_super);
+       size += (disks - 1) * sizeof(struct imsm_disk);
+       size += 2 * sizeof(struct imsm_dev);
+       /* up to 2 maps per raid device (-2 for imsm_maps in imsm_dev */
+       size += (4 - 2) * sizeof(struct imsm_map);
+       /* 4 possible disk_ord_tbl's */
+       size += 4 * (disks - 1) * sizeof(__u32);
+
+       return size;
+}
+
+static __u64 avail_size_imsm(struct supertype *st, __u64 devsize)
+{
+       if (devsize < (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS))
+               return 0;
+
+       return devsize - (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS);
+}
+
+static void free_devlist(struct intel_super *super)
+{
+       struct intel_dev *dv;
+
+       while (super->devlist) {
+               dv = super->devlist->next;
+               free(super->devlist->dev);
+               free(super->devlist);
+               super->devlist = dv;
+       }
+}
+
+static void imsm_copy_dev(struct imsm_dev *dest, struct imsm_dev *src)
+{
+       memcpy(dest, src, sizeof_imsm_dev(src, 0));
+}
+
+static int compare_super_imsm(struct supertype *st, struct supertype *tst)
+{
+       /*
+        * return:
+        *  0 same, or first was empty, and second was copied
+        *  1 second had wrong number
+        *  2 wrong uuid
+        *  3 wrong other info
+        */
+       struct intel_super *first = st->sb;
+       struct intel_super *sec = tst->sb;
+
+        if (!first) {
+                st->sb = tst->sb;
+                tst->sb = NULL;
+                return 0;
+        }
+
+       if (memcmp(first->anchor->sig, sec->anchor->sig, MAX_SIGNATURE_LENGTH) != 0)
+               return 3;
+
+       /* if an anchor does not have num_raid_devs set then it is a free
+        * floating spare
+        */
+       if (first->anchor->num_raid_devs > 0 &&
+           sec->anchor->num_raid_devs > 0) {
+               if (first->anchor->family_num != sec->anchor->family_num)
+                       return 3;
+       }
+
+       /* if 'first' is a spare promote it to a populated mpb with sec's
+        * family number
+        */
+       if (first->anchor->num_raid_devs == 0 &&
+           sec->anchor->num_raid_devs > 0) {
+               int i;
+               struct intel_dev *dv;
+               struct imsm_dev *dev;
+
+               /* we need to copy raid device info from sec if an allocation
+                * fails here we don't associate the spare
+                */
+               for (i = 0; i < sec->anchor->num_raid_devs; i++) {
+                       dv = malloc(sizeof(*dv));
+                       if (!dv)
+                               break;
+                       dev = malloc(sizeof_imsm_dev(get_imsm_dev(sec, i), 1));
+                       if (!dev) {
+                               free(dv);
+                               break;
+                       }
+                       dv->dev = dev;
+                       dv->index = i;
+                       dv->next = first->devlist;
+                       first->devlist = dv;
+               }
+               if (i <= sec->anchor->num_raid_devs) {
+                       /* allocation failure */
+                       free_devlist(first);
+                       fprintf(stderr, "imsm: failed to associate spare\n"); 
+                       return 3;
+               }
+               for (i = 0; i < sec->anchor->num_raid_devs; i++)
+                       imsm_copy_dev(get_imsm_dev(first, i), get_imsm_dev(sec, i));
+
+               first->anchor->num_raid_devs = sec->anchor->num_raid_devs;
+               first->anchor->family_num = sec->anchor->family_num;
+       }
+
+       return 0;
+}
+
+static void fd2devname(int fd, char *name)
+{
+       struct stat st;
+       char path[256];
+       char dname[100];
+       char *nm;
+       int rv;
+
+       name[0] = '\0';
+       if (fstat(fd, &st) != 0)
+               return;
+       sprintf(path, "/sys/dev/block/%d:%d",
+               major(st.st_rdev), minor(st.st_rdev));
+
+       rv = readlink(path, dname, sizeof(dname));
+       if (rv <= 0)
+               return;
+       
+       dname[rv] = '\0';
+       nm = strrchr(dname, '/');
+       nm++;
+       snprintf(name, MAX_RAID_SERIAL_LEN, "/dev/%s", nm);
+}
+
+
+extern int scsi_get_serial(int fd, void *buf, size_t buf_len);
+
+static int imsm_read_serial(int fd, char *devname,
+                           __u8 serial[MAX_RAID_SERIAL_LEN])
+{
+       unsigned char scsi_serial[255];
+       int rv;
+       int rsp_len;
+       int len;
+       char *dest;
+       char *src;
+       char *rsp_buf;
+       int i;
+
+       memset(scsi_serial, 0, sizeof(scsi_serial));
+
+       rv = scsi_get_serial(fd, scsi_serial, sizeof(scsi_serial));
+
+       if (rv && check_env("IMSM_DEVNAME_AS_SERIAL")) {
+               memset(serial, 0, MAX_RAID_SERIAL_LEN);
+               fd2devname(fd, (char *) serial);
+               return 0;
+       }
+
+       if (rv != 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to retrieve serial for %s\n",
+                               devname);
+               return rv;
+       }
+
+       rsp_len = scsi_serial[3];
+       if (!rsp_len) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to retrieve serial for %s\n",
+                               devname);
+               return 2;
+       }
+       rsp_buf = (char *) &scsi_serial[4];
+
+       /* trim all whitespace and non-printable characters and convert
+        * ':' to ';'
+        */
+       for (i = 0, dest = rsp_buf; i < rsp_len; i++) {
+               src = &rsp_buf[i];
+               if (*src > 0x20) {
+                       /* ':' is reserved for use in placeholder serial
+                        * numbers for missing disks
+                        */
+                       if (*src == ':')
+                               *dest++ = ';';
+                       else
+                               *dest++ = *src;
+               }
+       }
+       len = dest - rsp_buf;
+       dest = rsp_buf;
+
+       /* truncate leading characters */
+       if (len > MAX_RAID_SERIAL_LEN) {
+               dest += len - MAX_RAID_SERIAL_LEN;
+               len = MAX_RAID_SERIAL_LEN;
+       }
+
+       memset(serial, 0, MAX_RAID_SERIAL_LEN);
+       memcpy(serial, dest, len);
+
+       return 0;
+}
+
+static int serialcmp(__u8 *s1, __u8 *s2)
+{
+       return strncmp((char *) s1, (char *) s2, MAX_RAID_SERIAL_LEN);
+}
+
+static void serialcpy(__u8 *dest, __u8 *src)
+{
+       strncpy((char *) dest, (char *) src, MAX_RAID_SERIAL_LEN);
+}
+
+static struct dl *serial_to_dl(__u8 *serial, struct intel_super *super)
+{
+       struct dl *dl;
+
+       for (dl = super->disks; dl; dl = dl->next)
+               if (serialcmp(dl->serial, serial) == 0)
+                       break;
+
+       return dl;
+}
+
+static int
+load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd)
+{
+       struct dl *dl;
+       struct stat stb;
+       int rv;
+       int i;
+       int alloc = 1;
+       __u8 serial[MAX_RAID_SERIAL_LEN];
+
+       rv = imsm_read_serial(fd, devname, serial);
+
+       if (rv != 0)
+               return 2;
+
+       /* check if this is a disk we have seen before.  it may be a spare in
+        * super->disks while the current anchor believes it is a raid member,
+        * check if we need to update dl->index
+        */
+       dl = serial_to_dl(serial, super);
+       if (!dl)
+               dl = malloc(sizeof(*dl));
+       else
+               alloc = 0;
+
+       if (!dl) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": failed to allocate disk buffer for %s\n",
+                               devname);
+               return 2;
+       }
+
+       if (alloc) {
+               fstat(fd, &stb);
+               dl->major = major(stb.st_rdev);
+               dl->minor = minor(stb.st_rdev);
+               dl->next = super->disks;
+               dl->fd = keep_fd ? fd : -1;
+               dl->devname = devname ? strdup(devname) : NULL;
+               serialcpy(dl->serial, serial);
+               dl->index = -2;
+               dl->e = NULL;
+       } else if (keep_fd) {
+               close(dl->fd);
+               dl->fd = fd;
+       }
+
+       /* look up this disk's index in the current anchor */
+       for (i = 0; i < super->anchor->num_disks; i++) {
+               struct imsm_disk *disk_iter;
+
+               disk_iter = __get_imsm_disk(super->anchor, i);
+
+               if (serialcmp(disk_iter->serial, dl->serial) == 0) {
+                       dl->disk = *disk_iter;
+                       /* only set index on disks that are a member of a
+                        * populated contianer, i.e. one with raid_devs
+                        */
+                       if (dl->disk.status & FAILED_DISK)
+                               dl->index = -2;
+                       else if (dl->disk.status & SPARE_DISK)
+                               dl->index = -1;
+                       else
+                               dl->index = i;
+
+                       break;
+               }
+       }
+
+       /* no match, maybe a stale failed drive */
+       if (i == super->anchor->num_disks && dl->index >= 0) {
+               dl->disk = *__get_imsm_disk(super->anchor, dl->index);
+               if (dl->disk.status & FAILED_DISK)
+                       dl->index = -2;
+       }
+
+       if (alloc)
+               super->disks = dl;
+
+       return 0;
+}
+
+#ifndef MDASSEMBLE
+/* When migrating map0 contains the 'destination' state while map1
+ * contains the current state.  When not migrating map0 contains the
+ * current state.  This routine assumes that map[0].map_state is set to
+ * the current array state before being called.
+ *
+ * Migration is indicated by one of the following states
+ * 1/ Idle (migr_state=0 map0state=normal||unitialized||degraded||failed)
+ * 2/ Initialize (migr_state=1 migr_type=MIGR_INIT map0state=normal
+ *    map1state=unitialized)
+ * 3/ Repair (Resync) (migr_state=1 migr_type=MIGR_REPAIR  map0state=normal
+ *    map1state=normal)
+ * 4/ Rebuild (migr_state=1 migr_type=MIGR_REBUILD map0state=normal
+ *    map1state=degraded)
+ */
+static void migrate(struct imsm_dev *dev, __u8 to_state, int migr_type)
+{
+       struct imsm_map *dest;
+       struct imsm_map *src = get_imsm_map(dev, 0);
+
+       dev->vol.migr_state = 1;
+       set_migr_type(dev, migr_type);
+       dev->vol.curr_migr_unit = 0;
+       dest = get_imsm_map(dev, 1);
+
+       /* duplicate and then set the target end state in map[0] */
+       memcpy(dest, src, sizeof_imsm_map(src));
+       if (migr_type == MIGR_REBUILD) {
+               __u32 ord;
+               int i;
+
+               for (i = 0; i < src->num_members; i++) {
+                       ord = __le32_to_cpu(src->disk_ord_tbl[i]);
+                       set_imsm_ord_tbl_ent(src, i, ord_to_idx(ord));
+               }
+       }
+
+       src->map_state = to_state;
+}
+
+static void end_migration(struct imsm_dev *dev, __u8 map_state)
+{
+       struct imsm_map *map = get_imsm_map(dev, 0);
+       struct imsm_map *prev = get_imsm_map(dev, dev->vol.migr_state);
+       int i;
+
+       /* merge any IMSM_ORD_REBUILD bits that were not successfully
+        * completed in the last migration.
+        *
+        * FIXME add support for online capacity expansion and
+        * raid-level-migration
+        */
+       for (i = 0; i < prev->num_members; i++)
+               map->disk_ord_tbl[i] |= prev->disk_ord_tbl[i];
+
+       dev->vol.migr_state = 0;
+       dev->vol.curr_migr_unit = 0;
+       map->map_state = map_state;
+}
+#endif
+
+static int parse_raid_devices(struct intel_super *super)
+{
+       int i;
+       struct imsm_dev *dev_new;
+       size_t len, len_migr;
+       size_t space_needed = 0;
+       struct imsm_super *mpb = super->anchor;
+
+       for (i = 0; i < super->anchor->num_raid_devs; i++) {
+               struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i);
+               struct intel_dev *dv;
+
+               len = sizeof_imsm_dev(dev_iter, 0);
+               len_migr = sizeof_imsm_dev(dev_iter, 1);
+               if (len_migr > len)
+                       space_needed += len_migr - len;
+               
+               dv = malloc(sizeof(*dv));
+               if (!dv)
+                       return 1;
+               dev_new = malloc(len_migr);
+               if (!dev_new) {
+                       free(dv);
+                       return 1;
+               }
+               imsm_copy_dev(dev_new, dev_iter);
+               dv->dev = dev_new;
+               dv->index = i;
+               dv->next = super->devlist;
+               super->devlist = dv;
+       }
+
+       /* ensure that super->buf is large enough when all raid devices
+        * are migrating
+        */
+       if (__le32_to_cpu(mpb->mpb_size) + space_needed > super->len) {
+               void *buf;
+
+               len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + space_needed, 512);
+               if (posix_memalign(&buf, 512, len) != 0)
+                       return 1;
+
+               memcpy(buf, super->buf, super->len);
+               memset(buf + super->len, 0, len - super->len);
+               free(super->buf);
+               super->buf = buf;
+               super->len = len;
+       }
+               
+       return 0;
+}
+
+/* retrieve a pointer to the bbm log which starts after all raid devices */
+struct bbm_log *__get_imsm_bbm_log(struct imsm_super *mpb)
+{
+       void *ptr = NULL;
+
+       if (__le32_to_cpu(mpb->bbm_log_size)) {
+               ptr = mpb;
+               ptr += mpb->mpb_size - __le32_to_cpu(mpb->bbm_log_size);
+       } 
+
+       return ptr;
+}
+
+static void __free_imsm(struct intel_super *super, int free_disks);
+
+/* load_imsm_mpb - read matrix metadata
+ * allocates super->mpb to be freed by free_super
+ */
+static int load_imsm_mpb(int fd, struct intel_super *super, char *devname)
+{
+       unsigned long long dsize;
+       unsigned long long sectors;
+       struct stat;
+       struct imsm_super *anchor;
+       __u32 check_sum;
+       int rc;
+
+       get_dev_size(fd, NULL, &dsize);
+
+       if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Cannot seek to anchor block on %s: %s\n",
+                               devname, strerror(errno));
+               return 1;
+       }
+
+       if (posix_memalign((void**)&anchor, 512, 512) != 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to allocate imsm anchor buffer"
+                               " on %s\n", devname);
+               return 1;
+       }
+       if (read(fd, anchor, 512) != 512) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Cannot read anchor block on %s: %s\n",
+                               devname, strerror(errno));
+               free(anchor);
+               return 1;
+       }
+
+       if (strncmp((char *) anchor->sig, MPB_SIGNATURE, MPB_SIG_LEN) != 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": no IMSM anchor on %s\n", devname);
+               free(anchor);
+               return 2;
+       }
+
+       __free_imsm(super, 0);
+       super->len = ROUND_UP(anchor->mpb_size, 512);
+       if (posix_memalign(&super->buf, 512, super->len) != 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": unable to allocate %zu byte mpb buffer\n",
+                               super->len);
+               free(anchor);
+               return 2;
+       }
+       memcpy(super->buf, anchor, 512);
+
+       sectors = mpb_sectors(anchor) - 1;
+       free(anchor);
+       if (!sectors) {
+               check_sum = __gen_imsm_checksum(super->anchor);
+               if (check_sum != __le32_to_cpu(super->anchor->check_sum)) {
+                       if (devname)
+                               fprintf(stderr,
+                                       Name ": IMSM checksum %x != %x on %s\n",
+                                       check_sum,
+                                       __le32_to_cpu(super->anchor->check_sum),
+                                       devname);
+                       return 2;
+               }
+
+               rc = load_imsm_disk(fd, super, devname, 0);
+               if (rc == 0)
+                       rc = parse_raid_devices(super);
+               return rc;
+       }
+
+       /* read the extended mpb */
+       if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Cannot seek to extended mpb on %s: %s\n",
+                               devname, strerror(errno));
+               return 1;
+       }
+
+       if (read(fd, super->buf + 512, super->len - 512) != super->len - 512) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Cannot read extended mpb on %s: %s\n",
+                               devname, strerror(errno));
+               return 2;
+       }
+
+       check_sum = __gen_imsm_checksum(super->anchor);
+       if (check_sum != __le32_to_cpu(super->anchor->check_sum)) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": IMSM checksum %x != %x on %s\n",
+                               check_sum, __le32_to_cpu(super->anchor->check_sum),
+                               devname);
+               return 3;
+       }
+
+       /* FIXME the BBM log is disk specific so we cannot use this global
+        * buffer for all disks.  Ok for now since we only look at the global
+        * bbm_log_size parameter to gate assembly
+        */
+       super->bbm_log = __get_imsm_bbm_log(super->anchor);
+
+       rc = load_imsm_disk(fd, super, devname, 0);
+       if (rc == 0)
+               rc = parse_raid_devices(super);
+
+       return rc;
+}
+
+static void __free_imsm_disk(struct dl *d)
+{
+       if (d->fd >= 0)
+               close(d->fd);
+       if (d->devname)
+               free(d->devname);
+       if (d->e)
+               free(d->e);
+       free(d);
+
+}
+static void free_imsm_disks(struct intel_super *super)
+{
+       struct dl *d;
+
+       while (super->disks) {
+               d = super->disks;
+               super->disks = d->next;
+               __free_imsm_disk(d);
+       }
+       while (super->missing) {
+               d = super->missing;
+               super->missing = d->next;
+               __free_imsm_disk(d);
+       }
+
+}
+
+/* free all the pieces hanging off of a super pointer */
+static void __free_imsm(struct intel_super *super, int free_disks)
+{
+       if (super->buf) {
+               free(super->buf);
+               super->buf = NULL;
+       }
+       if (free_disks)
+               free_imsm_disks(super);
+       free_devlist(super);
+       if (super->hba) {
+               free((void *) super->hba);
+               super->hba = NULL;
+       }
+}
+
+static void free_imsm(struct intel_super *super)
+{
+       __free_imsm(super, 1);
+       free(super);
+}
+
+static void free_super_imsm(struct supertype *st)
+{
+       struct intel_super *super = st->sb;
+
+       if (!super)
+               return;
+
+       free_imsm(super);
+       st->sb = NULL;
+}
+
+static struct intel_super *alloc_super(int creating_imsm)
+{
+       struct intel_super *super = malloc(sizeof(*super));
+
+       if (super) {
+               memset(super, 0, sizeof(*super));
+               super->creating_imsm = creating_imsm;
+               super->current_vol = -1;
+               super->create_offset = ~((__u32 ) 0);
+               if (!check_env("IMSM_NO_PLATFORM"))
+                       super->orom = find_imsm_orom();
+               if (super->orom && !check_env("IMSM_TEST_OROM")) {
+                       struct sys_dev *list, *ent;
+
+                       /* find the first intel ahci controller */
+                       list = find_driver_devices("pci", "ahci");
+                       for (ent = list; ent; ent = ent->next)
+                               if (devpath_to_vendor(ent->path) == 0x8086)
+                                       break;
+                       if (ent) {
+                               super->hba = ent->path;
+                               ent->path = NULL;
+                       }
+                       free_sys_dev(&list);
+               }
+       }
+
+       return super;
+}
+
+#ifndef MDASSEMBLE
+/* find_missing - helper routine for load_super_imsm_all that identifies
+ * disks that have disappeared from the system.  This routine relies on
+ * the mpb being uptodate, which it is at load time.
+ */
+static int find_missing(struct intel_super *super)
+{
+       int i;
+       struct imsm_super *mpb = super->anchor;
+       struct dl *dl;
+       struct imsm_disk *disk;
+
+       for (i = 0; i < mpb->num_disks; i++) {
+               disk = __get_imsm_disk(mpb, i);
+               dl = serial_to_dl(disk->serial, super);
+               if (dl)
+                       continue;
+
+               dl = malloc(sizeof(*dl));
+               if (!dl)
+                       return 1;
+               dl->major = 0;
+               dl->minor = 0;
+               dl->fd = -1;
+               dl->devname = strdup("missing");
+               dl->index = i;
+               serialcpy(dl->serial, disk->serial);
+               dl->disk = *disk;
+               dl->e = NULL;
+               dl->next = super->missing;
+               super->missing = dl;
+       }
+
+       return 0;
+}
+
+static int load_super_imsm_all(struct supertype *st, int fd, void **sbp,
+                              char *devname, int keep_fd)
+{
+       struct mdinfo *sra;
+       struct intel_super *super;
+       struct mdinfo *sd, *best = NULL;
+       __u32 bestgen = 0;
+       __u32 gen;
+       char nm[20];
+       int dfd;
+       int rv;
+       int devnum = fd2devnum(fd);
+       int retry;
+       enum sysfs_read_flags flags;
+
+       flags = GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE;
+       if (mdmon_running(devnum))
+               flags |= SKIP_GONE_DEVS;
+
+       /* check if 'fd' an opened container */
+       sra = sysfs_read(fd, 0, flags);
+       if (!sra)
+               return 1;
+
+       if (sra->array.major_version != -1 ||
+           sra->array.minor_version != -2 ||
+           strcmp(sra->text_version, "imsm") != 0)
+               return 1;
+
+       super = alloc_super(0);
+       if (!super)
+               return 1;
+
+       /* find the most up to date disk in this array, skipping spares */
+       for (sd = sra->devs; sd; sd = sd->next) {
+               sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+               dfd = dev_open(nm, keep_fd ? O_RDWR : O_RDONLY);
+               if (dfd < 0) {
+                       free_imsm(super);
+                       return 2;
+               }
+               rv = load_imsm_mpb(dfd, super, NULL);
+
+               /* retry the load if we might have raced against mdmon */
+               if (rv == 3 && mdmon_running(devnum))
+                       for (retry = 0; retry < 3; retry++) {
+                               usleep(3000);
+                               rv = load_imsm_mpb(dfd, super, NULL);
+                               if (rv != 3)
+                                       break;
+                       }
+               if (!keep_fd)
+                       close(dfd);
+               if (rv == 0) {
+                       if (super->anchor->num_raid_devs == 0)
+                               gen = 0;
+                       else
+                               gen = __le32_to_cpu(super->anchor->generation_num);
+                       if (!best || gen > bestgen) {
+                               bestgen = gen;
+                               best = sd;
+                       }
+               } else {
+                       free_imsm(super);
+                       return rv;
+               }
+       }
+
+       if (!best) {
+               free_imsm(super);
+               return 1;
+       }
+
+       /* load the most up to date anchor */
+       sprintf(nm, "%d:%d", best->disk.major, best->disk.minor);
+       dfd = dev_open(nm, O_RDONLY);
+       if (dfd < 0) {
+               free_imsm(super);
+               return 1;
+       }
+       rv = load_imsm_mpb(dfd, super, NULL);
+       close(dfd);
+       if (rv != 0) {
+               free_imsm(super);
+               return 2;
+       }
+
+       /* re-parse the disk list with the current anchor */
+       for (sd = sra->devs ; sd ; sd = sd->next) {
+               sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+               dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY);
+               if (dfd < 0) {
+                       free_imsm(super);
+                       return 2;
+               }
+               load_imsm_disk(dfd, super, NULL, keep_fd);
+               if (!keep_fd)
+                       close(dfd);
+       }
+
+
+       if (find_missing(super) != 0) {
+               free_imsm(super);
+               return 2;
+       }
+
+       if (st->subarray[0]) {
+               if (atoi(st->subarray) <= super->anchor->num_raid_devs)
+                       super->current_vol = atoi(st->subarray);
+               else
+                       return 1;
+       }
+
+       *sbp = super;
+       st->container_dev = devnum;
+       if (st->ss == NULL) {
+               st->ss = &super_imsm;
+               st->minor_version = 0;
+               st->max_devs = IMSM_MAX_DEVICES;
+       }
+       st->loaded_container = 1;
+
+       return 0;
+}
+#endif
+
+static int load_super_imsm(struct supertype *st, int fd, char *devname)
+{
+       struct intel_super *super;
+       int rv;
+
+#ifndef MDASSEMBLE
+       if (load_super_imsm_all(st, fd, &st->sb, devname, 1) == 0)
+               return 0;
+#endif
+       if (st->subarray[0])
+               return 1; /* FIXME */
+
+       super = alloc_super(0);
+       if (!super) {
+               fprintf(stderr,
+                       Name ": malloc of %zu failed.\n",
+                       sizeof(*super));
+               return 1;
+       }
+
+       rv = load_imsm_mpb(fd, super, devname);
+
+       if (rv) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to load all information "
+                               "sections on %s\n", devname);
+               free_imsm(super);
+               return rv;
+       }
+
+       st->sb = super;
+       if (st->ss == NULL) {
+               st->ss = &super_imsm;
+               st->minor_version = 0;
+               st->max_devs = IMSM_MAX_DEVICES;
+       }
+       st->loaded_container = 0;
+
+       return 0;
+}
+
+static __u16 info_to_blocks_per_strip(mdu_array_info_t *info)
+{
+       if (info->level == 1)
+               return 128;
+       return info->chunk_size >> 9;
+}
+
+static __u32 info_to_num_data_stripes(mdu_array_info_t *info, int num_domains)
+{
+       __u32 num_stripes;
+
+       num_stripes = (info->size * 2) / info_to_blocks_per_strip(info);
+       num_stripes /= num_domains;
+
+       return num_stripes;
+}
+
+static __u32 info_to_blocks_per_member(mdu_array_info_t *info)
+{
+       if (info->level == 1)
+               return info->size * 2;
+       else
+               return (info->size * 2) & ~(info_to_blocks_per_strip(info) - 1);
+}
+
+static void imsm_update_version_info(struct intel_super *super)
+{
+       /* update the version and attributes */
+       struct imsm_super *mpb = super->anchor;
+       char *version;
+       struct imsm_dev *dev;
+       struct imsm_map *map;
+       int i;
+
+       for (i = 0; i < mpb->num_raid_devs; i++) {
+               dev = get_imsm_dev(super, i);
+               map = get_imsm_map(dev, 0);
+               if (__le32_to_cpu(dev->size_high) > 0)
+                       mpb->attributes |= MPB_ATTRIB_2TB;
+
+               /* FIXME detect when an array spans a port multiplier */
+               #if 0
+               mpb->attributes |= MPB_ATTRIB_PM;
+               #endif
+
+               if (mpb->num_raid_devs > 1 ||
+                   mpb->attributes != MPB_ATTRIB_CHECKSUM_VERIFY) {
+                       version = MPB_VERSION_ATTRIBS;
+                       switch (get_imsm_raid_level(map)) {
+                       case 0: mpb->attributes |= MPB_ATTRIB_RAID0; break;
+                       case 1: mpb->attributes |= MPB_ATTRIB_RAID1; break;
+                       case 10: mpb->attributes |= MPB_ATTRIB_RAID10; break;
+                       case 5: mpb->attributes |= MPB_ATTRIB_RAID5; break;
+                       }
+               } else {
+                       if (map->num_members >= 5)
+                               version = MPB_VERSION_5OR6_DISK_ARRAY;
+                       else if (dev->status == DEV_CLONE_N_GO)
+                               version = MPB_VERSION_CNG;
+                       else if (get_imsm_raid_level(map) == 5)
+                               version = MPB_VERSION_RAID5;
+                       else if (map->num_members >= 3)
+                               version = MPB_VERSION_3OR4_DISK_ARRAY;
+                       else if (get_imsm_raid_level(map) == 1)
+                               version = MPB_VERSION_RAID1;
+                       else
+                               version = MPB_VERSION_RAID0;
+               }
+               strcpy(((char *) mpb->sig) + strlen(MPB_SIGNATURE), version);
+       }
+}
+
+static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
+                                 unsigned long long size, char *name,
+                                 char *homehost, int *uuid)
+{
+       /* We are creating a volume inside a pre-existing container.
+        * so st->sb is already set.
+        */
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       struct intel_dev *dv;
+       struct imsm_dev *dev;
+       struct imsm_vol *vol;
+       struct imsm_map *map;
+       int idx = mpb->num_raid_devs;
+       int i;
+       unsigned long long array_blocks;
+       size_t size_old, size_new;
+       __u32 num_data_stripes;
+
+       if (super->orom && mpb->num_raid_devs >= super->orom->vpa) {
+               fprintf(stderr, Name": This imsm-container already has the "
+                       "maximum of %d volumes\n", super->orom->vpa);
+               return 0;
+       }
+
+       /* ensure the mpb is large enough for the new data */
+       size_old = __le32_to_cpu(mpb->mpb_size);
+       size_new = disks_to_mpb_size(info->nr_disks);
+       if (size_new > size_old) {
+               void *mpb_new;
+               size_t size_round = ROUND_UP(size_new, 512);
+
+               if (posix_memalign(&mpb_new, 512, size_round) != 0) {
+                       fprintf(stderr, Name": could not allocate new mpb\n");
+                       return 0;
+               }
+               memcpy(mpb_new, mpb, size_old);
+               free(mpb);
+               mpb = mpb_new;
+               super->anchor = mpb_new;
+               mpb->mpb_size = __cpu_to_le32(size_new);
+               memset(mpb_new + size_old, 0, size_round - size_old);
+       }
+       super->current_vol = idx;
+       /* when creating the first raid device in this container set num_disks
+        * to zero, i.e. delete this spare and add raid member devices in
+        * add_to_super_imsm_volume()
+        */
+       if (super->current_vol == 0)
+               mpb->num_disks = 0;
+
+       for (i = 0; i < super->current_vol; i++) {
+               dev = get_imsm_dev(super, i);
+               if (strncmp((char *) dev->volume, name,
+                            MAX_RAID_SERIAL_LEN) == 0) {
+                       fprintf(stderr, Name": '%s' is already defined for this container\n",
+                               name);
+                       return 0;
+               }
+       }
+
+       sprintf(st->subarray, "%d", idx);
+       dv = malloc(sizeof(*dv));
+       if (!dv) {
+               fprintf(stderr, Name ": failed to allocate device list entry\n");
+               return 0;
+       }
+       dev = malloc(sizeof(*dev) + sizeof(__u32) * (info->raid_disks - 1));
+       if (!dev) {
+               free(dv);
+               fprintf(stderr, Name": could not allocate raid device\n");
+               return 0;
+       }
+       strncpy((char *) dev->volume, name, MAX_RAID_SERIAL_LEN);
+       if (info->level == 1)
+               array_blocks = info_to_blocks_per_member(info);
+       else
+               array_blocks = calc_array_size(info->level, info->raid_disks,
+                                              info->layout, info->chunk_size,
+                                              info->size*2);
+       /* round array size down to closest MB */
+       array_blocks = (array_blocks >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT;
+
+       dev->size_low = __cpu_to_le32((__u32) array_blocks);
+       dev->size_high = __cpu_to_le32((__u32) (array_blocks >> 32));
+       dev->status = __cpu_to_le32(0);
+       dev->reserved_blocks = __cpu_to_le32(0);
+       vol = &dev->vol;
+       vol->migr_state = 0;
+       set_migr_type(dev, MIGR_INIT);
+       vol->dirty = 0;
+       vol->curr_migr_unit = 0;
+       map = get_imsm_map(dev, 0);
+       map->pba_of_lba0 = __cpu_to_le32(super->create_offset);
+       map->blocks_per_member = __cpu_to_le32(info_to_blocks_per_member(info));
+       map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info));
+       map->failed_disk_num = ~0;
+       map->map_state = info->level ? IMSM_T_STATE_UNINITIALIZED :
+                                      IMSM_T_STATE_NORMAL;
+       map->ddf = 1;
+
+       if (info->level == 1 && info->raid_disks > 2) {
+               fprintf(stderr, Name": imsm does not support more than 2 disks"
+                               "in a raid1 volume\n");
+               return 0;
+       }
+       if (info->level == 10) {
+               map->raid_level = 1;
+               map->num_domains = info->raid_disks / 2;
+       } else {
+               map->raid_level = info->level;
+               map->num_domains = 1;
+       }
+       num_data_stripes = info_to_num_data_stripes(info, map->num_domains);
+       map->num_data_stripes = __cpu_to_le32(num_data_stripes);
+
+       map->num_members = info->raid_disks;
+       for (i = 0; i < map->num_members; i++) {
+               /* initialized in add_to_super */
+               set_imsm_ord_tbl_ent(map, i, 0);
+       }
+       mpb->num_raid_devs++;
+
+       dv->dev = dev;
+       dv->index = super->current_vol;
+       dv->next = super->devlist;
+       super->devlist = dv;
+
+       imsm_update_version_info(super);
+
+       return 1;
+}
+
+static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
+                          unsigned long long size, char *name,
+                          char *homehost, int *uuid)
+{
+       /* This is primarily called by Create when creating a new array.
+        * We will then get add_to_super called for each component, and then
+        * write_init_super called to write it out to each device.
+        * For IMSM, Create can create on fresh devices or on a pre-existing
+        * array.
+        * To create on a pre-existing array a different method will be called.
+        * This one is just for fresh drives.
+        */
+       struct intel_super *super;
+       struct imsm_super *mpb;
+       size_t mpb_size;
+       char *version;
+
+       if (!info) {
+               st->sb = NULL;
+               return 0;
+       }
+       if (st->sb)
+               return init_super_imsm_volume(st, info, size, name, homehost,
+                                             uuid);
+
+       super = alloc_super(1);
+       if (!super)
+               return 0;
+       mpb_size = disks_to_mpb_size(info->nr_disks);
+       if (posix_memalign(&super->buf, 512, mpb_size) != 0) {
+               free(super);
+               return 0;
+       }
+       mpb = super->buf;
+       memset(mpb, 0, mpb_size); 
+
+       mpb->attributes = MPB_ATTRIB_CHECKSUM_VERIFY;
+
+       version = (char *) mpb->sig;
+       strcpy(version, MPB_SIGNATURE);
+       version += strlen(MPB_SIGNATURE);
+       strcpy(version, MPB_VERSION_RAID0);
+       mpb->mpb_size = mpb_size;
+
+       st->sb = super;
+       return 1;
+}
+
+#ifndef MDASSEMBLE
+static int add_to_super_imsm_volume(struct supertype *st, mdu_disk_info_t *dk,
+                                    int fd, char *devname)
+{
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       struct dl *dl;
+       struct imsm_dev *dev;
+       struct imsm_map *map;
+
+       dev = get_imsm_dev(super, super->current_vol);
+       map = get_imsm_map(dev, 0);
+
+       if (! (dk->state & (1<<MD_DISK_SYNC))) {
+               fprintf(stderr, Name ": %s: Cannot add spare devices to IMSM volume\n",
+                       devname);
+               return 1;
+       }
+
+       if (fd == -1) {
+               /* we're doing autolayout so grab the pre-marked (in
+                * validate_geometry) raid_disk
+                */
+               for (dl = super->disks; dl; dl = dl->next)
+                       if (dl->raiddisk == dk->raid_disk)
+                               break;
+       } else {
+               for (dl = super->disks; dl ; dl = dl->next)
+                       if (dl->major == dk->major &&
+                           dl->minor == dk->minor)
+                               break;
+       }
+
+       if (!dl) {
+               fprintf(stderr, Name ": %s is not a member of the same container\n", devname);
+               return 1;
+       }
+
+       /* add a pristine spare to the metadata */
+       if (dl->index < 0) {
+               dl->index = super->anchor->num_disks;
+               super->anchor->num_disks++;
+       }
+       set_imsm_ord_tbl_ent(map, dk->number, dl->index);
+       dl->disk.status = CONFIGURED_DISK | USABLE_DISK;
+
+       /* if we are creating the first raid device update the family number */
+       if (super->current_vol == 0) {
+               __u32 sum;
+               struct imsm_dev *_dev = __get_imsm_dev(mpb, 0);
+               struct imsm_disk *_disk = __get_imsm_disk(mpb, dl->index);
+
+               *_dev = *dev;
+               *_disk = dl->disk;
+               sum = __gen_imsm_checksum(mpb);
+               mpb->family_num = __cpu_to_le32(sum);
+       }
+
+       return 0;
+}
+
+static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk,
+                             int fd, char *devname)
+{
+       struct intel_super *super = st->sb;
+       struct dl *dd;
+       unsigned long long size;
+       __u32 id;
+       int rv;
+       struct stat stb;
+
+       /* if we are on an RAID enabled platform check that the disk is
+        * attached to the raid controller
+        */
+       if (super->hba && !disk_attached_to_hba(fd, super->hba)) {
+               fprintf(stderr,
+                       Name ": %s is not attached to the raid controller: %s\n",
+                       devname ? : "disk", super->hba);
+               return 1;
+       }
+
+       if (super->current_vol >= 0)
+               return add_to_super_imsm_volume(st, dk, fd, devname);
+
+       fstat(fd, &stb);
+       dd = malloc(sizeof(*dd));
+       if (!dd) {
+               fprintf(stderr,
+                       Name ": malloc failed %s:%d.\n", __func__, __LINE__);
+               return 1;
+       }
+       memset(dd, 0, sizeof(*dd));
+       dd->major = major(stb.st_rdev);
+       dd->minor = minor(stb.st_rdev);
+       dd->index = -1;
+       dd->devname = devname ? strdup(devname) : NULL;
+       dd->fd = fd;
+       dd->e = NULL;
+       rv = imsm_read_serial(fd, devname, dd->serial);
+       if (rv) {
+               fprintf(stderr,
+                       Name ": failed to retrieve scsi serial, aborting\n");
+               free(dd);
+               abort();
+       }
+
+       get_dev_size(fd, NULL, &size);
+       size /= 512;
+       serialcpy(dd->disk.serial, dd->serial);
+       dd->disk.total_blocks = __cpu_to_le32(size);
+       dd->disk.status = USABLE_DISK | SPARE_DISK;
+       if (sysfs_disk_to_scsi_id(fd, &id) == 0)
+               dd->disk.scsi_id = __cpu_to_le32(id);
+       else
+               dd->disk.scsi_id = __cpu_to_le32(0);
+
+       if (st->update_tail) {
+               dd->next = super->add;
+               super->add = dd;
+       } else {
+               dd->next = super->disks;
+               super->disks = dd;
+       }
+
+       return 0;
+}
+
+static int store_imsm_mpb(int fd, struct intel_super *super);
+
+/* spare records have their own family number and do not have any defined raid
+ * devices
+ */
+static int write_super_imsm_spares(struct intel_super *super, int doclose)
+{
+       struct imsm_super mpb_save;
+       struct imsm_super *mpb = super->anchor;
+       __u32 sum;
+       struct dl *d;
+
+       mpb_save = *mpb;
+       mpb->num_raid_devs = 0;
+       mpb->num_disks = 1;
+       mpb->mpb_size = sizeof(struct imsm_super);
+       mpb->generation_num = __cpu_to_le32(1UL);
+
+       for (d = super->disks; d; d = d->next) {
+               if (d->index != -1)
+                       continue;
+
+               mpb->disk[0] = d->disk;
+               sum = __gen_imsm_checksum(mpb);
+               mpb->family_num = __cpu_to_le32(sum);
+               sum = __gen_imsm_checksum(mpb);
+               mpb->check_sum = __cpu_to_le32(sum);
+
+               if (store_imsm_mpb(d->fd, super)) {
+                       fprintf(stderr, "%s: failed for device %d:%d %s\n",
+                               __func__, d->major, d->minor, strerror(errno));
+                       *mpb = mpb_save;
+                       return 1;
+               }
+               if (doclose) {
+                       close(d->fd);
+                       d->fd = -1;
+               }
+       }
+
+       *mpb = mpb_save;
+       return 0;
+}
+
+static int write_super_imsm(struct intel_super *super, int doclose)
+{
+       struct imsm_super *mpb = super->anchor;
+       struct dl *d;
+       __u32 generation;
+       __u32 sum;
+       int spares = 0;
+       int i;
+       __u32 mpb_size = sizeof(struct imsm_super) - sizeof(struct imsm_disk);
+
+       /* 'generation' is incremented everytime the metadata is written */
+       generation = __le32_to_cpu(mpb->generation_num);
+       generation++;
+       mpb->generation_num = __cpu_to_le32(generation);
+
+       mpb_size += sizeof(struct imsm_disk) * mpb->num_disks;
+       for (d = super->disks; d; d = d->next) {
+               if (d->index == -1)
+                       spares++;
+               else
+                       mpb->disk[d->index] = d->disk;
+       }
+       for (d = super->missing; d; d = d->next)
+               mpb->disk[d->index] = d->disk;
+
+       for (i = 0; i < mpb->num_raid_devs; i++) {
+               struct imsm_dev *dev = __get_imsm_dev(mpb, i);
+
+               imsm_copy_dev(dev, get_imsm_dev(super, i));
+               mpb_size += sizeof_imsm_dev(dev, 0);
+       }
+       mpb_size += __le32_to_cpu(mpb->bbm_log_size);
+       mpb->mpb_size = __cpu_to_le32(mpb_size);
+
+       /* recalculate checksum */
+       sum = __gen_imsm_checksum(mpb);
+       mpb->check_sum = __cpu_to_le32(sum);
+
+       /* write the mpb for disks that compose raid devices */
+       for (d = super->disks; d ; d = d->next) {
+               if (d->index < 0)
+                       continue;
+               if (store_imsm_mpb(d->fd, super))
+                       fprintf(stderr, "%s: failed for device %d:%d %s\n",
+                               __func__, d->major, d->minor, strerror(errno));
+               if (doclose) {
+                       close(d->fd);
+                       d->fd = -1;
+               }
+       }
+
+       if (spares)
+               return write_super_imsm_spares(super, doclose);
+
+       return 0;
+}
+
+
+static int create_array(struct supertype *st)
+{
+       size_t len;
+       struct imsm_update_create_array *u;
+       struct intel_super *super = st->sb;
+       struct imsm_dev *dev = get_imsm_dev(super, super->current_vol);
+       struct imsm_map *map = get_imsm_map(dev, 0);
+       struct disk_info *inf;
+       struct imsm_disk *disk;
+       int i;
+       int idx;
+
+       len = sizeof(*u) - sizeof(*dev) + sizeof_imsm_dev(dev, 0) +
+             sizeof(*inf) * map->num_members;
+       u = malloc(len);
+       if (!u) {
+               fprintf(stderr, "%s: failed to allocate update buffer\n",
+                       __func__);
+               return 1;
+       }
+
+       u->type = update_create_array;
+       u->dev_idx = super->current_vol;
+       imsm_copy_dev(&u->dev, dev);
+       inf = get_disk_info(u);
+       for (i = 0; i < map->num_members; i++) {
+               idx = get_imsm_disk_idx(dev, i);
+               disk = get_imsm_disk(super, idx);
+               serialcpy(inf[i].serial, disk->serial);
+       }
+       append_metadata_update(st, u, len);
+
+       return 0;
+}
+
+static int _add_disk(struct supertype *st)
+{
+       struct intel_super *super = st->sb;
+       size_t len;
+       struct imsm_update_add_disk *u;
+
+       if (!super->add)
+               return 0;
+
+       len = sizeof(*u);
+       u = malloc(len);
+       if (!u) {
+               fprintf(stderr, "%s: failed to allocate update buffer\n",
+                       __func__);
+               return 1;
+       }
+
+       u->type = update_add_disk;
+       append_metadata_update(st, u, len);
+
+       return 0;
+}
+
+static int write_init_super_imsm(struct supertype *st)
+{
+       if (st->update_tail) {
+               /* queue the recently created array / added disk
+                * as a metadata update */
+               struct intel_super *super = st->sb;
+               struct dl *d;
+               int rv;
+
+               /* determine if we are creating a volume or adding a disk */
+               if (super->current_vol < 0) {
+                       /* in the add disk case we are running in mdmon
+                        * context, so don't close fd's
+                        */
+                       return _add_disk(st);
+               } else
+                       rv = create_array(st);
+
+               for (d = super->disks; d ; d = d->next) {
+                       close(d->fd);
+                       d->fd = -1;
+               }
+
+               return rv;
+       } else
+               return write_super_imsm(st->sb, 1);
+}
+#endif
+
+static int store_zero_imsm(struct supertype *st, int fd)
+{
+       unsigned long long dsize;
+       void *buf;
+
+       get_dev_size(fd, NULL, &dsize);
+
+       /* first block is stored on second to last sector of the disk */
+       if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0)
+               return 1;
+
+       if (posix_memalign(&buf, 512, 512) != 0)
+               return 1;
+
+       memset(buf, 0, 512);
+       if (write(fd, buf, 512) != 512)
+               return 1;
+       return 0;
+}
+
+static int imsm_bbm_log_size(struct imsm_super *mpb)
+{
+       return __le32_to_cpu(mpb->bbm_log_size);
+}
+
+#ifndef MDASSEMBLE
+static int validate_geometry_imsm_container(struct supertype *st, int level,
+                                           int layout, int raiddisks, int chunk,
+                                           unsigned long long size, char *dev,
+                                           unsigned long long *freesize,
+                                           int verbose)
+{
+       int fd;
+       unsigned long long ldsize;
+       const struct imsm_orom *orom;
+
+       if (level != LEVEL_CONTAINER)
+               return 0;
+       if (!dev)
+               return 1;
+
+       if (check_env("IMSM_NO_PLATFORM"))
+               orom = NULL;
+       else
+               orom = find_imsm_orom();
+       if (orom && raiddisks > orom->tds) {
+               if (verbose)
+                       fprintf(stderr, Name ": %d exceeds maximum number of"
+                               " platform supported disks: %d\n",
+                               raiddisks, orom->tds);
+               return 0;
+       }
+
+       fd = open(dev, O_RDONLY|O_EXCL, 0);
+       if (fd < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": imsm: Cannot open %s: %s\n",
+                               dev, strerror(errno));
+               return 0;
+       }
+       if (!get_dev_size(fd, dev, &ldsize)) {
+               close(fd);
+               return 0;
+       }
+       close(fd);
+
+       *freesize = avail_size_imsm(st, ldsize >> 9);
+
+       return 1;
+}
+
+static unsigned long long find_size(struct extent *e, int *idx, int num_extents)
+{
+       const unsigned long long base_start = e[*idx].start;
+       unsigned long long end = base_start + e[*idx].size;
+       int i;
+
+       if (base_start == end)
+               return 0;
+
+       *idx = *idx + 1;
+       for (i = *idx; i < num_extents; i++) {
+               /* extend overlapping extents */
+               if (e[i].start >= base_start &&
+                   e[i].start <= end) {
+                       if (e[i].size == 0)
+                               return 0;
+                       if (e[i].start + e[i].size > end)
+                               end = e[i].start + e[i].size;
+               } else if (e[i].start > end) {
+                       *idx = i;
+                       break;
+               }
+       }
+
+       return end - base_start;
+}
+
+static unsigned long long merge_extents(struct intel_super *super, int sum_extents)
+{
+       /* build a composite disk with all known extents and generate a new
+        * 'maxsize' given the "all disks in an array must share a common start
+        * offset" constraint
+        */
+       struct extent *e = calloc(sum_extents, sizeof(*e));
+       struct dl *dl;
+       int i, j;
+       int start_extent;
+       unsigned long long pos;
+       unsigned long long start = 0;
+       unsigned long long maxsize;
+       unsigned long reserve;
+
+       if (!e)
+               return ~0ULL; /* error */
+
+       /* coalesce and sort all extents. also, check to see if we need to
+        * reserve space between member arrays
+        */
+       j = 0;
+       for (dl = super->disks; dl; dl = dl->next) {
+               if (!dl->e)
+                       continue;
+               for (i = 0; i < dl->extent_cnt; i++)
+                       e[j++] = dl->e[i];
+       }
+       qsort(e, sum_extents, sizeof(*e), cmp_extent);
+
+       /* merge extents */
+       i = 0;
+       j = 0;
+       while (i < sum_extents) {
+               e[j].start = e[i].start;
+               e[j].size = find_size(e, &i, sum_extents);
+               j++;
+               if (e[j-1].size == 0)
+                       break;
+       }
+
+       pos = 0;
+       maxsize = 0;
+       start_extent = 0;
+       i = 0;
+       do {
+               unsigned long long esize;
+
+               esize = e[i].start - pos;
+               if (esize >= maxsize) {
+                       maxsize = esize;
+                       start = pos;
+                       start_extent = i;
+               }
+               pos = e[i].start + e[i].size;
+               i++;
+       } while (e[i-1].size);
+       free(e);
+
+       if (start_extent > 0)
+               reserve = IMSM_RESERVED_SECTORS; /* gap between raid regions */
+       else
+               reserve = 0;
+
+       if (maxsize < reserve)
+               return ~0ULL;
+
+       super->create_offset = ~((__u32) 0);
+       if (start + reserve > super->create_offset)
+               return ~0ULL; /* start overflows create_offset */
+       super->create_offset = start + reserve;
+
+       return maxsize - reserve;
+}
+
+static int is_raid_level_supported(const struct imsm_orom *orom, int level, int raiddisks)
+{
+       if (level < 0 || level == 6 || level == 4)
+               return 0;
+
+       /* if we have an orom prevent invalid raid levels */
+       if (orom)
+               switch (level) {
+               case 0: return imsm_orom_has_raid0(orom);
+               case 1:
+                       if (raiddisks > 2)
+                               return imsm_orom_has_raid1e(orom);
+                       return imsm_orom_has_raid1(orom) && raiddisks == 2;
+               case 10: return imsm_orom_has_raid10(orom) && raiddisks == 4;
+               case 5: return imsm_orom_has_raid5(orom) && raiddisks > 2;
+               }
+       else
+               return 1; /* not on an Intel RAID platform so anything goes */
+
+       return 0;
+}
+
+#define pr_vrb(fmt, arg...) (void) (verbose && fprintf(stderr, Name fmt, ##arg))
+/* validate_geometry_imsm_volume - lifted from validate_geometry_ddf_bvd 
+ * FIX ME add ahci details
+ */
+static int validate_geometry_imsm_volume(struct supertype *st, int level,
+                                        int layout, int raiddisks, int chunk,
+                                        unsigned long long size, char *dev,
+                                        unsigned long long *freesize,
+                                        int verbose)
+{
+       struct stat stb;
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       struct dl *dl;
+       unsigned long long pos = 0;
+       unsigned long long maxsize;
+       struct extent *e;
+       int i;
+
+       /* We must have the container info already read in. */
+       if (!super)
+               return 0;
+
+       if (!is_raid_level_supported(super->orom, level, raiddisks)) {
+               pr_vrb(": platform does not support raid%d with %d disk%s\n",
+                       level, raiddisks, raiddisks > 1 ? "s" : "");
+               return 0;
+       }
+       if (super->orom && level != 1 &&
+           !imsm_orom_has_chunk(super->orom, chunk)) {
+               pr_vrb(": platform does not support a chunk size of: %d\n", chunk);
+               return 0;
+       }
+       if (layout != imsm_level_to_layout(level)) {
+               if (level == 5)
+                       pr_vrb(": imsm raid 5 only supports the left-asymmetric layout\n");
+               else if (level == 10)
+                       pr_vrb(": imsm raid 10 only supports the n2 layout\n");
+               else
+                       pr_vrb(": imsm unknown layout %#x for this raid level %d\n",
+                               layout, level);
+               return 0;
+       }
+
+       if (!dev) {
+               /* General test:  make sure there is space for
+                * 'raiddisks' device extents of size 'size' at a given
+                * offset
+                */
+               unsigned long long minsize = size;
+               unsigned long long start_offset = ~0ULL;
+               int dcnt = 0;
+               if (minsize == 0)
+                       minsize = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
+               for (dl = super->disks; dl ; dl = dl->next) {
+                       int found = 0;
+
+                       pos = 0;
+                       i = 0;
+                       e = get_extents(super, dl);
+                       if (!e) continue;
+                       do {
+                               unsigned long long esize;
+                               esize = e[i].start - pos;
+                               if (esize >= minsize)
+                                       found = 1;
+                               if (found && start_offset == ~0ULL) {
+                                       start_offset = pos;
+                                       break;
+                               } else if (found && pos != start_offset) {
+                                       found = 0;
+                                       break;
+                               }
+                               pos = e[i].start + e[i].size;
+                               i++;
+                       } while (e[i-1].size);
+                       if (found)
+                               dcnt++;
+                       free(e);
+               }
+               if (dcnt < raiddisks) {
+                       if (verbose)
+                               fprintf(stderr, Name ": imsm: Not enough "
+                                       "devices with space for this array "
+                                       "(%d < %d)\n",
+                                       dcnt, raiddisks);
+                       return 0;
+               }
+               return 1;
+       }
+
+       /* This device must be a member of the set */
+       if (stat(dev, &stb) < 0)
+               return 0;
+       if ((S_IFMT & stb.st_mode) != S_IFBLK)
+               return 0;
+       for (dl = super->disks ; dl ; dl = dl->next) {
+               if (dl->major == major(stb.st_rdev) &&
+                   dl->minor == minor(stb.st_rdev))
+                       break;
+       }
+       if (!dl) {
+               if (verbose)
+                       fprintf(stderr, Name ": %s is not in the "
+                               "same imsm set\n", dev);
+               return 0;
+       } else if (super->orom && dl->index < 0 && mpb->num_raid_devs) {
+               /* If a volume is present then the current creation attempt
+                * cannot incorporate new spares because the orom may not
+                * understand this configuration (all member disks must be
+                * members of each array in the container).
+                */
+               fprintf(stderr, Name ": %s is a spare and a volume"
+                       " is already defined for this container\n", dev);
+               fprintf(stderr, Name ": The option-rom requires all member"
+                       " disks to be a member of all volumes\n");
+               return 0;
+       }
+
+       /* retrieve the largest free space block */
+       e = get_extents(super, dl);
+       maxsize = 0;
+       i = 0;
+       if (e) {
+               do {
+                       unsigned long long esize;
+
+                       esize = e[i].start - pos;
+                       if (esize >= maxsize)
+                               maxsize = esize;
+                       pos = e[i].start + e[i].size;
+                       i++;
+               } while (e[i-1].size);
+               dl->e = e;
+               dl->extent_cnt = i;
+       } else {
+               if (verbose)
+                       fprintf(stderr, Name ": unable to determine free space for: %s\n",
+                               dev);
+               return 0;
+       }
+       if (maxsize < size) {
+               if (verbose)
+                       fprintf(stderr, Name ": %s not enough space (%llu < %llu)\n",
+                               dev, maxsize, size);
+               return 0;
+       }
+
+       /* count total number of extents for merge */
+       i = 0;
+       for (dl = super->disks; dl; dl = dl->next)
+               if (dl->e)
+                       i += dl->extent_cnt;
+
+       maxsize = merge_extents(super, i);
+       if (maxsize < size) {
+               if (verbose)
+                       fprintf(stderr, Name ": not enough space after merge (%llu < %llu)\n",
+                               maxsize, size);
+               return 0;
+       } else if (maxsize == ~0ULL) {
+               if (verbose)
+                       fprintf(stderr, Name ": failed to merge %d extents\n", i);
+               return 0;
+       }
+
+       *freesize = maxsize;
+
+       return 1;
+}
+
+static int reserve_space(struct supertype *st, int raiddisks,
+                        unsigned long long size, int chunk,
+                        unsigned long long *freesize)
+{
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       struct dl *dl;
+       int i;
+       int extent_cnt;
+       struct extent *e;
+       unsigned long long maxsize;
+       unsigned long long minsize;
+       int cnt;
+       int used;
+
+       /* find the largest common start free region of the possible disks */
+       used = 0;
+       extent_cnt = 0;
+       cnt = 0;
+       for (dl = super->disks; dl; dl = dl->next) {
+               dl->raiddisk = -1;
+
+               if (dl->index >= 0)
+                       used++;
+
+               /* don't activate new spares if we are orom constrained
+                * and there is already a volume active in the container
+                */
+               if (super->orom && dl->index < 0 && mpb->num_raid_devs)
+                       continue;
+
+               e = get_extents(super, dl);
+               if (!e)
+                       continue;
+               for (i = 1; e[i-1].size; i++)
+                       ;
+               dl->e = e;
+               dl->extent_cnt = i;
+               extent_cnt += i;
+               cnt++;
+       }
+
+       maxsize = merge_extents(super, extent_cnt);
+       minsize = size;
+       if (size == 0)
+               minsize = chunk;
+
+       if (cnt < raiddisks ||
+           (super->orom && used && used != raiddisks) ||
+           maxsize < minsize) {
+               fprintf(stderr, Name ": not enough devices with space to create array.\n");
+               return 0; /* No enough free spaces large enough */
+       }
+
+       if (size == 0) {
+               size = maxsize;
+               if (chunk) {
+                       size /= chunk;
+                       size *= chunk;
+               }
+       }
+
+       cnt = 0;
+       for (dl = super->disks; dl; dl = dl->next)
+               if (dl->e)
+                       dl->raiddisk = cnt++;
+
+       *freesize = size;
+
+       return 1;
+}
+
+static int validate_geometry_imsm(struct supertype *st, int level, int layout,
+                                 int raiddisks, int chunk, unsigned long long size,
+                                 char *dev, unsigned long long *freesize,
+                                 int verbose)
+{
+       int fd, cfd;
+       struct mdinfo *sra;
+
+       /* if given unused devices create a container 
+        * if given given devices in a container create a member volume
+        */
+       if (level == LEVEL_CONTAINER) {
+               /* Must be a fresh device to add to a container */
+               return validate_geometry_imsm_container(st, level, layout,
+                                                       raiddisks, chunk, size,
+                                                       dev, freesize,
+                                                       verbose);
+       }
+       
+       if (!dev) {
+               if (st->sb && freesize) {
+                       /* we are being asked to automatically layout a
+                        * new volume based on the current contents of
+                        * the container.  If the the parameters can be
+                        * satisfied reserve_space will record the disks,
+                        * start offset, and size of the volume to be
+                        * created.  add_to_super and getinfo_super
+                        * detect when autolayout is in progress.
+                        */
+                       return reserve_space(st, raiddisks, size, chunk, freesize);
+               }
+               return 1;
+       }
+       if (st->sb) {
+               /* creating in a given container */
+               return validate_geometry_imsm_volume(st, level, layout,
+                                                    raiddisks, chunk, size,
+                                                    dev, freesize, verbose);
+       }
+
+       /* limit creation to the following levels */
+       if (!dev)
+               switch (level) {
+               case 0:
+               case 1:
+               case 10:
+               case 5:
+                       break;
+               default:
+                       return 1;
+               }
+
+       /* This device needs to be a device in an 'imsm' container */
+       fd = open(dev, O_RDONLY|O_EXCL, 0);
+       if (fd >= 0) {
+               if (verbose)
+                       fprintf(stderr,
+                               Name ": Cannot create this array on device %s\n",
+                               dev);
+               close(fd);
+               return 0;
+       }
+       if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": Cannot open %s: %s\n",
+                               dev, strerror(errno));
+               return 0;
+       }
+       /* Well, it is in use by someone, maybe an 'imsm' container. */
+       cfd = open_container(fd);
+       if (cfd < 0) {
+               close(fd);
+               if (verbose)
+                       fprintf(stderr, Name ": Cannot use %s: It is busy\n",
+                               dev);
+               return 0;
+       }
+       sra = sysfs_read(cfd, 0, GET_VERSION);
+       close(fd);
+       if (sra && sra->array.major_version == -1 &&
+           strcmp(sra->text_version, "imsm") == 0) {
+               /* This is a member of a imsm container.  Load the container
+                * and try to create a volume
+                */
+               struct intel_super *super;
+
+               if (load_super_imsm_all(st, cfd, (void **) &super, NULL, 1) == 0) {
+                       st->sb = super;
+                       st->container_dev = fd2devnum(cfd);
+                       close(cfd);
+                       return validate_geometry_imsm_volume(st, level, layout,
+                                                            raiddisks, chunk,
+                                                            size, dev,
+                                                            freesize, verbose);
+               }
+               close(cfd);
+       } else /* may belong to another container */
+               return 0;
+
+       return 1;
+}
+#endif /* MDASSEMBLE */
+
+static struct mdinfo *container_content_imsm(struct supertype *st)
+{
+       /* Given a container loaded by load_super_imsm_all,
+        * extract information about all the arrays into
+        * an mdinfo tree.
+        *
+        * For each imsm_dev create an mdinfo, fill it in,
+        *  then look for matching devices in super->disks
+        *  and create appropriate device mdinfo.
+        */
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       struct mdinfo *rest = NULL;
+       int i;
+
+       /* do not assemble arrays that might have bad blocks */
+       if (imsm_bbm_log_size(super->anchor)) {
+               fprintf(stderr, Name ": BBM log found in metadata. "
+                               "Cannot activate array(s).\n");
+               return NULL;
+       }
+
+       for (i = 0; i < mpb->num_raid_devs; i++) {
+               struct imsm_dev *dev = get_imsm_dev(super, i);
+               struct imsm_map *map = get_imsm_map(dev, 0);
+               struct mdinfo *this;
+               int slot;
+
+               /* do not publish arrays that are in the middle of an
+                * unsupported migration
+                */
+               if (dev->vol.migr_state &&
+                   (migr_type(dev) == MIGR_GEN_MIGR ||
+                    migr_type(dev) == MIGR_STATE_CHANGE)) {
+                       fprintf(stderr, Name ": cannot assemble volume '%.16s':"
+                               " unsupported migration in progress\n",
+                               dev->volume);
+                       continue;
+               }
+
+               this = malloc(sizeof(*this));
+               memset(this, 0, sizeof(*this));
+               this->next = rest;
+
+               super->current_vol = i;
+               getinfo_super_imsm_volume(st, this);
+               for (slot = 0 ; slot <  map->num_members; slot++) {
+                       struct mdinfo *info_d;
+                       struct dl *d;
+                       int idx;
+                       int skip;
+                       __u32 s;
+                       __u32 ord;
+
+                       skip = 0;
+                       idx = get_imsm_disk_idx(dev, slot);
+                       ord = get_imsm_ord_tbl_ent(dev, slot); 
+                       for (d = super->disks; d ; d = d->next)
+                               if (d->index == idx)
+                                        break;
+
+                       if (d == NULL)
+                               skip = 1;
+
+                       s = d ? d->disk.status : 0;
+                       if (s & FAILED_DISK)
+                               skip = 1;
+                       if (!(s & USABLE_DISK))
+                               skip = 1;
+                       if (ord & IMSM_ORD_REBUILD)
+                               skip = 1;
+
+                       /* 
+                        * if we skip some disks the array will be assmebled degraded;
+                        * reset resync start to avoid a dirty-degraded situation
+                        *
+                        * FIXME handle dirty degraded
+                        */
+                       if (skip && !dev->vol.dirty)
+                               this->resync_start = ~0ULL;
+                       if (skip)
+                               continue;
+
+                       info_d = malloc(sizeof(*info_d));
+                       if (!info_d) {
+                               fprintf(stderr, Name ": failed to allocate disk"
+                                       " for volume %.16s\n", dev->volume);
+                               free(this);
+                               this = rest;
+                               break;
+                       }
+                       memset(info_d, 0, sizeof(*info_d));
+                       info_d->next = this->devs;
+                       this->devs = info_d;
+
+                       info_d->disk.number = d->index;
+                       info_d->disk.major = d->major;
+                       info_d->disk.minor = d->minor;
+                       info_d->disk.raid_disk = slot;
+
+                       this->array.working_disks++;
+
+                       info_d->events = __le32_to_cpu(mpb->generation_num);
+                       info_d->data_offset = __le32_to_cpu(map->pba_of_lba0);
+                       info_d->component_size = __le32_to_cpu(map->blocks_per_member);
+                       if (d->devname)
+                               strcpy(info_d->name, d->devname);
+               }
+               rest = this;
+       }
+
+       return rest;
+}
+
+
+#ifndef MDASSEMBLE
+static int imsm_open_new(struct supertype *c, struct active_array *a,
+                        char *inst)
+{
+       struct intel_super *super = c->sb;
+       struct imsm_super *mpb = super->anchor;
+       
+       if (atoi(inst) >= mpb->num_raid_devs) {
+               fprintf(stderr, "%s: subarry index %d, out of range\n",
+                       __func__, atoi(inst));
+               return -ENODEV;
+       }
+
+       dprintf("imsm: open_new %s\n", inst);
+       a->info.container_member = atoi(inst);
+       return 0;
+}
+
+static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, int failed)
+{
+       struct imsm_map *map = get_imsm_map(dev, 0);
+
+       if (!failed)
+               return map->map_state == IMSM_T_STATE_UNINITIALIZED ? 
+                       IMSM_T_STATE_UNINITIALIZED : IMSM_T_STATE_NORMAL;
+
+       switch (get_imsm_raid_level(map)) {
+       case 0:
+               return IMSM_T_STATE_FAILED;
+               break;
+       case 1:
+               if (failed < map->num_members)
+                       return IMSM_T_STATE_DEGRADED;
+               else
+                       return IMSM_T_STATE_FAILED;
+               break;
+       case 10:
+       {
+               /**
+                * check to see if any mirrors have failed, otherwise we
+                * are degraded.  Even numbered slots are mirrored on
+                * slot+1
+                */
+               int i;
+               /* gcc -Os complains that this is unused */
+               int insync = insync;
+
+               for (i = 0; i < map->num_members; i++) {
+                       __u32 ord = get_imsm_ord_tbl_ent(dev, i);
+                       int idx = ord_to_idx(ord);
+                       struct imsm_disk *disk;
+
+                       /* reset the potential in-sync count on even-numbered
+                        * slots.  num_copies is always 2 for imsm raid10 
+                        */
+                       if ((i & 1) == 0)
+                               insync = 2;
+
+                       disk = get_imsm_disk(super, idx);
+                       if (!disk || disk->status & FAILED_DISK ||
+                           ord & IMSM_ORD_REBUILD)
+                               insync--;
+
+                       /* no in-sync disks left in this mirror the
+                        * array has failed
+                        */
+                       if (insync == 0)
+                               return IMSM_T_STATE_FAILED;
+               }
+
+               return IMSM_T_STATE_DEGRADED;
+       }
+       case 5:
+               if (failed < 2)
+                       return IMSM_T_STATE_DEGRADED;
+               else
+                       return IMSM_T_STATE_FAILED;
+               break;
+       default:
+               break;
+       }
+
+       return map->map_state;
+}
+
+static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev)
+{
+       int i;
+       int failed = 0;
+       struct imsm_disk *disk;
+       struct imsm_map *map = get_imsm_map(dev, 0);
+       struct imsm_map *prev = get_imsm_map(dev, dev->vol.migr_state);
+       __u32 ord;
+       int idx;
+
+       /* at the beginning of migration we set IMSM_ORD_REBUILD on
+        * disks that are being rebuilt.  New failures are recorded to
+        * map[0].  So we look through all the disks we started with and
+        * see if any failures are still present, or if any new ones
+        * have arrived
+        *
+        * FIXME add support for online capacity expansion and
+        * raid-level-migration
+        */
+       for (i = 0; i < prev->num_members; i++) {
+               ord = __le32_to_cpu(prev->disk_ord_tbl[i]);
+               ord |= __le32_to_cpu(map->disk_ord_tbl[i]);
+               idx = ord_to_idx(ord);
+
+               disk = get_imsm_disk(super, idx);
+               if (!disk || disk->status & FAILED_DISK ||
+                   ord & IMSM_ORD_REBUILD)
+                       failed++;
+       }
+
+       return failed;
+}
+
+static int is_resyncing(struct imsm_dev *dev)
+{
+       struct imsm_map *migr_map;
+
+       if (!dev->vol.migr_state)
+               return 0;
+
+       if (migr_type(dev) == MIGR_INIT ||
+           migr_type(dev) == MIGR_REPAIR)
+               return 1;
+
+       migr_map = get_imsm_map(dev, 1);
+
+       if (migr_map->map_state == IMSM_T_STATE_NORMAL)
+               return 1;
+       else
+               return 0;
+}
+
+static int is_rebuilding(struct imsm_dev *dev)
+{
+       struct imsm_map *migr_map;
+
+       if (!dev->vol.migr_state)
+               return 0;
+
+       if (migr_type(dev) != MIGR_REBUILD)
+               return 0;
+
+       migr_map = get_imsm_map(dev, 1);
+
+       if (migr_map->map_state == IMSM_T_STATE_DEGRADED)
+               return 1;
+       else
+               return 0;
+}
+
+/* return true if we recorded new information */
+static int mark_failure(struct imsm_dev *dev, struct imsm_disk *disk, int idx)
+{
+       __u32 ord;
+       int slot;
+       struct imsm_map *map;
+
+       /* new failures are always set in map[0] */
+       map = get_imsm_map(dev, 0);
+
+       slot = get_imsm_disk_slot(map, idx);
+       if (slot < 0)
+               return 0;
+
+       ord = __le32_to_cpu(map->disk_ord_tbl[slot]);
+       if ((disk->status & FAILED_DISK) && (ord & IMSM_ORD_REBUILD))
+               return 0;
+
+       disk->status |= FAILED_DISK;
+       set_imsm_ord_tbl_ent(map, slot, idx | IMSM_ORD_REBUILD);
+       if (map->failed_disk_num == ~0)
+               map->failed_disk_num = slot;
+       return 1;
+}
+
+static void mark_missing(struct imsm_dev *dev, struct imsm_disk *disk, int idx)
+{
+       mark_failure(dev, disk, idx);
+
+       if (disk->scsi_id == __cpu_to_le32(~(__u32)0))
+               return;
+
+       disk->scsi_id = __cpu_to_le32(~(__u32)0);
+       memmove(&disk->serial[0], &disk->serial[1], MAX_RAID_SERIAL_LEN - 1);
+}
+
+/* Handle dirty -> clean transititions and resync.  Degraded and rebuild
+ * states are handled in imsm_set_disk() with one exception, when a
+ * resync is stopped due to a new failure this routine will set the
+ * 'degraded' state for the array.
+ */
+static int imsm_set_array_state(struct active_array *a, int consistent)
+{
+       int inst = a->info.container_member;
+       struct intel_super *super = a->container->sb;
+       struct imsm_dev *dev = get_imsm_dev(super, inst);
+       struct imsm_map *map = get_imsm_map(dev, 0);
+       int failed = imsm_count_failed(super, dev);
+       __u8 map_state = imsm_check_degraded(super, dev, failed);
+
+       /* before we activate this array handle any missing disks */
+       if (consistent == 2 && super->missing) {
+               struct dl *dl;
+
+               dprintf("imsm: mark missing\n");
+               end_migration(dev, map_state);
+               for (dl = super->missing; dl; dl = dl->next)
+                       mark_missing(dev, &dl->disk, dl->index);
+               super->updates_pending++;
+       }
+               
+       if (consistent == 2 &&
+           (!is_resync_complete(a) ||
+            map_state != IMSM_T_STATE_NORMAL ||
+            dev->vol.migr_state))
+               consistent = 0;
+
+       if (is_resync_complete(a)) {
+               /* complete intialization / resync,
+                * recovery and interrupted recovery is completed in
+                * ->set_disk
+                */
+               if (is_resyncing(dev)) {
+                       dprintf("imsm: mark resync done\n");
+                       end_migration(dev, map_state);
+                       super->updates_pending++;
+               }
+       } else if (!is_resyncing(dev) && !failed) {
+               /* mark the start of the init process if nothing is failed */
+               dprintf("imsm: mark resync start (%llu)\n", a->resync_start);
+               if (map->map_state == IMSM_T_STATE_UNINITIALIZED)
+                       migrate(dev, IMSM_T_STATE_NORMAL, MIGR_INIT);
+               else
+                       migrate(dev, IMSM_T_STATE_NORMAL, MIGR_REPAIR);
+               super->updates_pending++;
+       }
+
+        /* FIXME check if we can update curr_migr_unit from resync_start */
+
+       /* mark dirty / clean */
+       if (dev->vol.dirty != !consistent) {
+               dprintf("imsm: mark '%s' (%llu)\n",
+                       consistent ? "clean" : "dirty", a->resync_start);
+               if (consistent)
+                       dev->vol.dirty = 0;
+               else
+                       dev->vol.dirty = 1;
+               super->updates_pending++;
+       }
+       return consistent;
+}
+
+static void imsm_set_disk(struct active_array *a, int n, int state)
+{
+       int inst = a->info.container_member;
+       struct intel_super *super = a->container->sb;
+       struct imsm_dev *dev = get_imsm_dev(super, inst);
+       struct imsm_map *map = get_imsm_map(dev, 0);
+       struct imsm_disk *disk;
+       int failed;
+       __u32 ord;
+       __u8 map_state;
+
+       if (n > map->num_members)
+               fprintf(stderr, "imsm: set_disk %d out of range 0..%d\n",
+                       n, map->num_members - 1);
+
+       if (n < 0)
+               return;
+
+       dprintf("imsm: set_disk %d:%x\n", n, state);
+
+       ord = get_imsm_ord_tbl_ent(dev, n);
+       disk = get_imsm_disk(super, ord_to_idx(ord));
+
+       /* check for new failures */
+       if (state & DS_FAULTY) {
+               if (mark_failure(dev, disk, ord_to_idx(ord)))
+                       super->updates_pending++;
+       }
+
+       /* check if in_sync */
+       if (state & DS_INSYNC && ord & IMSM_ORD_REBUILD && is_rebuilding(dev)) {
+               struct imsm_map *migr_map = get_imsm_map(dev, 1);
+
+               set_imsm_ord_tbl_ent(migr_map, n, ord_to_idx(ord));
+               super->updates_pending++;
+       }
+
+       failed = imsm_count_failed(super, dev);
+       map_state = imsm_check_degraded(super, dev, failed);
+
+       /* check if recovery complete, newly degraded, or failed */
+       if (map_state == IMSM_T_STATE_NORMAL && is_rebuilding(dev)) {
+               end_migration(dev, map_state);
+               map = get_imsm_map(dev, 0);
+               map->failed_disk_num = ~0;
+               super->updates_pending++;
+       } else if (map_state == IMSM_T_STATE_DEGRADED &&
+                  map->map_state != map_state &&
+                  !dev->vol.migr_state) {
+               dprintf("imsm: mark degraded\n");
+               map->map_state = map_state;
+               super->updates_pending++;
+       } else if (map_state == IMSM_T_STATE_FAILED &&
+                  map->map_state != map_state) {
+               dprintf("imsm: mark failed\n");
+               end_migration(dev, map_state);
+               super->updates_pending++;
+       }
+}
+
+static int store_imsm_mpb(int fd, struct intel_super *super)
+{
+       struct imsm_super *mpb = super->anchor;
+       __u32 mpb_size = __le32_to_cpu(mpb->mpb_size);
+       unsigned long long dsize;
+       unsigned long long sectors;
+
+       get_dev_size(fd, NULL, &dsize);
+
+       if (mpb_size > 512) {
+               /* -1 to account for anchor */
+               sectors = mpb_sectors(mpb) - 1;
+
+               /* write the extended mpb to the sectors preceeding the anchor */
+               if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0)
+                       return 1;
+
+               if (write(fd, super->buf + 512, 512 * sectors) != 512 * sectors)
+                       return 1;
+       }
+
+       /* first block is stored on second to last sector of the disk */
+       if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0)
+               return 1;
+
+       if (write(fd, super->buf, 512) != 512)
+               return 1;
+
+       return 0;
+}
+
+static void imsm_sync_metadata(struct supertype *container)
+{
+       struct intel_super *super = container->sb;
+
+       if (!super->updates_pending)
+               return;
+
+       write_super_imsm(super, 0);
+
+       super->updates_pending = 0;
+}
+
+static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_array *a)
+{
+       struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member);
+       int i = get_imsm_disk_idx(dev, idx);
+       struct dl *dl;
+
+       for (dl = super->disks; dl; dl = dl->next)
+               if (dl->index == i)
+                       break;
+
+       if (dl && dl->disk.status & FAILED_DISK)
+               dl = NULL;
+
+       if (dl)
+               dprintf("%s: found %x:%x\n", __func__, dl->major, dl->minor);
+
+       return dl;
+}
+
+static struct dl *imsm_add_spare(struct intel_super *super, int slot,
+                                struct active_array *a, int activate_new)
+{
+       struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member);
+       int idx = get_imsm_disk_idx(dev, slot);
+       struct imsm_super *mpb = super->anchor;
+       struct imsm_map *map;
+       unsigned long long esize;
+       unsigned long long pos;
+       struct mdinfo *d;
+       struct extent *ex;
+       int i, j;
+       int found;
+       __u32 array_start;
+       __u32 blocks;
+       struct dl *dl;
+
+       for (dl = super->disks; dl; dl = dl->next) {
+               /* If in this array, skip */
+               for (d = a->info.devs ; d ; d = d->next)
+                       if (d->state_fd >= 0 &&
+                           d->disk.major == dl->major &&
+                           d->disk.minor == dl->minor) {
+                               dprintf("%x:%x already in array\n", dl->major, dl->minor);
+                               break;
+                       }
+               if (d)
+                       continue;
+
+               /* skip in use or failed drives */
+               if (dl->disk.status & FAILED_DISK || idx == dl->index ||
+                   dl->index == -2) {
+                       dprintf("%x:%x status (failed: %d index: %d)\n",
+                               dl->major, dl->minor,
+                               (dl->disk.status & FAILED_DISK) == FAILED_DISK, idx);
+                       continue;
+               }
+
+               /* skip pure spares when we are looking for partially
+                * assimilated drives
+                */
+               if (dl->index == -1 && !activate_new)
+                       continue;
+
+               /* Does this unused device have the requisite free space?
+                * It needs to be able to cover all member volumes
+                */
+               ex = get_extents(super, dl);
+               if (!ex) {
+                       dprintf("cannot get extents\n");
+                       continue;
+               }
+               for (i = 0; i < mpb->num_raid_devs; i++) {
+                       dev = get_imsm_dev(super, i);
+                       map = get_imsm_map(dev, 0);
+
+                       /* check if this disk is already a member of
+                        * this array
+                        */
+                       if (get_imsm_disk_slot(map, dl->index) >= 0)
+                               continue;
+
+                       found = 0;
+                       j = 0;
+                       pos = 0;
+                       array_start = __le32_to_cpu(map->pba_of_lba0);
+                       blocks = __le32_to_cpu(map->blocks_per_member);
+
+                       do {
+                               /* check that we can start at pba_of_lba0 with
+                                * blocks_per_member of space
+                                */
+                               esize = ex[j].start - pos;
+                               if (array_start >= pos &&
+                                   array_start + blocks < ex[j].start) {
+                                       found = 1;
+                                       break;
+                               }
+                               pos = ex[j].start + ex[j].size;
+                               j++;
+                       } while (ex[j-1].size);
+
+                       if (!found)
+                               break;
+               }
+
+               free(ex);
+               if (i < mpb->num_raid_devs) {
+                       dprintf("%x:%x does not have %u at %u\n",
+                               dl->major, dl->minor,
+                               blocks, array_start);
+                       /* No room */
+                       continue;
+               }
+               return dl;
+       }
+
+       return dl;
+}
+
+static struct mdinfo *imsm_activate_spare(struct active_array *a,
+                                         struct metadata_update **updates)
+{
+       /**
+        * Find a device with unused free space and use it to replace a
+        * failed/vacant region in an array.  We replace failed regions one a
+        * array at a time.  The result is that a new spare disk will be added
+        * to the first failed array and after the monitor has finished
+        * propagating failures the remainder will be consumed.
+        *
+        * FIXME add a capability for mdmon to request spares from another
+        * container.
+        */
+
+       struct intel_super *super = a->container->sb;
+       int inst = a->info.container_member;
+       struct imsm_dev *dev = get_imsm_dev(super, inst);
+       struct imsm_map *map = get_imsm_map(dev, 0);
+       int failed = a->info.array.raid_disks;
+       struct mdinfo *rv = NULL;
+       struct mdinfo *d;
+       struct mdinfo *di;
+       struct metadata_update *mu;
+       struct dl *dl;
+       struct imsm_update_activate_spare *u;
+       int num_spares = 0;
+       int i;
+
+       for (d = a->info.devs ; d ; d = d->next) {
+               if ((d->curr_state & DS_FAULTY) &&
+                       d->state_fd >= 0)
+                       /* wait for Removal to happen */
+                       return NULL;
+               if (d->state_fd >= 0)
+                       failed--;
+       }
+
+       dprintf("imsm: activate spare: inst=%d failed=%d (%d) level=%d\n",
+               inst, failed, a->info.array.raid_disks, a->info.array.level);
+       if (imsm_check_degraded(super, dev, failed) != IMSM_T_STATE_DEGRADED)
+               return NULL;
+
+       /* For each slot, if it is not working, find a spare */
+       for (i = 0; i < a->info.array.raid_disks; i++) {
+               for (d = a->info.devs ; d ; d = d->next)
+                       if (d->disk.raid_disk == i)
+                               break;
+               dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0);
+               if (d && (d->state_fd >= 0))
+                       continue;
+
+               /*
+                * OK, this device needs recovery.  Try to re-add the
+                * previous occupant of this slot, if this fails see if
+                * we can continue the assimilation of a spare that was
+                * partially assimilated, finally try to activate a new
+                * spare.
+                */
+               dl = imsm_readd(super, i, a);
+               if (!dl)
+                       dl = imsm_add_spare(super, i, a, 0);
+               if (!dl)
+                       dl = imsm_add_spare(super, i, a, 1);
+               if (!dl)
+                       continue;
+               /* found a usable disk with enough space */
+               di = malloc(sizeof(*di));
+               if (!di)
+                       continue;
+               memset(di, 0, sizeof(*di));
+
+               /* dl->index will be -1 in the case we are activating a
+                * pristine spare.  imsm_process_update() will create a
+                * new index in this case.  Once a disk is found to be
+                * failed in all member arrays it is kicked from the
+                * metadata
+                */
+               di->disk.number = dl->index;
+
+               /* (ab)use di->devs to store a pointer to the device
+                * we chose
+                */
+               di->devs = (struct mdinfo *) dl;
+
+               di->disk.raid_disk = i;
+               di->disk.major = dl->major;
+               di->disk.minor = dl->minor;
+               di->disk.state = 0;
+               di->data_offset = __le32_to_cpu(map->pba_of_lba0);
+               di->component_size = a->info.component_size;
+               di->container_member = inst;
+               di->next = rv;
+               rv = di;
+               num_spares++;
+               dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor,
+                       i, di->data_offset);
+
+               break;
+       }
+
+       if (!rv)
+               /* No spares found */
+               return rv;
+       /* Now 'rv' has a list of devices to return.
+        * Create a metadata_update record to update the
+        * disk_ord_tbl for the array
+        */
+       mu = malloc(sizeof(*mu));
+       if (mu) {
+               mu->buf = malloc(sizeof(struct imsm_update_activate_spare) * num_spares);
+               if (mu->buf == NULL) {
+                       free(mu);
+                       mu = NULL;
+               }
+       }
+       if (!mu) {
+               while (rv) {
+                       struct mdinfo *n = rv->next;
+
+                       free(rv);
+                       rv = n;
+               }
+               return NULL;
+       }
+                       
+       mu->space = NULL;
+       mu->len = sizeof(struct imsm_update_activate_spare) * num_spares;
+       mu->next = *updates;
+       u = (struct imsm_update_activate_spare *) mu->buf;
+
+       for (di = rv ; di ; di = di->next) {
+               u->type = update_activate_spare;
+               u->dl = (struct dl *) di->devs;
+               di->devs = NULL;
+               u->slot = di->disk.raid_disk;
+               u->array = inst;
+               u->next = u + 1;
+               u++;
+       }
+       (u-1)->next = NULL;
+       *updates = mu;
+
+       return rv;
+}
+
+static int disks_overlap(struct intel_super *super, int idx, struct imsm_update_create_array *u)
+{
+       struct imsm_dev *dev = get_imsm_dev(super, idx);
+       struct imsm_map *map = get_imsm_map(dev, 0);
+       struct imsm_map *new_map = get_imsm_map(&u->dev, 0);
+       struct disk_info *inf = get_disk_info(u);
+       struct imsm_disk *disk;
+       int i;
+       int j;
+
+       for (i = 0; i < map->num_members; i++) {
+               disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i));
+               for (j = 0; j < new_map->num_members; j++)
+                       if (serialcmp(disk->serial, inf[j].serial) == 0)
+                               return 1;
+       }
+
+       return 0;
+}
+
+static void imsm_delete(struct intel_super *super, struct dl **dlp, int index);
+
+static void imsm_process_update(struct supertype *st,
+                               struct metadata_update *update)
+{
+       /**
+        * crack open the metadata_update envelope to find the update record
+        * update can be one of:
+        *      update_activate_spare - a spare device has replaced a failed
+        *      device in an array, update the disk_ord_tbl.  If this disk is
+        *      present in all member arrays then also clear the SPARE_DISK
+        *      flag
+        */
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb;
+       enum imsm_update_type type = *(enum imsm_update_type *) update->buf;
+
+       /* update requires a larger buf but the allocation failed */
+       if (super->next_len && !super->next_buf) {
+               super->next_len = 0;
+               return;
+       }
+
+       if (super->next_buf) {
+               memcpy(super->next_buf, super->buf, super->len);
+               free(super->buf);
+               super->len = super->next_len;
+               super->buf = super->next_buf;
+
+               super->next_len = 0;
+               super->next_buf = NULL;
+       }
+
+       mpb = super->anchor;
+
+       switch (type) {
+       case update_activate_spare: {
+               struct imsm_update_activate_spare *u = (void *) update->buf; 
+               struct imsm_dev *dev = get_imsm_dev(super, u->array);
+               struct imsm_map *map = get_imsm_map(dev, 0);
+               struct imsm_map *migr_map;
+               struct active_array *a;
+               struct imsm_disk *disk;
+               __u8 to_state;
+               struct dl *dl;
+               unsigned int found;
+               int failed;
+               int victim = get_imsm_disk_idx(dev, u->slot);
+               int i;
+
+               for (dl = super->disks; dl; dl = dl->next)
+                       if (dl == u->dl)
+                               break;
+
+               if (!dl) {
+                       fprintf(stderr, "error: imsm_activate_spare passed "
+                               "an unknown disk (index: %d)\n",
+                               u->dl->index);
+                       return;
+               }
+
+               super->updates_pending++;
+
+               /* count failures (excluding rebuilds and the victim)
+                * to determine map[0] state
+                */
+               failed = 0;
+               for (i = 0; i < map->num_members; i++) {
+                       if (i == u->slot)
+                               continue;
+                       disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i));
+                       if (!disk || disk->status & FAILED_DISK)
+                               failed++;
+               }
+
+               /* adding a pristine spare, assign a new index */
+               if (dl->index < 0) {
+                       dl->index = super->anchor->num_disks;
+                       super->anchor->num_disks++;
+               }
+               disk = &dl->disk;
+               disk->status |= CONFIGURED_DISK;
+               disk->status &= ~SPARE_DISK;
+
+               /* mark rebuild */
+               to_state = imsm_check_degraded(super, dev, failed);
+               map->map_state = IMSM_T_STATE_DEGRADED;
+               migrate(dev, to_state, MIGR_REBUILD);
+               migr_map = get_imsm_map(dev, 1);
+               set_imsm_ord_tbl_ent(map, u->slot, dl->index);
+               set_imsm_ord_tbl_ent(migr_map, u->slot, dl->index | IMSM_ORD_REBUILD);
+
+               /* count arrays using the victim in the metadata */
+               found = 0;
+               for (a = st->arrays; a ; a = a->next) {
+                       dev = get_imsm_dev(super, a->info.container_member);
+                       map = get_imsm_map(dev, 0);
+
+                       if (get_imsm_disk_slot(map, victim) >= 0)
+                               found++;
+               }
+
+               /* delete the victim if it is no longer being
+                * utilized anywhere
+                */
+               if (!found) {
+                       struct dl **dlp;
+
+                       /* We know that 'manager' isn't touching anything,
+                        * so it is safe to delete
+                        */
+                       for (dlp = &super->disks; *dlp; dlp = &(*dlp)->next)
+                               if ((*dlp)->index == victim)
+                                       break;
+
+                       /* victim may be on the missing list */
+                       if (!*dlp)
+                               for (dlp = &super->missing; *dlp; dlp = &(*dlp)->next)
+                                       if ((*dlp)->index == victim)
+                                               break;
+                       imsm_delete(super, dlp, victim);
+               }
+               break;
+       }
+       case update_create_array: {
+               /* someone wants to create a new array, we need to be aware of
+                * a few races/collisions:
+                * 1/ 'Create' called by two separate instances of mdadm
+                * 2/ 'Create' versus 'activate_spare': mdadm has chosen
+                *     devices that have since been assimilated via
+                *     activate_spare.
+                * In the event this update can not be carried out mdadm will
+                * (FIX ME) notice that its update did not take hold.
+                */
+               struct imsm_update_create_array *u = (void *) update->buf;
+               struct intel_dev *dv;
+               struct imsm_dev *dev;
+               struct imsm_map *map, *new_map;
+               unsigned long long start, end;
+               unsigned long long new_start, new_end;
+               int i;
+               struct disk_info *inf;
+               struct dl *dl;
+
+               /* handle racing creates: first come first serve */
+               if (u->dev_idx < mpb->num_raid_devs) {
+                       dprintf("%s: subarray %d already defined\n",
+                               __func__, u->dev_idx);
+                       goto create_error;
+               }
+
+               /* check update is next in sequence */
+               if (u->dev_idx != mpb->num_raid_devs) {
+                       dprintf("%s: can not create array %d expected index %d\n",
+                               __func__, u->dev_idx, mpb->num_raid_devs);
+                       goto create_error;
+               }
+
+               new_map = get_imsm_map(&u->dev, 0);
+               new_start = __le32_to_cpu(new_map->pba_of_lba0);
+               new_end = new_start + __le32_to_cpu(new_map->blocks_per_member);
+               inf = get_disk_info(u);
+
+               /* handle activate_spare versus create race:
+                * check to make sure that overlapping arrays do not include
+                * overalpping disks
+                */
+               for (i = 0; i < mpb->num_raid_devs; i++) {
+                       dev = get_imsm_dev(super, i);
+                       map = get_imsm_map(dev, 0);
+                       start = __le32_to_cpu(map->pba_of_lba0);
+                       end = start + __le32_to_cpu(map->blocks_per_member);
+                       if ((new_start >= start && new_start <= end) ||
+                           (start >= new_start && start <= new_end))
+                               /* overlap */;
+                       else
+                               continue;
+
+                       if (disks_overlap(super, i, u)) {
+                               dprintf("%s: arrays overlap\n", __func__);
+                               goto create_error;
+                       }
+               }
+
+               /* check that prepare update was successful */
+               if (!update->space) {
+                       dprintf("%s: prepare update failed\n", __func__);
+                       goto create_error;
+               }
+
+               /* check that all disks are still active before committing
+                * changes.  FIXME: could we instead handle this by creating a
+                * degraded array?  That's probably not what the user expects,
+                * so better to drop this update on the floor.
+                */
+               for (i = 0; i < new_map->num_members; i++) {
+                       dl = serial_to_dl(inf[i].serial, super);
+                       if (!dl) {
+                               dprintf("%s: disk disappeared\n", __func__);
+                               goto create_error;
+                       }
+               }
+
+               super->updates_pending++;
+
+               /* convert spares to members and fixup ord_tbl */
+               for (i = 0; i < new_map->num_members; i++) {
+                       dl = serial_to_dl(inf[i].serial, super);
+                       if (dl->index == -1) {
+                               dl->index = mpb->num_disks;
+                               mpb->num_disks++;
+                               dl->disk.status |= CONFIGURED_DISK;
+                               dl->disk.status &= ~SPARE_DISK;
+                       }
+                       set_imsm_ord_tbl_ent(new_map, i, dl->index);
+               }
+
+               dv = update->space;
+               dev = dv->dev;
+               update->space = NULL;
+               imsm_copy_dev(dev, &u->dev);
+               dv->index = u->dev_idx;
+               dv->next = super->devlist;
+               super->devlist = dv;
+               mpb->num_raid_devs++;
+
+               imsm_update_version_info(super);
+               break;
+ create_error:
+               /* mdmon knows how to release update->space, but not
+                * ((struct intel_dev *) update->space)->dev
+                */
+               if (update->space) {
+                       dv = update->space;
+                       free(dv->dev);
+               }
+               break;
+       }
+       case update_add_disk:
+
+               /* we may be able to repair some arrays if disks are
+                * being added */
+               if (super->add) {
+                       struct active_array *a;
+
+                       super->updates_pending++;
+                       for (a = st->arrays; a; a = a->next)
+                               a->check_degraded = 1;
+               }
+               /* add some spares to the metadata */
+               while (super->add) {
+                       struct dl *al;
+
+                       al = super->add;
+                       super->add = al->next;
+                       al->next = super->disks;
+                       super->disks = al;
+                       dprintf("%s: added %x:%x\n",
+                               __func__, al->major, al->minor);
+               }
+
+               break;
+       }
+}
+
+static void imsm_prepare_update(struct supertype *st,
+                               struct metadata_update *update)
+{
+       /**
+        * Allocate space to hold new disk entries, raid-device entries or a new
+        * mpb if necessary.  The manager synchronously waits for updates to
+        * complete in the monitor, so new mpb buffers allocated here can be
+        * integrated by the monitor thread without worrying about live pointers
+        * in the manager thread.
+        */
+       enum imsm_update_type type = *(enum imsm_update_type *) update->buf;
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       size_t buf_len;
+       size_t len = 0;
+
+       switch (type) {
+       case update_create_array: {
+               struct imsm_update_create_array *u = (void *) update->buf;
+               struct intel_dev *dv;
+               struct imsm_dev *dev = &u->dev;
+               struct imsm_map *map = get_imsm_map(dev, 0);
+               struct dl *dl;
+               struct disk_info *inf;
+               int i;
+               int activate = 0;
+
+               inf = get_disk_info(u);
+               len = sizeof_imsm_dev(dev, 1);
+               /* allocate a new super->devlist entry */
+               dv = malloc(sizeof(*dv));
+               if (dv) {
+                       dv->dev = malloc(len);
+                       if (dv->dev)
+                               update->space = dv;
+                       else {
+                               free(dv);
+                               update->space = NULL;
+                       }
+               }
+
+               /* count how many spares will be converted to members */
+               for (i = 0; i < map->num_members; i++) {
+                       dl = serial_to_dl(inf[i].serial, super);
+                       if (!dl) {
+                               /* hmm maybe it failed?, nothing we can do about
+                                * it here
+                                */
+                               continue;
+                       }
+                       if (count_memberships(dl, super) == 0)
+                               activate++;
+               }
+               len += activate * sizeof(struct imsm_disk);
+               break;
+       default:
+               break;
+       }
+       }
+
+       /* check if we need a larger metadata buffer */
+       if (super->next_buf)
+               buf_len = super->next_len;
+       else
+               buf_len = super->len;
+
+       if (__le32_to_cpu(mpb->mpb_size) + len > buf_len) {
+               /* ok we need a larger buf than what is currently allocated
+                * if this allocation fails process_update will notice that
+                * ->next_len is set and ->next_buf is NULL
+                */
+               buf_len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + len, 512);
+               if (super->next_buf)
+                       free(super->next_buf);
+
+               super->next_len = buf_len;
+               if (posix_memalign(&super->next_buf, 512, buf_len) == 0)
+                       memset(super->next_buf, 0, buf_len);
+               else
+                       super->next_buf = NULL;
+       }
+}
+
+/* must be called while manager is quiesced */
+static void imsm_delete(struct intel_super *super, struct dl **dlp, int index)
+{
+       struct imsm_super *mpb = super->anchor;
+       struct dl *iter;
+       struct imsm_dev *dev;
+       struct imsm_map *map;
+       int i, j, num_members;
+       __u32 ord;
+
+       dprintf("%s: deleting device[%d] from imsm_super\n",
+               __func__, index);
+
+       /* shift all indexes down one */
+       for (iter = super->disks; iter; iter = iter->next)
+               if (iter->index > index)
+                       iter->index--;
+       for (iter = super->missing; iter; iter = iter->next)
+               if (iter->index > index)
+                       iter->index--;
+
+       for (i = 0; i < mpb->num_raid_devs; i++) {
+               dev = get_imsm_dev(super, i);
+               map = get_imsm_map(dev, 0);
+               num_members = map->num_members;
+               for (j = 0; j < num_members; j++) {
+                       /* update ord entries being careful not to propagate
+                        * ord-flags to the first map
+                        */
+                       ord = get_imsm_ord_tbl_ent(dev, j);
+
+                       if (ord_to_idx(ord) <= index)
+                               continue;
+
+                       map = get_imsm_map(dev, 0);
+                       set_imsm_ord_tbl_ent(map, j, ord_to_idx(ord - 1));
+                       map = get_imsm_map(dev, 1);
+                       if (map)
+                               set_imsm_ord_tbl_ent(map, j, ord - 1);
+               }
+       }
+
+       mpb->num_disks--;
+       super->updates_pending++;
+       if (*dlp) {
+               struct dl *dl = *dlp;
+
+               *dlp = (*dlp)->next;
+               __free_imsm_disk(dl);
+       }
+}
+#endif /* MDASSEMBLE */
+
+struct superswitch super_imsm = {
+#ifndef        MDASSEMBLE
+       .examine_super  = examine_super_imsm,
+       .brief_examine_super = brief_examine_super_imsm,
+       .export_examine_super = export_examine_super_imsm,
+       .detail_super   = detail_super_imsm,
+       .brief_detail_super = brief_detail_super_imsm,
+       .write_init_super = write_init_super_imsm,
+       .validate_geometry = validate_geometry_imsm,
+       .add_to_super   = add_to_super_imsm,
+       .detail_platform = detail_platform_imsm,
+#endif
+       .match_home     = match_home_imsm,
+       .uuid_from_super= uuid_from_super_imsm,
+       .getinfo_super  = getinfo_super_imsm,
+       .update_super   = update_super_imsm,
+
+       .avail_size     = avail_size_imsm,
+
+       .compare_super  = compare_super_imsm,
+
+       .load_super     = load_super_imsm,
+       .init_super     = init_super_imsm,
+       .store_super    = store_zero_imsm,
+       .free_super     = free_super_imsm,
+       .match_metadata_desc = match_metadata_desc_imsm,
+       .container_content = container_content_imsm,
+       .default_layout = imsm_level_to_layout,
+
+       .external       = 1,
+       .name = "imsm",
+
+#ifndef MDASSEMBLE
+/* for mdmon */
+       .open_new       = imsm_open_new,
+       .load_super     = load_super_imsm,
+       .set_array_state= imsm_set_array_state,
+       .set_disk       = imsm_set_disk,
+       .sync_metadata  = imsm_sync_metadata,
+       .activate_spare = imsm_activate_spare,
+       .process_update = imsm_process_update,
+       .prepare_update = imsm_prepare_update,
+#endif /* MDASSEMBLE */
+};
index ebba5341d2e1f7e21a32b0b2761f2a6d4b882d9a..2b4942f4ecdfa28c40cdefdcec89567451773219 100644 (file)
--- a/super0.c
+++ b/super0.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 
 #define HAVE_STDINT_H 1
@@ -53,7 +48,7 @@ static unsigned long calc_sb0_csum(mdp_super_t *super)
 }
 
 
-void super0_swap_endian(struct mdp_superblock_s *sb)
+static void super0_swap_endian(struct mdp_superblock_s *sb)
 {
        /* as super0 superblocks are host-endian, it is sometimes
         * useful to be able to swap the endianness
@@ -232,7 +227,7 @@ static void examine_super0(struct supertype *st, char *homehost)
        }
 }
 
-static void brief_examine_super0(struct supertype *st)
+static void brief_examine_super0(struct supertype *st, int verbose)
 {
        mdp_super_t *sb = st->sb;
        char *c=map_num(pers, sb->level);
@@ -240,14 +235,18 @@ static void brief_examine_super0(struct supertype *st)
 
        sprintf(devname, "/dev/md%d", sb->md_minor);
 
-       printf("ARRAY %s level=%s num-devices=%d UUID=",
-              devname,
-              c?c:"-unknown-", sb->raid_disks);
+       if (verbose) {
+               printf("ARRAY %s level=%s num-devices=%d",
+                      devname,
+                      c?c:"-unknown-", sb->raid_disks);
+       } else
+               printf("ARRAY %s", devname);
+
        if (sb->minor_version >= 90)
-               printf("%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1,
+               printf(" UUID=%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1,
                       sb->set_uuid2, sb->set_uuid3);
        else
-               printf("%08x", sb->set_uuid0);
+               printf(" UUID=%08x", sb->set_uuid0);
        printf("\n");
 }
 
@@ -300,18 +299,6 @@ static void brief_detail_super0(struct supertype *st)
        else
                printf("%08x", sb->set_uuid0);
 }
-
-static void export_detail_super0(struct supertype *st)
-{
-       mdp_super_t *sb = st->sb;
-       printf("MD_UUID=");
-       if (sb->minor_version >= 90)
-               printf("%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1,
-                      sb->set_uuid2, sb->set_uuid3);
-       else
-               printf("%08x", sb->set_uuid0);
-       printf("\n");
-}
 #endif
 
 static int match_home0(struct supertype *st, char *homehost)
@@ -368,6 +355,9 @@ static void getinfo_super0(struct supertype *st, struct mdinfo *info)
        info->events = md_event(sb);
        info->data_offset = 0;
 
+       sprintf(info->text_version, "0.%d", sb->minor_version);
+       info->safe_mode_delay = 200;
+
        uuid_from_super0(st, info->uuid);
 
        if (sb->minor_version > 90 && (sb->reshape_position+1) != 0) {
@@ -551,12 +541,18 @@ static int init_super0(struct supertype *st, mdu_array_info_t *info,
                       unsigned long long size, char *ignored_name, char *homehost,
                       int *uuid)
 {
-       mdp_super_t *sb = malloc(MD_SB_BYTES + sizeof(bitmap_super_t));
+       mdp_super_t *sb;
        int spares;
+
+       if (posix_memalign((void**)&sb, 4096,
+                          MD_SB_BYTES + ROUND_UP(sizeof(bitmap_super_t), 4096)) != 0) {
+               fprintf(stderr, Name ": %s could not allocate superblock\n", __func__);
+               return 0;
+       }
        memset(sb, 0, MD_SB_BYTES + sizeof(bitmap_super_t));
 
        st->sb = sb;
-       if (info->major_version == -1) {
+       if (info == NULL) {
                /* zeroing the superblock */
                return 0;
        }
@@ -622,18 +618,44 @@ static int init_super0(struct supertype *st, mdu_array_info_t *info,
        return 1;
 }
 
+struct devinfo {
+       int fd;
+       char *devname;
+       mdu_disk_info_t disk;
+       struct devinfo *next;
+};
+
+#ifndef MDASSEMBLE
 /* Add a device to the superblock being created */
-static void add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo)
+static int add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo,
+                         int fd, char *devname)
 {
        mdp_super_t *sb = st->sb;
        mdp_disk_t *dk = &sb->disks[dinfo->number];
+       struct devinfo *di, **dip;
 
        dk->number = dinfo->number;
        dk->major = dinfo->major;
        dk->minor = dinfo->minor;
        dk->raid_disk = dinfo->raid_disk;
        dk->state = dinfo->state;
+
+       sb->this_disk = sb->disks[dinfo->number];
+       sb->sb_csum = calc_sb0_csum(sb);
+
+       dip = (struct devinfo **)&st->info;
+       while (*dip)
+               dip = &(*dip)->next;
+       di = malloc(sizeof(struct devinfo));
+       di->fd = fd;
+       di->devname = devname;
+       di->disk = *dinfo;
+       di->next = NULL;
+       *dip = di;
+
+       return 0;
 }
+#endif
 
 static int store_super0(struct supertype *st, int fd)
 {
@@ -660,7 +682,8 @@ static int store_super0(struct supertype *st, int fd)
        if (super->state & (1<<MD_SB_BITMAP_PRESENT)) {
                struct bitmap_super_s * bm = (struct bitmap_super_s*)(super+1);
                if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC)
-                       if (write(fd, bm, sizeof(*bm)) != sizeof(*bm))
+                       if (write(fd, bm, ROUND_UP(sizeof(*bm),4096)) != 
+                           ROUND_UP(sizeof(*bm),4096))
                            return 5;
        }
 
@@ -668,32 +691,41 @@ static int store_super0(struct supertype *st, int fd)
        return 0;
 }
 
-static int write_init_super0(struct supertype *st,
-                            mdu_disk_info_t *dinfo, char *devname)
+#ifndef MDASSEMBLE
+static int write_init_super0(struct supertype *st)
 {
        mdp_super_t *sb = st->sb;
-       int fd = dev_open(devname, O_RDWR|O_EXCL);
-       int rv;
+       int rv = 0;
+       struct devinfo *di;
 
-       if (fd < 0) {
-               fprintf(stderr, Name ": Failed to open %s to write superblock\n", devname);
-               return -1;
-       }
+       for (di = st->info ; di && ! rv ; di = di->next) {
 
-       sb->disks[dinfo->number].state &= ~(1<<MD_DISK_FAULTY);
+               if (di->disk.state == 1)
+                       continue;
+               if (di->fd == -1)
+                       continue;
+               Kill(di->devname, 0, 1, 1);
+               Kill(di->devname, 0, 1, 1);
 
-       sb->this_disk = sb->disks[dinfo->number];
-       sb->sb_csum = calc_sb0_csum(sb);
-       rv = store_super0(st, fd);
+               sb->disks[di->disk.number].state &= ~(1<<MD_DISK_FAULTY);
 
-       if (rv == 0 && (sb->state & (1<<MD_SB_BITMAP_PRESENT)))
-               rv = st->ss->write_bitmap(st, fd);
+               sb->this_disk = sb->disks[di->disk.number];
+               sb->sb_csum = calc_sb0_csum(sb);
+               rv = store_super0(st, di->fd);
 
-       close(fd);
-       if (rv)
-               fprintf(stderr, Name ": failed to write superblock to %s\n", devname);
+               if (rv == 0 && (sb->state & (1<<MD_SB_BITMAP_PRESENT)))
+                       rv = st->ss->write_bitmap(st, di->fd);
+
+               if (rv)
+                       fprintf(stderr,
+                               Name ": failed to write superblock to %s\n",
+                               di->devname);
+               close(di->fd);
+               di->fd = -1;
+       }
        return rv;
 }
+#endif
 
 static int compare_super0(struct supertype *st, struct supertype *tst)
 {
@@ -711,7 +743,13 @@ static int compare_super0(struct supertype *st, struct supertype *tst)
        if (second->md_magic != MD_SB_MAGIC)
                return 1;
        if (!first) {
-               first = malloc(MD_SB_BYTES + sizeof(struct bitmap_super_s));
+               if (posix_memalign((void**)&first, 4096,
+                            MD_SB_BYTES + 
+                            ROUND_UP(sizeof(struct bitmap_super_s), 4096)) != 0) {
+                       fprintf(stderr, Name
+                               ": %s could not allocate superblock\n", __func__);
+                       return 1;
+               }
                memcpy(first, second, MD_SB_BYTES + sizeof(struct bitmap_super_s));
                st->sb = first;
                return 0;
@@ -753,6 +791,9 @@ static int load_super0(struct supertype *st, int fd, char *devname)
 
        free_super0(st);
 
+       if (st->subarray[0])
+               return 1;
+
        if (!get_dev_size(fd, devname, &dsize))
                return 1;
 
@@ -777,7 +818,13 @@ static int load_super0(struct supertype *st, int fd, char *devname)
                return 1;
        }
 
-       super = malloc(MD_SB_BYTES + sizeof(bitmap_super_t));
+       if (posix_memalign((void**)&super, 4096,
+                          MD_SB_BYTES +
+                          ROUND_UP(sizeof(bitmap_super_t), 4096)) != 0) {
+               fprintf(stderr, Name
+                       ": %s could not allocate superblock\n", __func__);
+               return 1;
+       }
 
        if (read(fd, super, sizeof(*super)) != MD_SB_BYTES) {
                if (devname)
@@ -811,6 +858,7 @@ static int load_super0(struct supertype *st, int fd, char *devname)
                st->ss = &super0;
                st->minor_version = super->minor_version;
                st->max_devs = MD_SB_DISKS;
+               st->info = NULL;
        }
 
        /* Now check on the bitmap superblock */
@@ -820,8 +868,8 @@ static int load_super0(struct supertype *st, int fd, char *devname)
         * valid.  If it doesn't clear the bit.  An --assemble --force
         * should get that written out.
         */
-       if (read(fd, super+1, sizeof(struct bitmap_super_s))
-           != sizeof(struct bitmap_super_s))
+       if (read(fd, super+1, ROUND_UP(sizeof(struct bitmap_super_s),4096))
+           != ROUND_UP(sizeof(struct bitmap_super_s),4096))
                goto no_bitmap;
 
        uuid_from_super0(st, uuid);
@@ -842,12 +890,14 @@ static struct supertype *match_metadata_desc0(char *arg)
        struct supertype *st = malloc(sizeof(*st));
        if (!st) return st;
 
+       memset(st, 0, sizeof(*st));
        st->ss = &super0;
+       st->info = NULL;
        st->minor_version = 90;
        st->max_devs = MD_SB_DISKS;
        st->sb = NULL;
-       /* Eliminate pointless leading 0 from some versions of mdadm -D */
-       if (strncmp(arg, "00.", 3) == 0)
+       /* we sometimes get 00.90 */
+       while (arg[0] == '0' && arg[1] == '0')
                arg++;
        if (strcmp(arg, "0") == 0 ||
            strcmp(arg, "0.90") == 0 ||
@@ -921,7 +971,7 @@ static int add_internal_bitmap0(struct supertype *st, int *chunkp,
 }
 
 
-void locate_bitmap0(struct supertype *st, int fd)
+static void locate_bitmap0(struct supertype *st, int fd)
 {
        unsigned long long dsize;
        unsigned long long offset;
@@ -941,7 +991,7 @@ void locate_bitmap0(struct supertype *st, int fd)
        lseek64(fd, offset, 0);
 }
 
-int write_bitmap0(struct supertype *st, int fd)
+static int write_bitmap0(struct supertype *st, int fd)
 {
        unsigned long long dsize;
        unsigned long long offset;
@@ -950,7 +1000,8 @@ int write_bitmap0(struct supertype *st, int fd)
        int rv = 0;
 
        int towrite, n;
-       char buf[4096];
+       char abuf[4096+4096];
+       char *buf = (char*)(((long)(abuf+4096))&~4095L);
 
        if (!get_dev_size(fd, NULL, &dsize))
                return 1;
@@ -966,21 +1017,19 @@ int write_bitmap0(struct supertype *st, int fd)
        if (lseek64(fd, offset + 4096, 0)< 0LL)
                return 3;
 
-
-       if (write(fd, ((char*)sb)+MD_SB_BYTES, sizeof(bitmap_super_t)) !=
-           sizeof(bitmap_super_t))
-               return -2;
-       towrite = 64*1024 - MD_SB_BYTES - sizeof(bitmap_super_t);
-       memset(buf, 0xff, sizeof(buf));
+       memset(buf, 0xff, 4096);
+       memcpy(buf,  ((char*)sb)+MD_SB_BYTES, sizeof(bitmap_super_t));
+       towrite = 64*1024;
        while (towrite > 0) {
                n = towrite;
-               if (n > sizeof(buf))
-                       n = sizeof(buf);
+               if (n > 4096)
+                       n = 4096;
                n = write(fd, buf, n);
                if (n > 0)
                        towrite -= n;
                else
                        break;
+               memset(buf, 0xff, 4096);
        }
        fsync(fd);
        if (towrite)
@@ -996,6 +1045,48 @@ static void free_super0(struct supertype *st)
        st->sb = NULL;
 }
 
+#ifndef MDASSEMBLE
+static int validate_geometry0(struct supertype *st, int level,
+                             int layout, int raiddisks,
+                             int chunk, unsigned long long size,
+                             char *subdev, unsigned long long *freesize,
+                             int verbose)
+{
+       unsigned long long ldsize;
+       int fd;
+
+       if (level == LEVEL_CONTAINER)
+               return 0;
+       if (raiddisks > MD_SB_DISKS)
+               return 0;
+       if (size > (0x7fffffffULL<<9))
+               return 0;
+       if (!subdev)
+               return 1;
+
+       fd = open(subdev, O_RDONLY|O_EXCL, 0);
+       if (fd < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": super0.90 cannot open %s: %s\n",
+                               subdev, strerror(errno));
+               return 0;
+       }
+
+       if (!get_dev_size(fd, subdev, &ldsize)) {
+               close(fd);
+               return 0;
+       }
+       close(fd);
+
+       if (ldsize < MD_RESERVED_SECTORS * 512)
+               return 0;
+       if (size > (0x7fffffffULL<<9))
+               return 0;
+       *freesize = MD_NEW_SIZE_SECTORS(ldsize >> 9);
+       return 1;
+}
+#endif /* MDASSEMBLE */
+
 struct superswitch super0 = {
 #ifndef MDASSEMBLE
        .examine_super = examine_super0,
@@ -1003,16 +1094,16 @@ struct superswitch super0 = {
        .export_examine_super = export_examine_super0,
        .detail_super = detail_super0,
        .brief_detail_super = brief_detail_super0,
-       .export_detail_super = export_detail_super0,
+       .write_init_super = write_init_super0,
+       .validate_geometry = validate_geometry0,
+       .add_to_super = add_to_super0,
 #endif
        .match_home = match_home0,
        .uuid_from_super = uuid_from_super0,
        .getinfo_super = getinfo_super0,
        .update_super = update_super0,
        .init_super = init_super0,
-       .add_to_super = add_to_super0,
        .store_super = store_super0,
-       .write_init_super = write_init_super0,
        .compare_super = compare_super0,
        .load_super = load_super0,
        .match_metadata_desc = match_metadata_desc0,
@@ -1021,6 +1112,5 @@ struct superswitch super0 = {
        .locate_bitmap = locate_bitmap0,
        .write_bitmap = write_bitmap0,
        .free_super = free_super0,
-       .major = 0,
-       .swapuuid = 0,
+       .name = "0.90",
 };
index 376b97d9a4a4e855f2eabfe6583f48ac9c11a6c3..056b93bb7dafdb9ba541903fec515e7bede42df6 100644 (file)
--- a/super1.c
+++ b/super1.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 
 #include "mdadm.h"
@@ -141,13 +136,71 @@ static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
        return __cpu_to_le32(csum);
 }
 
+static char abuf[4096+4096];
+static int aread(int fd, void *buf, int len)
+{
+       /* aligned read.
+        * On devices with a 4K sector size, we need to read
+        * the full sector and copy relevant bits into
+        * the buffer
+        */
+       int bsize;
+       char *b;
+       int n;
+       if (ioctl(fd, BLKSSZGET, &bsize) != 0 ||
+           bsize <= len)
+               return read(fd, buf, len);
+       if (bsize > 4096)
+               return -1;
+       b = (char*)(((long)(abuf+4096))&~4095UL);
+
+       n = read(fd, b, bsize);
+       if (n <= 0)
+               return n;
+       lseek(fd, len - n, 1);
+       if (n > len)
+               n = len;
+       memcpy(buf, b, n);
+       return n;
+}
+
+static int awrite(int fd, void *buf, int len)
+{
+       /* aligned write.
+        * On devices with a 4K sector size, we need to write
+        * the full sector.  We pre-read if the sector is larger
+        * than the write.
+        * The address must be sector-aligned.
+        */
+       int bsize;
+       char *b;
+       int n;
+       if (ioctl(fd, BLKSSZGET, &bsize) != 0 ||
+           bsize <= len)
+               return write(fd, buf, len);
+       if (bsize > 4096)
+               return -1;
+       b = (char*)(((long)(abuf+4096))&~4095UL);
+
+       n = read(fd, b, bsize);
+       if (n <= 0)
+               return n;
+       lseek(fd, -n, 1);
+       memcpy(b, buf, len);
+       n = write(fd, b, bsize);
+       if (n <= 0)
+               return n;
+       lseek(fd, len - n, 1);
+       return len;
+}
+
 #ifndef MDASSEMBLE
 static void examine_super1(struct supertype *st, char *homehost)
 {
        struct mdp_superblock_1 *sb = st->sb;
        time_t atime;
        int d;
-       int faulty;
+       int role;
        int i;
        char *c;
        int l = homehost ? strlen(homehost) : 0;
@@ -298,6 +351,8 @@ static void examine_super1(struct supertype *st, char *homehost)
        default: break;
        }
        printf("\n");
+#if 0
+       /* This turns out to just be confusing */
        printf("    Array Slot : %d (", __le32_to_cpu(sb->dev_number));
        for (i= __le32_to_cpu(sb->max_dev); i> 0 ; i--)
                if (__le16_to_cpu(sb->dev_roles[i-1]) != 0xffff)
@@ -310,6 +365,18 @@ static void examine_super1(struct supertype *st, char *homehost)
                else printf("%d", role);
        }
        printf(")\n");
+#endif
+       printf("   Device Role : ");
+       d = __le32_to_cpu(sb->dev_number);
+       if (d < sb->raid_disks)
+               role = __le16_to_cpu(sb->dev_roles[d]);
+       else
+               role = 0xFFFF;
+       if (role >= 0xFFFE)
+               printf("spare\n");
+       else
+               printf("Active device %d\n", role);
+
        printf("   Array State : ");
        for (d=0; d<__le32_to_cpu(sb->raid_disks); d++) {
                int cnt = 0;
@@ -324,10 +391,11 @@ static void examine_super1(struct supertype *st, char *homehost)
                        }
                }
                if (cnt > 1) printf("?");
-               else if (cnt == 1 && me) printf("U");
-               else if (cnt == 1) printf("u");
-               else printf ("_");
+               else if (cnt == 1) printf("A");
+               else printf (".");
        }
+#if 0
+       /* This is confusing too */
        faulty = 0;
        for (i=0; i< __le32_to_cpu(sb->max_dev); i++) {
                int role = __le16_to_cpu(sb->dev_roles[i]);
@@ -335,11 +403,13 @@ static void examine_super1(struct supertype *st, char *homehost)
                        faulty++;
        }
        if (faulty) printf(" %d failed", faulty);
+#endif
+       printf(" ('A' == active, '.' == missing)");
        printf("\n");
 }
 
 
-static void brief_examine_super1(struct supertype *st)
+static void brief_examine_super1(struct supertype *st, int verbose)
 {
        struct mdp_superblock_1 *sb = st->sb;
        int i;
@@ -353,17 +423,21 @@ static void brief_examine_super1(struct supertype *st)
        else if (sb->set_name[0])
                nm = sb->set_name;
        else
-               nm = "??";
+               nm = NULL;
 
-       printf("ARRAY /dev/md/%s level=%s ", nm, c?c:"-unknown-");
+       printf("ARRAY%s%s", nm ? " /dev/md/":"", nm);
+       if (verbose && c)
+               printf(" level=%s", c);
        sb_offset = __le64_to_cpu(sb->super_offset);
        if (sb_offset <= 4)
-               printf("metadata=1.1 ");
+               printf(" metadata=1.1 ");
        else if (sb_offset <= 8)
-               printf("metadata=1.2 ");
+               printf(" metadata=1.2 ");
        else
-               printf("metadata=1.0 ");
-       printf("num-devices=%d UUID=", __le32_to_cpu(sb->raid_disks));
+               printf(" metadata=1.0 ");
+       if (verbose)
+               printf("num-devices=%d ", __le32_to_cpu(sb->raid_disks));
+       printf("UUID=");
        for (i=0; i<16; i++) {
                if ((i&3)==0 && i != 0) printf(":");
                printf("%02x", sb->set_uuid[i]);
@@ -454,12 +528,6 @@ static void export_detail_super1(struct supertype *st)
                }
        if (len)
                printf("MD_NAME=%.*s\n", len, sb->set_name);
-       printf("MD_UUID=");
-       for (i=0; i<16; i++) {
-               if ((i&3)==0 && i != 0) printf(":");
-               printf("%02x", sb->set_uuid[i]);
-       }
-       printf("\n");
 }
 
 #endif
@@ -491,7 +559,7 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info)
        int role;
 
        info->array.major_version = 1;
-       info->array.minor_version = __le32_to_cpu(sb->feature_map);
+       info->array.minor_version = st->minor_version;
        info->array.patch_version = 0;
        info->array.raid_disks = __le32_to_cpu(sb->raid_disks);
        info->array.level = __le32_to_cpu(sb->level);
@@ -529,6 +597,8 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info)
                info->disk.raid_disk = role;
        }
        info->events = __le64_to_cpu(sb->events);
+       sprintf(info->text_version, "1.%d", st->minor_version);
+       info->safe_mode_delay = 200;
 
        memcpy(info->uuid, sb->set_uuid, 16);
 
@@ -685,7 +755,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
            __le64_to_cpu(sb->data_offset)) {
                /* set data_size to device size less data_offset */
                struct misc_dev_info *misc = (struct misc_dev_info*)
-                       (st->sb + 1024 + sizeof(struct bitmap_super_s));
+                       (st->sb + 1024 + 512);
                printf("Size was %llu\n", (unsigned long long)
                       __le64_to_cpu(sb->data_size));
                sb->data_size = __cpu_to_le64(
@@ -703,15 +773,21 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
 static int init_super1(struct supertype *st, mdu_array_info_t *info,
                       unsigned long long size, char *name, char *homehost, int *uuid)
 {
-       struct mdp_superblock_1 *sb = malloc(1024 + sizeof(bitmap_super_t) +
-                                            sizeof(struct misc_dev_info));
+       struct mdp_superblock_1 *sb;
        int spares;
        int rfd;
        char defname[10];
+
+       if (posix_memalign((void**)&sb, 512, (1024 + 512 + 
+                          sizeof(struct misc_dev_info))) != 0) {
+               fprintf(stderr, Name
+                       ": %s could not allocate superblock\n", __func__);
+               return 0;
+       }
        memset(sb, 0, 1024);
 
        st->sb = sb;
-       if (info->major_version == -1) {
+       if (info == NULL) {
                /* zeroing superblock */
                return 0;
        }
@@ -780,21 +856,48 @@ static int init_super1(struct supertype *st, mdu_array_info_t *info,
        return 1;
 }
 
+struct devinfo {
+       int fd;
+       char *devname;
+       mdu_disk_info_t disk;
+       struct devinfo *next;
+};
+#ifndef MDASSEMBLE
 /* Add a device to the superblock being created */
-static void add_to_super1(struct supertype *st, mdu_disk_info_t *dk)
+static int add_to_super1(struct supertype *st, mdu_disk_info_t *dk,
+                         int fd, char *devname)
 {
        struct mdp_superblock_1 *sb = st->sb;
        __u16 *rp = sb->dev_roles + dk->number;
+       struct devinfo *di, **dip;
+
        if ((dk->state & 6) == 6) /* active, sync */
                *rp = __cpu_to_le16(dk->raid_disk);
        else if ((dk->state & ~2) == 0) /* active or idle -> spare */
                *rp = 0xffff;
        else
                *rp = 0xfffe;
+
        if (dk->number >= __le32_to_cpu(sb->max_dev) &&
            __le32_to_cpu(sb->max_dev) < 384)
                sb->max_dev = __cpu_to_le32(dk->number+1);
+
+       sb->dev_number = __cpu_to_le32(dk->number);
+       sb->sb_csum = calc_sb_1_csum(sb);
+
+       dip = (struct devinfo **)&st->info;
+       while (*dip)
+               dip = &(*dip)->next;
+       di = malloc(sizeof(struct devinfo));
+       di->fd = fd;
+       di->devname = devname;
+       di->disk = *dk;
+       di->next = NULL;
+       *dip = di;
+
+       return 0;
 }
+#endif
 
 static void locate_bitmap1(struct supertype *st, int fd);
 
@@ -850,8 +953,9 @@ static int store_super1(struct supertype *st, int fd)
                return 3;
 
        sbsize = sizeof(*sb) + 2 * __le32_to_cpu(sb->max_dev);
+       sbsize = (sbsize+511)&(~511UL);
 
-       if (write(fd, sb, sbsize) != sbsize)
+       if (awrite(fd, sb, sbsize) != sbsize)
                return 4;
 
        if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) {
@@ -859,7 +963,8 @@ static int store_super1(struct supertype *st, int fd)
                        (((char*)sb)+1024);
                if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC) {
                        locate_bitmap1(st, fd);
-                       if (write(fd, bm, sizeof(*bm)) != sizeof(*bm))
+                       if (awrite(fd, bm, sizeof(*bm)) !=
+                           sizeof(*bm))
                            return 5;
                }
        }
@@ -882,122 +987,133 @@ static unsigned long choose_bm_space(unsigned long devsize)
        return 4*2;
 }
 
-static int write_init_super1(struct supertype *st,
-                            mdu_disk_info_t *dinfo, char *devname)
+#ifndef MDASSEMBLE
+static int write_init_super1(struct supertype *st)
 {
        struct mdp_superblock_1 *sb = st->sb;
        struct supertype refst;
-       int fd = dev_open(devname, O_RDWR | O_EXCL);
        int rfd;
-       int rv;
+       int rv = 0;
        int bm_space;
-
+       struct devinfo *di;
        unsigned long long dsize, array_size;
        long long sb_offset;
 
+       for (di = st->info; di && ! rv ; di = di->next) {
+               if (di->disk.state == 1)
+                       continue;
+               if (di->fd < 0)
+                       continue;
 
-       if (fd < 0) {
-               fprintf(stderr, Name ": Failed to open %s to write superblock\n",
-                       devname);
-               return -1;
-       }
+               Kill(di->devname, 0, 1, 1);
+               Kill(di->devname, 0, 1, 1);
 
-       sb->dev_number = __cpu_to_le32(dinfo->number);
-       if (dinfo->state & (1<<MD_DISK_WRITEMOSTLY))
-               sb->devflags |= __cpu_to_le32(WriteMostly1);
+               sb->dev_number = __cpu_to_le32(di->disk.number);
+               if (di->disk.state & (1<<MD_DISK_WRITEMOSTLY))
+                       sb->devflags |= __cpu_to_le32(WriteMostly1);
 
-       if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 ||
-           read(rfd, sb->device_uuid, 16) != 16) {
-               __u32 r[4] = {random(), random(), random(), random()};
-               memcpy(sb->device_uuid, r, 16);
-       }
-       
-       if (rfd >= 0) close(rfd);
-       sb->events = 0;
-
-       refst =*st;
-       refst.sb = NULL;
-       if (load_super1(&refst, fd, NULL)==0) {
-               struct mdp_superblock_1 *refsb = refst.sb;
-
-               memcpy(sb->device_uuid, refsb->device_uuid, 16);
-               if (memcmp(sb->set_uuid, refsb->set_uuid, 16)==0) {
-                       /* same array, so preserve events and dev_number */
-                       sb->events = refsb->events;
-                       /* bugs in 2.6.17 and earlier mean the dev_number
-                        * chosen in Manage must be preserved
-                        */
-                       if (get_linux_version() >= 2006018)
-                               sb->dev_number = refsb->dev_number;
+               if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 ||
+                   read(rfd, sb->device_uuid, 16) != 16) {
+                       __u32 r[4] = {random(), random(), random(), random()};
+                       memcpy(sb->device_uuid, r, 16);
+               }
+               if (rfd >= 0)
+                       close(rfd);
+
+               sb->events = 0;
+
+               refst =*st;
+               refst.sb = NULL;
+               if (load_super1(&refst, di->fd, NULL)==0) {
+                       struct mdp_superblock_1 *refsb = refst.sb;
+
+                       memcpy(sb->device_uuid, refsb->device_uuid, 16);
+                       if (memcmp(sb->set_uuid, refsb->set_uuid, 16)==0) {
+                               /* same array, so preserve events and
+                                * dev_number */
+                               sb->events = refsb->events;
+                               /* bugs in 2.6.17 and earlier mean the
+                                * dev_number chosen in Manage must be preserved
+                                */
+                               if (get_linux_version() >= 2006018)
+                                       sb->dev_number = refsb->dev_number;
+                       }
+                       free(refsb);
                }
-               free(refsb);
-       }
 
-       if (!get_dev_size(fd, NULL, &dsize))
-               return 1;
-       dsize >>= 9;
+               if (!get_dev_size(di->fd, NULL, &dsize))
+                       return 1;
+               dsize >>= 9;
 
-       if (dsize < 24) {
-               close(fd);
-               return 2;
-       }
+               if (dsize < 24) {
+                       close(di->fd);
+                       return 2;
+               }
 
 
-       /*
-        * Calculate the position of the superblock.
-        * It is always aligned to a 4K boundary and
-        * depending on minor_version, it can be:
-        * 0: At least 8K, but less than 12K, from end of device
-        * 1: At start of device
-        * 2: 4K from start of device.
-        * Depending on the array size, we might leave extra space
-        * for a bitmap.
-        */
-       array_size = __le64_to_cpu(sb->size);
-       /* work out how much space we left for a bitmap */
-       bm_space = choose_bm_space(array_size);
-
-       switch(st->minor_version) {
-       case 0:
-               sb_offset = dsize;
-               sb_offset -= 8*2;
-               sb_offset &= ~(4*2-1);
-               sb->super_offset = __cpu_to_le64(sb_offset);
-               sb->data_offset = __cpu_to_le64(0);
+               /*
+                * Calculate the position of the superblock.
+                * It is always aligned to a 4K boundary and
+                * depending on minor_version, it can be:
+                * 0: At least 8K, but less than 12K, from end of device
+                * 1: At start of device
+                * 2: 4K from start of device.
+                * Depending on the array size, we might leave extra space
+                * for a bitmap.
+                */
+               array_size = __le64_to_cpu(sb->size);
+               /* work out how much space we left for a bitmap */
+               bm_space = choose_bm_space(array_size);
+
+               switch(st->minor_version) {
+               case 0:
+                       sb_offset = dsize;
+                       sb_offset -= 8*2;
+                       sb_offset &= ~(4*2-1);
+                       sb->super_offset = __cpu_to_le64(sb_offset);
+                       sb->data_offset = __cpu_to_le64(0);
                if (sb_offset - bm_space < array_size)
                        bm_space = sb_offset - array_size;
-               sb->data_size = __cpu_to_le64(sb_offset - bm_space);
-               break;
-       case 1:
-               sb->super_offset = __cpu_to_le64(0);
-               if (4*2 + bm_space + __le64_to_cpu(sb->size) > dsize)
-                       bm_space = dsize - __le64_to_cpu(sb->size) - 4*2;
-               sb->data_offset = __cpu_to_le64(bm_space + 4*2);
-               sb->data_size = __cpu_to_le64(dsize - bm_space - 4*2);
-               break;
-       case 2:
-               sb_offset = 4*2;
-               sb->super_offset = __cpu_to_le64(4*2);
-               if (4*2 + 4*2 + bm_space + __le64_to_cpu(sb->size) > dsize)
-                       bm_space = dsize - __le64_to_cpu(sb->size) - 4*2 - 4*2;
-               sb->data_offset = __cpu_to_le64(4*2 + 4*2 + bm_space);
-               sb->data_size = __cpu_to_le64(dsize - 4*2 - 4*2 - bm_space );
-               break;
-       default:
-               return -EINVAL;
-       }
+                       sb->data_size = __cpu_to_le64(sb_offset - bm_space);
+                       break;
+               case 1:
+                       sb->super_offset = __cpu_to_le64(0);
+                       if (4*2 + bm_space + __le64_to_cpu(sb->size) > dsize)
+                               bm_space = dsize - __le64_to_cpu(sb->size) -4*2;
+                       sb->data_offset = __cpu_to_le64(bm_space + 4*2);
+                       sb->data_size = __cpu_to_le64(dsize - bm_space - 4*2);
+                       break;
+               case 2:
+                       sb_offset = 4*2;
+                       sb->super_offset = __cpu_to_le64(4*2);
+                       if (4*2 + 4*2 + bm_space + __le64_to_cpu(sb->size)
+                           > dsize)
+                               bm_space = dsize - __le64_to_cpu(sb->size)
+                                       - 4*2 - 4*2;
+                       sb->data_offset = __cpu_to_le64(4*2 + 4*2 + bm_space);
+                       sb->data_size = __cpu_to_le64(dsize - 4*2 - 4*2
+                                                     - bm_space );
+                       break;
+               default:
+                       return -EINVAL;
+               }
 
 
-       sb->sb_csum = calc_sb_1_csum(sb);
-       rv = store_super1(st, fd);
-       if (rv)
-               fprintf(stderr, Name ": failed to write superblock to %s\n", devname);
+               sb->sb_csum = calc_sb_1_csum(sb);
+               rv = store_super1(st, di->fd);
+               if (rv)
+                       fprintf(stderr,
+                               Name ": failed to write superblock to %s\n",
+                               di->devname);
 
-       if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1))
-               rv = st->ss->write_bitmap(st, fd);
-       close(fd);
+               if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1))
+                       rv = st->ss->write_bitmap(st, di->fd);
+               close(di->fd);
+               di->fd = -1;
+       }
        return rv;
 }
+#endif
 
 static int compare_super1(struct supertype *st, struct supertype *tst)
 {
@@ -1017,9 +1133,14 @@ static int compare_super1(struct supertype *st, struct supertype *tst)
                return 1;
 
        if (!first) {
-               first = malloc(1024+sizeof(bitmap_super_t) +
-                              sizeof(struct misc_dev_info));
-               memcpy(first, second, 1024+sizeof(bitmap_super_t) +
+               if (posix_memalign((void**)&first, 512,
+                              1024 + 512 +
+                              sizeof(struct misc_dev_info)) != 0) {
+                       fprintf(stderr, Name
+                               ": %s could not allocate superblock\n", __func__);
+                       return 1;
+               }
+               memcpy(first, second, 1024 + 512 + 
                       sizeof(struct misc_dev_info));
                st->sb = first;
                return 0;
@@ -1050,13 +1171,16 @@ static int load_super1(struct supertype *st, int fd, char *devname)
 
        free_super1(st);
 
+       if (st->subarray[0])
+               return 1;
+
        if (st->ss == NULL || st->minor_version == -1) {
                int bestvers = -1;
                struct supertype tst;
                __u64 bestctime = 0;
                /* guess... choose latest ctime */
+               memset(&tst, 0, sizeof(tst));
                tst.ss = &super1;
-               tst.sb = NULL;
                for (tst.minor_version = 0; tst.minor_version <= 2 ; tst.minor_version++) {
                        switch(load_super1(&tst, fd, devname)) {
                        case 0: super = tst.sb;
@@ -1129,10 +1253,15 @@ static int load_super1(struct supertype *st, int fd, char *devname)
                return 1;
        }
 
-       super = malloc(1024 + sizeof(bitmap_super_t) +
-                      sizeof(struct misc_dev_info));
+       if (posix_memalign((void**)&super, 512,
+                      1024 + 512 +
+                      sizeof(struct misc_dev_info)) != 0) {
+               fprintf(stderr, Name ": %s could not allocate superblock\n",
+                       __func__);
+               return 1;
+       }
 
-       if (read(fd, super, 1024) != 1024) {
+       if (aread(fd, super, 1024) != 1024) {
                if (devname)
                        fprintf(stderr, Name ": Cannot read superblock on %s\n",
                                devname);
@@ -1166,7 +1295,7 @@ static int load_super1(struct supertype *st, int fd, char *devname)
 
        bsb = (struct bitmap_super_s *)(((char*)super)+1024);
 
-       misc = (struct misc_dev_info*) (bsb+1);
+       misc = (struct misc_dev_info*) (((char*)super)+1024+512);
        misc->device_size = dsize;
 
        /* Now check on the bitmap superblock */
@@ -1177,8 +1306,8 @@ static int load_super1(struct supertype *st, int fd, char *devname)
         * should get that written out.
         */
        locate_bitmap1(st, fd);
-       if (read(fd, ((char*)super)+1024, sizeof(struct bitmap_super_s))
-           != sizeof(struct bitmap_super_s))
+       if (aread(fd, ((char*)super)+1024, 512)
+           != 512)
                goto no_bitmap;
 
        uuid_from_super1(st, uuid);
@@ -1198,11 +1327,12 @@ static struct supertype *match_metadata_desc1(char *arg)
        struct supertype *st = malloc(sizeof(*st));
        if (!st) return st;
 
+       memset(st, 0, sizeof(*st));
        st->ss = &super1;
        st->max_devs = 384;
        st->sb = NULL;
-       /* Eliminate pointless leading 0 from some versions of mdadm -D */
-       if (strncmp(arg, "01.", 3) == 0)
+       /* leading zeros can be safely ignored.  --detail generates them. */
+       while (*arg == '0')
                arg++;
        if (strcmp(arg, "1.0") == 0 ||
            strcmp(arg, "1.00") == 0) {
@@ -1220,7 +1350,7 @@ static struct supertype *match_metadata_desc1(char *arg)
                return st;
        }
        if (strcmp(arg, "1") == 0 ||
-           strcmp(arg, "default/large") == 0) {
+           strcmp(arg, "default") == 0) {
                st->minor_version = -1;
                return st;
        }
@@ -1414,25 +1544,27 @@ static int write_bitmap1(struct supertype *st, int fd)
        int rv = 0;
 
        int towrite, n;
-       char buf[4096];
+       char *buf = (char*)(((long)(abuf+4096))&~4095UL);
 
        locate_bitmap1(st, fd);
 
-       if (write(fd, ((char*)sb)+1024, sizeof(bitmap_super_t)) !=
-           sizeof(bitmap_super_t))
-               return -2;
+       memset(buf, 0xff, 4096);
+       memcpy(buf, ((char*)sb)+1024, sizeof(bitmap_super_t));
+
        towrite = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9);
        towrite = (towrite+7) >> 3; /* bits to bytes */
-       memset(buf, 0xff, sizeof(buf));
+       towrite += sizeof(bitmap_super_t);
+       towrite = ROUND_UP(towrite, 512);
        while (towrite > 0) {
                n = towrite;
-               if (n > sizeof(buf))
-                       n = sizeof(buf);
+               if (n > 4096)
+                       n = 4096;
                n = write(fd, buf, n);
                if (n > 0)
                        towrite -= n;
                else
                        break;
+               memset(buf, 0xff, 4096);
        }
        fsync(fd);
        if (towrite)
@@ -1448,6 +1580,40 @@ static void free_super1(struct supertype *st)
        st->sb = NULL;
 }
 
+#ifndef MDASSEMBLE
+static int validate_geometry1(struct supertype *st, int level,
+                             int layout, int raiddisks,
+                             int chunk, unsigned long long size,
+                             char *subdev, unsigned long long *freesize,
+                             int verbose)
+{
+       unsigned long long ldsize;
+       int fd;
+
+       if (level == LEVEL_CONTAINER)
+               return 0;
+       if (!subdev)
+               return 1;
+
+       fd = open(subdev, O_RDONLY|O_EXCL, 0);
+       if (fd < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": super1.x cannot open %s: %s\n",
+                               subdev, strerror(errno));
+               return 0;
+       }
+
+       if (!get_dev_size(fd, subdev, &ldsize)) {
+               close(fd);
+               return 0;
+       }
+       close(fd);
+
+       *freesize = avail_size1(st, ldsize >> 9);
+       return 1;
+}
+#endif /* MDASSEMBLE */
+
 struct superswitch super1 = {
 #ifndef MDASSEMBLE
        .examine_super = examine_super1,
@@ -1456,15 +1622,16 @@ struct superswitch super1 = {
        .detail_super = detail_super1,
        .brief_detail_super = brief_detail_super1,
        .export_detail_super = export_detail_super1,
+       .write_init_super = write_init_super1,
+       .validate_geometry = validate_geometry1,
+       .add_to_super = add_to_super1,
 #endif
        .match_home = match_home1,
        .uuid_from_super = uuid_from_super1,
        .getinfo_super = getinfo_super1,
        .update_super = update_super1,
        .init_super = init_super1,
-       .add_to_super = add_to_super1,
        .store_super = store_super1,
-       .write_init_super = write_init_super1,
        .compare_super = compare_super1,
        .load_super = load_super1,
        .match_metadata_desc = match_metadata_desc1,
@@ -1473,10 +1640,10 @@ struct superswitch super1 = {
        .locate_bitmap = locate_bitmap1,
        .write_bitmap = write_bitmap1,
        .free_super = free_super1,
-       .major = 1,
 #if __BYTE_ORDER == BIG_ENDIAN
        .swapuuid = 0,
 #else
        .swapuuid = 1,
 #endif
+       .name = "1.x",
 };
diff --git a/sysfs.c b/sysfs.c
index 6350242b6db175d0896cd01d9717ee6f63e03fd5..81ccb53fe5d23bfd64a60ebb74c753c58e547e40 100644 (file)
--- a/sysfs.c
+++ b/sysfs.c
@@ -2,7 +2,7 @@
  * sysfs - extract md related information from sysfs.  Part of:
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
@@ -25,6 +25,7 @@
 
 #include       "mdadm.h"
 #include       <dirent.h>
+#include       <ctype.h>
 
 int load_sys(char *path, char *buf)
 {
@@ -34,10 +35,10 @@ int load_sys(char *path, char *buf)
                return -1;
        n = read(fd, buf, 1024);
        close(fd);
-       if (n <=0 || n >= 1024)
+       if (n <0 || n >= 1024)
                return -1;
        buf[n] = 0;
-       if (buf[n-1] == '\n')
+       if (n && buf[n-1] == '\n')
                buf[n-1] = 0;
        return 0;
 }
@@ -56,6 +57,47 @@ void sysfs_free(struct mdinfo *sra)
        }
 }
 
+int sysfs_open(int devnum, char *devname, char *attr)
+{
+       char fname[50];
+       int fd;
+       char *mdname = devnum2devname(devnum);
+
+       if (!mdname)
+               return -1;
+
+       sprintf(fname, "/sys/block/%s/md/", mdname);
+       if (devname) {
+               strcat(fname, devname);
+               strcat(fname, "/");
+       }
+       strcat(fname, attr);
+       fd = open(fname, O_RDWR);
+       if (fd < 0 && errno == EACCES)
+               fd = open(fname, O_RDONLY);
+       free(mdname);
+       return fd;
+}
+
+void sysfs_init(struct mdinfo *mdi, int fd, int devnum)
+{
+       mdi->sys_name[0] = 0;
+       if (fd >= 0) {
+               mdu_version_t vers;
+               if (ioctl(fd, RAID_VERSION, &vers) != 0)
+                       return;
+               devnum = fd2devnum(fd);
+       }
+       if (devnum == NoMdDev)
+               return;
+       if (devnum >= 0)
+               sprintf(mdi->sys_name, "md%d", devnum);
+       else
+               sprintf(mdi->sys_name, "md_d%d",
+                       -1-devnum);
+}
+
+
 struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
 {
        /* Longest possible name in sysfs, mounted at /sys, is
@@ -69,55 +111,19 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
        char *dbase;
        struct mdinfo *sra;
        struct mdinfo *dev;
-       DIR *dir;
+       DIR *dir = NULL;
        struct dirent *de;
 
        sra = malloc(sizeof(*sra));
        if (sra == NULL)
                return sra;
-       sra->next = NULL;
-
-       if (fd >= 0) {
-               struct stat stb;
-               mdu_version_t vers;
-               if (fstat(fd, &stb)) return NULL;
-               if (ioctl(fd, RAID_VERSION, &vers) != 0)
-                       return NULL;
-               if (major(stb.st_rdev) == MD_MAJOR)
-                       sprintf(sra->sys_name, "md%d", (int)minor(stb.st_rdev));
-               else if (major(stb.st_rdev) == get_mdp_major())
-                       sprintf(sra->sys_name, "md_d%d",
-                               (int)minor(stb.st_rdev)>>MdpMinorShift);
-               else {
-                       /* must be an extended-minor partition. Look at the
-                        * /sys/dev/block/%d:%d link which must look like
-                        * ../../block/mdXXX/mdXXXpYY
-                        */
-                       char path[30];
-                       char link[200];
-                       char *cp;
-                       int n;
-                       sprintf(path, "/sys/dev/block/%d:%d", major(stb.st_rdev),
-                               minor(stb.st_rdev));
-                       n = readlink(path, link, sizeof(link)-1);
-                       if (n <= 0)
-                               return NULL;
-                       link[n] = 0;
-                       cp = strrchr(link, '/');
-                       if (cp) *cp = 0;
-                       cp = strchr(link, '/');
-                       if (cp && strncmp(cp, "/md", 3) == 0)
-                               strcpy(sra->sys_name, cp+1);
-                       else
-                               return NULL;
-               }
-       } else {
-               if (devnum >= 0)
-                       sprintf(sra->sys_name, "md%d", devnum);
-               else
-                       sprintf(sra->sys_name, "md_d%d",
-                               -1-devnum);
+       memset(sra, 0, sizeof(*sra));
+       sysfs_init(sra, fd, devnum);
+       if (sra->sys_name[0] == 0) {
+               free(sra);
+               return NULL;
        }
+
        sprintf(fname, "/sys/block/%s/md/", sra->sys_name);
        base = fname + strlen(fname);
 
@@ -134,10 +140,12 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
                        sra->array.major_version = -1;
                        sra->array.minor_version = -2;
                        strcpy(sra->text_version, buf+9);
-               } else
+               } else {
                        sscanf(buf, "%d.%d",
                               &sra->array.major_version,
                               &sra->array.minor_version);
+                       strcpy(sra->text_version, buf);
+               }
        }
        if (options & GET_LEVEL) {
                strcpy(base, "level");
@@ -151,6 +159,18 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
                        goto abort;
                sra->array.layout = strtoul(buf, NULL, 0);
        }
+       if (options & GET_DISKS) {
+               strcpy(base, "raid_disks");
+               if (load_sys(fname, buf))
+                       goto abort;
+               sra->array.raid_disks = strtoul(buf, NULL, 0);
+       }
+       if (options & GET_DEGRADED) {
+               strcpy(base, "degraded");
+               if (load_sys(fname, buf))
+                       goto abort;
+               sra->array.failed_disks = strtoul(buf, NULL, 0);
+       }
        if (options & GET_COMPONENT) {
                strcpy(base, "component_size");
                if (load_sys(fname, buf))
@@ -177,6 +197,35 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
                        goto abort;
                sra->mismatch_cnt = strtoul(buf, NULL, 0);
        }
+       if (options & GET_SAFEMODE) {
+               int scale = 1;
+               int dot = 0;
+               int i;
+               unsigned long msec;
+               size_t len;
+
+               strcpy(base, "safe_mode_delay");
+               if (load_sys(fname, buf))
+                       goto abort;
+
+               /* remove a period, and count digits after it */
+               len = strlen(buf);
+               for (i = 0; i < len; i++) {
+                       if (dot) {
+                               if (isdigit(buf[i])) {
+                                       buf[i-1] = buf[i];
+                                       scale *= 10;
+                               }
+                               buf[i] = 0;
+                       } else if (buf[i] == '.') {
+                               dot=1;
+                               buf[i] = 0;
+                       }
+               }
+               msec = strtoul(buf, NULL, 10);
+               msec = (msec * 1000) / scale;
+               sra->safe_mode_delay = msec;
+       }
 
        if (! (options & GET_DEVS))
                return sra;
@@ -200,22 +249,57 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
                dev = malloc(sizeof(*dev));
                if (!dev)
                        goto abort;
-               dev->next = sra->devs;
-               sra->devs = dev;
-               strcpy(dev->sys_name, de->d_name);
 
                /* Always get slot, major, minor */
                strcpy(dbase, "slot");
-               if (load_sys(fname, buf))
-                       goto abort;
+               if (load_sys(fname, buf)) {
+                       /* hmm... unable to read 'slot' maybe the device
+                        * is going away?
+                        */
+                       strcpy(dbase, "block");
+                       if (readlink(fname, buf, sizeof(buf)) < 0 &&
+                           errno != ENAMETOOLONG) {
+                               /* ...yup device is gone */
+                               free(dev);
+                               continue;
+                       } else {
+                               /* slot is unreadable but 'block' link
+                                * still intact... something bad is happening
+                                * so abort
+                                */
+                               free(dev);
+                               goto abort;
+                       }
+                       
+               }
+               strcpy(dev->sys_name, de->d_name);
                dev->disk.raid_disk = strtoul(buf, &ep, 10);
                if (*ep) dev->disk.raid_disk = -1;
 
                strcpy(dbase, "block/dev");
-               if (load_sys(fname, buf))
-                       goto abort;
+               if (load_sys(fname, buf)) {
+                       free(dev);
+                       if (options & SKIP_GONE_DEVS)
+                               continue;
+                       else
+                               goto abort;
+               }
                sscanf(buf, "%d:%d", &dev->disk.major, &dev->disk.minor);
 
+               /* special case check for block devices that can go 'offline' */
+               if (options & SKIP_GONE_DEVS) {
+                       strcpy(dbase, "block/device/state");
+                       if (load_sys(fname, buf) == 0 &&
+                           strncmp(buf, "offline", 7) == 0) {
+                               free(dev);
+                               continue;
+                       }
+               }
+
+               /* finally add this disk to the array */
+               dev->next = sra->devs;
+               sra->devs = dev;
+
                if (options & GET_OFFSET) {
                        strcpy(dbase, "offset");
                        if (load_sys(fname, buf))
@@ -226,7 +310,7 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
                        strcpy(dbase, "size");
                        if (load_sys(fname, buf))
                                goto abort;
-                       dev->component_size = strtoull(buf, NULL, 0);
+                       dev->component_size = strtoull(buf, NULL, 0) * 2;
                }
                if (options & GET_STATE) {
                        dev->disk.state = 0;
@@ -247,13 +331,41 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
                        dev->errors = strtoul(buf, NULL, 0);
                }
        }
+       closedir(dir);
        return sra;
 
  abort:
+       if (dir)
+               closedir(dir);
        sysfs_free(sra);
        return NULL;
 }
 
+int sysfs_attr_match(const char *attr, const char *str)
+{
+       /* See if attr, read from a sysfs file, matches
+        * str.  They must either be the same, or attr can
+        * have a trailing newline or comma
+        */
+       while (*attr && *str && *attr == *str) {
+               attr++;
+               str++;
+       }
+
+       if (*str || (*attr && *attr != ',' && *attr != '\n'))
+               return 0;
+       return 1;
+}
+
+int sysfs_match_word(const char *word, char **list)
+{
+       int n;
+       for (n=0; list[n]; n++)
+               if (sysfs_attr_match(word, list[n]))
+                       break;
+       return n;
+}
+
 unsigned long long get_component_size(int fd)
 {
        /* Find out the component size of the array.
@@ -290,6 +402,7 @@ int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev,
        char fname[50];
        int n;
        int fd;
+
        sprintf(fname, "/sys/block/%s/md/%s/%s",
                sra->sys_name, dev?dev->sys_name:"", name);
        fd = open(fname, O_WRONLY);
@@ -297,8 +410,11 @@ int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev,
                return -1;
        n = write(fd, val, strlen(val));
        close(fd);
-       if (n != strlen(val))
+       if (n != strlen(val)) {
+               dprintf(Name ": failed to write '%s' to '%s' (%s)\n",
+                       val, fname, strerror(errno));
                return -1;
+       }
        return 0;
 }
 
@@ -310,6 +426,22 @@ int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev,
        return sysfs_set_str(sra, dev, name, valstr);
 }
 
+int sysfs_uevent(struct mdinfo *sra, char *event)
+{
+       char fname[50];
+       int n;
+       int fd;
+
+       sprintf(fname, "/sys/block/%s/uevent",
+               sra->sys_name);
+       fd = open(fname, O_WRONLY);
+       if (fd < 0)
+               return -1;
+       n = write(fd, event, strlen(event));
+       close(fd);
+       return 0;
+}      
+
 int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
                       char *name, unsigned long long *val)
 {
@@ -333,3 +465,401 @@ int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
                return -1;
        return 0;
 }
+
+int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev,
+                      char *name, char *val, int size)
+{
+       char fname[50];
+       int n;
+       int fd;
+       sprintf(fname, "/sys/block/%s/md/%s/%s",
+               sra->sys_name, dev?dev->sys_name:"", name);
+       fd = open(fname, O_RDONLY);
+       if (fd < 0)
+               return -1;
+       n = read(fd, val, size);
+       close(fd);
+       if (n <= 0)
+               return -1;
+       val[n] = 0;
+       return n;
+}
+
+int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms)
+{
+       unsigned long sec;
+       unsigned long msec;
+       char delay[30];
+
+       sec = ms / 1000;
+       msec = ms % 1000;
+
+       sprintf(delay, "%ld.%03ld\n", sec, msec);
+       /*             this '\n' ^ needed for kernels older than 2.6.28 */
+       return sysfs_set_str(sra, NULL, "safe_mode_delay", delay);
+}
+
+int sysfs_set_array(struct mdinfo *info, int vers)
+{
+       int rv = 0;
+       char ver[100];
+
+       ver[0] = 0;
+       if (info->array.major_version == -1 &&
+           info->array.minor_version == -2) {
+               strcat(strcpy(ver, "external:"), info->text_version);
+
+               if ((vers % 100) < 2 ||
+                   sysfs_set_str(info, NULL, "metadata_version",
+                                 ver) < 0) {
+                       fprintf(stderr, Name ": This kernel does not "
+                               "support external metadata.\n");
+                       return 1;
+               }
+       }
+       if (info->array.level < 0)
+               return 0; /* FIXME */
+       rv |= sysfs_set_str(info, NULL, "level",
+                           map_num(pers, info->array.level));
+       rv |= sysfs_set_num(info, NULL, "raid_disks", info->array.raid_disks);
+       rv |= sysfs_set_num(info, NULL, "chunk_size", info->array.chunk_size);
+       rv |= sysfs_set_num(info, NULL, "layout", info->array.layout);
+       rv |= sysfs_set_num(info, NULL, "component_size", info->component_size/2);
+       if (info->custom_array_size) {
+               int rc;
+
+               rc = sysfs_set_num(info, NULL, "array_size",
+                                  info->custom_array_size/2);
+               if (rc && errno == ENOENT) {
+                       fprintf(stderr, Name ": This kernel does not "
+                               "have the md/array_size attribute, "
+                               "the array may be larger than expected\n");
+                       rc = 0;
+               }
+               rv |= rc;
+       }
+
+       if (info->array.level > 0)
+               rv |= sysfs_set_num(info, NULL, "resync_start", info->resync_start);
+       return rv;
+}
+
+int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int in_sync)
+{
+       char dv[100];
+       char nm[100];
+       char *dname;
+       int rv;
+
+       sprintf(dv, "%d:%d", sd->disk.major, sd->disk.minor);
+       rv = sysfs_set_str(sra, NULL, "new_dev", dv);
+       if (rv)
+               return rv;
+
+       memset(nm, 0, sizeof(nm));
+       sprintf(dv, "/sys/dev/block/%d:%d", sd->disk.major, sd->disk.minor);
+       rv = readlink(dv, nm, sizeof(nm));
+       if (rv <= 0)
+               return -1;
+       nm[rv] = '\0';
+       dname = strrchr(nm, '/');
+       if (dname) dname++;
+       strcpy(sd->sys_name, "dev-");
+       strcpy(sd->sys_name+4, dname);
+
+       rv = sysfs_set_num(sra, sd, "offset", sd->data_offset);
+       rv |= sysfs_set_num(sra, sd, "size", (sd->component_size+1) / 2);
+       if (sra->array.level != LEVEL_CONTAINER) {
+               if (in_sync)
+                       /* This can correctly fail if array isn't started,
+                        * yet, so just ignore status for now.
+                        */
+                       sysfs_set_str(sra, sd, "state", "in_sync");
+               rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk);
+       }
+       return rv;
+}
+
+#if 0
+int sysfs_disk_to_sg(int fd)
+{
+       /* from an open block device, try find and open its corresponding
+        * scsi_generic interface
+        */
+       struct stat st;
+       char path[256];
+       char sg_path[256];
+       char sg_major_minor[8];
+       char *c;
+       DIR *dir;
+       struct dirent *de;
+       int major, minor, rv;
+
+       if (fstat(fd, &st))
+               return -1;
+
+       snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device",
+                major(st.st_rdev), minor(st.st_rdev));
+
+       dir = opendir(path);
+       if (!dir)
+               return -1;
+
+       de = readdir(dir);
+       while (de) {
+               if (strncmp("scsi_generic:", de->d_name,
+                           strlen("scsi_generic:")) == 0)
+                       break;
+               de = readdir(dir);
+       }
+       closedir(dir);
+
+       if (!de)
+               return -1;
+
+       snprintf(sg_path, sizeof(sg_path), "%s/%s/dev", path, de->d_name);
+       fd = open(sg_path, O_RDONLY);
+       if (fd < 0)
+               return fd;
+
+       rv = read(fd, sg_major_minor, sizeof(sg_major_minor));
+       close(fd);
+       if (rv < 0)
+               return -1;
+       else
+               sg_major_minor[rv - 1] = '\0';
+
+       c = strchr(sg_major_minor, ':');
+       *c = '\0';
+       c++;
+       major = strtol(sg_major_minor, NULL, 10);
+       minor = strtol(c, NULL, 10);
+       snprintf(path, sizeof(path), "/dev/.tmp.md.%d:%d:%d",
+                (int) getpid(), major, minor);
+       if (mknod(path, S_IFCHR|0600, makedev(major, minor))==0) {
+                       fd = open(path, O_RDONLY);
+                       unlink(path);
+                       return fd;
+       }
+
+       return -1;
+}
+#endif
+
+int sysfs_disk_to_scsi_id(int fd, __u32 *id)
+{
+       /* from an open block device, try to retrieve it scsi_id */
+       struct stat st;
+       char path[256];
+       char *c1, *c2;
+       DIR *dir;
+       struct dirent *de;
+
+       if (fstat(fd, &st))
+               return 1;
+
+       snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device",
+                major(st.st_rdev), minor(st.st_rdev));
+
+       dir = opendir(path);
+       if (!dir)
+               return 1;
+
+       de = readdir(dir);
+       while (de) {
+               if (strncmp("scsi_disk:", de->d_name,
+                           strlen("scsi_disk:")) == 0)
+                       break;
+               de = readdir(dir);
+       }
+       closedir(dir);
+
+       if (!de)
+               return 1;
+
+       c1 = strchr(de->d_name, ':');
+       c1++;
+       c2 = strchr(c1, ':');
+       *c2 = '\0';
+       *id = strtol(c1, NULL, 10) << 24; /* host */
+       c1 = c2 + 1;
+       c2 = strchr(c1, ':');
+       *c2 = '\0';
+       *id |= strtol(c1, NULL, 10) << 16; /* channel */
+       c1 = c2 + 1;
+       c2 = strchr(c1, ':');
+       *c2 = '\0';
+       *id |= strtol(c1, NULL, 10) << 8; /* lun */
+       c1 = c2 + 1;
+       *id |= strtol(c1, NULL, 10); /* id */
+
+       return 0;
+}
+
+
+int sysfs_unique_holder(int devnum, long rdev)
+{
+       /* Check that devnum is a holder of rdev,
+        * and is the only holder.
+        * we should be locked against races by
+        * an O_EXCL on devnum
+        */
+       DIR *dir;
+       struct dirent *de;
+       char dirname[100];
+       char l;
+       int found = 0;
+       sprintf(dirname, "/sys/dev/block/%d:%d/holders",
+               major(rdev), minor(rdev));
+       dir = opendir(dirname);
+       errno = ENOENT;
+       if (!dir)
+               return 0;
+       l = strlen(dirname);
+       while ((de = readdir(dir)) != NULL) {
+               char buf[10];
+               int n;
+               int mj, mn;
+               char c;
+               int fd;
+
+               if (de->d_ino == 0)
+                       continue;
+               if (de->d_name[0] == '.')
+                       continue;
+               strcpy(dirname+l, "/");
+               strcat(dirname+l, de->d_name);
+               strcat(dirname+l, "/dev");
+               fd = open(dirname, O_RDONLY);
+               if (fd < 0) {
+                       errno = ENOENT;
+                       break;
+               }
+               n = read(fd, buf, sizeof(buf)-1);
+               close(fd);
+               buf[n] = 0;
+               if (sscanf(buf, "%d:%d%c", &mj, &mn, &c) != 3 ||
+                   c != '\n') {
+                       errno = ENOENT;
+                       break;
+               }
+               if (mj != MD_MAJOR)
+                       mn = -1-(mn>>6);
+
+               if (devnum != mn) {
+                       errno = EEXIST;
+                       break;
+               }
+               found = 1;
+       }
+       closedir(dir);
+       if (de)
+               return 0;
+       else
+               return found;
+}
+
+#ifndef MDASSEMBLE
+
+static char *clean_states[] = {
+       "clear", "inactive", "readonly", "read-auto", "clean", NULL };
+
+int WaitClean(char *dev, int verbose)
+{
+       int fd;
+       struct mdinfo *mdi;
+       int rv = 1;
+       int devnum;
+
+       fd = open(dev, O_RDONLY); 
+       if (fd < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": Couldn't open %s: %s\n", dev, strerror(errno));
+               return 1;
+       }
+
+       devnum = fd2devnum(fd);
+       mdi = sysfs_read(fd, devnum, GET_VERSION|GET_LEVEL|GET_SAFEMODE);
+       if (!mdi) {
+               if (verbose)
+                       fprintf(stderr, Name ": Failed to read sysfs attributes for "
+                               "%s\n", dev);
+               close(fd);
+               return 0;
+       }
+
+       switch(mdi->array.level) {
+       case LEVEL_LINEAR:
+       case LEVEL_MULTIPATH:
+       case 0:
+               /* safemode delay is irrelevant for these levels */
+               rv = 0;
+               
+       }
+
+       /* for internal metadata the kernel handles the final clean
+        * transition, containers can never be dirty
+        */
+       if (!is_subarray(mdi->text_version))
+               rv = 0;
+
+       /* safemode disabled ? */
+       if (mdi->safe_mode_delay == 0)
+               rv = 0;
+
+       if (rv) {
+               int state_fd = sysfs_open(fd2devnum(fd), NULL, "array_state");
+               char buf[20];
+               fd_set fds;
+               struct timeval tm;
+
+               /* minimize the safe_mode_delay and prepare to wait up to 5s
+                * for writes to quiesce
+                */
+               sysfs_set_safemode(mdi, 1);
+               tm.tv_sec = 5;
+               tm.tv_usec = 0;
+
+               /* give mdmon a chance to checkpoint resync */
+               sysfs_set_str(mdi, NULL, "sync_action", "idle");
+
+               FD_ZERO(&fds);
+
+               /* wait for array_state to be clean */
+               while (1) {
+                       rv = read(state_fd, buf, sizeof(buf));
+                       if (rv < 0)
+                               break;
+                       if (sysfs_match_word(buf, clean_states) <= 4)
+                               break;
+                       FD_SET(state_fd, &fds);
+                       rv = select(state_fd + 1, NULL, NULL, &fds, &tm);
+                       if (rv < 0 && errno != EINTR)
+                               break;
+                       lseek(state_fd, 0, SEEK_SET);
+               }
+               if (rv < 0)
+                       rv = 1;
+               else if (ping_monitor(mdi->text_version) == 0) {
+                       /* we need to ping to close the window between array
+                        * state transitioning to clean and the metadata being
+                        * marked clean
+                        */
+                       rv = 0;
+               } else
+                       rv = 1;
+               if (rv && verbose)
+                       fprintf(stderr, Name ": Error waiting for %s to be clean\n",
+                               dev);
+
+               /* restore the original safe_mode_delay */
+               sysfs_set_safemode(mdi, mdi->safe_mode_delay);
+               close(state_fd);
+       }
+
+       sysfs_free(mdi);
+       close(fd);
+
+       return rv;
+}
+#endif /* MDASSEMBLE */
diff --git a/test b/test
index 1a79bab42295102a8731d854d75e3997b32f1bf1..133f8ff41a33b41d46fe0cfe8860207a881f6fa8 100644 (file)
--- a/test
+++ b/test
@@ -39,9 +39,13 @@ mdsize1b=19988
 mdsize11=19992
 mdsize12=19988
 
+# ddf needs bigger devices as 32Meg is reserved!
+ddfsize=65536
+
 cleanup() {
-       $mdadm -Ss
-       for d in 0 1 2 3 4 5 6 7
+       udevadm settle
+       $mdadm -Ssq
+       for d in 0 1 2 3 4 5 6 7  8 9 10 11 12
        do
            losetup -d /dev/loop$d ; # rm -f $targetdir/mdtest$d
         done
@@ -50,9 +54,11 @@ cleanup() {
 trap cleanup 0 1 2 3 15
 
 devlist=
-for d in 0 1 2 3 4 5 6 7
+for d in 0 1 2 3 4 5 6 7 8 9 10 11 12
 do
-   [ -f $targetdir/mdtest$d ] || dd if=/dev/zero of=$targetdir/mdtest$d count=$size bs=1K > /dev/null 2>&1
+   sz=$size
+   if [ $d -gt 7 ]; then sz=$ddfsize ; fi
+   [ -f $targetdir/mdtest$d ] || dd if=/dev/zero of=$targetdir/mdtest$d count=$sz bs=1K > /dev/null 2>&1
    [ -b /dev/loop$d ] || mknod /dev/loop$d b 7 $d
    if [ $d -eq 7 ]
    then
@@ -63,10 +69,13 @@ do
    eval dev$d=/dev/loop$d
    eval file$d=$targetdir/mdtest$d
    eval devlist=\"\$devlist \$dev$d\"
+   #" <-- add this quote to un-confuse vim syntax highlighting
 done
 path0=$dev6
 path1=$dev7
 
+ulimit -c unlimited
+[ -f /proc/mdstat ] || modprobe md_mod
 echo 2000 > /proc/sys/dev/raid/speed_limit_max
 echo 0 > /sys/module/md_mod/parameters/start_ro
 
@@ -77,11 +86,16 @@ fi
 # mdadm always adds --quiet, and we want to see any unexpected messages
 mdadm() {
     rm -f $targetdir/stderr
+    case $* in
+       *-S* ) udevadm settle;;
+    esac
     case $* in
        *-C* ) $mdadm 2> $targetdir/stderr --quiet "$@" --auto=yes;;
         * )   $mdadm 2> $targetdir/stderr --quiet "$@"
     esac
+    rv=$?
     cat >&2 $targetdir/stderr
+    return $rv
 }
 
 # check various things
@@ -155,6 +169,7 @@ testdev() {
    dsize=$[dvsize/chunk]
    dsize=$[dsize*chunk]
    rasize=$[dsize*2*cnt]
+   if [ `/sbin/blockdev --getsize $dev` -eq 0 ]; then sleep 2 ; fi
    if [ $rasize -ne `/sbin/blockdev --getsize $dev` ]
    then
      echo "ERROR: size is wrong for $dev: $cnt * $dvsize (chunk=$chunk) = $rasize, not `/sbin/blockdev --getsize $dev`"
@@ -167,21 +182,42 @@ rotest() {
   fsck -fn $dev >&2
 }
 
+setup_environment() {
+   if [ -f $1 ]; then
+      . $environment
+      setup_env
+   fi
+}
 
+reset_environment() {
+   if [ -f $1 ]; then
+      reset_env
+      unset setup_env
+      unset reset_env
+   fi
+}
 
 for script in tests/$prefix tests/$prefix*[^~]
 do
   if [ -f "$script" ]
   then
    rm -f $targetdir/stderr
+   # stop all arrays, just incase some script left an array active.
+   mdadm -Ssq
+   mdadm --zero $devlist 2> /dev/null
+   mdadm --zero $devlist 2> /dev/null
+   environment="tests/env-`basename $script`"
+   setup_environment $environment
    # source script in a subshell, so it has access to our
    # namespace, but cannot change it.
    if ( set -ex ; . $script )  2> $targetdir/log
    then echo "$script succeeded" 
    else cat $targetdir/log ; cat $targetdir/stderr
         echo "$script failed"
+       reset_environment $environment
        exit 1
    fi
+   reset_environment $environment
   fi
 done
 exit 0
diff --git a/tests/01r5integ b/tests/01r5integ
new file mode 100644 (file)
index 0000000..714a57f
--- /dev/null
@@ -0,0 +1,29 @@
+
+# Check integrity of raid5 in degraded mode
+# Create a 4 disk raid5, create a filesystem and
+# sh1sum it with each device failed
+
+for layout in ls rs la ra
+do
+  mdadm -CR $md0 -l5 --layout $layout -n4 $dev0 $dev1 $dev2 $dev3
+  check wait
+  tar cf - /etc > $md0
+  sum=`sha1sum $md0`
+
+  for i in $dev0 $dev1 $dev2 $dev3
+  do
+    mdadm $md0 -f $i
+    mdadm $md0 -r $i
+    blockdev --flushbufs $md0
+    sum1=`sha1sum $md0`
+    if [ $sum != $sum1 ]
+    then
+     echo $sum does not matc $sum1 with $i missing
+     exit 1
+    fi
+    mdadm $md0 -a $i
+   check wait
+  done
+  mdadm -S $md0
+done
+
diff --git a/tests/01raid6integ b/tests/01raid6integ
new file mode 100644 (file)
index 0000000..ed7cec5
--- /dev/null
@@ -0,0 +1,53 @@
+
+# Check integrity of raid6 in degraded modes
+# Create a 5 disk raid6, dump some data to it, then
+# sh1sum it with different pairs of devices failed
+
+layouts='ls rs la ra'
+lv=`uname -r`
+if expr $lv '>=' 2.6.30 > /dev/null
+then
+  layouts="$layouts parity-first dd-zero-restart ddf-N-restart ddf-N-continue \
+       left-asymmetric-6 right-asymmetric-6 left-symmetric-6 right-symmetric-6 parity-first-6"
+fi
+echo $layouts
+for layout in $layouts
+do
+  mdadm -CR $md0 -l6 --layout $layout -n5 $dev0 $dev1 $dev2 $dev3 $dev4
+  check wait
+  tar cf - /etc > $md0
+  sum=`sha1sum $md0`
+
+  totest=
+  for second in $dev0 $dev1 $dev2 $dev3 $dev4
+  do
+    mdadm $md0 -f $second
+    mdadm $md0 -r $second
+    blockdev --flushbufs $md0
+    sum1=`sha1sum $md0`
+    if [ $sum != $sum1 ]
+    then
+      echo $sum does not matc $sum1 with $second missing
+      exit 1
+    fi
+    for first in $totest
+    do
+       mdadm $md0 -f $first
+       mdadm $md0 -r $first
+       blockdev --flushbufs $md0
+       sum1=`sha1sum $md0`
+       if [ $sum != $sum1 ]
+       then
+         echo $sum does not matc $sum1 with $first and $second missing
+         exit 1
+       fi
+       mdadm $md0 -a $first
+       check wait
+    done
+    mdadm $md0 -a $second
+    check wait
+    totest="$totest $second"
+  done
+  mdadm -S $md0
+done
+
index 4f03d7bdc2ee0d2516c4684567d8bff5d80e88f6..55205a36f91702e62387b5e5ffea655ba655d5c7 100644 (file)
@@ -129,3 +129,10 @@ echo "  metadata=1 devices=$dev0,$dev1,$dev2" >> $conf
 mdadm --assemble --scan --config=$conf $md2 
 $tst
 mdadm -S $md2
+
+# Now use incremental assembly.
+mdadm -I --config=$conf $dev0
+mdadm -I --config=$conf $dev1
+mdadm -I --config=$conf $dev2
+$tst
+mdadm -S $md2
index 7553a4f0b9b56fab6854f57c8fbbe1ffcf6eab22..0f2c83b6ed3a159964844747f00a005ff5a1baae 100644 (file)
@@ -113,3 +113,10 @@ echo "  metadata=1.0 devices=$dev0,$dev1,$dev2" >> $conf
 mdadm --assemble --scan --config=$conf $md1
 check state U_U
 eval $tst
+
+# And now assemble with -I
+mdadm -Ss
+mdadm -I -c $conf $dev0
+mdadm -I -c $conf $dev1
+mdadm -I -c $conf $dev2
+eval $tst
index 9c749f27175b6c9f42fffb49a2fdcb5ab34a7802..26d2300ac592ffa7d9329d272bc976281594ee60 100644 (file)
@@ -3,8 +3,8 @@ set -x
 # create an array with a name
 
 mdadm -CR $md0 -l0 -n2 --metadata=1 --name="Fred" $dev0 $dev1
-mdadm -E $dev0 | grep 'Name : Fred$' > /dev/null || exit 1
-mdadm -D $md0 | grep 'Name : Fred$' > /dev/null || exit 1
+mdadm -E $dev0 | grep 'Name : [^:]*:Fred ' > /dev/null || exit 1
+mdadm -D $md0 | grep 'Name : [^:]*:Fred ' > /dev/null || exit 1
 mdadm -S $md0
 
 mdadm -A $md0 --name="Fred" $devlist
diff --git a/tests/08imsm-overlap b/tests/08imsm-overlap
new file mode 100644 (file)
index 0000000..1a071ef
--- /dev/null
@@ -0,0 +1,25 @@
+# create raid arrays with varying degress of overlap
+mdadm -CR $container -e imsm -n 6 $dev0 $dev1 $dev2 $dev3 $dev4 $dev5
+imsm_check container 6
+
+size=1910
+level=1
+num_disks=2
+mdadm -CR $member0 $dev0 $dev1 -n $num_disks -l $level -z $size
+mdadm -CR $member1 $dev1 $dev2 -n $num_disks -l $level -z $size
+mdadm -CR $member2 $dev2 $dev3 -n $num_disks -l $level -z $size
+mdadm -CR $member3 $dev3 $dev4 -n $num_disks -l $level -z $size
+mdadm -CR $member4 $dev4 $dev5 -n $num_disks -l $level -z $size
+
+offset=0
+imsm_check member $member0 $num_disks $level $size $offset
+offset=$((offset+size+2048))
+imsm_check member $member1 $num_disks $level $size $offset
+offset=$((offset+size+2048))
+imsm_check member $member2 $num_disks $level $size $offset
+offset=$((offset+size+2048))
+imsm_check member $member3 $num_disks $level $size $offset
+# at this point there should be more freespace at the start of the disk
+# than the end
+offset=0
+imsm_check member $member4 $num_disks $level $size $offset
diff --git a/tests/09imsm-create-fail-rebuild b/tests/09imsm-create-fail-rebuild
new file mode 100644 (file)
index 0000000..8069576
--- /dev/null
@@ -0,0 +1,56 @@
+# sanity check array creation
+
+num_disks=2
+mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1
+imsm_check container $num_disks
+
+# RAID0 + RAID1
+size=10000
+level=0
+chunk=64
+offset=0
+mdadm -CR $member0 $dev0 $dev1 -n $num_disks -l $level -z $size -c $chunk
+imsm_check member $member0 $num_disks $level $size $offset $chunk
+testdev $member0 $num_disks $size $chunk
+
+offset=$(((size & ~(chunk - 1)) + 2048))
+size=5000
+level=1
+chunk=0
+mdadm -CR $member1 $dev0 $dev1 -n $num_disks -l $level -z $size
+imsm_check member $member1 $num_disks $level $size $offset $chunk
+testdev $member1 1 $size 1
+check wait
+
+mdadm -Ss
+
+# RAID10 + RAID5
+num_disks=4
+mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 $dev2 $dev3
+imsm_check container $num_disks
+
+size=10000
+level=10
+chunk=64
+offset=0
+mdadm -CR $member0 $dev0 $dev1 $dev2 $dev3 -n $num_disks -l $level -z $size -c $chunk
+imsm_check member $member0 $num_disks $level $size $offset $chunk
+testdev $member0 $((num_disks-2)) $size $chunk
+
+offset=$(((size & ~(chunk - 1)) + 2048))
+size=5000
+level=5
+mdadm -CR $member1 $dev0 $dev1 $dev2 $dev3 -n $num_disks -l $level -z $size -c $chunk
+imsm_check member $member1 $num_disks $level $size $offset $chunk
+testdev $member1 $((num_disks-1)) $size $chunk
+check wait
+
+# FAIL / REBUILD
+imsm_check_hold $container $dev0
+mdadm --fail $member0 $dev0
+mdadm --wait-clean --scan
+imsm_check_removal $container $dev0
+mdadm --add $container $dev4
+check wait
+imsm_check_hold $container $dev4
+
diff --git a/tests/10ddf-create b/tests/10ddf-create
new file mode 100644 (file)
index 0000000..db22b64
--- /dev/null
@@ -0,0 +1,76 @@
+#
+# Test basic DDF functionality.
+#
+# Create a container with 5 drives
+# create a small raid0 across them all, then a 2disk raid1
+# and a 3disk raid5 using the remaining space
+#
+# add some data, tear down the array, reassemble
+# and make sure it is still there.
+
+mdadm -CR /dev/md/ddf0 -e ddf -n 5 $dev8 $dev9 $dev10 $dev11 $dev12
+mdadm -CR r0 -l0 -n5 /dev/md/ddf0 -z 5000
+mdadm -CR r1 -l1 -n2 /dev/md/ddf0
+mdadm -CR r5 -l5 -n3 /dev/md/ddf0
+testdev /dev/md/r0 5 5000 64
+# r0 will use 4992 due to chunk size, so that leave 27776 for the rest
+testdev /dev/md/r1 1 27776 1
+testdev /dev/md/r5 2 27776 64
+dd if=/dev/sda of=/dev/md/r0 || true
+dd if=/dev/sda of=/dev/md/r1 || true
+dd if=/dev/sda of=/dev/md/r5 || true
+
+s0=`sha1sum /dev/md/r0`
+s1=`sha1sum /dev/md/r1`
+s5=`sha1sum /dev/md/r5`
+
+
+mdadm -Ss
+mdadm -A /dev/md/ddf0  $dev8 $dev9 $dev10 $dev11 $dev12
+mdadm -I /dev/md/ddf0
+
+s0a=`sha1sum /dev/md/r0`
+s1a=`sha1sum /dev/md/r1`
+s5a=`sha1sum /dev/md/r5`
+
+if [ "$s0" != "$s0a" ]; then
+   echo r0 did not match ; exit 1;
+fi
+if [ "$s1" != "$s1a" ]; then
+   echo r1 did not match ; exit 1;
+fi
+if [ "$s5" != "$s5a" ]; then
+   echo r5 did not match ; exit 1;
+fi
+
+# failure status just means it has completed already, so ignore it.
+mdadm --wait /dev/md/r1 || true
+mdadm --wait /dev/md/r5 || true
+
+mdadm -Dbs > /var/tmp/mdadm.conf
+
+mdadm -Ss
+
+# Now try to assemble using mdadm.conf
+mdadm -Asc /var/tmp/mdadm.conf
+check nosync  # This failed once. The raid5 was resyncing.
+
+mdadm -Dbs > /tmp/mdadm.conf
+diff /tmp/mdadm.conf /var/tmp/mdadm.conf
+mdadm -Ss
+
+# and now assemble fully incrementally.
+for i in  $dev8 $dev9 $dev10 $dev11 $dev12
+do 
+  #./mdadm -I $i -vv 2>&1 | wc -l > /tmp/cnt
+  ./mdadm -I $i 2> /tmp/thing
+  wc -l < /tmp/thing > /tmp/cnt
+  # should find container and 2 devices, so 3 lines.
+  [ `cat /tmp/cnt` -eq 3 ]
+done
+check nosync
+
+mdadm -Dbs > /tmp/mdadm.conf
+diff /tmp/mdadm.conf /var/tmp/mdadm.conf
+mdadm -Ss
+rm /tmp/mdadm.conf /var/tmp/mdadm.conf
diff --git a/tests/env-08imsm-overlap b/tests/env-08imsm-overlap
new file mode 100644 (file)
index 0000000..83557d3
--- /dev/null
@@ -0,0 +1,68 @@
+imsm_check() {
+   case $1 in
+    container )
+      grep -s "$(((418 * $2)/2)) blocks super external:imsm" /proc/mdstat > /dev/null || {
+               echo >&2 "ERROR correctly formed container not found"; cat /proc/mdstat; exit 1;}
+      ;;
+    member )
+      member=$2
+      num_disks=$3
+      level=$4
+      size=$5
+      offset=$6
+      err=0
+
+      eval `stat -L -c "let major=0x%t; let minor=0x%T;" $member`
+      sysfs=/sys/dev/block/${major}:${minor}
+      if [ ! -f ${sysfs}/md/array_state ]; then
+           echo "member array $member not found" >&2
+           cat /proc/mdstat >&2
+           exit 1
+      fi
+      for i in `seq 0 $((num_disks-1))`
+      do
+         _offset=`cat ${sysfs}/md/rd${i}/offset`
+         if [ $offset -ne $((_offset/2)) ]; then
+           echo "offset mismatch expected $offset got $_offset" >&2
+            err=$((err+1))
+         fi
+         _size=`cat ${sysfs}/md/rd${i}/size`
+         if [ $size -ne $_size ]; then
+           echo "offset mismatch expected $size got $_size" >&2
+            err=$((err+1))
+         fi
+      done
+
+      if [ $err -gt 0 ]; then
+          echo "$member failed check" >&2
+          cat /proc/mdstat >&2
+         mdadm -E /dev/loop0 >&2
+          exit 1
+      fi
+      ;;
+    * ) echo >&2 ERROR unknown check $1 ; exit 1;
+   esac
+}
+
+setup_env() {
+       export IMSM_DEVNAME_AS_SERIAL=1
+       export IMSM_NO_PLATFORM=1
+       container=/dev/md/container
+       member0=/dev/md/vol0
+       member1=/dev/md/vol1
+       member2=/dev/md/vol2
+       member3=/dev/md/vol3
+       member4=/dev/md/vol4
+}
+
+reset_env() {
+       unset IMSM_DEVNAME_AS_SERIAL
+       unset IMSM_NO_PLATFORM
+       unset imsm_check
+       unset container
+       unset member0
+       unset member1
+       unset member2
+       unset member3
+       unset member4
+}
diff --git a/tests/env-09imsm-create-fail-rebuild b/tests/env-09imsm-create-fail-rebuild
new file mode 100644 (file)
index 0000000..b44746c
--- /dev/null
@@ -0,0 +1,98 @@
+imsm_check_hold() {
+   if mdadm --remove $1 $2; then
+       echo "$2 removal from $1 should have been blocked" >&2
+       cat /proc/mdstat >&2
+       mdadm -E $2
+       exit 1
+   fi
+}
+
+imsm_check_removal() {
+   if ! mdadm --remove $1 $2 ; then
+       echo "$2 removal from $1 should have succeeded" >&2
+       cat /proc/mdstat >&2
+       mdadm -E $2
+       exit 1
+   fi
+}
+
+imsm_check() {
+   udevadm settle
+   case $1 in
+    container )
+      grep -s "$(((418 * $2)/2)) blocks super external:imsm" /proc/mdstat > /dev/null || {
+               echo >&2 "ERROR correctly formed container not found"; cat /proc/mdstat; exit 1;}
+      ;;
+    member )
+      member=$2
+      num_disks=$3
+      level=$4
+      size=$5
+      offset=$6
+      chunk=$7
+      err=0
+
+      if [ $level -ne 1 ]; then
+         size=$((size & ~(chunk - 1)))
+      else
+         chunk=64
+      fi
+      eval `stat -L -c "let major=0x%t; let minor=0x%T;" $member`
+      sysfs=/sys/dev/block/${major}:${minor}
+      if [ ! -f ${sysfs}/md/array_state ]; then
+           echo "member array $member not found" >&2
+           cat /proc/mdstat >&2
+           exit 1
+      fi
+      _chunk=`cat ${sysfs}/md/chunk_size`
+      if [ $chunk -ne $((_chunk/1024)) ]; then
+         echo "chunk mismatch expected $chunk got $_chunk" >&2
+         err=$((err+1))
+      fi
+      for i in `seq 0 $((num_disks-1))`
+      do
+         _offset=`cat ${sysfs}/md/rd${i}/offset`
+         if [ $offset -ne $((_offset/2)) ]; then
+           echo "offset mismatch expected $offset got $_offset" >&2
+            err=$((err+1))
+         fi
+         _size=`cat ${sysfs}/md/rd${i}/size`
+         if [ $size -ne $_size ]; then
+           echo "size mismatch expected $size got $_size" >&2
+            err=$((err+1))
+         fi
+      done
+
+      if [ $err -gt 0 ]; then
+          echo "$member failed check" >&2
+          cat /proc/mdstat >&2
+         mdadm -E /dev/loop0 >&2
+          exit 1
+      fi
+      ;;
+    * ) echo >&2 ERROR unknown check $1 ; exit 1;
+   esac
+}
+
+setup_env() {
+       export IMSM_DEVNAME_AS_SERIAL=1
+       export IMSM_TEST_OROM=1
+       container=/dev/md/container
+       member0=/dev/md/vol0
+       member1=/dev/md/vol1
+       member2=/dev/md/vol2
+       member3=/dev/md/vol3
+       member4=/dev/md/vol4
+}
+
+reset_env() {
+       unset IMSM_DEVNAME_AS_SERIAL
+       unset IMSM_TEST_OROM
+       unset imsm_check
+       unset container
+       unset member0
+       unset member1
+       unset member2
+       unset member3
+       unset member4
+}
diff --git a/udev-md-raid.rules b/udev-md-raid.rules
new file mode 100644 (file)
index 0000000..300a754
--- /dev/null
@@ -0,0 +1,38 @@
+# do not edit this file, it will be overwritten on update
+
+SUBSYSTEM!="block", GOTO="md_end"
+ACTION!="add|change", GOTO="md_end"
+ACTION=="change", GOTO="md_no_incr"
+
+# import data from a raid member and activate it
+#ENV{ID_FS_TYPE}=="linux_raid_member", IMPORT{program}="/sbin/mdadm --examine --export $tempnode", RUN+="/sbin/mdadm --incremental $env{DEVNAME}"
+# import data from a raid set
+LABEL="md_no_incr"
+KERNEL!="md*", GOTO="md_end"
+
+# partitions have no md/{array_state,metadata_version}, but should not
+# for that reason be ignored.
+ENV{DEVTYPE}=="partition", GOTO="md_ignore_state"
+
+# container devices have a metadata version of e.g. 'external:ddf' and
+# never leave state 'inactive'
+ATTR{md/metadata_version}=="external:[A-Za-z]*", ATTR{md/array_state}=="inactive", GOTO="md_ignore_state"
+TEST!="md/array_state", GOTO="md_end"
+ATTR{md/array_state}=="|clear|inactive", GOTO="md_end"
+LABEL="md_ignore_state"
+
+IMPORT{program}="/sbin/mdadm --detail --export $tempnode"
+ENV{DEVTYPE}=="disk", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}", OPTIONS+="string_escape=replace"
+ENV{DEVTYPE}=="disk", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}"
+ENV{DEVTYPE}=="disk", ENV{MD_DEVNAME}=="?*", SYMLINK+="md/$env{MD_DEVNAME}"
+ENV{DEVTYPE}=="partition", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}-part%n", OPTIONS+="string_escape=replace"
+ENV{DEVTYPE}=="partition", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}-part%n"
+ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[^0-9]", SYMLINK+="md/$env{MD_DEVNAME}%n"
+ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[0-9]", SYMLINK+="md/$env{MD_DEVNAME}p%n"
+
+IMPORT{program}="vol_id --export $tempnode"
+OPTIONS+="link_priority=100"
+ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}"
+ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}"
+
+LABEL="md_end"
diff --git a/util.c b/util.c
index 1b2ae6b25482ecb78b9b8c3c1e1325fe17157da6..00bf80378de615e8ece5d40ab4d56990062ea4c8 100644 (file)
--- a/util.c
+++ b/util.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 
 #include       "mdadm.h"
 #include       "md_p.h"
+#include       <sys/socket.h>
 #include       <sys/utsname.h>
+#include       <sys/wait.h>
+#include       <sys/un.h>
 #include       <ctype.h>
+#include       <dirent.h>
+#include       <signal.h>
 
 /*
  * following taken from linux/blkpg.h because they aren't
@@ -217,8 +217,13 @@ int enough(int level, int raid_disks, int layout, int clean,
        }
 }
 
+const int uuid_match_any[4] = { ~0, ~0, ~0, ~0 };
 int same_uuid(int a[4], int b[4], int swapuuid)
 {
+       if (memcmp(a, uuid_match_any, sizeof(int[4])) == 0 ||
+           memcmp(b, uuid_match_any, sizeof(int[4])) == 0)
+               return 1;
+
        if (swapuuid) {
                /* parse uuids are hostendian.
                 * uuid's from some superblocks are big-ending
@@ -264,6 +269,27 @@ void copy_uuid(void *a, int b[4], int swapuuid)
                memcpy(a, b, 16);
 }
 
+char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char sep)
+{
+       int i, j;
+       int id;
+       char uuid[16];
+       char *c = buf;
+       strcpy(c, "UUID-");
+       c += strlen(c);
+       copy_uuid(uuid, info->uuid, st->ss->swapuuid);
+       for (i = 0; i < 4; i++) {
+               id = uuid[i];
+               if (i)
+                       *c++ = sep;
+               for (j = 3; j >= 0; j--) {
+                       sprintf(c,"%02x", (unsigned char) uuid[j+4*i]);
+                       c+= 2;
+               }
+       }
+       return buf;
+}
+
 #ifndef MDASSEMBLE
 int check_ext2(int fd, char *name)
 {
@@ -389,6 +415,9 @@ int is_standard(char *dev, int *nump)
        /* tests if dev is a "standard" md dev name.
         * i.e if the last component is "/dNN" or "/mdNN",
         * where NN is a string of digits
+        * Returns 1 if a partitionable standard,
+        *   -1 if non-partitonable,
+        *   0 if not a standard name.
         */
        char *d = strrchr(dev, '/');
        int type=0;
@@ -477,14 +506,13 @@ int nftw(const char *path, int (*han)(const char *name, const struct stat *stb,
 /*
  * Find a block device with the right major/minor number.
  * If we find multiple names, choose the shortest.
- * If we find a non-standard name, it is probably there
- * deliberately so prefer it over a standard name.
+ * If we find a name in /dev/md/, we prefer that.
  * This applies only to names for MD devices.
  */
 char *map_dev(int major, int minor, int create)
 {
        struct devmap *p;
-       char *std = NULL, *nonstd=NULL;
+       char *regular = NULL, *preferred=NULL;
        int did_check = 0;
 
        if (major == 0 && minor == 0)
@@ -511,27 +539,27 @@ char *map_dev(int major, int minor, int create)
        for (p=devlist; p; p=p->next)
                if (p->major == major &&
                    p->minor == minor) {
-                       if (is_standard(p->name, NULL)) {
-                               if (std == NULL ||
-                                   strlen(p->name) < strlen(std))
-                                       std = p->name;
+                       if (strncmp(p->name, "/dev/md/",8) == 0) {
+                               if (preferred == NULL ||
+                                   strlen(p->name) < strlen(preferred))
+                                       preferred = p->name;
                        } else {
-                               if (nonstd == NULL ||
-                                   strlen(p->name) < strlen(nonstd))
-                                       nonstd = p->name;
+                               if (regular == NULL ||
+                                   strlen(p->name) < strlen(regular))
+                                       regular = p->name;
                        }
                }
-       if (!std && !nonstd && !did_check) {
+       if (!regular && !preferred && !did_check) {
                devlist_ready = 0;
                goto retry;
        }
-       if (create && !std && !nonstd) {
+       if (create && !regular && !preferred) {
                static char buf[30];
                snprintf(buf, sizeof(buf), "%d:%d", major, minor);
-               nonstd = buf;
+               regular = buf;
        }
 
-       return nonstd ? nonstd : std;
+       return preferred ? preferred : regular;
 }
 
 unsigned long calc_csum(void *super, int bytes)
@@ -627,6 +655,23 @@ void print_r10_layout(int layout)
 }
 #endif
 
+unsigned long long calc_array_size(int level, int raid_disks, int layout,
+                                  int chunksize, unsigned long long devsize)
+{
+       int data_disks = 0;
+       switch (level) {
+       case 0: data_disks = raid_disks; break;
+       case 1: data_disks = 1; break;
+       case 4:
+       case 5: data_disks = raid_disks - 1; break;
+       case 6: data_disks = raid_disks - 2; break;
+       case 10: data_disks = raid_disks / (layout & 255) / ((layout>>8)&255);
+               break;
+       }
+       devsize &= ~(unsigned long long)((chunksize>>9)-1);
+       return data_disks * devsize;
+}
+
 int get_mdp_major(void)
 {
 static int mdp_major = -1;
@@ -655,9 +700,7 @@ static int mdp_major = -1;
        return mdp_major;
 }
 
-
 #if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO)
-
 char *get_md_name(int dev)
 {
        /* find /dev/md%d or /dev/md/%d or make a device /dev/.tmp.md%d */
@@ -712,26 +755,11 @@ void put_md_name(char *name)
                unlink(name);
 }
 
-static int dev2major(int d)
-{
-       if (d >= 0)
-               return MD_MAJOR;
-       else
-               return get_mdp_major();
-}
-
-static int dev2minor(int d)
-{
-       if (d >= 0)
-               return d;
-       return (-1-d) << MdpMinorShift;
-}
-
 int find_free_devnum(int use_partitions)
 {
        int devnum;
        for (devnum = 127; devnum != 128;
-            devnum = devnum ? devnum-1 : (1<<22)-1) {
+            devnum = devnum ? devnum-1 : (1<<20)-1) {
                char *dn;
                int _devnum;
 
@@ -768,19 +796,79 @@ int dev_open(char *dev, int flags)
        if (e > dev && *e == ':' && e[1] &&
            (minor = strtoul(e+1, &e, 0)) >= 0 &&
            *e == 0) {
-               snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d", major, minor);
+               snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d:%d",
+                        (int)getpid(), major, minor);
                if (mknod(devname, S_IFBLK|0600, makedev(major, minor))==0) {
-                       fd = open(devname, flags);
+                       fd = open(devname, flags|O_DIRECT);
                        unlink(devname);
                }
        } else
-               fd = open(dev, flags);
+               fd = open(dev, flags|O_DIRECT);
        return fd;
 }
 
-struct superswitch *superlist[] = { &super0, &super1, NULL };
+int open_dev(int devnum)
+{
+       char buf[20];
+
+       sprintf(buf, "%d:%d", dev2major(devnum), dev2minor(devnum));
+       return dev_open(buf, O_RDWR);
+}
+
+int open_dev_excl(int devnum)
+{
+       char buf[20];
+       int i;
+
+       sprintf(buf, "%d:%d", dev2major(devnum), dev2minor(devnum));
+       for (i=0 ; i<25 ; i++) {
+               int fd = dev_open(buf, O_RDWR|O_EXCL);
+               if (fd >= 0)
+                       return fd;
+               if (errno != EBUSY)
+                       return fd;
+               usleep(200000);
+       }
+       return -1;
+}
+
+int same_dev(char *one, char *two)
+{
+       struct stat st1, st2;
+       if (stat(one, &st1) != 0)
+               return 0;
+       if (stat(two, &st2) != 0)
+               return 0;
+       if ((st1.st_mode & S_IFMT) != S_IFBLK)
+               return 0;
+       if ((st2.st_mode & S_IFMT) != S_IFBLK)
+               return 0;
+       return st1.st_rdev == st2.st_rdev;
+}
+
+void wait_for(char *dev, int fd)
+{
+       int i;
+       struct stat stb_want;
+
+       if (fstat(fd, &stb_want) != 0 ||
+           (stb_want.st_mode & S_IFMT) != S_IFBLK)
+               return;
+
+       for (i=0 ; i<25 ; i++) {
+               struct stat stb;
+               if (stat(dev, &stb) == 0 &&
+                   (stb.st_mode & S_IFMT) == S_IFBLK &&
+                   (stb.st_rdev == stb_want.st_rdev))
+                       return;
+               usleep(200000);
+       }
+}
+
+struct superswitch *superlist[] = { &super0, &super1, &super_ddf, &super_imsm, NULL };
 
 #if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO)
+
 struct supertype *super_by_fd(int fd)
 {
        mdu_array_info_t array;
@@ -791,6 +879,7 @@ struct supertype *super_by_fd(int fd)
        char *verstr;
        char version[20];
        int i;
+       char *subarray = NULL;
 
        sra = sysfs_read(fd, 0, GET_VERSION);
 
@@ -810,40 +899,59 @@ struct supertype *super_by_fd(int fd)
                sprintf(version, "%d.%d", vers, minor);
                verstr = version;
        }
+       if (minor == -2 && is_subarray(verstr)) {
+               char *dev = verstr+1;
+               subarray = strchr(dev, '/');
+               int devnum;
+               if (subarray)
+                       *subarray++ = '\0';
+               devnum = devname2devnum(dev);
+               subarray = strdup(subarray);
+               if (sra)
+                       sysfs_free(sra);
+               sra = sysfs_read(-1, devnum, GET_VERSION);
+               if (sra && sra->text_version[0])
+                       verstr = sra->text_version;
+               else
+                       verstr = "-no-metadata-";
+       }
+
        for (i = 0; st == NULL && superlist[i] ; i++)
                st = superlist[i]->match_metadata_desc(verstr);
 
        if (sra)
                sysfs_free(sra);
-       if (st)
+       if (st) {
                st->sb = NULL;
+               if (subarray) {
+                       strncpy(st->subarray, subarray, 32);
+                       st->subarray[31] = 0;
+                       free(subarray);
+               } else
+                       st->subarray[0] = 0;
+       }
        return st;
 }
 #endif /* !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) */
 
 
-struct supertype *dup_super(struct supertype *st)
+struct supertype *dup_super(struct supertype *orig)
 {
-       struct supertype *stnew = NULL;
-       char *verstr = NULL;
-       char version[20];
-       int i;
+       struct supertype *st;
 
+       if (!orig)
+               return orig;
+       st = malloc(sizeof(*st));
        if (!st)
                return st;
-
-       if (st->minor_version == -1)
-               sprintf(version, "%d", st->ss->major);
-       else
-               sprintf(version, "%d.%d", st->ss->major, st->minor_version);
-       verstr = version;
-
-       for (i = 0; stnew == NULL && superlist[i] ; i++)
-               stnew = superlist[i]->match_metadata_desc(verstr);
-
-       if (stnew)
-               stnew->sb = NULL;
-       return stnew;
+       memset(st, 0, sizeof(*st));
+       st->ss = orig->ss;
+       st->max_devs = orig->max_devs;
+       st->minor_version = orig->minor_version;
+       strcpy(st->subarray, orig->subarray);
+       st->sb = NULL;
+       st->info = NULL;
+       return st;
 }
 
 struct supertype *guess_super(int fd)
@@ -858,11 +966,10 @@ struct supertype *guess_super(int fd)
        int i;
 
        st = malloc(sizeof(*st));
-       memset(st, 0, sizeof(*st));
        for (i=0 ; superlist[i]; i++) {
                int rv;
                ss = superlist[i];
-               st->ss = NULL;
+               memset(st, 0, sizeof(*st));
                rv = ss->load_super(st, fd, NULL);
                if (rv == 0) {
                        struct mdinfo info;
@@ -877,7 +984,7 @@ struct supertype *guess_super(int fd)
        }
        if (bestsuper != -1) {
                int rv;
-               st->ss = NULL;
+               memset(st, 0, sizeof(*st));
                rv = superlist[bestsuper]->load_super(st, fd, NULL);
                if (rv == 0) {
                        superlist[bestsuper]->free_super(st);
@@ -925,6 +1032,315 @@ void get_one_disk(int mdfd, mdu_array_info_t *ainf, mdu_disk_info_t *disk)
                        return;
 }
 
+int open_container(int fd)
+{
+       /* 'fd' is a block device.  Find out if it is in use
+        * by a container, and return an open fd on that container.
+        */
+       char path[256];
+       char *e;
+       DIR *dir;
+       struct dirent *de;
+       int dfd, n;
+       char buf[200];
+       int major, minor;
+       struct stat st;
+
+       if (fstat(fd, &st) != 0)
+               return -1;
+       sprintf(path, "/sys/dev/block/%d:%d/holders",
+               (int)major(st.st_rdev), (int)minor(st.st_rdev));
+       e = path + strlen(path);
+
+       dir = opendir(path);
+       if (!dir)
+               return -1;
+       while ((de = readdir(dir))) {
+               if (de->d_ino == 0)
+                       continue;
+               if (de->d_name[0] == '.')
+                       continue;
+               sprintf(e, "/%s/dev", de->d_name);
+               dfd = open(path, O_RDONLY);
+               if (dfd < 0)
+                       continue;
+               n = read(dfd, buf, sizeof(buf));
+               close(dfd);
+               if (n <= 0 || n >= sizeof(buf))
+                       continue;
+               buf[n] = 0;
+               if (sscanf(buf, "%d:%d", &major, &minor) != 2)
+                       continue;
+               sprintf(buf, "%d:%d", major, minor);
+               dfd = dev_open(buf, O_RDONLY);
+               if (dfd >= 0) {
+                       closedir(dir);
+                       return dfd;
+               }
+       }
+       closedir(dir);
+       return -1;
+}
+
+int add_disk(int mdfd, struct supertype *st,
+            struct mdinfo *sra, struct mdinfo *info)
+{
+       /* Add a device to an array, in one of 2 ways. */
+       int rv;
+#ifndef MDASSEMBLE
+       if (st->ss->external) {
+               rv = sysfs_add_disk(sra, info,
+                                   info->disk.state & (1<<MD_DISK_SYNC));
+               if (! rv) {
+                       struct mdinfo *sd2;
+                       for (sd2 = sra->devs; sd2; sd2=sd2->next)
+                               if (sd2 == info)
+                                       break;
+                       if (sd2 == NULL) {
+                               sd2 = malloc(sizeof(*sd2));
+                               *sd2 = *info;
+                               sd2->next = sra->devs;
+                               sra->devs = sd2;
+                       }
+               }
+       } else
+#endif
+               rv = ioctl(mdfd, ADD_NEW_DISK, &info->disk);
+       return rv;
+}
+
+int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info)
+{
+       /* Initialise kernel's knowledge of array.
+        * This varies between externally managed arrays
+        * and older kernels
+        */
+       int vers = md_get_version(mdfd);
+       int rv;
+
+#ifndef MDASSEMBLE
+       if (st->ss->external)
+               rv = sysfs_set_array(info, vers);
+       else
+#endif
+               if ((vers % 100) >= 1) { /* can use different versions */
+               mdu_array_info_t inf;
+               memset(&inf, 0, sizeof(inf));
+               inf.major_version = info->array.major_version;
+               inf.minor_version = info->array.minor_version;
+               rv = ioctl(mdfd, SET_ARRAY_INFO, &inf);
+       } else
+               rv = ioctl(mdfd, SET_ARRAY_INFO, NULL);
+       return rv;
+}
+
+char *devnum2devname(int num)
+{
+       char name[100];
+       if (num > 0)
+               sprintf(name, "md%d", num);
+       else
+               sprintf(name, "md_d%d", -1-num);
+       return strdup(name);
+}
+
+int devname2devnum(char *name)
+{
+       char *ep;
+       int num;
+       if (strncmp(name, "md_d", 4)==0)
+               num = -1-strtoul(name+4, &ep, 10);
+       else
+               num = strtoul(name+2, &ep, 10);
+       return num;
+}
+
+int stat2devnum(struct stat *st)
+{
+       char path[30];
+       char link[200];
+       char *cp;
+       int n;
+
+       if ((S_IFMT & st->st_mode) == S_IFBLK) {
+               if (major(st->st_rdev) == MD_MAJOR)
+                       return minor(st->st_rdev);
+               else if (major(st->st_rdev) == get_mdp_major())
+                       return -1- (minor(st->st_rdev)>>MdpMinorShift);
+
+               /* must be an extended-minor partition. Look at the
+                * /sys/dev/block/%d:%d link which must look like
+                * ../../block/mdXXX/mdXXXpYY
+                */
+               sprintf(path, "/sys/dev/block/%d:%d", major(st->st_rdev),
+                       minor(st->st_rdev));
+               n = readlink(path, link, sizeof(link)-1);
+               if (n <= 0)
+                       return NoMdDev;
+               link[n] = 0;
+               cp = strrchr(link, '/');
+               if (cp) *cp = 0;
+               cp = strchr(link, '/');
+               if (cp && strncmp(cp, "/md", 3) == 0)
+                       return devname2devnum(cp+1);
+       }
+       return NoMdDev;
+
+}
+
+int fd2devnum(int fd)
+{
+       struct stat stb;
+       if (fstat(fd, &stb) == 0)
+               return stat2devnum(&stb);
+       return NoMdDev;
+}
+
+int mdmon_running(int devnum)
+{
+       char path[100];
+       char pid[10];
+       int fd;
+       int n;
+       sprintf(path, "/var/run/mdadm/%s.pid", devnum2devname(devnum));
+       fd = open(path, O_RDONLY, 0);
+
+       if (fd < 0)
+               return 0;
+       n = read(fd, pid, 9);
+       close(fd);
+       if (n <= 0)
+               return 0;
+       if (kill(atoi(pid), 0) == 0)
+               return 1;
+       return 0;
+}
+
+int signal_mdmon(int devnum)
+{
+       char path[100];
+       char pid[10];
+       int fd;
+       int n;
+       sprintf(path, "/var/run/mdadm/%s.pid", devnum2devname(devnum));
+       fd = open(path, O_RDONLY, 0);
+
+       if (fd < 0)
+               return 0;
+       n = read(fd, pid, 9);
+       close(fd);
+       if (n <= 0)
+               return 0;
+       if (kill(atoi(pid), SIGUSR1) == 0)
+               return 1;
+       return 0;
+}
+
+int start_mdmon(int devnum)
+{
+       int i;
+       int len;
+       pid_t pid;      
+       int status;
+       char pathbuf[1024];
+       char *paths[4] = {
+               pathbuf,
+               "/sbin/mdmon",
+               "mdmon",
+               NULL
+       };
+
+       if (check_env("MDADM_NO_MDMON"))
+               return 0;
+
+       len = readlink("/proc/self/exe", pathbuf, sizeof(pathbuf));
+       if (len > 0) {
+               char *sl;
+               pathbuf[len] = 0;
+               sl = strrchr(pathbuf, '/');
+               if (sl)
+                       sl++;
+               else
+                       sl = pathbuf;
+               strcpy(sl, "mdmon");
+       } else
+               pathbuf[0] = '\0';
+
+       switch(fork()) {
+       case 0:
+               /* FIXME yuk. CLOSE_EXEC?? */
+               for (i=3; i < 100; i++)
+                       close(i);
+               for (i=0; paths[i]; i++)
+                       if (paths[i][0])
+                               execl(paths[i], "mdmon",
+                                     devnum2devname(devnum),
+                                     NULL);
+               exit(1);
+       case -1: fprintf(stderr, Name ": cannot run mdmon. "
+                        "Array remains readonly\n");
+               return -1;
+       default: /* parent - good */
+               pid = wait(&status);
+               if (pid < 0 || status != 0)
+                       return -1;
+       }
+       return 0;
+}
+
+int check_env(char *name)
+{
+       char *val = getenv(name);
+
+       if (val && atoi(val) == 1)
+               return 1;
+
+       return 0;
+}
+
+#ifndef MDASSEMBLE
+int flush_metadata_updates(struct supertype *st)
+{
+       int sfd;
+       if (!st->updates) {
+               st->update_tail = NULL;
+               return -1;
+       }
+
+       sfd = connect_monitor(devnum2devname(st->container_dev));
+       if (sfd < 0)
+               return -1;
+
+       while (st->updates) {
+               struct metadata_update *mu = st->updates;
+               st->updates = mu->next;
+
+               send_message(sfd, mu, 0);
+               wait_reply(sfd, 0);
+               free(mu->buf);
+               free(mu);
+       }
+       ack(sfd, 0);
+       wait_reply(sfd, 0);
+       close(sfd);
+       st->update_tail = NULL;
+       return 0;
+}
+
+void append_metadata_update(struct supertype *st, void *buf, int len)
+{
+
+       struct metadata_update *mu = malloc(sizeof(*mu));
+
+       mu->buf = buf;
+       mu->len = len;
+       mu->space = NULL;
+       mu->next = NULL;
+       *st->update_tail = mu;
+       st->update_tail = &mu->next;
+}
+#endif /* MDASSEMBLE */
+
 #ifdef __TINYC__
 /* tinyc doesn't optimize this check in ioctl.h out ... */
 unsigned int __invalid_size_argument_for_IOC = 0;