/*-stamp
/mdadm
/mdadm.udeb
+/mdmon
+/swap_super
+/test_stripe
+/TAGS
--- /dev/null
+Subject: ANNOUNCE: mdadm 3.0-devel1 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.0-devel1
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+Note that this is a "devel" release. It is not intended for
+production use yet, but rather for testing and ongoing development.
+
+The significant change which justifies the new major version number is
+that mdadm can now handle metadata updates entirely in userspace.
+This allows mdadm to support metadata formats that the kernel knows
+nothing about.
+
+Currently two such metadata formats are supported:
+ - DDF - The SNIA standard format
+ - Intel Matrix - The metadata used by recent Intel ICH controlers.
+
+The manual pages have not yet been updated, but here is a brief outline.
+
+Externally managed metadata introduces the concept of a 'container'.
+A container is a collection of (normally) physical devices which have
+a common set of metadata. A container is assembled as an md array, but
+is left 'inactive'.
+
+A container can contain one or more data arrays. These are composed from
+slices (partitions?) of various devices in the container.
+
+For example, a 5 devices DDF set can container a RAID1 using the first
+half of two devices, a RAID0 using the first half of the remain 3 devices,
+and a RAID5 over thte second half of all 5 devices.
+
+A container can be created with
+
+ mdadm --create /dev/md0 -e ddf -n5 /dev/sd[abcde]
+
+or "-e imsm" to use the Intel Matrix Storage Manager.
+
+An array can be created within a container either by giving the
+container name and the only member:
+
+ mdadm -C /dev/md1 --level raid1 -n 2 /dev/md0
+
+or by listing the component devices
+
+ mdadm -C /dev/md2 --level raid0 -n 3 /dev/sd[cde]
+
+The assemble a container, it is easiest just to pass each device in turn to
+mdadm -I
+
+ for i in /dev/sd[abcde]
+ do mdadm -I $i
+ done
+
+This will assemble the container and the components.
+
+Alternately the container can be assembled explicitly
+
+ mdadm -A /dev/md0 /dev/sd[abcde]
+
+Then the components can all be assembled with
+
+ mdadm -I /dev/md0
+
+For each container, mdadm will start a program called "mdmon" which will
+monitor the array and effect any metadata updates needed. The array is
+initially assembled readonly. It is up to "mdmon" to mark the metadata
+as 'dirty' and which the array to 'read-write'.
+
+The version 0.90 and 1.x metadata formats supported by previous
+versions for mdadm are still supported and the kernel still performs
+the same updates it use to. The new 'mdmon' approach is only used for
+newly introduced metadata types.
+
+Any testing and feedback will be greatly appreciated.
+
+NeilBrown 18th September 2008
+
--- /dev/null
+Subject: ANNOUNCE: mdadm 3.0-devel2 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.0-devel2
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+Note that this is a "devel" release. It should be used with
+caution, though it is believed to be close to release-candidate stage.
+
+The significant change which justifies the new major version number is
+that mdadm can now handle metadata updates entirely in userspace.
+This allows mdadm to support metadata formats that the kernel knows
+nothing about.
+
+Currently two such metadata formats are supported:
+ - DDF - The SNIA standard format
+ - Intel Matrix - The metadata used by recent Intel ICH controlers.
+
+Also the approach to device names has changed significantly.
+
+If udev is installed on the system, mdadm will not create any devices
+in /dev. Rather it allows udev to manage those devices. For this to work
+as expected, the included udev rules file should be installed.
+
+If udev is not install, mdadm will still create devices and symlinks
+as required, and will also remove them when the array is stopped.
+
+mdadm now requires all devices which do not have a standard name (mdX
+or md_dX) to live in the directory /dev/md/. Names in this directory
+will always be created as symlinks back to the standard name in /dev.
+
+The man pages contain some information about the new externally managed
+metadata. However see below for a more condensed overview.
+
+Externally managed metadata introduces the concept of a 'container'.
+A container is a collection of (normally) physical devices which have
+a common set of metadata. A container is assembled as an md array, but
+is left 'inactive'.
+
+A container can contain one or more data arrays. These are composed from
+slices (partitions?) of various devices in the container.
+
+For example, a 5 devices DDF set can container a RAID1 using the first
+half of two devices, a RAID0 using the first half of the remain 3 devices,
+and a RAID5 over thte second half of all 5 devices.
+
+A container can be created with
+
+ mdadm --create /dev/md0 -e ddf -n5 /dev/sd[abcde]
+
+or "-e imsm" to use the Intel Matrix Storage Manager.
+
+An array can be created within a container either by giving the
+container name and the only member:
+
+ mdadm -C /dev/md1 --level raid1 -n 2 /dev/md0
+
+or by listing the component devices
+
+ mdadm -C /dev/md2 --level raid0 -n 3 /dev/sd[cde]
+
+To assemble a container, it is easiest just to pass each device in turn to
+mdadm -I
+
+ for i in /dev/sd[abcde]
+ do mdadm -I $i
+ done
+
+This will assemble the container and the components.
+
+Alternately the container can be assembled explicitly
+
+ mdadm -A /dev/md0 /dev/sd[abcde]
+
+Then the components can all be assembled with
+
+ mdadm -I /dev/md0
+
+For each container, mdadm will start a program called "mdmon" which will
+monitor the array and effect any metadata updates needed. The array is
+initially assembled readonly. It is up to "mdmon" to mark the metadata
+as 'dirty' and which the array to 'read-write'.
+
+The version 0.90 and 1.x metadata formats supported by previous
+versions for mdadm are still supported and the kernel still performs
+the same updates it use to. The new 'mdmon' approach is only used for
+newly introduced metadata types.
+
+Any testing and feedback will be greatly appreciated.
+
+NeilBrown 5th November 2008
+
--- /dev/null
+Subject: ANNOUNCE: mdadm 3.0-devel3 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.0-devel3
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+Note that this is a "devel" release. It should be used with
+caution, though it is believed to be close to release-candidate stage.
+
+There have been numerous improvements and additions since -devel2.
+I think we are close to a release of 3.0.
+
+I need to add lots of tests to the test suite to test the new
+functionality. And I need to review the man pages.
+
+After that I will release -rc1 followed by -final.
+
+
+The following is the same introduction to 3.x as appeared in
+previous announcements.
+
+
+Any testing and feedback will be greatly appreciated.
+
+NeilBrown 10th March 2009
+
+
+=====================================================
+
+The significant change which justifies the new major version number is
+that mdadm can now handle metadata updates entirely in userspace.
+This allows mdadm to support metadata formats that the kernel knows
+nothing about.
+
+Currently two such metadata formats are supported:
+ - DDF - The SNIA standard format
+ - Intel Matrix - The metadata used by recent Intel ICH controlers.
+
+Also the approach to device names has changed significantly.
+
+If udev is installed on the system, mdadm will not create any devices
+in /dev. Rather it allows udev to manage those devices. For this to work
+as expected, the included udev rules file should be installed.
+
+If udev is not install, mdadm will still create devices and symlinks
+as required, and will also remove them when the array is stopped.
+
+mdadm now requires all devices which do not have a standard name (mdX
+or md_dX) to live in the directory /dev/md/. Names in this directory
+will always be created as symlinks back to the standard name in /dev.
+
+The man pages contain some information about the new externally managed
+metadata. However see below for a more condensed overview.
+
+Externally managed metadata introduces the concept of a 'container'.
+A container is a collection of (normally) physical devices which have
+a common set of metadata. A container is assembled as an md array, but
+is left 'inactive'.
+
+A container can contain one or more data arrays. These are composed from
+slices (partitions?) of various devices in the container.
+
+For example, a 5 devices DDF set can container a RAID1 using the first
+half of two devices, a RAID0 using the first half of the remain 3 devices,
+and a RAID5 over thte second half of all 5 devices.
+
+A container can be created with
+
+ mdadm --create /dev/md0 -e ddf -n5 /dev/sd[abcde]
+
+or "-e imsm" to use the Intel Matrix Storage Manager.
+
+An array can be created within a container either by giving the
+container name and the only member:
+
+ mdadm -C /dev/md1 --level raid1 -n 2 /dev/md0
+
+or by listing the component devices
+
+ mdadm -C /dev/md2 --level raid0 -n 3 /dev/sd[cde]
+
+To assemble a container, it is easiest just to pass each device in turn to
+mdadm -I
+
+ for i in /dev/sd[abcde]
+ do mdadm -I $i
+ done
+
+This will assemble the container and the components.
+
+Alternately the container can be assembled explicitly
+
+ mdadm -A /dev/md0 /dev/sd[abcde]
+
+Then the components can all be assembled with
+
+ mdadm -I /dev/md0
+
+For each container, mdadm will start a program called "mdmon" which will
+monitor the array and effect any metadata updates needed. The array is
+initially assembled readonly. It is up to "mdmon" to mark the metadata
+as 'dirty' and which the array to 'read-write'.
+
+The version 0.90 and 1.x metadata formats supported by previous
+versions for mdadm are still supported and the kernel still performs
+the same updates it use to. The new 'mdmon' approach is only used for
+newly introduced metadata types.
--- /dev/null
+Subject: ANNOUNCE: mdadm 3.0-rc1 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.0-rc1
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+This is a "release candidate" which means that I think it is safe
+to use and that there will be no significant change in functionality
+before release.
+
+The man pages aren't really "release candidate" yet but I will be
+working on them before the final release.
+
+The most significant changes since -devel3 relate to the names of md
+devices as they appear in /dev and /dev/md/, and in particular the names
+that are used when an array is assembled with "--incremental" or with
+"mdadm --assemble --scan" when there are no ARRAY lines in mdadm.conf.
+In these cases mdadm needs to deduce a name to use, and to try to
+avoid using a name that a different array might have a stronger claim to.
+The rules are:
+ - if the array is mentioned in mdadm.conf, use the name given there.
+ - if the array appear to have been created for "this host" using the
+ "homehost" concept, trust the name given in the metadata
+ - if the new setting "HOMEHOST <ignore>" is given (can be in mdadm.conf
+ or on command line) the the name given in the metadata is not
+ associated with some other array by mdadm.conf, then trust the
+ name given in the metadata
+ - otherwise use the name in the metadata, but in an untrusted manner.
+
+If a name is untrusted, or if the name is already in use by another
+array, then a numeric suffix like "_0", "_1" is appended to create
+a unique name for the array.
+
+That name is then used to create a device file in /dev/md/.
+
+So if all arrays needed for boot will always be listed in
+/etc/mdadm.conf, then it is appropriate to add "HOMEHOST <ignore>" to
+mdadm.conf and there is no risk of conflicting names. However if you
+want auto-assemble to assemble all arrays at boot time and you don't
+want to list them in mdadm.conf, then don't give "HOMEHOST <ignore>"
+either else there could be a risk of the wrong array being assembled
+for a given name.
+
+
+
+The following is the same introduction to 3.x as appeared in
+previous announcements.
+
+Any testing and feedback will be greatly appreciated.
+
+NeilBrown 11th May 2009
+
+
+=====================================================
+
+The significant change which justifies the new major version number is
+that mdadm can now handle metadata updates entirely in userspace.
+This allows mdadm to support metadata formats that the kernel knows
+nothing about.
+
+Currently two such metadata formats are supported:
+ - DDF - The SNIA standard format
+ - Intel Matrix - The metadata used by recent Intel ICH controlers.
+
+Also the approach to device names has changed significantly.
+
+If udev is installed on the system, mdadm will not create any devices
+in /dev. Rather it allows udev to manage those devices. For this to work
+as expected, the included udev rules file should be installed.
+
+If udev is not install, mdadm will still create devices and symlinks
+as required, and will also remove them when the array is stopped.
+
+mdadm now requires all devices which do not have a standard name (mdX
+or md_dX) to live in the directory /dev/md/. Names in this directory
+will always be created as symlinks back to the standard name in /dev.
+
+The man pages contain some information about the new externally managed
+metadata. However see below for a more condensed overview.
+
+Externally managed metadata introduces the concept of a 'container'.
+A container is a collection of (normally) physical devices which have
+a common set of metadata. A container is assembled as an md array, but
+is left 'inactive'.
+
+A container can contain one or more data arrays. These are composed from
+slices (partitions?) of various devices in the container.
+
+For example, a 5 devices DDF set can container a RAID1 using the first
+half of two devices, a RAID0 using the first half of the remain 3 devices,
+and a RAID5 over thte second half of all 5 devices.
+
+A container can be created with
+
+ mdadm --create /dev/md0 -e ddf -n5 /dev/sd[abcde]
+
+or "-e imsm" to use the Intel Matrix Storage Manager.
+
+An array can be created within a container either by giving the
+container name and the only member:
+
+ mdadm -C /dev/md1 --level raid1 -n 2 /dev/md0
+
+or by listing the component devices
+
+ mdadm -C /dev/md2 --level raid0 -n 3 /dev/sd[cde]
+
+To assemble a container, it is easiest just to pass each device in turn to
+mdadm -I
+
+ for i in /dev/sd[abcde]
+ do mdadm -I $i
+ done
+
+This will assemble the container and the components.
+
+Alternately the container can be assembled explicitly
+
+ mdadm -A /dev/md0 /dev/sd[abcde]
+
+Then the components can all be assembled with
+
+ mdadm -I /dev/md0
+
+For each container, mdadm will start a program called "mdmon" which will
+monitor the array and effect any metadata updates needed. The array is
+initially assembled readonly. It is up to "mdmon" to mark the metadata
+as 'dirty' and which the array to 'read-write'.
+
+The version 0.90 and 1.x metadata formats supported by previous
+versions for mdadm are still supported and the kernel still performs
+the same updates it use to. The new 'mdmon' approach is only used for
+newly introduced metadata types.
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
#include "mdadm.h"
return 0;
}
-int Assemble(struct supertype *st, char *mddev, int mdfd,
+static int is_member_busy(char *metadata_version)
+{
+ /* check if the given member array is active */
+ struct mdstat_ent *mdstat = mdstat_read(1, 0);
+ struct mdstat_ent *ent;
+ int busy = 0;
+
+ for (ent = mdstat; ent; ent = ent->next) {
+ if (ent->metadata_version == NULL)
+ continue;
+ if (strncmp(ent->metadata_version, "external:", 9) != 0)
+ continue;
+ if (!is_subarray(&ent->metadata_version[9]))
+ continue;
+ /* Skip first char - it can be '/' or '-' */
+ if (strcmp(&ent->metadata_version[10], metadata_version+1) == 0) {
+ busy = 1;
+ break;
+ }
+ }
+ free_mdstat(mdstat);
+
+ return busy;
+}
+
+int Assemble(struct supertype *st, char *mddev,
mddev_ident_t ident,
mddev_dev_t devlist, char *backup_file,
int readonly, int runstop,
- char *update, char *homehost,
+ char *update, char *homehost, int require_homehost,
int verbose, int force)
{
/*
* START_ARRAY
*
*/
- int clean = 0;
- int must_close = 0;
+ int mdfd;
+ int clean;
+ int auto_assem = (mddev == NULL && !ident->uuid_set &&
+ ident->super_minor == UnSet && ident->name[0] == 0
+ && (ident->container == NULL || ident->member == NULL));
int old_linux = 0;
- int vers = 0; /* Keep gcc quite - it really is initialised */
+ int vers = vers; /* Keep gcc quite - it really is initialised */
struct {
char *devname;
int uptodate; /* set once we decide that this device is as
int chosen_drive;
int change = 0;
int inargv = 0;
+ int report_missmatch;
int bitmap_done;
- int start_partial_ok = (runstop >= 0) && (force || devlist==NULL || mdfd < 0);
+ int start_partial_ok = (runstop >= 0) &&
+ (force || devlist==NULL || auto_assem);
unsigned int num_devs;
mddev_dev_t tmpdev;
struct mdinfo info;
+ struct mdinfo *content = NULL;
char *avail;
int nextspare = 0;
+ char *name = NULL;
+ int trustworthy;
+ char chosen_name[1024];
if (get_linux_version() < 2004000)
old_linux = 1;
- if (mdfd >= 0) {
- vers = md_get_version(mdfd);
- if (vers <= 0) {
- fprintf(stderr, Name ": %s appears not to be an md device.\n", mddev);
- return 1;
- }
- if (vers < 9000) {
- fprintf(stderr, Name ": Assemble requires driver version 0.90.0 or later.\n"
- " Upgrade your kernel or try --build\n");
- return 1;
- }
-
- if (ioctl(mdfd, GET_ARRAY_INFO, &info.array)>=0) {
- fprintf(stderr, Name ": device %s already active - cannot assemble it\n",
- mddev);
- return 1;
- }
- ioctl(mdfd, STOP_ARRAY, NULL); /* just incase it was started but has no content */
- }
/*
* If any subdevs are listed, then any that don't
* match ident are discarded. Remainder must all match and
if (!devlist &&
ident->uuid_set == 0 &&
ident->super_minor < 0 &&
+ ident->name[0] == 0 &&
+ (ident->container == NULL || ident->member == NULL) &&
ident->devices == NULL) {
fprintf(stderr, Name ": No identity information available for %s - cannot assemble.\n",
mddev ? mddev : "further assembly");
return 1;
}
+
if (devlist == NULL)
devlist = conf_get_devs();
- else if (mdfd >= 0)
+ else if (mddev)
inargv = 1;
+ report_missmatch = ((inargv && verbose >= 0) || verbose > 0);
try_again:
+ /* We come back here when doing auto-assembly and attempting some
+ * set of devices failed. Those are now marked as ->used==2 and
+ * we ignore them and try again
+ */
tmpdev = devlist; num_devs = 0;
while (tmpdev) {
/* first walk the list of devices to find a consistent set
* that match the criterea, if that is possible.
- * We flag the one we like with 'used'.
+ * We flag the ones we like with 'used'.
*/
for (tmpdev = devlist;
tmpdev;
if (ident->devices &&
!match_oneof(ident->devices, devname)) {
- if ((inargv && verbose>=0) || verbose > 0)
+ if (report_missmatch)
fprintf(stderr, Name ": %s is not one of %s\n", devname, ident->devices);
continue;
}
dfd = dev_open(devname, O_RDONLY|O_EXCL);
if (dfd < 0) {
- if ((inargv && verbose >= 0) || verbose > 0)
+ if (report_missmatch)
fprintf(stderr, Name ": cannot open device %s: %s\n",
devname, strerror(errno));
tmpdev->used = 2;
devname);
tmpdev->used = 2;
} else if (!tst && (tst = guess_super(dfd)) == NULL) {
- if ((inargv && verbose >= 0) || verbose > 0)
+ if (report_missmatch)
fprintf(stderr, Name ": no recogniseable superblock on %s\n",
devname);
tmpdev->used = 2;
+ } else if (auto_assem && st == NULL &&
+ !conf_test_metadata(tst->ss->name)) {
+ if (report_missmatch)
+ fprintf(stderr, Name ": %s has metadata type %s for which "
+ "auto-assembly is disabled\n",
+ devname, tst->ss->name);
+ tmpdev->used = 2;
} else if (tst->ss->load_super(tst,dfd, NULL)) {
- if ((inargv && verbose >= 0) || verbose > 0)
+ if (report_missmatch)
fprintf( stderr, Name ": no RAID superblock on %s\n",
devname);
} else {
- tst->ss->getinfo_super(tst, &info);
+ content = &info;
+ memset(content, 0, sizeof(*content));
+ tst->ss->getinfo_super(tst, content);
}
if (dfd >= 0) close(dfd);
+ if (tst && tst->sb && tst->ss->container_content
+ && tst->loaded_container) {
+ /* tmpdev is a container. We need to be either
+ * looking for a member, or auto-assembling
+ */
+ if (st) {
+ /* already found some components, this cannot
+ * be another one.
+ */
+ if (report_missmatch)
+ fprintf(stderr, Name ": %s is a container, but we are looking for components\n",
+ devname);
+ goto loop;
+ }
+
+ if (ident->container) {
+ if (ident->container[0] == '/' &&
+ !same_dev(ident->container, devname)) {
+ if (report_missmatch)
+ fprintf(stderr, Name ": %s is not the container required (%s)\n",
+ devname, ident->container);
+ goto loop;
+ }
+ if (ident->container[0] != '/') {
+ /* we have a uuid */
+ int uuid[4];
+ if (!parse_uuid(ident->container, uuid) ||
+ !same_uuid(content->uuid, uuid, tst->ss->swapuuid)) {
+ if (report_missmatch)
+ fprintf(stderr, Name ": %s has wrong UUID to be required container\n",
+ devname);
+ goto loop;
+ }
+ }
+ }
+ /* It is worth looking inside this container.
+ */
+ next_member:
+ if (tmpdev->content)
+ content = tmpdev->content;
+ else
+ content = tst->ss->container_content(tst);
+
+ tmpdev->content = content->next;
+ if (tmpdev->content == NULL)
+ tmpdev->used = 2;
+
+ } else if (ident->container || ident->member) {
+ /* No chance of this matching if we don't have
+ * a container */
+ if (report_missmatch)
+ fprintf(stderr, Name "%s is not a container, and one is required.\n",
+ devname);
+ goto loop;
+ }
+
if (ident->uuid_set && (!update || strcmp(update, "uuid")!= 0) &&
(!tst || !tst->sb ||
- same_uuid(info.uuid, ident->uuid, tst->ss->swapuuid)==0)) {
- if ((inargv && verbose >= 0) || verbose > 0)
+ same_uuid(content->uuid, ident->uuid, tst->ss->swapuuid)==0)) {
+ if (report_missmatch)
fprintf(stderr, Name ": %s has wrong uuid.\n",
devname);
goto loop;
}
if (ident->name[0] && (!update || strcmp(update, "name")!= 0) &&
(!tst || !tst->sb ||
- name_matches(info.name, ident->name, homehost)==0)) {
- if ((inargv && verbose >= 0) || verbose > 0)
+ name_matches(content->name, ident->name, homehost)==0)) {
+ if (report_missmatch)
fprintf(stderr, Name ": %s has wrong name.\n",
devname);
goto loop;
}
if (ident->super_minor != UnSet &&
(!tst || !tst->sb ||
- ident->super_minor != info.array.md_minor)) {
- if ((inargv && verbose >= 0) || verbose > 0)
+ ident->super_minor != content->array.md_minor)) {
+ if (report_missmatch)
fprintf(stderr, Name ": %s has wrong super-minor.\n",
devname);
goto loop;
}
if (ident->level != UnSet &&
(!tst || !tst->sb ||
- ident->level != info.array.level)) {
- if ((inargv && verbose >= 0) || verbose > 0)
+ ident->level != content->array.level)) {
+ if (report_missmatch)
fprintf(stderr, Name ": %s has wrong raid level.\n",
devname);
goto loop;
}
if (ident->raid_disks != UnSet &&
(!tst || !tst->sb ||
- ident->raid_disks!= info.array.raid_disks)) {
- if ((inargv && verbose >= 0) || verbose > 0)
+ ident->raid_disks!= content->array.raid_disks)) {
+ if (report_missmatch)
fprintf(stderr, Name ": %s requires wrong number of drives.\n",
devname);
goto loop;
}
- if (mdfd < 0) {
+ if (auto_assem) {
if (tst == NULL || tst->sb == NULL)
continue;
- if (update == NULL &&
- tst->ss->match_home(tst, homehost)==0) {
- if ((inargv && verbose >= 0) || verbose > 0)
- fprintf(stderr, Name ": %s is not built for host %s.\n",
- devname, homehost);
- /* Auto-assemble, and this is not a usable host */
- /* if update != NULL, we are updating the host
- * name... */
- goto loop;
- }
}
/* If we are this far, then we are nearly commited to this device.
* If the super_block doesn't exist, or doesn't match others,
return 1;
}
+ if (tst && tst->sb && tst->ss->container_content
+ && tst->loaded_container) {
+ /* we have the one container we need, don't keep
+ * looking. If the chosen member is active, skip.
+ */
+ if (is_member_busy(content->text_version)) {
+ if (report_missmatch)
+ fprintf(stderr, Name ": member %s in %s is already assembled\n",
+ content->text_version,
+ devname);
+ tst->ss->free_super(tst);
+ tst = NULL;
+ content = NULL;
+ if (auto_assem)
+ goto loop;
+ return 1;
+ }
+ st = tst; tst = NULL;
+ if (!auto_assem && tmpdev->next != NULL) {
+ fprintf(stderr, Name ": %s is a container, but is not "
+ "only device given: confused and aborting\n",
+ devname);
+ st->ss->free_super(st);
+ return 1;
+ }
+ break;
+ }
if (st == NULL)
st = dup_super(tst);
if (st->minor_version == -1)
* Or, if we are auto assembling, we just ignore the second
* for now.
*/
- if (mdfd < 0)
+ if (auto_assem)
goto loop;
if (homehost) {
int first = st->ss->match_home(st, homehost);
int last = tst->ss->match_home(tst, homehost);
- if (first+last == 1) {
+ if (first != last &&
+ (first == 1 || last == 1)) {
/* We can do something */
if (first) {/* just ignore this one */
- if ((inargv && verbose >= 0) || verbose > 0)
+ if (report_missmatch)
fprintf(stderr, Name ": %s misses out due to wrong homehost\n",
devname);
goto loop;
} else { /* reject all those sofar */
mddev_dev_t td;
- if ((inargv && verbose >= 0) || verbose > 0)
+ if (report_missmatch)
fprintf(stderr, Name ": %s overrides previous devices due to good homehost\n",
devname);
for (td=devlist; td != tmpdev; td=td->next)
tmpdev->used = 1;
loop:
+ if (tmpdev->content)
+ goto next_member;
if (tst)
tst->ss->free_super(tst);
}
- if (mdfd < 0) {
- /* So... it is up to me to open the device.
- * We create a name '/dev/md/XXX' based on the info in the
- * superblock, and call open_mddev on that
- */
- mdu_array_info_t inf;
- char *c;
- if (!st || !st->sb) {
- return 2;
- }
- st->ss->getinfo_super(st, &info);
- c = strchr(info.name, ':');
- if (c) c++; else c= info.name;
- if (isdigit(*c) && ((ident->autof & 7)==4 || (ident->autof&7)==6))
- /* /dev/md/d0 style for partitionable */
- xasprintf(&mddev, "/dev/md/d%s", c);
+ if (!st || !st->sb || !content)
+ return 2;
+
+ /* Now need to open the array device. Use create_mddev */
+ if (content == &info)
+ st->ss->getinfo_super(st, content);
+
+ trustworthy = FOREIGN;
+ name = content->name;
+ switch (st->ss->match_home(st, homehost)
+ ?: st->ss->match_home(st, "any")) {
+ case 1:
+ trustworthy = LOCAL;
+ name = strchr(content->name, ':');
+ if (name)
+ name++;
else
- xasprintf(&mddev, "/dev/md/%s", c);
- mdfd = open_mddev(mddev, ident->autof);
- if (mdfd < 0) {
- st->ss->free_super(st);
- free(devices);
+ name = content->name;
+ break;
+ }
+ if (!auto_assem)
+ /* If the array is listed in mdadm.conf or on
+ * command line, then we trust the name
+ * even if the array doesn't look local
+ */
+ trustworthy = LOCAL;
+
+ if (name[0] == 0 &&
+ content->array.level == LEVEL_CONTAINER) {
+ name = content->text_version;
+ trustworthy = METADATA;
+ }
+
+ if (name[0] && trustworthy != LOCAL &&
+ ! require_homehost &&
+ conf_name_is_free(name))
+ trustworthy = LOCAL;
+
+ if (trustworthy == LOCAL &&
+ strchr(name, ':'))
+ /* Ignore 'host:' prefix of name */
+ name = strchr(name, ':')+1;
+
+ mdfd = create_mddev(mddev, name, ident->autof, trustworthy,
+ chosen_name);
+ if (mdfd < 0) {
+ st->ss->free_super(st);
+ free(devices);
+ if (auto_assem)
goto try_again;
- }
- vers = md_get_version(mdfd);
- if (ioctl(mdfd, GET_ARRAY_INFO, &inf)==0) {
- for (tmpdev = devlist ;
- tmpdev && tmpdev->used != 1;
- tmpdev = tmpdev->next)
- ;
- fprintf(stderr, Name ": %s already active, cannot restart it!\n", mddev);
- if (tmpdev)
- fprintf(stderr, Name ": %s needed for %s...\n",
- mddev, tmpdev->devname);
- close(mdfd);
- mdfd = -1;
- st->ss->free_super(st);
- free(devices);
+ return 1;
+ }
+ mddev = chosen_name;
+ vers = md_get_version(mdfd);
+ if (vers < 9000) {
+ fprintf(stderr, Name ": Assemble requires driver version 0.90.0 or later.\n"
+ " Upgrade your kernel or try --build\n");
+ close(mdfd);
+ return 1;
+ }
+ if (mddev_busy(fd2devnum(mdfd))) {
+ fprintf(stderr, Name ": %s already active, cannot restart it!\n",
+ mddev);
+ for (tmpdev = devlist ;
+ tmpdev && tmpdev->used != 1;
+ tmpdev = tmpdev->next)
+ ;
+ if (tmpdev && auto_assem)
+ fprintf(stderr, Name ": %s needed for %s...\n",
+ mddev, tmpdev->devname);
+ close(mdfd);
+ mdfd = -3;
+ st->ss->free_super(st);
+ free(devices);
+ if (auto_assem)
goto try_again;
- }
- must_close = 1;
+ return 1;
}
+ ioctl(mdfd, STOP_ARRAY, NULL); /* just incase it was started but has no content */
+#ifndef MDASSEMBLE
+ if (content != &info) {
+ /* This is a member of a container. Try starting the array. */
+ return assemble_container_content(st, mdfd, content, runstop,
+ chosen_name, verbose);
+ }
+#endif
/* Ok, no bad inconsistancy, we can try updating etc */
bitmap_done = 0;
for (tmpdev = devlist; tmpdev; tmpdev=tmpdev->next) if (tmpdev->used == 1) {
tst = dup_super(st);
tst->ss->load_super(tst, dfd, NULL);
- tst->ss->getinfo_super(tst, &info);
+ tst->ss->getinfo_super(tst, content);
- memcpy(info.uuid, ident->uuid, 16);
- strcpy(info.name, ident->name);
- info.array.md_minor = minor(stb2.st_rdev);
+ memcpy(content->uuid, ident->uuid, 16);
+ strcpy(content->name, ident->name);
+ content->array.md_minor = minor(stb2.st_rdev);
- tst->ss->update_super(tst, &info, update,
+ tst->ss->update_super(tst, content, update,
devname, verbose,
ident->uuid_set, homehost);
if (strcmp(update, "uuid")==0 &&
!ident->uuid_set) {
ident->uuid_set = 1;
- memcpy(ident->uuid, info.uuid, 16);
+ memcpy(ident->uuid, content->uuid, 16);
}
if (dfd < 0)
fprintf(stderr, Name ": Cannot open %s for superblock update\n",
if (strcmp(update, "uuid")==0 &&
ident->bitmap_fd >= 0 && !bitmap_done) {
if (bitmap_update_uuid(ident->bitmap_fd,
- info.uuid,
+ content->uuid,
tst->ss->swapuuid) != 0)
fprintf(stderr, Name ": Could not update uuid on external bitmap.\n");
else
remove_partitions(dfd);
tst->ss->load_super(tst, dfd, NULL);
- tst->ss->getinfo_super(tst, &info);
+ tst->ss->getinfo_super(tst, content);
tst->ss->free_super(tst);
close(dfd);
}
if (verbose > 0)
fprintf(stderr, Name ": %s is identified as a member of %s, slot %d.\n",
- devname, mddev, info.disk.raid_disk);
+ devname, mddev, content->disk.raid_disk);
devices[devcnt].devname = devname;
devices[devcnt].uptodate = 0;
- devices[devcnt].i = info;
+ devices[devcnt].i = *content;
devices[devcnt].i.disk.major = major(stb.st_rdev);
devices[devcnt].i.disk.minor = minor(stb.st_rdev);
if (most_recent < devcnt) {
> devices[most_recent].i.events)
most_recent = devcnt;
}
- if (info.array.level == -4)
+ if (content->array.level == -4)
/* with multipath, the raid_disk from the superblock is meaningless */
i = devcnt;
else
i = devices[devcnt].i.disk.raid_disk;
if (i+1 == 0) {
- if (nextspare < info.array.raid_disks)
- nextspare = info.array.raid_disks;
+ if (nextspare < content->array.raid_disks)
+ nextspare = content->array.raid_disks;
i = nextspare++;
} else {
- if (i >= info.array.raid_disks &&
+ if (i >= content->array.raid_disks &&
i >= nextspare)
nextspare = i+1;
}
== devices[devcnt].i.events
&& (devices[best[i]].i.disk.minor
!= devices[devcnt].i.disk.minor)
- && st->ss->major == 0
- && info.array.level != -4) {
+ && st->ss == &super0
+ && content->array.level != LEVEL_MULTIPATH) {
/* two different devices with identical superblock.
* Could be a mis-detection caused by overlapping
* partitions. fail-safe.
inargv ? "the list" :
"the\n DEVICE list in mdadm.conf"
);
- if (must_close) close(mdfd);
+ close(mdfd);
return 1;
}
if (best[i] == -1
mddev);
if (st)
st->ss->free_super(st);
- if (must_close) close(mdfd);
+ close(mdfd);
return 1;
}
if (update && strcmp(update, "byteorder")==0)
st->minor_version = 90;
- st->ss->getinfo_super(st, &info);
- clean = info.array.state & 1;
+ st->ss->getinfo_super(st, content);
+ clean = content->array.state & 1;
/* now we have some devices that might be suitable.
* I wonder how many
*/
- avail = malloc(info.array.raid_disks);
- memset(avail, 0, info.array.raid_disks);
+ avail = malloc(content->array.raid_disks);
+ memset(avail, 0, content->array.raid_disks);
okcnt = 0;
sparecnt=0;
for (i=0; i< bestcnt ;i++) {
/* note: we ignore error flags in multipath arrays
* as they don't make sense
*/
- if (info.array.level != -4)
+ if (content->array.level != -4)
if (!(devices[j].i.disk.state & (1<<MD_DISK_SYNC))) {
if (!(devices[j].i.disk.state
& (1<<MD_DISK_FAULTY)))
if (devices[j].i.events+event_margin >=
devices[most_recent].i.events) {
devices[j].uptodate = 1;
- if (i < info.array.raid_disks) {
+ if (i < content->array.raid_disks) {
okcnt++;
avail[i]=1;
} else
sparecnt++;
}
}
- while (force && !enough(info.array.level, info.array.raid_disks,
- info.array.layout, 1,
+ while (force && !enough(content->array.level, content->array.raid_disks,
+ content->array.layout, 1,
avail, okcnt)) {
/* Choose the newest best drive which is
* not up-to-date, update the superblock
struct supertype *tst;
long long current_events;
chosen_drive = -1;
- for (i=0; i<info.array.raid_disks && i < bestcnt; i++) {
+ for (i=0; i<content->array.raid_disks && i < bestcnt; i++) {
int j = best[i];
if (j>=0 &&
!devices[j].uptodate &&
devices[chosen_drive].i.events = 0;
continue;
}
- info.events = devices[most_recent].i.events;
- tst->ss->update_super(tst, &info, "force-one",
+ content->events = devices[most_recent].i.events;
+ tst->ss->update_super(tst, content, "force-one",
devices[chosen_drive].devname, verbose,
0, NULL);
/* If there are any other drives of the same vintage,
* add them in as well. We can't lose and we might gain
*/
- for (i=0; i<info.array.raid_disks && i < bestcnt ; i++) {
+ for (i=0; i<content->array.raid_disks && i < bestcnt ; i++) {
int j = best[i];
if (j >= 0 &&
!devices[j].uptodate &&
if ((fd=dev_open(devices[j].devname, O_RDONLY|O_EXCL))< 0) {
fprintf(stderr, Name ": Cannot open %s: %s\n",
devices[j].devname, strerror(errno));
- if (must_close) close(mdfd);
+ close(mdfd);
return 1;
}
if (st->ss->load_super(st,fd, NULL)) {
close(fd);
fprintf(stderr, Name ": RAID superblock has disappeared from %s\n",
devices[j].devname);
- if (must_close) close(mdfd);
+ close(mdfd);
return 1;
}
close(fd);
}
if (st->sb == NULL) {
fprintf(stderr, Name ": No suitable drives found for %s\n", mddev);
- if (must_close) close(mdfd);
+ close(mdfd);
return 1;
}
- st->ss->getinfo_super(st, &info);
+ st->ss->getinfo_super(st, content);
+#ifndef MDASSEMBLE
+ sysfs_init(content, mdfd, 0);
+#endif
for (i=0; i<bestcnt; i++) {
int j = best[i];
unsigned int desired_state;
- if (i < info.array.raid_disks)
+ if (i < content->array.raid_disks)
desired_state = (1<<MD_DISK_ACTIVE) | (1<<MD_DISK_SYNC);
else
desired_state = 0;
#endif
}
if (force && !clean &&
- !enough(info.array.level, info.array.raid_disks,
- info.array.layout, clean,
+ !enough(content->array.level, content->array.raid_disks,
+ content->array.layout, clean,
avail, okcnt)) {
- change += st->ss->update_super(st, &info, "force-array",
+ change += st->ss->update_super(st, content, "force-array",
devices[chosen_drive].devname, verbose,
0, NULL);
clean = 1;
if (fd < 0) {
fprintf(stderr, Name ": Could not open %s for write - cannot Assemble array.\n",
devices[chosen_drive].devname);
- if (must_close) close(mdfd);
+ close(mdfd);
return 1;
}
if (st->ss->store_super(st, fd)) {
close(fd);
fprintf(stderr, Name ": Could not re-write superblock on %s\n",
devices[chosen_drive].devname);
- if (must_close) close(mdfd);
+ close(mdfd);
return 1;
}
close(fd);
* The code of doing this lives in Grow.c
*/
#ifndef MDASSEMBLE
- if (info.reshape_active) {
+ if (content->reshape_active) {
int err = 0;
int *fdlist = malloc(sizeof(int)* bestcnt);
for (i=0; i<bestcnt; i++) {
fdlist[i] = -1;
}
if (!err)
- err = Grow_restart(st, &info, fdlist, bestcnt, backup_file);
+ err = Grow_restart(st, content, fdlist, bestcnt, backup_file);
while (i>0) {
i--;
if (fdlist[i]>=0) close(fdlist[i]);
}
if (err) {
fprintf(stderr, Name ": Failed to restore critical section for reshape, sorry.\n");
- if (must_close) close(mdfd);
+ close(mdfd);
return err;
}
}
/* count number of in-sync devices according to the superblock.
* We must have this number to start the array without -s or -R
*/
- req_cnt = info.array.working_disks;
+ req_cnt = content->array.working_disks;
/* Almost ready to actually *do* something */
if (!old_linux) {
int rv;
- if ((vers % 100) >= 1) { /* can use different versions */
- mdu_array_info_t inf;
- memset(&inf, 0, sizeof(inf));
- inf.major_version = st->ss->major;
- inf.minor_version = st->minor_version;
- rv = ioctl(mdfd, SET_ARRAY_INFO, &inf);
- } else
- rv = ioctl(mdfd, SET_ARRAY_INFO, NULL);
+ /* First, fill in the map, so that udev can find our name
+ * as soon as we become active.
+ */
+ map_update(NULL, fd2devnum(mdfd), content->text_version,
+ content->uuid, chosen_name);
+
+ rv = set_array_info(mdfd, st, content);
if (rv) {
- fprintf(stderr, Name ": SET_ARRAY_INFO failed for %s: %s\n",
+ fprintf(stderr, Name ": failed to set array info for %s: %s\n",
mddev, strerror(errno));
- if (must_close) close(mdfd);
+ close(mdfd);
return 1;
}
if (ident->bitmap_fd >= 0) {
if (ioctl(mdfd, SET_BITMAP_FILE, ident->bitmap_fd) != 0) {
fprintf(stderr, Name ": SET_BITMAP_FILE failed.\n");
- if (must_close) close(mdfd);
+ close(mdfd);
return 1;
}
} else if (ident->bitmap_file) {
if (bmfd < 0) {
fprintf(stderr, Name ": Could not open bitmap file %s\n",
ident->bitmap_file);
- if (must_close) close(mdfd);
+ close(mdfd);
return 1;
}
if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) {
fprintf(stderr, Name ": Failed to set bitmapfile for %s\n", mddev);
close(bmfd);
- if (must_close) close(mdfd);
+ close(mdfd);
return 1;
}
close(bmfd);
j = chosen_drive;
if (j >= 0 /* && devices[j].uptodate */) {
- if (ioctl(mdfd, ADD_NEW_DISK,
- &devices[j].i.disk)!=0) {
+ rv = add_disk(mdfd, st, content, &devices[j].i);
+
+ if (rv) {
fprintf(stderr, Name ": failed to add "
"%s to %s: %s\n",
devices[j].devname,
mddev,
strerror(errno));
- if (i < info.array.raid_disks
+ if (i < content->array.raid_disks
|| i == bestcnt)
okcnt--;
else
"to %s as %d\n",
devices[j].devname, mddev,
devices[j].i.disk.raid_disk);
- } else if (verbose > 0 && i < info.array.raid_disks)
+ } else if (verbose > 0 && i < content->array.raid_disks)
fprintf(stderr, Name ": no uptodate device for "
"slot %d of %s\n",
i, mddev);
}
+ if (content->array.level == LEVEL_CONTAINER) {
+ if (verbose >= 0) {
+ fprintf(stderr, Name ": Container %s has been "
+ "assembled with %d drive%s",
+ mddev, okcnt+sparecnt, okcnt+sparecnt==1?"":"s");
+ if (okcnt < content->array.raid_disks)
+ fprintf(stderr, " (out of %d)",
+ content->array.raid_disks);
+ fprintf(stderr, "\n");
+ }
+ sysfs_uevent(content, "change");
+ wait_for(chosen_name, mdfd);
+ close(mdfd);
+ return 0;
+ }
+
if (runstop == 1 ||
(runstop <= 0 &&
- ( enough(info.array.level, info.array.raid_disks,
- info.array.layout, clean, avail, okcnt) &&
+ ( enough(content->array.level, content->array.raid_disks,
+ content->array.layout, clean, avail, okcnt) &&
(okcnt >= req_cnt || start_partial_ok)
))) {
if (ioctl(mdfd, RUN_ARRAY, NULL)==0) {
if (verbose >= 0) {
fprintf(stderr, Name ": %s has been started with %d drive%s",
mddev, okcnt, okcnt==1?"":"s");
- if (okcnt < info.array.raid_disks)
- fprintf(stderr, " (out of %d)", info.array.raid_disks);
+ if (okcnt < content->array.raid_disks)
+ fprintf(stderr, " (out of %d)", content->array.raid_disks);
if (sparecnt)
fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s");
fprintf(stderr, ".\n");
}
- if (info.reshape_active &&
- info.array.level >= 4 &&
- info.array.level <= 6) {
+ if (content->reshape_active &&
+ content->array.level >= 4 &&
+ content->array.level <= 6) {
/* might need to increase the size
* of the stripe cache - default is 256
*/
- if (256 < 4 * (info.array.chunk_size/4096)) {
+ if (256 < 4 * (content->array.chunk_size/4096)) {
struct mdinfo *sra = sysfs_read(mdfd, 0, 0);
if (sra)
sysfs_set_num(sra, NULL,
"stripe_cache_size",
- (4 * info.array.chunk_size / 4096) + 1);
+ (4 * content->array.chunk_size / 4096) + 1);
}
}
- if (must_close) {
+ wait_for(mddev, mdfd);
+ close(mdfd);
+ if (auto_assem) {
int usecs = 1;
- close(mdfd);
/* There is a nasty race with 'mdadm --monitor'.
* If it opens this device before we close it,
* it gets an incomplete open on which IO
- * doesn't work and the capacity if wrong.
+ * doesn't work and the capacity is
+ * wrong.
* If we reopen (to check for layered devices)
* before --monitor closes, we loose.
*
fprintf(stderr, Name ": failed to RUN_ARRAY %s: %s\n",
mddev, strerror(errno));
- if (!enough(info.array.level, info.array.raid_disks,
- info.array.layout, 1, avail, okcnt))
+ if (!enough(content->array.level, content->array.raid_disks,
+ content->array.layout, 1, avail, okcnt))
fprintf(stderr, Name ": Not enough devices to "
"start the array.\n");
- else if (!enough(info.array.level,
- info.array.raid_disks,
- info.array.layout, clean,
+ else if (!enough(content->array.level,
+ content->array.raid_disks,
+ content->array.layout, clean,
avail, okcnt))
fprintf(stderr, Name ": Not enough devices to "
"start the array while not clean "
"- consider --force.\n");
- if (must_close) {
+ if (auto_assem)
ioctl(mdfd, STOP_ARRAY, NULL);
- close(mdfd);
- }
+ close(mdfd);
return 1;
}
if (runstop == -1) {
fprintf(stderr, Name ": %s assembled from %d drive%s",
mddev, okcnt, okcnt==1?"":"s");
- if (okcnt != info.array.raid_disks)
- fprintf(stderr, " (out of %d)", info.array.raid_disks);
+ if (okcnt != content->array.raid_disks)
+ fprintf(stderr, " (out of %d)", content->array.raid_disks);
fprintf(stderr, ", but not started.\n");
- if (must_close) close(mdfd);
+ close(mdfd);
return 0;
}
if (verbose >= -1) {
fprintf(stderr, Name ": %s assembled from %d drive%s", mddev, okcnt, okcnt==1?"":"s");
if (sparecnt)
fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s");
- if (!enough(info.array.level, info.array.raid_disks,
- info.array.layout, 1, avail, okcnt))
+ if (!enough(content->array.level, content->array.raid_disks,
+ content->array.layout, 1, avail, okcnt))
fprintf(stderr, " - not enough to start the array.\n");
- else if (!enough(info.array.level,
- info.array.raid_disks,
- info.array.layout, clean,
+ else if (!enough(content->array.level,
+ content->array.raid_disks,
+ content->array.layout, clean,
avail, okcnt))
fprintf(stderr, " - not enough to start the "
"array while not clean - consider "
"--force.\n");
else {
- if (req_cnt == info.array.raid_disks)
+ if (req_cnt == content->array.raid_disks)
fprintf(stderr, " - need all %d to start it", req_cnt);
else
- fprintf(stderr, " - need %d of %d to start", req_cnt, info.array.raid_disks);
+ fprintf(stderr, " - need %d of %d to start", req_cnt, content->array.raid_disks);
fprintf(stderr, " (use --run to insist).\n");
}
}
- if (must_close) {
+ if (auto_assem)
ioctl(mdfd, STOP_ARRAY, NULL);
- close(mdfd);
- }
+ close(mdfd);
return 1;
} else {
/* The "chosen_drive" is a good choice, and if necessary, the superblock has
}
}
- if (must_close) close(mdfd);
+ close(mdfd);
return 0;
}
+
+#ifndef MDASSEMBLE
+int assemble_container_content(struct supertype *st, int mdfd,
+ struct mdinfo *content, int runstop,
+ char *chosen_name, int verbose)
+{
+ struct mdinfo *dev, *sra;
+ int working = 0, preexist = 0;
+ struct map_ent *map = NULL;
+
+ sysfs_init(content, mdfd, 0);
+
+ sra = sysfs_read(mdfd, 0, GET_VERSION);
+ if (sra == NULL || strcmp(sra->text_version, content->text_version) != 0)
+ if (sysfs_set_array(content, md_get_version(mdfd)) != 0) {
+ close(mdfd);
+ return 1;
+ }
+ if (sra)
+ sysfs_free(sra);
+
+ for (dev = content->devs; dev; dev = dev->next)
+ if (sysfs_add_disk(content, dev, 1) == 0)
+ working++;
+ else if (errno == EEXIST)
+ preexist++;
+ if (working == 0) {
+ close(mdfd);
+ return 1;/* Nothing new, don't try to start */
+ }
+
+ map_update(&map, fd2devnum(mdfd),
+ content->text_version,
+ content->uuid, chosen_name);
+
+ if (runstop > 0 ||
+ (working + preexist) >= content->array.working_disks) {
+ int err;
+
+ switch(content->array.level) {
+ case LEVEL_LINEAR:
+ case LEVEL_MULTIPATH:
+ case 0:
+ err = sysfs_set_str(content, NULL, "array_state",
+ "active");
+ break;
+ default:
+ err = sysfs_set_str(content, NULL, "array_state",
+ "readonly");
+ /* start mdmon if needed. */
+ if (!err) {
+ if (!mdmon_running(st->container_dev))
+ start_mdmon(st->container_dev);
+ ping_monitor(devnum2devname(st->container_dev));
+ }
+ break;
+ }
+ if (!err)
+ sysfs_set_safemode(content, content->safe_mode_delay);
+ if (verbose >= 0) {
+ if (err)
+ fprintf(stderr, Name
+ ": array %s now has %d devices",
+ chosen_name, working + preexist);
+ else
+ fprintf(stderr, Name
+ ": Started %s with %d devices",
+ chosen_name, working + preexist);
+ if (preexist)
+ fprintf(stderr, " (%d new)", working);
+ fprintf(stderr, "\n");
+ }
+ if (!err)
+ wait_for(chosen_name, mdfd);
+ close(mdfd);
+ return 0;
+ /* FIXME should have an O_EXCL and wait for read-auto */
+ } else {
+ if (verbose >= 0)
+ fprintf(stderr, Name
+ ": %s assembled with %d devices but "
+ "not started\n",
+ chosen_name, working);
+ close(mdfd);
+ return 1;
+ }
+}
+#endif
+
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
#include "mdadm.h"
#define START_MD _IO (MD_MAJOR, 2)
#define STOP_MD _IO (MD_MAJOR, 3)
-int Build(char *mddev, int mdfd, int chunk, int level, int layout,
- int raiddisks,
- mddev_dev_t devlist, int assume_clean,
- char *bitmap_file, int bitmap_chunk, int write_behind, int delay,
- int verbose, unsigned long long size)
+int Build(char *mddev, int chunk, int level, int layout,
+ int raiddisks, mddev_dev_t devlist, int assume_clean,
+ char *bitmap_file, int bitmap_chunk, int write_behind,
+ int delay, int verbose, int autof, unsigned long long size)
{
/* Build a linear or raid0 arrays without superblocks
* We cannot really do any checks, we just do it.
mddev_dev_t dv;
int bitmap_fd;
unsigned long long bitmapsize;
+ int mdfd;
+ char chosen_name[1024];
+ int uuid[4] = {0,0,0,0};
+ struct map_ent *map = NULL;
/* scan all devices, make sure they really are block devices */
for (dv = devlist; dv; dv=dv->next) {
break;
}
+ /* We need to create the device. It can have no name. */
+ map_lock(&map);
+ mdfd = create_mddev(mddev, NULL, autof, LOCAL,
+ chosen_name);
+ if (mdfd < 0) {
+ map_unlock(&map);
+ return 1;
+ }
+ mddev = chosen_name;
+
+ map_update(&map, fd2devnum(mdfd), "none", uuid, chosen_name);
+ map_unlock(&map);
vers = md_get_version(mdfd);
if (ioctl(mdfd, SET_ARRAY_INFO, &array)) {
fprintf(stderr, Name ": SET_ARRAY_INFO failed for %s: %s\n",
mddev, strerror(errno));
- return 1;
+ goto abort;
}
} else if (bitmap_file) {
fprintf(stderr, Name ": bitmaps not supported with this kernel\n");
- return 1;
+ goto abort;
}
if (bitmap_file && level <= 0) {
fprintf(stderr, Name ": bitmaps not meaningful with level %s\n",
map_num(pers, level)?:"given");
- return 1;
+ goto abort;
}
/* now add the devices */
for ((i=0), (dv = devlist) ; dv ; i++, dv=dv->next) {
if (bitmap_chunk == UnSet) {
fprintf(stderr, Name ": %s cannot be openned.",
bitmap_file);
- return 1;
+ goto abort;
}
#endif
if (vers < 9003) {
bitmapsize = size>>9; /* FIXME wrong for RAID10 */
if (CreateBitmap(bitmap_file, 1, NULL, bitmap_chunk,
delay, write_behind, bitmapsize, major)) {
- return 1;
+ goto abort;
}
bitmap_fd = open(bitmap_file, O_RDWR);
if (bitmap_fd < 0) {
fprintf(stderr, Name ": %s cannot be openned.",
bitmap_file);
- return 1;
+ goto abort;
}
}
if (bitmap_fd >= 0) {
if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) {
fprintf(stderr, Name ": Cannot set bitmap file for %s: %s\n",
mddev, strerror(errno));
- return 1;
+ goto abort;
}
}
}
if (verbose >= 0)
fprintf(stderr, Name ": array %s built and started.\n",
mddev);
+ wait_for(mddev, mdfd);
+ close(mdfd);
return 0;
abort:
ioctl(mdfd, STOP_ARRAY, 0);
else
ioctl(mdfd, STOP_MD, 0);
+ close(mdfd);
return 1;
}
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
#include "mdadm.h"
#include "md_p.h"
#include <ctype.h>
-int Create(struct supertype *st, char *mddev, int mdfd,
+static int default_layout(struct supertype *st, int level, int verbose)
+{
+ int layout = UnSet;
+
+ if (st && st->ss->default_layout)
+ layout = st->ss->default_layout(level);
+
+ if (layout == UnSet)
+ switch(level) {
+ default: /* no layout */
+ layout = 0;
+ break;
+ case 10:
+ layout = 0x102; /* near=2, far=1 */
+ if (verbose > 0)
+ fprintf(stderr,
+ Name ": layout defaults to n1\n");
+ break;
+ case 5:
+ case 6:
+ layout = map_name(r5layout, "default");
+ if (verbose > 0)
+ fprintf(stderr,
+ Name ": layout defaults to %s\n", map_num(r5layout, layout));
+ break;
+ case LEVEL_FAULTY:
+ layout = map_name(faultylayout, "default");
+
+ if (verbose > 0)
+ fprintf(stderr,
+ Name ": layout defaults to %s\n", map_num(faultylayout, layout));
+ break;
+ }
+
+ return layout;
+}
+
+
+int Create(struct supertype *st, char *mddev,
int chunk, int level, int layout, unsigned long long size, int raiddisks, int sparedisks,
char *name, char *homehost, int *uuid,
int subdevs, mddev_dev_t devlist,
int runstop, int verbose, int force, int assume_clean,
- char *bitmap_file, int bitmap_chunk, int write_behind, int delay)
+ char *bitmap_file, int bitmap_chunk, int write_behind, int delay, int autof)
{
/*
* Create a new raid array.
* if runstop==run, or raiddisks disks were used,
* RUN_ARRAY
*/
+ int mdfd;
unsigned long long minsize=0, maxsize=0;
char *mindisc = NULL;
char *maxdisc = NULL;
int second_missing = subdevs * 2;
int missing_disks = 0;
int insert_point = subdevs * 2; /* where to insert a missing drive */
+ int total_slots;
int pass;
int vers;
int rv;
int bitmap_fd;
+ int have_container = 0;
+ int container_fd = -1;
+ int need_mdmon = 0;
unsigned long long bitmapsize;
- struct mdinfo info;
+ struct mdinfo info, *infos;
+ int did_default = 0;
+ int do_default_layout = 0;
+ unsigned long safe_mode_delay = 0;
+ char chosen_name[1024];
+ struct map_ent *map = NULL;
+ unsigned long long newsize;
int major_num = BITMAP_MAJOR_HI;
memset(&info, 0, sizeof(info));
- vers = md_get_version(mdfd);
- if (vers < 9000) {
- fprintf(stderr, Name ": Create requires md driver version 0.90.0 or later\n");
- return 1;
- } else {
- mdu_array_info_t inf;
- memset(&inf, 0, sizeof(inf));
- ioctl(mdfd, GET_ARRAY_INFO, &inf);
- if (inf.working_disks != 0) {
- fprintf(stderr, Name ": another array by this name"
- " is already running.\n");
- return 1;
- }
+ if (level == UnSet) {
+ /* "ddf" and "imsm" metadata only supports one level - should possibly
+ * push this into metadata handler??
+ */
+ if (st && (st->ss == &super_ddf || st->ss == &super_imsm))
+ level = LEVEL_CONTAINER;
}
+
if (level == UnSet) {
fprintf(stderr,
Name ": a RAID level is needed to create an array.\n");
Name ": This level does not support spare devices\n");
return 1;
}
+
+ if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) {
+ /* If given a single device, it might be a container, and we can
+ * extract a device list from there
+ */
+ mdu_array_info_t inf;
+ int fd;
+
+ memset(&inf, 0, sizeof(inf));
+ fd = open(devlist->devname, O_RDONLY);
+ if (fd >= 0 &&
+ ioctl(fd, GET_ARRAY_INFO, &inf) == 0 &&
+ inf.raid_disks == 0) {
+ /* yep, looks like a container */
+ if (st) {
+ rv = st->ss->load_super(st, fd,
+ devlist->devname);
+ if (rv == 0)
+ have_container = 1;
+ } else {
+ st = guess_super(fd);
+ if (st && !(rv = st->ss->
+ load_super(st, fd,
+ devlist->devname)))
+ have_container = 1;
+ else
+ st = NULL;
+ }
+ if (have_container) {
+ subdevs = raiddisks;
+ first_missing = subdevs * 2;
+ second_missing = subdevs * 2;
+ insert_point = subdevs * 2;
+ }
+ }
+ if (fd >= 0)
+ close(fd);
+ }
+ if (st && st->ss->external && sparedisks) {
+ fprintf(stderr,
+ Name ": This metadata type does not support "
+ "spare disks are create time\n");
+ return 1;
+ }
if (subdevs > raiddisks+sparedisks) {
fprintf(stderr, Name ": You have listed more devices (%d) than are in the array(%d)!\n", subdevs, raiddisks+sparedisks);
return 1;
}
- if (subdevs < raiddisks+sparedisks) {
+ if (!have_container && subdevs < raiddisks+sparedisks) {
fprintf(stderr, Name ": You haven't given enough devices (real or missing) to create this array\n");
return 1;
}
}
/* now set some defaults */
- if (layout == UnSet)
- switch(level) {
- default: /* no layout */
- layout = 0;
- break;
- case 10:
- layout = 0x102; /* near=2, far=1 */
- if (verbose > 0)
- fprintf(stderr,
- Name ": layout defaults to n1\n");
- break;
- case 5:
- case 6:
- layout = map_name(r5layout, "default");
- if (verbose > 0)
- fprintf(stderr,
- Name ": layout defaults to %s\n", map_num(r5layout, layout));
- break;
- case LEVEL_FAULTY:
- layout = map_name(faultylayout, "default");
- if (verbose > 0)
- fprintf(stderr,
- Name ": layout defaults to %s\n", map_num(faultylayout, layout));
- break;
- }
+
+ if (layout == UnSet) {
+ do_default_layout = 1;
+ layout = default_layout(st, level, verbose);
+ }
if (level == 10)
/* check layout fits in array*/
case 1:
case LEVEL_FAULTY:
case LEVEL_MULTIPATH:
+ case LEVEL_CONTAINER:
if (chunk) {
chunk = 0;
if (verbose > 0)
fprintf(stderr, Name ": unknown level %d\n", level);
return 1;
}
+
+ if (size && chunk)
+ size &= ~(unsigned long long)(chunk - 1);
+ newsize = size * 2;
+ if (st && ! st->ss->validate_geometry(st, level, layout, raiddisks,
+ chunk, size*2, NULL, &newsize, verbose>=0))
+ return 1;
+ if (size == 0) {
+ size = newsize / 2;
+ if (size && verbose > 0)
+ fprintf(stderr, Name ": setting size to %lluK\n",
+ (unsigned long long)size);
+ }
/* now look at the subdevs */
info.array.active_disks = 0;
info.array.working_disks = 0;
dnum = 0;
- for (dv=devlist; dv; dv=dv->next, dnum++) {
+ for (dv=devlist; dv && !have_container; dv=dv->next, dnum++) {
char *dname = dv->devname;
- unsigned long long ldsize, freesize;
- int fd;
+ unsigned long long freesize;
if (strcasecmp(dname, "missing")==0) {
if (first_missing > dnum)
first_missing = dnum;
info.array.working_disks++;
if (dnum < raiddisks)
info.array.active_disks++;
- fd = open(dname, O_RDONLY|O_EXCL);
- if (fd <0 ) {
- fprintf(stderr, Name ": Cannot open %s: %s\n",
- dname, strerror(errno));
- fail=1;
- continue;
- }
- if (!get_dev_size(fd, dname, &ldsize)) {
- fail = 1;
- close(fd);
- continue;
- }
if (st == NULL) {
struct createinfo *ci = conf_get_create_info();
if (ci)
}
if (st == NULL) {
/* Need to choose a default metadata, which is different
- * depending on the sizes of devices
+ * depending on geometry of array.
*/
int i;
char *name = "default";
- if (level >= 1 && ldsize > (0x7fffffffULL<<10))
- name = "default/large";
- for(i=0; !st && superlist[i]; i++)
+ for(i=0; !st && superlist[i]; i++) {
st = superlist[i]->match_metadata_desc(name);
+ if (do_default_layout)
+ layout = default_layout(st, level, verbose);
+ if (st && !st->ss->validate_geometry
+ (st, level, layout, raiddisks,
+ chunk, size*2, dname, &freesize,
+ verbose > 0))
+ st = NULL;
+ }
if (!st) {
- fprintf(stderr, Name ": internal error - no default metadata style\n");
+ fprintf(stderr, Name ": device %s not suitable "
+ "for any style of array\n",
+ dname);
exit(2);
}
- if (st->ss->major != 0 ||
+ if (st->ss != &super0 ||
st->minor_version != 90)
- fprintf(stderr, Name ": Defaulting to version"
- " %d.%d metadata\n",
- st->ss->major,
- st->minor_version);
- }
- freesize = st->ss->avail_size(st, ldsize >> 9);
- if (freesize == 0) {
- fprintf(stderr, Name ": %s is too small: %luK\n",
- dname, (unsigned long)(ldsize>>10));
- fail = 1;
- close(fd);
- continue;
+ did_default = 1;
+ } else {
+ if (do_default_layout)
+ layout = default_layout(st, level, verbose);
+ if (!st->ss->validate_geometry(st, level, layout,
+ raiddisks,
+ chunk, size*2, dname,
+ &freesize,
+ verbose > 0)) {
+
+ fprintf(stderr,
+ Name ": %s is not suitable for "
+ "this array.\n",
+ dname);
+ fail = 1;
+ continue;
+ }
}
freesize /= 2; /* convert to K */
}
if (size && freesize < size) {
- fprintf(stderr, Name ": %s is smaller that given size."
- " %lluK < %lluK + superblock\n", dname, freesize, size);
+ fprintf(stderr, Name ": %s is smaller than given size."
+ " %lluK < %lluK + metadata\n",
+ dname, freesize, size);
fail = 1;
- close(fd);
continue;
}
if (maxdisc == NULL || (maxdisc && freesize > maxsize)) {
minsize = freesize;
}
if (runstop != 1 || verbose >= 0) {
+ int fd = open(dname, O_RDONLY);
+ if (fd <0 ) {
+ fprintf(stderr, Name ": Cannot open %s: %s\n",
+ dname, strerror(errno));
+ fail=1;
+ continue;
+ }
warn |= check_ext2(fd, dname);
warn |= check_reiser(fd, dname);
warn |= check_raid(fd, dname);
+ close(fd);
}
- close(fd);
}
+ if (have_container)
+ info.array.working_disks = raiddisks;
if (fail) {
fprintf(stderr, Name ": create aborted\n");
return 1;
}
if (size == 0) {
- if (mindisc == NULL) {
+ if (mindisc == NULL && !have_container) {
fprintf(stderr, Name ": no size and no drives given - aborting create.\n");
return 1;
}
- if (level > 0 || level == LEVEL_MULTIPATH || level == LEVEL_FAULTY) {
+ if (level > 0 || level == LEVEL_MULTIPATH
+ || level == LEVEL_FAULTY
+ || st->ss->external ) {
/* size is meaningful */
- if (minsize > 0x100000000ULL && st->ss->major == 0) {
+ if (!st->ss->validate_geometry(st, level, layout,
+ raiddisks,
+ chunk, minsize*2,
+ NULL, NULL, 0)) {
fprintf(stderr, Name ": devices too large for RAID level %d\n", level);
return 1;
}
fprintf(stderr, Name ": size set to %lluK\n", size);
}
}
- if (level > 0 && ((maxsize-size)*100 > maxsize)) {
+ if (!have_container && level > 0 && ((maxsize-size)*100 > maxsize)) {
if (runstop != 1 || verbose >= 0)
- fprintf(stderr, Name ": largest drive (%s) exceed size (%lluK) by more than 1%%\n",
+ fprintf(stderr, Name ": largest drive (%s) exceeds size (%lluK) by more than 1%%\n",
maxdisc, size);
warn = 1;
}
+ if (st->ss->detail_platform && st->ss->detail_platform(0, 1) != 0) {
+ if (runstop != 1 || verbose >= 0)
+ fprintf(stderr, Name ": %s unable to enumerate platform support\n"
+ " array may not be compatible with hardware/firmware\n",
+ st->ss->name);
+ warn = 1;
+ }
+
if (warn) {
if (runstop!= 1) {
if (!ask("Continue creating array? ")) {
* as missing, so that a reconstruct happens (faster than re-parity)
* FIX: Can we do this for raid6 as well?
*/
- if (assume_clean==0 && force == 0 && first_missing >= raiddisks) {
+ if (st->ss->external == 0 &&
+ assume_clean==0 && force == 0 && first_missing >= raiddisks) {
switch ( level ) {
case 4:
case 5:
* into a spare, else the create will fail
*/
if (assume_clean == 0 && force == 0 && first_missing < raiddisks &&
+ st->ss->external == 0 &&
second_missing >= raiddisks && level == 6) {
insert_point = raiddisks - 1;
if (insert_point == first_missing)
missing_disks++;
}
- if (level <= 0 && first_missing != subdevs * 2) {
+ if (level <= 0 && first_missing < subdevs * 2) {
fprintf(stderr,
Name ": This level does not support missing devices\n");
return 1;
}
+ /* We need to create the device */
+ map_lock(&map);
+ mdfd = create_mddev(mddev, name, autof, LOCAL, chosen_name);
+ if (mdfd < 0)
+ return 1;
+ mddev = chosen_name;
+
+ vers = md_get_version(mdfd);
+ if (vers < 9000) {
+ fprintf(stderr, Name ": Create requires md driver version 0.90.0 or later\n");
+ goto abort;
+ } else {
+ mdu_array_info_t inf;
+ memset(&inf, 0, sizeof(inf));
+ ioctl(mdfd, GET_ARRAY_INFO, &inf);
+ if (inf.working_disks != 0) {
+ fprintf(stderr, Name ": another array by this name"
+ " is already running.\n");
+ goto abort;
+ }
+ }
+
/* Ok, lets try some ioctls */
info.array.level = level;
( level == 6 && (insert_point < raiddisks
|| second_missing < raiddisks))
||
+ ( level <= 0 )
+ ||
assume_clean
- )
+ ) {
info.array.state = 1; /* clean, but one+ drive will be missing*/
- else
+ info.resync_start = ~0ULL;
+ } else {
info.array.state = 0; /* not clean, but no errors */
-
+ info.resync_start = 0;
+ }
if (level == 10) {
/* for raid10, the bitmap size is the capacity of the array,
* which is array.size * raid_disks / ncopies;
+ info.array.failed_disks;
info.array.layout = layout;
info.array.chunk_size = chunk*1024;
- info.array.major_version = st->ss->major;
if (name == NULL || *name == 0) {
/* base name on mddev */
* /dev/md/home -> home
* /dev/mdhome -> home
*/
+ /* FIXME compare this with rules in create_mddev */
name = strrchr(mddev, '/');
if (name) {
name++;
}
}
if (!st->ss->init_super(st, &info.array, size, name, homehost, uuid))
- return 1;
+ goto abort;
+
+ total_slots = info.array.nr_disks;
+ sysfs_init(&info, mdfd, 0);
+ st->ss->getinfo_super(st, &info);
+
+ if (did_default && verbose >= 0) {
+ if (is_subarray(info.text_version)) {
+ int dnum = devname2devnum(info.text_version+1);
+ char *path;
+ int mdp = get_mdp_major();
+ struct mdinfo *mdi;
+ if (dnum > 0)
+ path = map_dev(MD_MAJOR, dnum, 1);
+ else
+ path = map_dev(mdp, (-1-dnum)<< 6, 1);
+
+ mdi = sysfs_read(-1, dnum, GET_VERSION);
+
+ fprintf(stderr, Name ": Creating array inside "
+ "%s container %s\n",
+ mdi?mdi->text_version:"managed", path);
+ sysfs_free(mdi);
+ } else
+ fprintf(stderr, Name ": Defaulting to version"
+ " %s metadata\n", info.text_version);
+ }
+
+ map_update(&map, fd2devnum(mdfd), info.text_version,
+ info.uuid, chosen_name);
+ map_unlock(&map);
if (bitmap_file && vers < 9003) {
major_num = BITMAP_MAJOR_HOSTENDIAN;
if (bitmap_file && strcmp(bitmap_file, "internal")==0) {
if ((vers%100) < 2) {
fprintf(stderr, Name ": internal bitmaps not supported by this kernel.\n");
- return 1;
+ goto abort;
}
if (!st->ss->add_internal_bitmap(st, &bitmap_chunk,
delay, write_behind,
bitmapsize, 1, major_num)) {
fprintf(stderr, Name ": Given bitmap chunk size not supported.\n");
- return 1;
+ goto abort;
}
bitmap_file = NULL;
}
+ sysfs_init(&info, mdfd, 0);
- if ((vers % 100) >= 1) { /* can use different versions */
- mdu_array_info_t inf;
- memset(&inf, 0, sizeof(inf));
- inf.major_version = st->ss->major;
- inf.minor_version = st->minor_version;
- rv = ioctl(mdfd, SET_ARRAY_INFO, &inf);
- } else
- rv = ioctl(mdfd, SET_ARRAY_INFO, NULL);
+ if (st->ss->external && st->subarray[0]) {
+ /* member */
+
+ /* When creating a member, we need to be careful
+ * to negotiate with mdmon properly.
+ * If it is already running, we cannot write to
+ * the devices and must ask it to do that part.
+ * If it isn't running, we write to the devices,
+ * and then start it.
+ * We hold an exclusive open on the container
+ * device to make sure mdmon doesn't exit after
+ * we checked that it is running.
+ *
+ * For now, fail if it is already running.
+ */
+ container_fd = open_dev_excl(st->container_dev);
+ if (container_fd < 0) {
+ fprintf(stderr, Name ": Cannot get exclusive "
+ "open on container - weird.\n");
+ goto abort;
+ }
+ if (mdmon_running(st->container_dev)) {
+ if (verbose)
+ fprintf(stderr, Name ": reusing mdmon "
+ "for %s.\n",
+ devnum2devname(st->container_dev));
+ st->update_tail = &st->updates;
+ } else
+ need_mdmon = 1;
+ }
+ rv = set_array_info(mdfd, st, &info);
if (rv) {
- fprintf(stderr, Name ": SET_ARRAY_INFO failed for %s: %s\n",
+ fprintf(stderr, Name ": failed to set array info for %s: %s\n",
mddev, strerror(errno));
- return 1;
+ goto abort;
}
if (bitmap_file) {
delay, write_behind,
bitmapsize,
major_num)) {
- return 1;
+ goto abort;
}
bitmap_fd = open(bitmap_file, O_RDWR);
if (bitmap_fd < 0) {
fprintf(stderr, Name ": weird: %s cannot be openned\n",
bitmap_file);
- return 1;
+ goto abort;
}
if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) {
fprintf(stderr, Name ": Cannot set bitmap file for %s: %s\n",
mddev, strerror(errno));
- return 1;
+ goto abort;
}
}
-
+ infos = malloc(sizeof(*infos) * total_slots);
for (pass=1; pass <=2 ; pass++) {
mddev_dev_t moved_disk = NULL; /* the disk that was moved out of the insert point */
dv=(dv->next)?(dv->next):moved_disk, dnum++) {
int fd;
struct stat stb;
+ struct mdinfo *inf = &infos[dnum];
- info.disk.number = dnum;
+ if (dnum >= total_slots)
+ abort();
if (dnum == insert_point) {
moved_disk = dv;
+ continue;
}
- info.disk.raid_disk = info.disk.number;
- if (info.disk.raid_disk < raiddisks)
- info.disk.state = (1<<MD_DISK_ACTIVE) |
+ if (strcasecmp(dv->devname, "missing")==0)
+ continue;
+ if (have_container)
+ moved_disk = NULL;
+ if (have_container && dnum < info.array.raid_disks - 1)
+ /* repeatedly use the container */
+ moved_disk = dv;
+
+ switch(pass) {
+ case 1:
+ *inf = info;
+
+ inf->disk.number = dnum;
+ inf->disk.raid_disk = dnum;
+ if (inf->disk.raid_disk < raiddisks)
+ inf->disk.state = (1<<MD_DISK_ACTIVE) |
(1<<MD_DISK_SYNC);
- else
- info.disk.state = 0;
- if (dv->writemostly == 1)
- info.disk.state |= (1<<MD_DISK_WRITEMOSTLY);
-
- if (dnum == insert_point ||
- strcasecmp(dv->devname, "missing")==0) {
- info.disk.major = 0;
- info.disk.minor = 0;
- info.disk.state = (1<<MD_DISK_FAULTY);
- } else {
- fd = open(dv->devname, O_RDONLY|O_EXCL);
- if (fd < 0) {
- fprintf(stderr, Name ": failed to open %s after earlier success - aborting\n",
- dv->devname);
- return 1;
+ else
+ inf->disk.state = 0;
+
+ if (dv->writemostly == 1)
+ inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
+
+ if (have_container)
+ fd = -1;
+ else {
+ if (st->ss->external && st->subarray[0])
+ fd = open(dv->devname, O_RDWR);
+ else
+ fd = open(dv->devname, O_RDWR|O_EXCL);
+
+ if (fd < 0) {
+ fprintf(stderr, Name ": failed to open %s "
+ "after earlier success - aborting\n",
+ dv->devname);
+ goto abort;
+ }
+ fstat(fd, &stb);
+ inf->disk.major = major(stb.st_rdev);
+ inf->disk.minor = minor(stb.st_rdev);
+ }
+ if (fd >= 0)
+ remove_partitions(fd);
+ if (st->ss->add_to_super(st, &inf->disk,
+ fd, dv->devname))
+ goto abort;
+ st->ss->getinfo_super(st, inf);
+ safe_mode_delay = inf->safe_mode_delay;
+
+ if (have_container && verbose > 0)
+ fprintf(stderr, Name ": Using %s for device %d\n",
+ map_dev(inf->disk.major,
+ inf->disk.minor,
+ 0), dnum);
+
+ if (!have_container) {
+ /* getinfo_super might have lost these ... */
+ inf->disk.major = major(stb.st_rdev);
+ inf->disk.minor = minor(stb.st_rdev);
}
- fstat(fd, &stb);
- info.disk.major = major(stb.st_rdev);
- info.disk.minor = minor(stb.st_rdev);
- remove_partitions(fd);
- close(fd);
- }
- switch(pass){
- case 1:
- st->ss->add_to_super(st, &info.disk);
break;
case 2:
- if (info.disk.state == 1) break;
- Kill(dv->devname, 0, 1); /* Just be sure it is clean */
- Kill(dv->devname, 0, 1); /* and again, there could be two superblocks */
- st->ss->write_init_super(st, &info.disk,
- dv->devname);
-
- if (ioctl(mdfd, ADD_NEW_DISK, &info.disk)) {
- fprintf(stderr, Name ": ADD_NEW_DISK for %s failed: %s\n",
+ inf->errors = 0;
+ rv = 0;
+
+ rv = add_disk(mdfd, st, &info, inf);
+
+ if (rv) {
+ fprintf(stderr,
+ Name ": ADD_NEW_DISK for %s "
+ "failed: %s\n",
dv->devname, strerror(errno));
st->ss->free_super(st);
- return 1;
+ goto abort;
}
-
break;
}
- if (dv == moved_disk && dnum != insert_point) break;
+ if (!have_container &&
+ dv == moved_disk && dnum != insert_point) break;
+ }
+ if (pass == 1) {
+ st->ss->write_init_super(st);
+ flush_metadata_updates(st);
}
}
+ free(infos);
st->ss->free_super(st);
- /* param is not actually used */
- if (runstop == 1 || subdevs >= raiddisks) {
- mdu_param_t param;
- if (ioctl(mdfd, RUN_ARRAY, ¶m)) {
- fprintf(stderr, Name ": RUN_ARRAY failed: %s\n",
- strerror(errno));
- Manage_runstop(mddev, mdfd, -1, 0);
- return 1;
+ if (level == LEVEL_CONTAINER) {
+ /* No need to start. But we should signal udev to
+ * create links */
+ sysfs_uevent(&info, "change");
+ if (verbose >= 0)
+ fprintf(stderr, Name ": container %s prepared.\n", mddev);
+ wait_for(chosen_name, mdfd);
+ } else if (runstop == 1 || subdevs >= raiddisks) {
+ if (st->ss->external) {
+ switch(level) {
+ case LEVEL_LINEAR:
+ case LEVEL_MULTIPATH:
+ case 0:
+ sysfs_set_str(&info, NULL, "array_state",
+ "active");
+ need_mdmon = 0;
+ break;
+ default:
+ sysfs_set_str(&info, NULL, "array_state",
+ "readonly");
+ break;
+ }
+ sysfs_set_safemode(&info, safe_mode_delay);
+ } else {
+ /* param is not actually used */
+ mdu_param_t param;
+ if (ioctl(mdfd, RUN_ARRAY, ¶m)) {
+ fprintf(stderr, Name ": RUN_ARRAY failed: %s\n",
+ strerror(errno));
+ Manage_runstop(mddev, mdfd, -1, 0);
+ goto abort;
+ }
}
if (verbose >= 0)
fprintf(stderr, Name ": array %s started.\n", mddev);
+ if (st->ss->external && st->subarray[0]) {
+ if (need_mdmon)
+ start_mdmon(st->container_dev);
+
+ ping_monitor(devnum2devname(st->container_dev));
+ close(container_fd);
+ }
+ wait_for(chosen_name, mdfd);
} else {
fprintf(stderr, Name ": not starting array - not enough devices.\n");
}
+ close(mdfd);
return 0;
+
+ abort:
+ if (mdfd >= 0)
+ close(mdfd);
+ return 1;
}
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
#include "mdadm.h"
#include "md_p.h"
#include "md_u.h"
+#include <dirent.h>
int Detail(char *dev, int brief, int export, int test, char *homehost)
{
int max_disks = MD_SB_DISKS; /* just a default */
struct mdinfo info;
struct mdinfo *sra;
+ char *member = NULL;
+ char *container = NULL;
int rv = test ? 4 : 1;
int avail_disks = 0;
stb.st_rdev = 0;
rv = 0;
- if (st) max_disks = st->max_devs;
+ if (st)
+ max_disks = st->max_devs;
+
+ if (sra && is_subarray(sra->text_version) &&
+ strchr(sra->text_version+1, '/')) {
+ /* This is a subarray of some container.
+ * We want the name of the container, and the member
+ */
+ char *s = strchr(sra->text_version+1, '/');
+ int dn;
+ *s++ = '\0';
+ member = s;
+ dn = devname2devnum(sra->text_version+1);
+ container = map_dev(dev2major(dn), dev2minor(dn), 1);
+ }
/* try to load a superblock */
for (d= 0; d<max_disks; d++) {
continue;
if ((dv=map_dev(disk.major, disk.minor, 1))) {
if ((!st || !st->sb) &&
- (disk.state & (1<<MD_DISK_ACTIVE))) {
+ (array.raid_disks == 0 ||
+ (disk.state & (1<<MD_DISK_ACTIVE)))) {
/* try to read the superblock from this device
* to get more info
*/
if (fd2 >=0 && st &&
st->ss->load_super(st, fd2, NULL) == 0) {
st->ss->getinfo_super(st, &info);
- if (info.array.ctime != array.ctime ||
- info.array.level != array.level)
+ if (array.raid_disks != 0 && /* container */
+ (info.array.ctime != array.ctime ||
+ info.array.level != array.level))
st->ss->free_super(st);
}
if (fd2 >= 0) close(fd2);
c = map_num(pers, array.level);
if (export) {
- if (c)
- printf("MD_LEVEL=%s\n", c);
- printf("MD_DEVICES=%d\n", array.raid_disks);
- if (sra && sra->array.major_version < 0)
- printf("MD_METADATA=%s\n", sra->text_version);
- else
- printf("MD_METADATA=%d.%02d\n",
- array.major_version, array.minor_version);
+ if (array.raid_disks) {
+ if (c)
+ printf("MD_LEVEL=%s\n", c);
+ printf("MD_DEVICES=%d\n", array.raid_disks);
+ } else {
+ printf("MD_LEVEL=container\n");
+ printf("MD_DEVICES=%d\n", array.nr_disks);
+ }
+ if (container) {
+ printf("MD_CONTAINER=%s\n", container);
+ printf("MD_MEMBER=%s\n", member);
+ } else {
+ if (sra && sra->array.major_version < 0)
+ printf("MD_METADATA=%s\n", sra->text_version);
+ else
+ printf("MD_METADATA=%d.%02d\n",
+ array.major_version, array.minor_version);
+ }
+
+ if (st && st->sb) {
+ struct mdinfo info;
+ char nbuf[64];
+ struct map_ent *mp, *map = NULL;
+ st->ss->getinfo_super(st, &info);
+ fname_from_uuid(st, &info, nbuf, ':');
+ printf("MD_UUID=%s\n", nbuf+5);
+ mp = map_by_uuid(&map, info.uuid);
+ if (mp && mp->path &&
+ strncmp(mp->path, "/dev/md/", 8) == 0)
+ printf("MD_DEVNAME=%s\n", mp->path+8);
- if (st && st->sb)
- st->ss->export_detail_super(st);
+ if (st->ss->export_detail_super)
+ st->ss->export_detail_super(st);
+ } else {
+ struct map_ent *mp, *map = NULL;
+ mp = map_by_devnum(&map, fd2devnum(fd));
+ if (mp && mp->path &&
+ strncmp(mp->path, "/dev/md/", 8) == 0)
+ printf("MD_DEVNAME=%s\n", mp->path+8);
+ }
goto out;
}
if (brief) {
mdu_bitmap_file_t bmf;
- printf("ARRAY %s level=%s num-devices=%d", dev,
- c?c:"-unknown-",
- array.raid_disks );
- if (sra && sra->array.major_version < 0)
- printf(" metadata=%s", sra->text_version);
- else
- printf(" metadata=%d.%02d",
- array.major_version, array.minor_version);
+ printf("ARRAY %s", dev);
+ if (brief > 1) {
+ if (array.raid_disks)
+ printf("level=%s num-devices=%d",
+ c?c:"-unknown-",
+ array.raid_disks );
+ else
+ printf("level=container num-devices=%d",
+ array.nr_disks);
+ }
+ if (container) {
+ printf(" container=%s", container);
+ printf(" member=%s", member);
+ } else {
+ if (sra && sra->array.major_version < 0)
+ printf(" metadata=%s", sra->text_version);
+ else
+ printf(" metadata=%d.%02d",
+ array.major_version, array.minor_version);
+ }
/* Only try GET_BITMAP_FILE for 0.90.01 and later */
if (vers >= 9001 &&
printf("%s:\n", dev);
+ if (container)
+ printf(" Container : %s, member %s\n", container, member);
+ else {
if (sra && sra->array.major_version < 0)
printf(" Version : %s\n", sra->text_version);
else
printf(" Version : %d.%02d\n",
array.major_version, array.minor_version);
+ }
atime = array.ctime;
- printf(" Creation Time : %.24s\n", ctime(&atime));
+ if (atime)
+ printf(" Creation Time : %.24s\n", ctime(&atime));
if (array.raid_disks == 0) c = "container";
printf(" Raid Level : %s\n", c?c:"-unknown-");
if (larray_size)
printf(" Used Dev Size : %d%s\n", array.size,
human_size((long long)array.size<<10));
}
- printf(" Raid Devices : %d\n", array.raid_disks);
+ if (array.raid_disks)
+ printf(" Raid Devices : %d\n", array.raid_disks);
printf(" Total Devices : %d\n", array.nr_disks);
- printf("Preferred Minor : %d\n", array.md_minor);
+ if (!container &&
+ ((sra == NULL && array.major_version == 0) ||
+ (sra && sra->array.major_version == 0)))
+ printf("Preferred Minor : %d\n", array.md_minor);
if (sra == NULL || sra->array.major_version >= 0)
printf(" Persistence : Superblock is %spersistent\n",
array.not_persistent?"not ":"");
} else if (array.state & (1<<MD_SB_BITMAP_PRESENT))
printf(" Intent Bitmap : Internal\n\n");
atime = array.utime;
- printf(" Update Time : %.24s\n", ctime(&atime));
- printf(" State : %s%s%s%s\n",
- (array.state&(1<<MD_SB_CLEAN))?"clean":"active",
- array.active_disks < array.raid_disks? ", degraded":"",
- (!e || e->percent < 0) ? "" :
- (e->resync) ? ", resyncing": ", recovering",
- larray_size ? "": ", Not Started");
- printf(" Active Devices : %d\n", array.active_disks);
+ if (atime)
+ printf(" Update Time : %.24s\n", ctime(&atime));
+ if (array.raid_disks)
+ printf(" State : %s%s%s%s\n",
+ (array.state&(1<<MD_SB_CLEAN))?"clean":"active",
+ array.active_disks < array.raid_disks? ", degraded":"",
+ (!e || e->percent < 0) ? "" :
+ (e->resync) ? ", resyncing": ", recovering",
+ larray_size ? "": ", Not Started");
+ if (array.raid_disks)
+ printf(" Active Devices : %d\n", array.active_disks);
printf("Working Devices : %d\n", array.working_disks);
- printf(" Failed Devices : %d\n", array.failed_disks);
- printf(" Spare Devices : %d\n", array.spare_disks);
+ if (array.raid_disks) {
+ printf(" Failed Devices : %d\n", array.failed_disks);
+ printf(" Spare Devices : %d\n", array.spare_disks);
+ }
printf("\n");
if (array.level == 5) {
c = map_num(r5layout, array.layout);
if (st && st->sb)
st->ss->detail_super(st, homehost);
- printf(" Number Major Minor RaidDevice State\n");
+ if (array.raid_disks == 0 && sra && sra->array.major_version == -1
+ && sra->array.minor_version == -2 && sra->text_version[0] != '/') {
+ /* This looks like a container. Find any active arrays
+ * That claim to be a member.
+ */
+ DIR *dir = opendir("/sys/block");
+ struct dirent *de;
+
+ printf(" Member Arrays :");
+
+ while (dir && (de = readdir(dir)) != NULL) {
+ char path[200];
+ char vbuf[1024];
+ int nlen = strlen(sra->sys_name);
+ int dn;
+ if (de->d_name[0] == '.')
+ continue;
+ sprintf(path, "/sys/block/%s/md/metadata_version",
+ de->d_name);
+ if (load_sys(path, vbuf) < 0)
+ continue;
+ if (strncmp(vbuf, "external:", 9) != 0 ||
+ !is_subarray(sra->sys_name+9) ||
+ strncmp(vbuf+10, sra->sys_name, nlen) != 0 ||
+ vbuf[10+nlen] != '/')
+ continue;
+ dn = devname2devnum(de->d_name);
+ printf(" %s", map_dev(dev2major(dn),
+ dev2minor(dn), 1));
+ }
+ if (dir)
+ closedir(dir);
+ printf("\n\n");
+ }
+
+ if (array.raid_disks)
+ printf(" Number Major Minor RaidDevice State\n");
+ else
+ printf(" Number Major Minor RaidDevice\n");
}
disks = malloc(max_disks * sizeof(mdu_disk_info_t));
for (d=0; d<max_disks; d++) {
else
printf(" %5d %5d %5d %5d ",
disk.number, disk.major, disk.minor, disk.raid_disk);
+ }
+ if (!brief && array.raid_disks) {
+
if (disk.state & (1<<MD_DISK_FAULTY)) {
printf(" faulty");
if (disk.raid_disk < array.raid_disks &&
}
if (!brief) printf("\n");
}
- if (spares && brief) printf(" spares=%d", spares);
+ if (spares && brief && array.raid_disks) printf(" spares=%d", spares);
if (brief && st && st->sb)
st->ss->brief_detail_super(st);
st->ss->free_super(st);
close(fd);
return rv;
}
+
+int Detail_Platform(struct superswitch *ss, int scan, int verbose)
+{
+ /* display platform capabilities for the given metadata format
+ * 'scan' in this context means iterate over all metadata types
+ */
+ int i;
+ int err = 1;
+
+ if (ss && ss->detail_platform)
+ err = ss->detail_platform(verbose, 0);
+ else if (ss) {
+ if (verbose)
+ fprintf(stderr, Name ": %s metadata is platform independent\n",
+ ss->name ? : "[no name]");
+ } else if (!scan) {
+ if (verbose)
+ fprintf(stderr, Name ": specify a metadata type or --scan\n");
+ }
+
+ if (!scan)
+ return err;
+
+ for (i = 0; superlist[i]; i++) {
+ struct superswitch *meta = superlist[i];
+
+ if (meta == ss)
+ continue;
+ if (verbose)
+ fprintf(stderr, Name ": checking metadata %s\n",
+ meta->name ? : "[no name]");
+ if (!meta->detail_platform) {
+ if (verbose)
+ fprintf(stderr, Name ": %s metadata is platform independent\n",
+ meta->name ? : "[no name]");
+ } else
+ err |= meta->detail_platform(verbose, 0);
+ }
+
+ return err;
+}
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
#include "mdadm.h"
st->ss->getinfo_super(st, &ap->info);
st->ss->free_super(st);
}
- if (!(ap->info.disk.state & MD_DISK_SYNC))
+ if (!(ap->info.disk.state & (1<<MD_DISK_SYNC)))
ap->spares++;
d = dl_strdup(devlist->devname);
dl_add(ap->devs, d);
} else if (export) {
- st->ss->export_examine_super(st);
+ if (st->ss->export_examine_super)
+ st->ss->export_examine_super(st);
} else {
printf("%s:\n",devlist->devname);
st->ss->examine_super(st, homehost);
for (ap=arrays; ap; ap=ap->next) {
char sep='=';
char *d;
- ap->st->ss->brief_examine_super(ap->st);
+ ap->st->ss->brief_examine_super(ap->st, brief > 1);
if (ap->spares) printf(" spares=%d", ap->spares);
if (brief > 1) {
printf(" devices");
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
#include "mdadm.h"
#include "dlink.h"
return 1;
}
- nfd = open(newdev, O_RDWR|O_EXCL);
+ nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT);
if (nfd < 0) {
fprintf(stderr, Name ": cannot open %s\n", newdev);
return 1;
__u64 arraystart;
__u64 length;
__u32 sb_csum; /* csum of preceeding bytes. */
-};
+ __u8 pad[512-68];
+} __attribute__((aligned(512))) bsb;
int bsb_csum(char *buf, int len)
{
struct mdu_array_info_s array;
char *c;
- struct mdp_backup_super bsb;
struct supertype *st;
int nlevel, olevel;
* a leading superblock 4K earlier.
*/
for (i=array.raid_disks; i<d; i++) {
- char buf[4096];
+ char abuf[4096+512];
+ char *buf = (char*)(((unsigned long)abuf+511)& ~511);
if (i==d-1 && backup_file) {
/* This is the backup file */
offsets[i] = 8;
fprintf(stderr, Name ": could not seek...\n");
goto abort;
}
- memset(buf, 0, sizeof(buf));
+ memset(buf, 0, 4096);
bsb.devstart = __cpu_to_le64(offsets[i]);
bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
memcpy(buf, &bsb, sizeof(bsb));
if (lseek64(fdlist[i], (offsets[i]+last_block)<<9, 0) < 0 ||
write(fdlist[i], &bsb, sizeof(bsb)) != sizeof(bsb) ||
fsync(fdlist[i]) != 0) {
- fprintf(stderr, Name ": %s: fail to save metadata for critical region backups.\n",
+ fprintf(stderr, Name ": %s: failed to save metadata for critical region backups.\n",
devname);
goto abort_resume;
}
/* wait for reshape to pass the critical region */
while(1) {
unsigned long long comp;
+
if (sysfs_get_ll(sra, NULL, "sync_completed", &comp)<0) {
sleep(5);
break;
}
if (comp >= nstripe)
break;
+ if (comp == 0) {
+ /* Maybe it finished already */
+ char action[20];
+ if (sysfs_get_str(sra, NULL, "sync_action",
+ action, 20) > 0 &&
+ strncmp(action, "reshape", 7) != 0)
+ break;
+ }
sleep(1);
}
for (i=old_disks-(backup_file?1:0); i<cnt; i++) {
struct mdinfo dinfo;
- struct mdp_backup_super bsb;
char buf[4096];
int fd;
* Incremental.c - support --incremental. Part of:
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
char *array_name);
int Incremental(char *devname, int verbose, int runstop,
- struct supertype *st, char *homehost, int autof)
+ struct supertype *st, char *homehost, int require_homehost,
+ int autof)
{
/* Add this device to an array, creating the array if necessary
* and starting the array if sensible or - if runstop>0 - if possible.
* 2/ Find metadata, reject if none appropriate (check
* version/name from args)
* 3/ Check if there is a match in mdadm.conf
- * 3a/ if not, check for homehost match. If no match, reject.
+ * 3a/ if not, check for homehost match. If no match, assemble as
+ * a 'foreign' array.
* 4/ Determine device number.
* - If in mdadm.conf with std name, use that
* - UUID in /var/run/mdadm.map use that
* - Choose a free, high number.
* - Use a partitioned device unless strong suggestion not to.
* e.g. auto=md
+ * Don't choose partitioned for containers.
* 5/ Find out if array already exists
* 5a/ if it does not
* - choose a name, from mdadm.conf or 'name' field in array.
* - add the device
* 6/ Make sure /var/run/mdadm.map contains this array.
* 7/ Is there enough devices to possibly start the array?
+ * For a container, this means running Incremental_container.
* 7a/ if not, finish with success.
* 7b/ if yes,
* - read all metadata and arrange devices like -A does
* start the array (auto-readonly).
*/
struct stat stb;
- struct mdinfo info, info2;
+ struct mdinfo info;
struct mddev_ident_s *array_list, *match;
char chosen_name[1024];
int rv;
- int devnum;
struct map_ent *mp, *map = NULL;
int dfd, mdfd;
char *avail;
int active_disks;
+ int trustworthy = FOREIGN;
+ char *name_to_use;
+ mdu_array_info_t ainf;
+
struct createinfo *ci = conf_get_create_info();
- char *name;
- /* 1/ Check if devices is permitted by mdadm.conf */
+ /* 1/ Check if device is permitted by mdadm.conf */
if (!conf_test_dev(devname)) {
if (verbose >= 0)
close(dfd);
return 1;
}
- st->ss->getinfo_super(st, &info);
close (dfd);
+ memset(&info, 0, sizeof(info));
+ st->ss->getinfo_super(st, &info);
/* 3/ Check if there is a match in mdadm.conf */
array_list = conf_get_ident(NULL);
if (array_list->uuid_set &&
same_uuid(array_list->uuid, info.uuid, st->ss->swapuuid)
== 0) {
- if (verbose >= 2)
+ if (verbose >= 2 && array_list->devname)
fprintf(stderr, Name
": UUID differs from %s.\n",
array_list->devname);
}
if (array_list->name[0] &&
strcasecmp(array_list->name, info.name) != 0) {
- if (verbose >= 2)
+ if (verbose >= 2 && array_list->devname)
fprintf(stderr, Name
": Name differs from %s.\n",
array_list->devname);
}
if (array_list->devices &&
!match_oneof(array_list->devices, devname)) {
- if (verbose >= 2)
+ if (verbose >= 2 && array_list->devname)
fprintf(stderr, Name
": Not a listed device for %s.\n",
array_list->devname);
}
if (array_list->super_minor != UnSet &&
array_list->super_minor != info.array.md_minor) {
- if (verbose >= 2)
+ if (verbose >= 2 && array_list->devname)
fprintf(stderr, Name
": Different super-minor to %s.\n",
array_list->devname);
!array_list->name[0] &&
!array_list->devices &&
array_list->super_minor == UnSet) {
- if (verbose >= 2)
+ if (verbose >= 2 && array_list->devname)
fprintf(stderr, Name
": %s doesn't have any identifying information.\n",
array_list->devname);
/* FIXME, should I check raid_disks and level too?? */
if (match) {
- if (verbose >= 0)
- fprintf(stderr, Name
+ if (verbose >= 0) {
+ if (match->devname && array_list->devname)
+ fprintf(stderr, Name
": we match both %s and %s - cannot decide which to use.\n",
- match->devname, array_list->devname);
+ match->devname, array_list->devname);
+ else
+ fprintf(stderr, Name
+ ": multiple lines in mdadm.conf match\n");
+ }
return 2;
}
match = array_list;
}
+ if (match && match->devname
+ && strcasecmp(match->devname, "<ignore>") == 0) {
+ if (verbose >= 0)
+ fprintf(stderr, Name ": array containing %s is explicitly"
+ " ignored by mdadm.conf\n",
+ devname);
+ return 1;
+ }
+
+ if (!match && !conf_test_metadata(st->ss->name)) {
+ if (verbose >= 1)
+ fprintf(stderr, Name
+ ": %s has metadata type %s for which "
+ "auto-assembly is disabled\n",
+ devname, st->ss->name);
+ return 1;
+ }
+
/* 3a/ if not, check for homehost match. If no match, continue
* but don't trust the 'name' in the array. Thus a 'random' minor
* number will be assigned, and the device name will be based
* on that. */
- name = info.name;
- if (!match) {
- if (homehost == NULL ||
- st->ss->match_home(st, homehost) == 0) {
- if (verbose >= 0)
- fprintf(stderr, Name
- ": not found in mdadm.conf and not identified by homehost.\n");
- name = NULL;
- }
- }
- /* 4/ Determine device number. */
- /* - If in mdadm.conf with std name, get number from name. */
- /* - UUID in /var/run/mdadm.map get number from mapping */
- /* - If name is suggestive, use that. unless in use with */
- /* different uuid. */
- /* - Choose a free, high number. */
- /* - Use a partitioned device unless strong suggestion not to. */
- /* e.g. auto=md */
+ if (match)
+ trustworthy = LOCAL;
+ else if ((homehost == NULL ||
+ st->ss->match_home(st, homehost) != 1) &&
+ st->ss->match_home(st, "any") != 1)
+ trustworthy = FOREIGN;
+ else
+ trustworthy = LOCAL;
/* There are three possible sources for 'autof': command line,
* ARRAY line in mdadm.conf, or CREATE line in mdadm.conf.
if (autof == 0)
autof = ci->autof;
- if (match && (rv = is_standard(match->devname, &devnum))) {
- devnum = (rv > 0) ? (-1-devnum) : devnum;
- } else if ((mp = map_by_uuid(&map, info.uuid)) != NULL)
- devnum = mp->devnum;
- else {
- /* Have to guess a bit. */
- int use_partitions = 1;
- char *np, *ep;
- if ((autof&7) == 3 || (autof&7) == 5)
- use_partitions = 0;
- np = name ? strchr(name, ':') : ":NONAME";
- if (np)
- np++;
- else
- np = name;
- devnum = strtoul(np, &ep, 10);
- if (ep > np && *ep == 0) {
- /* This is a number. Let check that it is unused. */
- if (mddev_busy(use_partitions ? (-1-devnum) : devnum))
- devnum = -1;
- } else
- devnum = -1;
-
- if (devnum < 0) {
- /* Haven't found anything yet, choose something free */
- devnum = find_free_devnum(use_partitions);
-
- if (devnum == NoMdDev) {
- fprintf(stderr, Name
- ": No spare md devices!!\n");
- return 2;
- }
- } else
- devnum = use_partitions ? (-1-devnum) : devnum;
+ if (st->ss->container_content && st->loaded_container) {
+ /* This is a pre-built container array, so we do something
+ * rather different.
+ */
+ return Incremental_container(st, devname, verbose, runstop,
+ autof, trustworthy);
}
- mdfd = open_mddev_devnum(match ? match->devname : NULL,
- devnum,
- name,
- chosen_name, autof >> 3);
- if (mdfd < 0) {
- fprintf(stderr, Name ": failed to open %s: %s.\n",
- chosen_name, strerror(errno));
- return 2;
+
+ name_to_use = info.name;
+ if (name_to_use[0] == 0 &&
+ info.array.level == LEVEL_CONTAINER &&
+ trustworthy == LOCAL) {
+ name_to_use = info.text_version;
+ trustworthy = METADATA;
}
- /* 5/ Find out if array already exists */
- if (! mddev_busy(devnum)) {
- /* 5a/ if it does not */
- /* - choose a name, from mdadm.conf or 'name' field in array. */
- /* - create the array */
- /* - add the device */
- mdu_array_info_t ainf;
- mdu_disk_info_t disk;
- char md[20];
+ if (name_to_use[0] && trustworthy != LOCAL &&
+ ! require_homehost &&
+ conf_name_is_free(name_to_use))
+ trustworthy = LOCAL;
+
+ /* strip "hostname:" prefix from name if we have decided
+ * to treat it as LOCAL
+ */
+ if (trustworthy == LOCAL && strchr(name_to_use, ':') != NULL)
+ name_to_use = strchr(name_to_use, ':')+1;
+
+ /* 4/ Check if array exists.
+ */
+ map_lock(&map);
+ mp = map_by_uuid(&map, info.uuid);
+ if (mp)
+ mdfd = open_dev(mp->devnum);
+ else
+ mdfd = -1;
+
+ if (mdfd < 0) {
struct mdinfo *sra;
+ struct mdinfo dinfo;
- memset(&ainf, 0, sizeof(ainf));
- ainf.major_version = st->ss->major;
- ainf.minor_version = st->minor_version;
- if (ioctl(mdfd, SET_ARRAY_INFO, &ainf) != 0) {
- fprintf(stderr, Name
- ": SET_ARRAY_INFO failed for %s: %s\b",
+ /* Couldn't find an existing array, maybe make a new one */
+ mdfd = create_mddev(match ? match->devname : NULL,
+ name_to_use, autof, trustworthy, chosen_name);
+
+ if (mdfd < 0)
+ return 1;
+
+ sysfs_init(&info, mdfd, 0);
+
+ if (set_array_info(mdfd, st, &info) != 0) {
+ fprintf(stderr, Name ": failed to set array info for %s: %s\n",
chosen_name, strerror(errno));
close(mdfd);
return 2;
}
- sprintf(md, "%d.%d\n", st->ss->major, st->minor_version);
- sra = sysfs_read(mdfd, devnum, GET_VERSION);
- sysfs_set_str(sra, NULL, "metadata_version", md);
- memset(&disk, 0, sizeof(disk));
- disk.major = major(stb.st_rdev);
- disk.minor = minor(stb.st_rdev);
- sysfs_free(sra);
- if (ioctl(mdfd, ADD_NEW_DISK, &disk) != 0) {
+
+ dinfo = info;
+ dinfo.disk.major = major(stb.st_rdev);
+ dinfo.disk.minor = minor(stb.st_rdev);
+ if (add_disk(mdfd, st, &info, &dinfo) != 0) {
fprintf(stderr, Name ": failed to add %s to %s: %s.\n",
devname, chosen_name, strerror(errno));
ioctl(mdfd, STOP_ARRAY, 0);
close(mdfd);
return 2;
}
- sra = sysfs_read(mdfd, devnum, GET_DEVS);
+ sra = sysfs_read(mdfd, fd2devnum(mdfd), GET_DEVS);
if (!sra || !sra->devs || sra->devs->disk.raid_disk >= 0) {
/* It really should be 'none' - must be old buggy
* kernel, and mdadm -I may not be able to complete.
sysfs_free(sra);
return 2;
}
+ info.array.working_disks = 1;
+ sysfs_free(sra);
+ /* 6/ Make sure /var/run/mdadm.map contains this array. */
+ map_update(&map, fd2devnum(mdfd),
+ info.text_version,
+ info.uuid, chosen_name);
} else {
/* 5b/ if it does */
/* - check one drive in array to make sure metadata is a reasonably */
/* - add the device */
char dn[20];
int dfd2;
- mdu_disk_info_t disk;
int err;
struct mdinfo *sra;
struct supertype *st2;
- sra = sysfs_read(mdfd, devnum, (GET_VERSION | GET_DEVS |
- GET_STATE));
+ struct mdinfo info2, *d;
- if (sra->array.major_version != st->ss->major ||
- sra->array.minor_version != st->minor_version) {
- if (verbose >= 0)
+ if (mp->path)
+ strcpy(chosen_name, mp->path);
+ else
+ strcpy(chosen_name, devnum2devname(mp->devnum));
+
+ sra = sysfs_read(mdfd, fd2devnum(mdfd), (GET_DEVS | GET_STATE));
+
+ if (sra->devs) {
+ sprintf(dn, "%d:%d", sra->devs->disk.major,
+ sra->devs->disk.minor);
+ dfd2 = dev_open(dn, O_RDONLY);
+ st2 = dup_super(st);
+ if (st2->ss->load_super(st2, dfd2, NULL) ||
+ st->ss->compare_super(st, st2) != 0) {
fprintf(stderr, Name
- ": %s has different metadata to chosen array %s %d.%d %d.%d.\n",
- devname, chosen_name,
- sra->array.major_version,
- sra->array.minor_version,
- st->ss->major, st->minor_version);
- close(mdfd);
- return 1;
- }
- sprintf(dn, "%d:%d", sra->devs->disk.major,
- sra->devs->disk.minor);
- dfd2 = dev_open(dn, O_RDONLY);
- st2 = dup_super(st);
- if (st2->ss->load_super(st2, dfd2, NULL)) {
- fprintf(stderr, Name
- ": Strange error loading metadata for %s.\n",
- chosen_name);
- close(mdfd);
+ ": metadata mismatch between %s and "
+ "chosen array %s\n",
+ devname, chosen_name);
+ close(mdfd);
+ close(dfd2);
+ return 2;
+ }
close(dfd2);
- return 2;
- }
- close(dfd2);
- st2->ss->getinfo_super(st2, &info2);
- st2->ss->free_super(st2);
- if (info.array.level != info2.array.level ||
- memcmp(info.uuid, info2.uuid, 16) != 0 ||
- info.array.raid_disks != info2.array.raid_disks) {
- fprintf(stderr, Name
- ": unexpected difference between %s and %s.\n",
- chosen_name, devname);
- close(mdfd);
- return 2;
+ memset(&info2, 0, sizeof(info2));
+ st2->ss->getinfo_super(st2, &info2);
+ st2->ss->free_super(st2);
+ if (info.array.level != info2.array.level ||
+ memcmp(info.uuid, info2.uuid, 16) != 0 ||
+ info.array.raid_disks != info2.array.raid_disks) {
+ fprintf(stderr, Name
+ ": unexpected difference between %s and %s.\n",
+ chosen_name, devname);
+ close(mdfd);
+ return 2;
+ }
}
- memset(&disk, 0, sizeof(disk));
- disk.major = major(stb.st_rdev);
- disk.minor = minor(stb.st_rdev);
- err = ioctl(mdfd, ADD_NEW_DISK, &disk);
+ info2.disk.major = major(stb.st_rdev);
+ info2.disk.minor = minor(stb.st_rdev);
+ /* add disk needs to know about containers */
+ if (st->ss->external)
+ sra->array.level = LEVEL_CONTAINER;
+ err = add_disk(mdfd, st, sra, &info2);
if (err < 0 && errno == EBUSY) {
/* could be another device present with the same
* disk.number. Find and reject any such
*/
find_reject(mdfd, st, sra, info.disk.number,
info.events, verbose, chosen_name);
- err = ioctl(mdfd, ADD_NEW_DISK, &disk);
+ err = add_disk(mdfd, st, sra, &info2);
}
if (err < 0) {
fprintf(stderr, Name ": failed to add %s to %s: %s.\n",
close(mdfd);
return 2;
}
+ info.array.working_disks = 0;
+ for (d = sra->devs; d; d=d->next)
+ info.array.working_disks ++;
+
}
- /* 6/ Make sure /var/run/mdadm.map contains this array. */
- map_update(&map, devnum,
- info.array.major_version,
- info.array.minor_version,
- info.uuid, chosen_name);
/* 7/ Is there enough devices to possibly start the array? */
/* 7a/ if not, finish with success. */
+ if (info.array.level == LEVEL_CONTAINER) {
+ /* Try to assemble within the container */
+ map_unlock(&map);
+ sysfs_uevent(&info, "change");
+ if (verbose >= 0)
+ fprintf(stderr, Name
+ ": container %s now has %d devices\n",
+ chosen_name, info.array.working_disks);
+ wait_for(chosen_name, mdfd);
+ close(mdfd);
+ if (runstop < 0)
+ return 0; /* don't try to assemble */
+ rv = Incremental(chosen_name, verbose, runstop,
+ NULL, homehost, require_homehost, autof);
+ if (rv == 1)
+ /* Don't fail the whole -I if a subarray didn't
+ * have enough devices to start yet
+ */
+ rv = 0;
+ return rv;
+ }
avail = NULL;
active_disks = count_active(st, mdfd, &avail, &info);
if (enough(info.array.level, info.array.raid_disks,
info.array.layout, info.array.state & 1,
- avail, active_disks) == 0) {
+ avail, active_disks) == 0 ||
+ (runstop < 0 && active_disks < info.array.raid_disks)) {
free(avail);
if (verbose >= 0)
fprintf(stderr, Name
": %s attached to %s, not enough to start (%d).\n",
devname, chosen_name, active_disks);
+ map_unlock(&map);
close(mdfd);
return 0;
}
/* are enough, */
/* + add any bitmap file */
/* + start the array (auto-readonly). */
-{
- mdu_array_info_t ainf;
if (ioctl(mdfd, GET_ARRAY_INFO, &ainf) == 0) {
if (verbose >= 0)
fprintf(stderr, Name
": %s attached to %s which is already active.\n",
devname, chosen_name);
- close (mdfd);
+ close(mdfd);
+ map_unlock(&map);
return 0;
}
-}
+
+ map_unlock(&map);
if (runstop > 0 || active_disks >= info.array.working_disks) {
struct mdinfo *sra;
/* Let's try to start it */
}
close(bmfd);
}
- sra = sysfs_read(mdfd, devnum, 0);
+ sra = sysfs_read(mdfd, fd2devnum(mdfd), 0);
if ((sra == NULL || active_disks >= info.array.working_disks)
- && name != NULL)
+ && trustworthy != FOREIGN)
rv = ioctl(mdfd, RUN_ARRAY, NULL);
else
rv = sysfs_set_str(sra, NULL,
": %s attached to %s, which has been started.\n",
devname, chosen_name);
rv = 0;
+ wait_for(chosen_name, mdfd);
} else {
fprintf(stderr, Name
": %s attached to %s, but failed to start: %s.\n",
devs = conf_get_ident(NULL);
for (me = mapl ; me ; me = me->next) {
- char path[1024];
mdu_array_info_t array;
mdu_bitmap_file_t bmf;
struct mdinfo *sra;
- int mdfd = open_mddev_devnum(me->path, me->devnum,
- NULL, path, 0);
+ int mdfd = open_dev(me->devnum);
+
if (mdfd < 0)
continue;
if (ioctl(mdfd, GET_ARRAY_INFO, &array) == 0 ||
}
/* Ok, we can try this one. Maybe it needs a bitmap */
for (mddev = devs ; mddev ; mddev = mddev->next)
- if (strcmp(mddev->devname, me->path) == 0)
+ if (mddev->devname && me->path
+ && devname_matches(mddev->devname, me->path))
break;
if (mddev && mddev->bitmap_file) {
/*
if (verbose >= 0)
fprintf(stderr, Name
": started array %s\n",
- me->path);
+ me->path ?: devnum2devname(me->devnum));
} else {
fprintf(stderr, Name
": failed to start array %s: %s\n",
- me->path, strerror(errno));
+ me->path ?: devnum2devname(me->devnum),
+ strerror(errno));
rv = 1;
}
}
}
return rv;
}
+
+static char *container2devname(char *devname)
+{
+ char *mdname = NULL;
+
+ if (devname[0] == '/') {
+ int fd = open(devname, O_RDONLY);
+ if (fd >= 0) {
+ mdname = devnum2devname(fd2devnum(fd));
+ close(fd);
+ }
+ } else {
+ int uuid[4];
+ struct map_ent *mp, *map = NULL;
+
+ if (!parse_uuid(devname, uuid))
+ return mdname;
+ mp = map_by_uuid(&map, uuid);
+ if (mp)
+ mdname = devnum2devname(mp->devnum);
+ map_free(map);
+ }
+
+ return mdname;
+}
+
+int Incremental_container(struct supertype *st, char *devname, int verbose,
+ int runstop, int autof, int trustworthy)
+{
+ /* Collect the contents of this container and for each
+ * array, choose a device name and assemble the array.
+ */
+
+ struct mdinfo *list = st->ss->container_content(st);
+ struct mdinfo *ra;
+ struct map_ent *map = NULL;
+
+ map_lock(&map);
+
+ for (ra = list ; ra ; ra = ra->next) {
+ int mdfd;
+ char chosen_name[1024];
+ struct map_ent *mp;
+ struct mddev_ident_s *match = NULL;
+
+ mp = map_by_uuid(&map, ra->uuid);
+
+ if (mp) {
+ mdfd = open_dev(mp->devnum);
+ if (mp->path)
+ strcpy(chosen_name, mp->path);
+ else
+ strcpy(chosen_name, devnum2devname(mp->devnum));
+ } else {
+
+ /* Check in mdadm.conf for container == devname and
+ * member == ra->text_version after second slash.
+ */
+ char *sub = strchr(ra->text_version+1, '/');
+ struct mddev_ident_s *array_list;
+ if (sub) {
+ sub++;
+ array_list = conf_get_ident(NULL);
+ } else
+ array_list = NULL;
+ for(; array_list ; array_list = array_list->next) {
+ char *dn;
+ if (array_list->member == NULL ||
+ array_list->container == NULL)
+ continue;
+ if (strcmp(array_list->member, sub) != 0)
+ continue;
+ if (array_list->uuid_set &&
+ !same_uuid(ra->uuid, array_list->uuid, st->ss->swapuuid))
+ continue;
+ dn = container2devname(array_list->container);
+ if (dn == NULL)
+ continue;
+ if (strncmp(dn, ra->text_version+1,
+ strlen(dn)) != 0 ||
+ ra->text_version[strlen(dn)+1] != '/') {
+ free(dn);
+ continue;
+ }
+ free(dn);
+ /* we have a match */
+ match = array_list;
+ if (verbose>0)
+ fprintf(stderr, Name ": match found for member %s\n",
+ array_list->member);
+ break;
+ }
+
+ if (match && match->devname &&
+ strcasecmp(match->devname, "<ignore>") == 0) {
+ if (verbose > 0)
+ fprintf(stderr, Name ": array %s/%s is "
+ "explicitly ignored by mdadm.conf\n",
+ match->container, match->member);
+ return 2;
+ }
+ if (match)
+ trustworthy = LOCAL;
+
+ mdfd = create_mddev(match ? match->devname : NULL,
+ ra->name,
+ autof,
+ trustworthy,
+ chosen_name);
+ }
+
+ if (mdfd < 0) {
+ fprintf(stderr, Name ": failed to open %s: %s.\n",
+ chosen_name, strerror(errno));
+ return 2;
+ }
+
+ assemble_container_content(st, mdfd, ra, runstop,
+ chosen_name, verbose);
+ }
+ map_unlock(&map);
+ return 0;
+}
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*
* Added by Dale Stephenson
* steph@snapserver.com
#include "md_u.h"
#include "md_p.h"
-int Kill(char *dev, int force, int quiet)
+int Kill(char *dev, int force, int quiet, int noexcl)
{
/*
* Nothing fancy about Kill. It just zeroes out a superblock
int fd, rv = 0;
struct supertype *st;
+ if (force)
+ noexcl = 1;
fd = open(dev, O_RDWR|(force ? 0 : O_EXCL));
if (fd < 0) {
if (!quiet)
if (force && rv >= 2)
rv = 0; /* ignore bad data in superblock */
if (rv== 0 || (force && rv >= 2)) {
- mdu_array_info_t info;
- info.major_version = -1; /* zero superblock */
st->ss->free_super(st);
- st->ss->init_super(st, &info, 0, "", NULL, NULL);
+ st->ss->init_super(st, NULL, 0, "", NULL, NULL);
if (st->ss->store_super(st, fd)) {
if (!quiet)
fprintf(stderr, Name ": Could not zero superblock on %s\n",
# e.g. make CXFLAGS=-O to optimise
TCC = tcc
UCLIBC_GCC = $(shell for nm in i386-uclibc-linux-gcc i386-uclibc-gcc; do which $$nm > /dev/null && { echo $$nm ; exit; } ; done; echo false No uclibc found )
-DIET_GCC = diet gcc
+#DIET_GCC = diet gcc
+# sorry, but diet-libc doesn't know about posix_memalign,
+# so we cannot use it any more.
+DIET_GCC = gcc -DHAVE_STDINT_H
KLIBC=/home/src/klibc/klibc-0.77
CC = $(CROSS_COMPILE)gcc
CXFLAGS = -ggdb
CWFLAGS = -Wall -Werror -Wstrict-prototypes
+ifdef WARN_UNUSED
+CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O
+endif
ifdef DEBIAN
CPPFLAGS= -DDEBIAN
OBJS = mdadm.o config.o mdstat.o ReadMe.o util.o Manage.o Assemble.o Build.o \
Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \
Incremental.o \
- mdopen.o super0.o super1.o bitmap.o restripe.o sysfs.o sha1.o \
- mapfile.o
+ mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \
+ restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o \
+ platform-intel.o probe_roms.o
+
SRCS = mdadm.c config.c mdstat.c ReadMe.c util.c Manage.c Assemble.c Build.c \
Create.c Detail.c Examine.c Grow.c Monitor.c dlink.c Kill.c Query.c \
Incremental.c \
- mdopen.c super0.c super1.c bitmap.c restripe.c sysfs.c sha1.c \
- mapfile.c
+ mdopen.c super0.c super1.c super-ddf.c super-intel.c bitmap.c \
+ restripe.c sysfs.c sha1.c mapfile.c crc32.c sg_io.c msg.c \
+ platform-intel.c probe_roms.c
+
+MON_OBJS = mdmon.o monitor.o managemon.o util.o mdstat.o sysfs.o config.o \
+ Kill.o sg_io.o dlink.o ReadMe.o super0.o super1.o super-intel.o \
+ super-ddf.o sha1.o crc32.o msg.o bitmap.o \
+ platform-intel.o probe_roms.o
+
STATICSRC = pwgr.c
STATICOBJS = pwgr.o
ASSEMBLE_SRCS := mdassemble.c Assemble.c Manage.c config.c dlink.c util.c \
- super0.c super1.c sha1.c sysfs.c
-ASSEMBLE_AUTO_SRCS := mdopen.c mdstat.c
+ super0.c super1.c super-ddf.c super-intel.c sha1.c crc32.c sg_io.c mdstat.c \
+ platform-intel.c probe_roms.c sysfs.c
+ASSEMBLE_AUTO_SRCS := mdopen.c
ASSEMBLE_FLAGS:= $(CFLAGS) -DMDASSEMBLE
ifdef MDASSEMBLE_AUTO
ASSEMBLE_SRCS += $(ASSEMBLE_AUTO_SRCS)
ASSEMBLE_FLAGS += -DMDASSEMBLE_AUTO
endif
-all : mdadm mdadm.man md.man mdadm.conf.man
+all : mdadm mdmon mdadm.man md.man mdadm.conf.man mdmon.man
everything: all mdadm.static swap_super test_stripe \
mdassemble mdassemble.auto mdassemble.static mdassemble.man \
mdadm.O2 : $(SRCS) mdadm.h
$(CC) -o mdadm.O2 $(CFLAGS) -DHAVE_STDINT_H -O2 $(SRCS)
+mdmon : $(MON_OBJS)
+ $(CC) $(LDFLAGS) -o mdmon $(MON_OBJS) $(LDLIBS)
+msg.o: msg.c msg.h
+
test_stripe : restripe.c mdadm.h
$(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe -DMAIN restripe.c
mdadm.man : mdadm.8
nroff -man mdadm.8 > mdadm.man
+mdmon.man : mdmon.8
+ nroff -man mdmon.8 > mdmon.man
+
md.man : md.4
nroff -man md.4 > md.man
mdassemble.man : mdassemble.8
nroff -man mdassemble.8 > mdassemble.man
-$(OBJS) : mdadm.h bitmap.h
+$(OBJS) : mdadm.h mdmon.h bitmap.h
+$(MON_OBJS) : mdadm.h mdmon.h bitmap.h
sha1.o : sha1.c sha1.h md5.h
$(CC) $(CFLAGS) -DHAVE_STDINT_H -o sha1.o -c sha1.c
-install : mdadm install-man
+install : mdadm mdmon install-man install-udev
$(INSTALL) -D $(STRIP) -m 755 mdadm $(DESTDIR)$(BINDIR)/mdadm
+ $(INSTALL) -D $(STRIP) -m 755 mdmon $(DESTDIR)$(BINDIR)/mdmon
install-static : mdadm.static install-man
$(INSTALL) -D $(STRIP) -m 755 mdadm.static $(DESTDIR)$(BINDIR)/mdadm
install-klibc : mdadm.klibc install-man
$(INSTALL) -D $(STRIP) -m 755 mdadm.klibc $(DESTDIR)$(BINDIR)/mdadm
-install-man: mdadm.8 md.4 mdadm.conf.5
+install-man: mdadm.8 md.4 mdadm.conf.5 mdmon.8
$(INSTALL) -D -m 644 mdadm.8 $(DESTDIR)$(MAN8DIR)/mdadm.8
+ $(INSTALL) -D -m 644 mdmon.8 $(DESTDIR)$(MAN8DIR)/mdmon.8
$(INSTALL) -D -m 644 md.4 $(DESTDIR)$(MAN4DIR)/md.4
$(INSTALL) -D -m 644 mdadm.conf.5 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5
+install-udev: udev-md-raid.rules
+ $(INSTALL) -D -m 644 udev-md-raid.rules $(DESTDIR)/lib/udev/rules.d/64-md-raid.rules
+
uninstall:
- rm -f $(DESTDIR)$(MAN8DIR)/mdadm.8 md.4 $(DESTDIR)$(MAN4DIR)/md.4 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 $(DESTDIR)$(BINDIR)/mdadm
+ rm -f $(DESTDIR)$(MAN8DIR)/mdadm.8 $(DESTDIR)$(MAN8DIR)/mdmon.8 $(DESTDIR)$(MAN4DIR)/md.4 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 $(DESTDIR)$(BINDIR)/mdadm
-test: mdadm test_stripe swap_super
+test: mdadm mdmon test_stripe swap_super
@echo "Please run 'sh ./test' as root"
clean :
- rm -f mdadm $(OBJS) $(STATICOBJS) core *.man mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \
+ rm -f mdadm mdmon $(OBJS) $(MON_OBJS) $(STATICOBJS) core *.man \
+ mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \
mdadm.Os mdadm.O2 \
mdassemble mdassemble.static mdassemble.auto mdassemble.uclibc \
mdassemble.klibc swap_super \
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
#include "mdadm.h"
#include "md_u.h"
#include "md_p.h"
+#include <ctype.h>
#define REGISTER_DEV _IO (MD_MAJOR, 1)
#define START_MD _IO (MD_MAJOR, 2)
*
*/
mdu_array_info_t array;
+#ifndef MDASSEMBLE
+ struct mdinfo *mdi;
+#endif
if (md_get_version(fd) < 9000) {
fprintf(stderr, Name ": need md driver version 0.90.0 or later\n");
return 1;
}
+#ifndef MDASSEMBLE
+ /* If this is an externally-manage array, we need to modify the
+ * metadata_version so that mdmon doesn't undo our change.
+ */
+ mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION);
+ if (mdi &&
+ mdi->array.major_version == -1 &&
+ mdi->array.level > 0 &&
+ is_subarray(mdi->text_version)) {
+ char vers[64];
+ strcpy(vers, "external:");
+ strcat(vers, mdi->text_version);
+ if (readonly > 0) {
+ int rv;
+ /* We set readonly ourselves. */
+ vers[9] = '-';
+ sysfs_set_str(mdi, NULL, "metadata_version", vers);
+
+ close(fd);
+ rv = sysfs_set_str(mdi, NULL, "array_state", "readonly");
+
+ if (rv < 0) {
+ fprintf(stderr, Name ": failed to set readonly for %s: %s\n",
+ devname, strerror(errno));
+
+ vers[9] = mdi->text_version[0];
+ sysfs_set_str(mdi, NULL, "metadata_version", vers);
+ return 1;
+ }
+ } else {
+ char *cp;
+ /* We cannot set read/write - must signal mdmon */
+ vers[9] = '/';
+ sysfs_set_str(mdi, NULL, "metadata_version", vers);
+
+ cp = strchr(vers+10, '/');
+ if (*cp)
+ *cp = 0;
+ ping_monitor(vers+10);
+ }
+ return 0;
+ }
+#endif
if (ioctl(fd, GET_ARRAY_INFO, &array)) {
fprintf(stderr, Name ": %s does not appear to be active.\n",
devname);
#ifndef MDASSEMBLE
+static void remove_devices(int devnum, char *path)
+{
+ /* Remove all 'standard' devices for 'devnum', including
+ * partitions. Also remove names at 'path' - possibly with
+ * partition suffixes - which link to those names.
+ */
+ char base[40];
+ char *path2;
+ char link[1024];
+ int n;
+ int part;
+ char *be;
+ char *pe;
+
+ if (devnum >= 0)
+ sprintf(base, "/dev/md%d", devnum);
+ else
+ sprintf(base, "/dev/md_d%d", -1-devnum);
+ be = base + strlen(base);
+ if (path) {
+ path2 = malloc(strlen(path)+20);
+ strcpy(path2, path);
+ pe = path2 + strlen(path2);
+ } else
+ path = NULL;
+
+ for (part = 0; part < 16; part++) {
+ if (part) {
+ sprintf(be, "p%d", part);
+ if (path) {
+ if (isdigit(pe[-1]))
+ sprintf(pe, "p%d", part);
+ else
+ sprintf(pe, "%d", part);
+ }
+ }
+ /* FIXME test if really is md device ?? */
+ unlink(base);
+ if (path) {
+ n = readlink(path2, link, sizeof(link));
+ if (n && strlen(base) == n &&
+ strncmp(link, base, n) == 0)
+ unlink(path2);
+ }
+ }
+}
+
+
int Manage_runstop(char *devname, int fd, int runstop, int quiet)
{
/* Run or stop the array. array must already be configured
* required >= 0.90.0
+ * Only print failure messages if quiet == 0;
+ * quiet > 0 means really be quiet
+ * quiet < 0 means we will try again if it fails.
*/
mdu_param_t param; /* unused */
if (runstop == -1 && md_get_version(fd) < 9000) {
if (ioctl(fd, STOP_MD, 0)) {
- if (!quiet) fprintf(stderr, Name ": stopping device %s failed: %s\n",
- devname, strerror(errno));
+ if (quiet == 0) fprintf(stderr,
+ Name ": stopping device %s "
+ "failed: %s\n",
+ devname, strerror(errno));
return 1;
}
}
} else if (runstop < 0){
struct map_ent *map = NULL;
struct stat stb;
- if (ioctl(fd, STOP_ARRAY, NULL)) {
- if (quiet==0) {
- fprintf(stderr, Name ": fail to stop array %s: %s\n",
+ struct mdinfo *mdi;
+ int devnum;
+ /* If this is an mdmon managed array, just write 'inactive'
+ * to the array state and let mdmon clear up.
+ */
+ devnum = fd2devnum(fd);
+ mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION);
+ if (mdi &&
+ mdi->array.level > 0 &&
+ is_subarray(mdi->text_version)) {
+ /* This is mdmon managed. */
+ close(fd);
+ if (sysfs_set_str(mdi, NULL,
+ "array_state", "inactive") < 0) {
+ if (quiet == 0)
+ fprintf(stderr, Name
+ ": failed to stop array %s: %s\n",
+ devname, strerror(errno));
+ return 1;
+ }
+
+ /* Give monitor a chance to act */
+ ping_monitor(mdi->text_version);
+
+ fd = open(devname, O_RDONLY);
+ } else if (mdi &&
+ mdi->array.major_version == -1 &&
+ mdi->array.minor_version == -2 &&
+ !is_subarray(mdi->text_version)) {
+ /* container, possibly mdmon-managed.
+ * Make sure mdmon isn't opening it, which
+ * would interfere with the 'stop'
+ */
+ ping_monitor(mdi->sys_name);
+ }
+
+ if (fd >= 0 && ioctl(fd, STOP_ARRAY, NULL)) {
+ if (quiet == 0) {
+ fprintf(stderr, Name
+ ": failed to stop array %s: %s\n",
devname, strerror(errno));
if (errno == EBUSY)
fprintf(stderr, "Perhaps a running "
"process, mounted filesystem "
"or active volume group?\n");
}
+ if (mdi)
+ sysfs_free(mdi);
return 1;
}
+ /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array
+ * was stopped, so We'll do it here just to be sure. Drop any
+ * partitions as well...
+ */
+ if (fd >= 0)
+ ioctl(fd, BLKRRPART, 0);
+ if (mdi)
+ sysfs_uevent(mdi, "change");
+
+
+ if (devnum != NoMdDev &&
+ (stat("/dev/.udev", &stb) != 0 ||
+ check_env("MDADM_NO_UDEV"))) {
+ struct map_ent *mp = map_by_devnum(&map, devnum);
+ remove_devices(devnum, mp ? mp->path : NULL);
+ }
+
+
if (quiet <= 0)
fprintf(stderr, Name ": stopped %s\n", devname);
- if (fstat(fd, &stb) == 0) {
- int devnum;
- if (major(stb.st_rdev) == MD_MAJOR)
- devnum = minor(stb.st_rdev);
- else
- devnum = -1-(minor(stb.st_rdev)>>6);
+ if (devnum != NoMdDev) {
map_delete(&map, devnum);
map_write(map);
map_free(map);
struct supertype *st, *tst;
int duuid[4];
int ouuid[4];
+ int lfd = -1;
if (ioctl(fd, GET_ARRAY_INFO, &array)) {
fprintf(stderr, Name ": cannot get array info for %s\n",
unsigned long long ldsize;
char dvname[20];
char *dnprintable = dv->devname;
+ int err;
next = dv->next;
jnext = 0;
return 1;
case 'a':
/* add the device */
-
+ if (tst->subarray[0]) {
+ fprintf(stderr, Name ": Cannot add disks to a"
+ " \'member\' array, perform this"
+ " operation on the parent container\n");
+ return 1;
+ }
/* Make sure it isn't in use (in 2.6 or later) */
- tfd = dev_open(dv->devname, O_RDONLY|O_EXCL);
+ tfd = dev_open(dv->devname, O_RDONLY|O_EXCL|O_DIRECT);
if (tfd < 0) {
fprintf(stderr, Name ": Cannot open %s: %s\n",
dv->devname, strerror(errno));
}
close(tfd);
- if (array.major_version == 0 &&
+
+ if (!tst->ss->external &&
+ array.major_version == 0 &&
md_get_version(fd)%100 < 2) {
if (ioctl(fd, HOT_ADD_DISK,
(unsigned long)stb.st_rdev)==0) {
return 1;
}
- if (array.not_persistent == 0) {
+ if (array.not_persistent == 0 || tst->ss->external) {
/* need to find a sample superblock to copy, and
- * a spare slot to use
+ * a spare slot to use.
+ * For 'external' array (well, container based),
+ * We can just load the metadata for the array.
*/
- for (j = 0; j < tst->max_devs; j++) {
+ if (tst->ss->external) {
+ tst->ss->load_super(tst, fd, NULL);
+ } else for (j = 0; j < tst->max_devs; j++) {
char *dev;
int dfd;
disc.number = j;
close(dfd);
break;
}
+ /* FIXME this is a bad test to be using */
if (!tst->sb) {
fprintf(stderr, Name ": cannot find valid superblock in this array - HELP\n");
return 1;
disc.minor = minor(stb.st_rdev);
disc.number =j;
disc.state = 0;
- if (array.not_persistent==0) {
+ if (array.not_persistent==0 || tst->ss->external) {
+ int dfd;
if (dv->writemostly == 1)
disc.state |= 1 << MD_DISK_WRITEMOSTLY;
- tst->ss->add_to_super(tst, &disc);
- if (tst->ss->write_init_super(tst, &disc,
- dv->devname))
+ dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
+ if (tst->ss->add_to_super(tst, &disc, dfd,
+ dv->devname)) {
+ close(dfd);
+ return 1;
+ }
+ /* write_init_super will close 'dfd' */
+ if (tst->ss->external)
+ /* mdmon will write the metadata */
+ close(dfd);
+ else if (tst->ss->write_init_super(tst))
return 1;
} else if (dv->re_add) {
/* this had better be raid1.
}
if (dv->writemostly == 1)
disc.state |= (1 << MD_DISK_WRITEMOSTLY);
- if (ioctl(fd,ADD_NEW_DISK, &disc)) {
+ if (tst->ss->external) {
+ /* add a disk to an external metadata container
+ * only if mdmon is around to see it
+ */
+ struct mdinfo new_mdi;
+ struct mdinfo *sra;
+ int container_fd;
+ int devnum = fd2devnum(fd);
+
+ container_fd = open_dev_excl(devnum);
+ if (container_fd < 0) {
+ fprintf(stderr, Name ": add failed for %s:"
+ " could not get exclusive access to container\n",
+ dv->devname);
+ return 1;
+ }
+
+ if (!mdmon_running(devnum)) {
+ fprintf(stderr, Name ": add failed for %s: mdmon not running\n",
+ dv->devname);
+ close(container_fd);
+ return 1;
+ }
+
+ sra = sysfs_read(container_fd, -1, 0);
+ if (!sra) {
+ fprintf(stderr, Name ": add failed for %s: sysfs_read failed\n",
+ dv->devname);
+ close(container_fd);
+ return 1;
+ }
+ sra->array.level = LEVEL_CONTAINER;
+ /* Need to set data_offset and component_size */
+ tst->ss->getinfo_super(tst, &new_mdi);
+ new_mdi.disk.major = disc.major;
+ new_mdi.disk.minor = disc.minor;
+ if (sysfs_add_disk(sra, &new_mdi, 0) != 0) {
+ fprintf(stderr, Name ": add new device to external metadata"
+ " failed for %s\n", dv->devname);
+ close(container_fd);
+ return 1;
+ }
+ ping_monitor(devnum2devname(devnum));
+ sysfs_free(sra);
+ close(container_fd);
+ } else if (ioctl(fd, ADD_NEW_DISK, &disc)) {
fprintf(stderr, Name ": add new device failed for %s as %d: %s\n",
dv->devname, j, strerror(errno));
return 1;
case 'r':
/* hot remove */
+ if (tst->subarray[0]) {
+ fprintf(stderr, Name ": Cannot remove disks from a"
+ " \'member\' array, perform this"
+ " operation on the parent container\n");
+ return 1;
+ }
+ if (tst->ss->external) {
+ /* To remove a device from a container, we must
+ * check that it isn't in use in an array.
+ * This involves looking in the 'holders'
+ * directory - there must be just one entry,
+ * the container.
+ * To ensure that it doesn't get used as a
+ * hold spare while we are checking, we
+ * get an O_EXCL open on the container
+ */
+ int dnum = fd2devnum(fd);
+ lfd = open_dev_excl(dnum);
+ if (lfd < 0) {
+ fprintf(stderr, Name
+ ": Cannot get exclusive access "
+ " to container - odd\n");
+ return 1;
+ }
+ /* in the detached case it is not possible to
+ * check if we are the unique holder, so just
+ * rely on the 'detached' checks
+ */
+ if (strcmp(dv->devname, "detached") == 0 ||
+ sysfs_unique_holder(dnum, stb.st_rdev))
+ /* pass */;
+ else {
+ fprintf(stderr, Name
+ ": %s is %s, cannot remove.\n",
+ dnprintable,
+ errno == EEXIST ? "still in use":
+ "not a member");
+ close(lfd);
+ return 1;
+ }
+ }
/* FIXME check that it is a current member */
- if (ioctl(fd, HOT_REMOVE_DISK, (unsigned long)stb.st_rdev)) {
+ err = ioctl(fd, HOT_REMOVE_DISK, (unsigned long)stb.st_rdev);
+ if (err && errno == ENODEV) {
+ /* Old kernels rejected this if no personality
+ * registered */
+ struct mdinfo *sra = sysfs_read(fd, 0, GET_DEVS);
+ struct mdinfo *dv = NULL;
+ if (sra)
+ dv = sra->devs;
+ for ( ; dv ; dv=dv->next)
+ if (dv->disk.major == major(stb.st_rdev) &&
+ dv->disk.minor == minor(stb.st_rdev))
+ break;
+ if (dv)
+ err = sysfs_set_str(sra, dv,
+ "state", "remove");
+ else
+ err = -1;
+ if (sra)
+ sysfs_free(sra);
+ }
+ if (err) {
fprintf(stderr, Name ": hot remove failed "
"for %s: %s\n", dnprintable,
strerror(errno));
+ if (lfd >= 0)
+ close(lfd);
return 1;
}
+ if (tst->ss->external) {
+ /*
+ * Before dropping our exclusive open we make an
+ * attempt at preventing mdmon from seeing an
+ * 'add' event before reconciling this 'remove'
+ * event.
+ */
+ char *name = devnum2devname(fd2devnum(fd));
+
+ if (!name) {
+ fprintf(stderr, Name ": unable to get container name\n");
+ return 1;
+ }
+
+ ping_manager(name);
+ free(name);
+ }
+ close(lfd);
if (verbose >= 0)
fprintf(stderr, Name ": hot removed %s\n",
dnprintable);
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
#include "mdadm.h"
if (devlist == NULL) {
mddev_ident_t mdlist = conf_get_ident(NULL);
for (; mdlist; mdlist=mdlist->next) {
- struct state *st = malloc(sizeof *st);
+ struct state *st;
+ if (mdlist->devname == NULL)
+ continue;
+ if (strcasecmp(mdlist->devname, "<ignore>") == 0)
+ continue;
+ st = malloc(sizeof *st);
if (st == NULL)
continue;
- st->devname = strdup(mdlist->devname);
+ if (mdlist->devname[0] == '/')
+ st->devname = strdup(mdlist->devname);
+ else {
+ st->devname = malloc(8+strlen(mdlist->devname)+1);
+ strcpy(strcpy(st->devname, "/dev/md/"),
+ mdlist->devname);
+ }
st->utime = 0;
st->next = statelist;
st->err = 0;
mse = mse2;
}
+ if (array.utime == 0)
+ /* external arrays don't update utime */
+ array.utime = time(0);
+
if (st->utime == array.utime &&
st->failed == array.failed_disks &&
st->working == array.working_disks &&
strerror(errno));
return 2;
}
- if (major(stb.st_rdev) == MD_MAJOR)
- devnum = minor(stb.st_rdev);
- else
- devnum = -1-(minor(stb.st_rdev)/64);
+ devnum = stat2devnum(&stb);
while(1) {
struct mdstat_ent *ms = mdstat_read(1, 0);
break;
if (!e || e->percent < 0) {
+ if (e && e->metadata_version &&
+ strncmp(e->metadata_version, "external:", 9) == 0) {
+ if (is_subarray(&e->metadata_version[9]))
+ ping_monitor(&e->metadata_version[9]);
+ else
+ ping_monitor(devnum2devname(devnum));
+ }
free_mdstat(ms);
return rv;
}
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2002-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2002-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
#include "mdadm.h"
if (superror == 0) {
/* array might be active... */
st->ss->getinfo_super(st, &info);
- if (st->ss->major == 0) {
+ if (st->ss == &super0) {
mddev = get_md_name(info.array.md_minor);
disc.number = info.disk.number;
activity = "undetected";
activity,
map_num(pers, info.array.level),
mddev);
- if (st->ss->major == 0)
+ if (st->ss == &super0)
put_md_name(mddev);
}
return 0;
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2007 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
#include "mdadm.h"
-char Version[] = Name " - v2.6.9 - 10th March 2009\n";
+char Version[] = Name " - v3.0-rc1 - 11th May 2009\n";
/*
* File: ReadMe.c
{"query", 0, 0, 'Q'},
{"examine-bitmap", 0, 0, 'X'},
{"auto-detect", 0, 0, AutoDetect},
+ {"detail-platform", 0, 0, DetailPlatform},
/* synonyms */
{"monitor", 0, 0, 'F'},
{"write-mostly",0, 0, 'W'},
{"re-add", 0, 0, ReAdd},
{"homehost", 1, 0, HomeHost},
+#if 0
{"auto-update-homehost", 0, 0, AutoHomeHost},
+#endif
{"symlinks", 1, 0, Symlinks},
/* For assemble */
{"readwrite", 0, 0, 'w'},
{"no-degraded",0,0, NoDegraded },
{"wait", 0, 0, 'W'},
+ {"wait-clean", 0, 0, Waitclean },
/* For Detail/Examine */
{"brief", 0, 0, 'b'},
" --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n"
" --force -f : Honour devices as listed on command line. Don't\n"
" : insert a missing drive for RAID5.\n"
-" --auto(=p) -a : Automatically allocate new (partitioned) md array if needed.\n"
" --assume-clean : Assume the array is already in-sync. This is dangerous.\n"
" --bitmap-chunk= : chunksize of bitmap in bitmap file (Kilobytes)\n"
" --delay= -d : seconds between bitmap updates\n"
" --scan -s : scan config file for missing information\n"
" --force -f : Assemble the array even if some superblocks appear out-of-date\n"
" --update= -U : Update superblock: try '-A --update=?' for list of options.\n"
-" --auto(=p) -a : Automatically allocate new (partitioned) md array if needed.\n"
" --no-degraded : Do not start any degraded arrays - default unless --scan.\n"
"\n"
" For detail or examine:\n"
" --query -Q : Display general information about how a\n"
" device relates to the md driver\n"
" --detail -D : Display details of an array\n"
+" --detail-platform : Display hardware/firmware details\n"
" --examine -E : Examine superblock on an array component\n"
" --examine-bitmap -X: Display contents of a bitmap file\n"
" --zero-superblock : erase the MD superblock from a device.\n"
/* name/number mappings */
mapping_t r5layout[] = {
- { "left-asymmetric", 0},
- { "right-asymmetric", 1},
- { "left-symmetric", 2},
- { "right-symmetric", 3},
-
- { "default", 2},
- { "la", 0},
- { "ra", 1},
- { "ls", 2},
- { "rs", 3},
+ { "left-asymmetric", ALGORITHM_LEFT_ASYMMETRIC},
+ { "right-asymmetric", ALGORITHM_RIGHT_ASYMMETRIC},
+ { "left-symmetric", ALGORITHM_LEFT_SYMMETRIC},
+ { "right-symmetric", ALGORITHM_RIGHT_SYMMETRIC},
+
+ { "default", ALGORITHM_LEFT_SYMMETRIC},
+ { "la", ALGORITHM_LEFT_ASYMMETRIC},
+ { "ra", ALGORITHM_RIGHT_ASYMMETRIC},
+ { "ls", ALGORITHM_LEFT_SYMMETRIC},
+ { "rs", ALGORITHM_RIGHT_SYMMETRIC},
+
+ { "parity-first", ALGORITHM_PARITY_0},
+ { "parity-last", ALGORITHM_PARITY_N},
+ { "ddf-zero-restart", ALGORITHM_RIGHT_ASYMMETRIC},
+ { "ddf-N-restart", ALGORITHM_LEFT_ASYMMETRIC},
+ { "ddf-N-continue", ALGORITHM_LEFT_SYMMETRIC},
+
+ { NULL, 0}
+};
+mapping_t r6layout[] = {
+ { "left-asymmetric", ALGORITHM_LEFT_ASYMMETRIC},
+ { "right-asymmetric", ALGORITHM_RIGHT_ASYMMETRIC},
+ { "left-symmetric", ALGORITHM_LEFT_SYMMETRIC},
+ { "right-symmetric", ALGORITHM_RIGHT_SYMMETRIC},
+
+ { "default", ALGORITHM_LEFT_SYMMETRIC},
+ { "la", ALGORITHM_LEFT_ASYMMETRIC},
+ { "ra", ALGORITHM_RIGHT_ASYMMETRIC},
+ { "ls", ALGORITHM_LEFT_SYMMETRIC},
+ { "rs", ALGORITHM_RIGHT_SYMMETRIC},
+
+ { "parity-first", ALGORITHM_PARITY_0},
+ { "parity-last", ALGORITHM_PARITY_N},
+ { "ddf-zero-restart", ALGORITHM_ROTATING_ZERO_RESTART},
+ { "ddf-N-restart", ALGORITHM_ROTATING_N_RESTART},
+ { "ddf-N-continue", ALGORITHM_ROTATING_N_CONTINUE},
+
+ { "left-asymmetric-6", ALGORITHM_LEFT_ASYMMETRIC_6},
+ { "right-asymmetric-6", ALGORITHM_RIGHT_ASYMMETRIC_6},
+ { "left-symmetric-6", ALGORITHM_LEFT_SYMMETRIC_6},
+ { "right-symmetric-6", ALGORITHM_RIGHT_SYMMETRIC_6},
+ { "parity-first-6", ALGORITHM_PARITY_0_6},
+
{ NULL, 0}
};
{ "raid10", 10},
{ "10", 10},
{ "faulty", LEVEL_FAULTY},
+ { "container", LEVEL_CONTAINER},
{ NULL, 0}
};
+ - add 'name' field to metadata type and use it.
+ - use validate_geometry more
+ - metadata should be able to check/reject bitmap stuff.
+
+DDF:
+ Three new metadata types:
+ ddf - used only to create a container.
+ ddf-bvd - used to create an array in a container
+ ddf-svd - used to create a secondary array from bvds.
+
+ Usage:
+ mdadm -C /dev/ddf1 /dev/sd[abcdef]
+ mdadm -C /dev/md1 -e ddf /dev/sd[a-f]
+ mdadm -C /dev/md1 -l container /dev/sd[a-f]
+
+ Each of these create a new ddf container using all those
+ devices. The name 'ddf*' signals that ddf metadata should be used.
+ '-e ddf' only supports one level - 'container'. 'container' is only
+ supported by ddf.
+
+ mdadm -C /dev/md1 -l0 -n4 /dev/ddf1 # or maybe not ???
+ mdadm -C /dev/md1 -l1 -n2 /dev/sda /dev/sdb
+ If exactly one device is given, and it is a container, we select
+ devices from that container.
+ If devices are given that are already in use, they must be in use by
+ a container, and the array is created in the container.
+ If devices given are bvds, we slip under the hood to make
+ the svd arrays.
+
+ mdadm -A /dev/ddf ......
+ base drives make a container. Anything in that container is started
+ auto-read-only.
+ if /dev/ddf is already assembled, we assemble bvds and svds inside it.
+
+
2005-dec-20
Want an incremental assembly mode to work nicely with udev.
Core usage would be something like
*/
unsigned long long total_bits = 0, read_bits = 0, dirty_bits = 0;
bitmap_info_t *info;
- char *buf, *unaligned;
+ void *buf;
int n, skip;
- unaligned = malloc(8192*2);
- buf = (char*) ((unsigned long)unaligned | 8191)+1;
+ if (posix_memalign(&buf, 512, 8192) != 0) {
+ fprintf(stderr, Name ": failed to allocate 8192 bytes\n");
+ return NULL;
+ }
n = read(fd, buf, 8192);
info = malloc(sizeof(*info));
fprintf(stderr, Name ": failed to read superblock of bitmap "
"file: %s\n", strerror(errno));
free(info);
- free(unaligned);
return NULL;
}
memcpy(&info->sb, buf, sizeof(info->sb));
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
#include "mdadm.h"
* with a key word, and not be indented, or must start with a
* non-key-word and must be indented.
*
- * Keywords are DEVICE and ARRAY
+ * Keywords are DEVICE and ARRAY ... and several others.
* DEV{ICE} introduces some devices that might contain raid components.
* e.g.
* DEV style=0 /dev/sda* /dev/hd*
char DefaultConfFile[] = CONFFILE;
char DefaultAltConfFile[] = CONFFILE2;
-enum linetype { Devices, Array, Mailaddr, Mailfrom, Program, CreateDev, Homehost, LTEnd };
+enum linetype { Devices, Array, Mailaddr, Mailfrom, Program, CreateDev,
+ Homehost, AutoMode, LTEnd };
char *keywords[] = {
[Devices] = "devices",
[Array] = "array",
[Program] = "program",
[CreateDev]= "create",
[Homehost] = "homehost",
+ [AutoMode] = "auto",
[LTEnd] = NULL
};
d->devname = strdup(name);
d->next = rv;
d->used = 0;
+ d->content = NULL;
rv = d;
}
fclose(f);
return rv;
}
+mddev_dev_t load_containers(void)
+{
+ struct mdstat_ent *mdstat = mdstat_read(1, 0);
+ struct mdstat_ent *ent;
+ mddev_dev_t d;
+ mddev_dev_t rv = NULL;
+
+ if (!mdstat)
+ return NULL;
+
+ for (ent = mdstat; ent; ent = ent->next)
+ if (ent->metadata_version &&
+ strncmp(ent->metadata_version, "external:", 9) == 0 &&
+ !is_subarray(&ent->metadata_version[9])) {
+ d = malloc(sizeof(*d));
+ if (!d)
+ continue;
+ if (asprintf(&d->devname, "/dev/%s", ent->dev) < 0) {
+ free(d);
+ continue;
+ }
+ d->next = rv;
+ d->used = 0;
+ d->content = NULL;
+ rv = d;
+ }
+ free_mdstat(mdstat);
+
+ return rv;
+}
+
struct createinfo createinfo = {
.autof = 2, /* by default, create devices with standard names */
.symlinks = 1,
struct conf_dev *cd;
for (w=dl_next(line); w != line; w=dl_next(w)) {
- if (w[0] == '/' || strcasecmp(w, "partitions") == 0) {
+ if (w[0] == '/' || strcasecmp(w, "partitions") == 0 ||
+ strcasecmp(w, "containers") == 0) {
cd = malloc(sizeof(*cd));
cd->name = strdup(w);
cd->next = cdevlist;
mddev_ident_t mddevlist = NULL;
mddev_ident_t *mddevlp = &mddevlist;
+static int is_number(char *w)
+{
+ /* check if there are 1 or more digits and nothing else */
+ int digits = 0;
+ while (*w && isdigit(*w)) {
+ digits++;
+ w++;
+ }
+ return (digits && ! *w);
+}
+
void arrayline(char *line)
{
char *w;
mis.bitmap_fd = -1;
mis.bitmap_file = NULL;
mis.name[0] = 0;
+ mis.container = NULL;
+ mis.member = NULL;
for (w=dl_next(line); w!=line; w=dl_next(w)) {
- if (w[0] == '/') {
- if (mis.devname)
- fprintf(stderr, Name ": only give one device per ARRAY line: %s and %s\n",
- mis.devname, w);
- else mis.devname = w;
+ if (w[0] == '/' || strchr(w, '=') == NULL) {
+ /* This names the device, or is '<ignore>'.
+ * The rules match those in create_mddev.
+ * 'w' must be:
+ * /dev/md/{anything}
+ * /dev/mdNN
+ * /dev/md_dNN
+ * <ignore>
+ * or anything that doesn't start '/' or '<'
+ */
+ if (strcasecmp(w, "<ignore>") == 0 ||
+ strncmp(w, "/dev/md/", 8) == 0 ||
+ (w[0] != '/' && w[0] != '<') ||
+ (strncmp(w, "/dev/md", 7) == 0 &&
+ is_number(w+7)) ||
+ (strncmp(w, "/dev/md_d", 9) == 0 &&
+ is_number(w+9))
+ ) {
+ /* This is acceptable */;
+ if (mis.devname)
+ fprintf(stderr, Name ": only give one "
+ "device per ARRAY line: %s and %s\n",
+ mis.devname, w);
+ else
+ mis.devname = w;
+ }else {
+ fprintf(stderr, Name ": %s is an invalid name for "
+ "an md device - ignored.\n", w);
+ }
} else if (strncasecmp(w, "uuid=", 5)==0 ) {
if (mis.uuid_set)
fprintf(stderr, Name ": only specify uuid once, %s ignored.\n",
} else if (strncasecmp(w, "auto=", 5) == 0 ) {
/* whether to create device special files as needed */
mis.autof = parse_auto(w+5, "auto type", 0);
+ } else if (strncasecmp(w, "member=", 7) == 0) {
+ /* subarray within a container */
+ mis.member = strdup(w+7);
+ } else if (strncasecmp(w, "container=", 10) == 0) {
+ /* the container holding this subarray. Either a device name
+ * or a uuid */
+ mis.container = strdup(w+10);
} else {
fprintf(stderr, Name ": unrecognised word on ARRAY line: %s\n",
w);
}
}
- if (mis.devname == NULL)
- fprintf(stderr, Name ": ARRAY line with no device\n");
- else if (mis.uuid_set == 0 && mis.devices == NULL && mis.super_minor == UnSet && mis.name[0] == 0)
+ if (mis.uuid_set == 0 && mis.devices == NULL &&
+ mis.super_minor == UnSet && mis.name[0] == 0 &&
+ (mis.container == NULL || mis.member == NULL))
fprintf(stderr, Name ": ARRAY line %s has no identity information.\n", mis.devname);
else {
mi = malloc(sizeof(*mi));
*mi = mis;
- mi->devname = strdup(mis.devname);
+ mi->devname = mis.devname ? strdup(mis.devname) : NULL;
mi->next = NULL;
*mddevlp = mi;
mddevlp = &mi->next;
if (alert_mail_from == NULL)
alert_mail_from = strdup(w);
else {
- char *t= NULL;
- xasprintf(&t, "%s %s", alert_mail_from, w);
- free(alert_mail_from);
- alert_mail_from = t;
+ char *t = NULL;
+
+ if (xasprintf(&t, "%s %s", alert_mail_from, w) > 0) {
+ free(alert_mail_from);
+ alert_mail_from = t;
+ }
}
}
}
}
static char *home_host = NULL;
+static int require_homehost = 1;
void homehostline(char *line)
{
char *w;
for (w=dl_next(line); w != line ; w=dl_next(w)) {
- if (home_host == NULL)
+ if (strcasecmp(w, "<ignore>")==0)
+ require_homehost = 0;
+ else if (home_host == NULL)
home_host = strdup(w);
else
fprintf(stderr, Name ": excess host name on HOMEHOST line: %s - ignored\n",
}
}
+static char *auto_options = NULL;
+void autoline(char *line)
+{
+ if (auto_options) {
+ fprintf(stderr, Name ": AUTO line may only be give once."
+ " Subsequent lines ignored\n");
+ return;
+ }
+ auto_options = line;
+}
int loaded = 0;
case Homehost:
homehostline(line);
break;
+ case AutoMode:
+ autoline(line);
+ break;
default:
fprintf(stderr, Name ": Unknown keyword %s\n", line);
}
return alert_program;
}
-char *conf_get_homehost(void)
+char *conf_get_homehost(int *require_homehostp)
{
load_conffile();
+ if (require_homehostp)
+ *require_homehostp = require_homehost;
return home_host;
}
mddev_ident_t rv;
load_conffile();
rv = mddevlist;
- while (dev && rv && strcmp(dev, rv->devname)!=0)
+ while (dev && rv && (rv->devname == NULL
+ || !devname_matches(dev, rv->devname)))
rv = rv->next;
return rv;
}
+static void append_dlist(mddev_dev_t *dlp, mddev_dev_t list)
+{
+ while (*dlp)
+ dlp = &(*dlp)->next;
+ *dlp = list;
+}
+
mddev_dev_t conf_get_devs()
{
glob_t globbuf;
load_conffile();
- if (cdevlist == NULL)
- /* default to 'partitions */
+ if (cdevlist == NULL) {
+ /* default to 'partitions' and 'containers' */
dlist = load_partitions();
+ append_dlist(&dlist, load_containers());
+ }
for (cd=cdevlist; cd; cd=cd->next) {
- if (strcasecmp(cd->name, "partitions")==0 && dlist == NULL)
- dlist = load_partitions();
+ if (strcasecmp(cd->name, "partitions")==0)
+ append_dlist(&dlist, load_partitions());
+ else if (strcasecmp(cd->name, "containers")==0)
+ append_dlist(&dlist, load_containers());
else {
glob(cd->name, flags, NULL, &globbuf);
flags |= GLOB_APPEND;
t->devname = strdup(globbuf.gl_pathv[i]);
t->next = dlist;
t->used = 0;
+ t->content = NULL;
dlist = t;
/* printf("one dev is %s\n", t->devname);*/
}
return 0;
}
+int conf_test_metadata(const char *version)
+{
+ /* Check if the given metadata version is allowed
+ * to be auto-assembled.
+ * The default is 'yes' but the 'auto' line might over-ride that.
+ * Word in auto_options are processed in order with the first
+ * match winning.
+ * word can be:
+ * +version - that version can be assembled
+ * -version - that version cannot be auto-assembled
+ * yes or +all - any other version can be assembled
+ * no or -all - no other version can be assembled.
+ */
+ char *w;
+ load_conffile();
+ if (!auto_options)
+ return 1;
+ for (w = dl_next(auto_options); w != auto_options; w = dl_next(w)) {
+ int rv;
+ if (strcasecmp(w, "yes") == 0)
+ return 1;
+ if (strcasecmp(w, "no") == 0)
+ return 0;
+ if (w[0] == '+')
+ rv = 1;
+ else if (w[0] == '-')
+ rv = 0;
+ else continue;
+
+ if (strcasecmp(w+1, "all") == 0)
+ return rv;
+ if (strcasecmp(w+1, version) == 0)
+ return rv;
+ /* allow '0' to match version '0.90'
+ * and 1 or 1.whatever to match version '1.x'
+ */
+ if (version[1] == '.' &&
+ strlen(w+1) == 1 &&
+ w[1] == version[0])
+ return rv;
+ if (version[1] == '.' && version[2] == 'x' &&
+ strncmp(w+1, version, 2) == 0)
+ return rv;
+ }
+ return 1;
+}
int match_oneof(char *devices, char *devname)
{
}
return 0;
}
+
+int devname_matches(char *name, char *match)
+{
+ /* See if the given array name matches the
+ * given match from config file.
+ *
+ * First strip and /dev/md/ or /dev/, then
+ * see if there might be a numeric match of
+ * mdNN with NN
+ * then just strcmp
+ */
+ if (strncmp(name, "/dev/md/", 8) == 0)
+ name += 8;
+ else if (strncmp(name, "/dev/", 5) == 0)
+ name += 5;
+
+ if (strncmp(match, "/dev/md/", 8) == 0)
+ match += 8;
+ else if (strncmp(match, "/dev/", 5) == 0)
+ match += 5;
+
+
+ if (strncmp(name, "md", 2) == 0 &&
+ isdigit(name[2]))
+ name += 2;
+ if (strncmp(match, "md", 2) == 0 &&
+ isdigit(match[2]))
+ match += 2;
+
+ return (strcmp(name, match) == 0);
+}
+
+int conf_name_is_free(char *name)
+{
+ /* Check if this name is already take by an ARRAY entry in
+ * the config file.
+ * It can be taken either by a match on devname, name, or
+ * even super-minor.
+ */
+ mddev_ident_t dev;
+
+ load_conffile();
+ for (dev = mddevlist; dev; dev = dev->next) {
+ char nbuf[100];
+ if (dev->devname && devname_matches(name, dev->devname))
+ return 0;
+ if (dev->name[0] && devname_matches(name, dev->name))
+ return 0;
+ sprintf(nbuf, "%d", dev->super_minor);
+ if (dev->super_minor != UnSet &&
+ devname_matches(name, nbuf))
+ return 0;
+ }
+ return 1;
+}
+
+struct mddev_ident_s *conf_match(struct mdinfo *info, struct supertype *st)
+{
+ struct mddev_ident_s *array_list, *match;
+ int verbose = 0;
+ char *devname = NULL;
+ array_list = conf_get_ident(NULL);
+ match = NULL;
+ for (; array_list; array_list = array_list->next) {
+ if (array_list->uuid_set &&
+ same_uuid(array_list->uuid, info->uuid, st->ss->swapuuid)
+ == 0) {
+ if (verbose >= 2 && array_list->devname)
+ fprintf(stderr, Name
+ ": UUID differs from %s.\n",
+ array_list->devname);
+ continue;
+ }
+ if (array_list->name[0] &&
+ strcasecmp(array_list->name, info->name) != 0) {
+ if (verbose >= 2 && array_list->devname)
+ fprintf(stderr, Name
+ ": Name differs from %s.\n",
+ array_list->devname);
+ continue;
+ }
+ if (array_list->devices && devname &&
+ !match_oneof(array_list->devices, devname)) {
+ if (verbose >= 2 && array_list->devname)
+ fprintf(stderr, Name
+ ": Not a listed device for %s.\n",
+ array_list->devname);
+ continue;
+ }
+ if (array_list->super_minor != UnSet &&
+ array_list->super_minor != info->array.md_minor) {
+ if (verbose >= 2 && array_list->devname)
+ fprintf(stderr, Name
+ ": Different super-minor to %s.\n",
+ array_list->devname);
+ continue;
+ }
+ if (!array_list->uuid_set &&
+ !array_list->name[0] &&
+ !array_list->devices &&
+ array_list->super_minor == UnSet) {
+ if (verbose >= 2 && array_list->devname)
+ fprintf(stderr, Name
+ ": %s doesn't have any identifying information.\n",
+ array_list->devname);
+ continue;
+ }
+ /* FIXME, should I check raid_disks and level too?? */
+
+ if (match) {
+ if (verbose >= 0) {
+ if (match->devname && array_list->devname)
+ fprintf(stderr, Name
+ ": we match both %s and %s - cannot decide which to use.\n",
+ match->devname, array_list->devname);
+ else
+ fprintf(stderr, Name
+ ": multiple lines in mdadm.conf match\n");
+ }
+ return NULL;
+ }
+ match = array_list;
+ }
+ return match;
+}
--- /dev/null
+/* crc32.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2003 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
+ * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
+ * tables for updating the shift register in one step with three exclusive-ors
+ * instead of four steps with four exclusive-ors. This results about a factor
+ * of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
+ */
+
+/* @(#) $Id$ */
+
+/*
+ Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
+ protection on the static variables used to control the first-use generation
+ of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
+ first call get_crc_table() to initialize the tables before allowing more than
+ one thread to use crc32().
+ */
+
+#ifdef MAKECRCH
+# include <stdio.h>
+# ifndef DYNAMIC_CRC_TABLE
+# define DYNAMIC_CRC_TABLE
+# endif /* !DYNAMIC_CRC_TABLE */
+#endif /* MAKECRCH */
+
+/* #include "zutil.h" / * for STDC and FAR definitions */
+#define STDC
+#define FAR
+#define Z_NULL ((void*)0)
+#define OF(X) X
+#define ZEXPORT
+typedef long ptrdiff_t;
+#define NOBYFOUR
+
+#define local static
+
+/* Find a four-byte integer type for crc32_little() and crc32_big(). */
+#ifndef NOBYFOUR
+# ifdef STDC /* need ANSI C limits.h to determine sizes */
+# include <limits.h>
+# define BYFOUR
+# if (UINT_MAX == 0xffffffffUL)
+ typedef unsigned int u4;
+# else
+# if (ULONG_MAX == 0xffffffffUL)
+ typedef unsigned long u4;
+# else
+# if (USHRT_MAX == 0xffffffffUL)
+ typedef unsigned short u4;
+# else
+# undef BYFOUR /* can't find a four-byte integer type! */
+# endif
+# endif
+# endif
+# endif /* STDC */
+#endif /* !NOBYFOUR */
+
+/* Definitions for doing the crc four data bytes at a time. */
+#ifdef BYFOUR
+# define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \
+ (((w)&0xff00)<<8)+(((w)&0xff)<<24))
+ local unsigned long crc32_little OF((unsigned long,
+ const unsigned char FAR *, unsigned));
+ local unsigned long crc32_big OF((unsigned long,
+ const unsigned char FAR *, unsigned));
+# define TBLS 8
+#else
+# define TBLS 1
+#endif /* BYFOUR */
+
+#ifdef DYNAMIC_CRC_TABLE
+
+local volatile int crc_table_empty = 1;
+local unsigned long FAR crc_table[TBLS][256];
+local void make_crc_table OF((void));
+#ifdef MAKECRCH
+ local void write_table OF((FILE *, const unsigned long FAR *));
+#endif /* MAKECRCH */
+
+/*
+ Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
+ x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
+
+ Polynomials over GF(2) are represented in binary, one bit per coefficient,
+ with the lowest powers in the most significant bit. Then adding polynomials
+ is just exclusive-or, and multiplying a polynomial by x is a right shift by
+ one. If we call the above polynomial p, and represent a byte as the
+ polynomial q, also with the lowest power in the most significant bit (so the
+ byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p,
+ where a mod b means the remainder after dividing a by b.
+
+ This calculation is done using the shift-register method of multiplying and
+ taking the remainder. The register is initialized to zero, and for each
+ incoming bit, x^32 is added mod p to the register if the bit is a one (where
+ x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by
+ x (which is shifting right by one and adding x^32 mod p if the bit shifted
+ out is a one). We start with the highest power (least significant bit) of
+ q and repeat for all eight bits of q.
+
+ The first table is simply the CRC of all possible eight bit values. This is
+ all the information needed to generate CRCs on data a byte at a time for all
+ combinations of CRC register values and incoming bytes. The remaining tables
+ allow for word-at-a-time CRC calculation for both big-endian and little-
+ endian machines, where a word is four bytes.
+*/
+local void make_crc_table()
+{
+ unsigned long c;
+ int n, k;
+ unsigned long poly; /* polynomial exclusive-or pattern */
+ /* terms of polynomial defining this crc (except x^32): */
+ static volatile int first = 1; /* flag to limit concurrent making */
+ static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26};
+
+ /* See if another task is already doing this (not thread-safe, but better
+ than nothing -- significantly reduces duration of vulnerability in
+ case the advice about DYNAMIC_CRC_TABLE is ignored) */
+ if (first) {
+ first = 0;
+
+ /* make exclusive-or pattern from polynomial (0xedb88320UL) */
+ poly = 0UL;
+ for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++)
+ poly |= 1UL << (31 - p[n]);
+
+ /* generate a crc for every 8-bit value */
+ for (n = 0; n < 256; n++) {
+ c = (unsigned long)n;
+ for (k = 0; k < 8; k++)
+ c = c & 1 ? poly ^ (c >> 1) : c >> 1;
+ crc_table[0][n] = c;
+ }
+
+#ifdef BYFOUR
+ /* generate crc for each value followed by one, two, and three zeros,
+ and then the byte reversal of those as well as the first table */
+ for (n = 0; n < 256; n++) {
+ c = crc_table[0][n];
+ crc_table[4][n] = REV(c);
+ for (k = 1; k < 4; k++) {
+ c = crc_table[0][c & 0xff] ^ (c >> 8);
+ crc_table[k][n] = c;
+ crc_table[k + 4][n] = REV(c);
+ }
+ }
+#endif /* BYFOUR */
+
+ crc_table_empty = 0;
+ }
+ else { /* not first */
+ /* wait for the other guy to finish (not efficient, but rare) */
+ while (crc_table_empty)
+ ;
+ }
+
+#ifdef MAKECRCH
+ /* write out CRC tables to crc32.h */
+ {
+ FILE *out;
+
+ out = fopen("crc32.h", "w");
+ if (out == NULL) return;
+ fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n");
+ fprintf(out, " * Generated automatically by crc32.c\n */\n\n");
+ fprintf(out, "local const unsigned long FAR ");
+ fprintf(out, "crc_table[TBLS][256] =\n{\n {\n");
+ write_table(out, crc_table[0]);
+# ifdef BYFOUR
+ fprintf(out, "#ifdef BYFOUR\n");
+ for (k = 1; k < 8; k++) {
+ fprintf(out, " },\n {\n");
+ write_table(out, crc_table[k]);
+ }
+ fprintf(out, "#endif\n");
+# endif /* BYFOUR */
+ fprintf(out, " }\n};\n");
+ fclose(out);
+ }
+#endif /* MAKECRCH */
+}
+
+#ifdef MAKECRCH
+local void write_table(out, table)
+ FILE *out;
+ const unsigned long FAR *table;
+{
+ int n;
+
+ for (n = 0; n < 256; n++)
+ fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : " ", table[n],
+ n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", "));
+}
+#endif /* MAKECRCH */
+
+#else /* !DYNAMIC_CRC_TABLE */
+/* ========================================================================
+ * Tables of CRC-32s of all single-byte values, made by make_crc_table().
+ */
+#include "crc32.h"
+#endif /* DYNAMIC_CRC_TABLE */
+
+/* =========================================================================
+ * This function can be used by asm versions of crc32()
+ */
+const unsigned long FAR * ZEXPORT get_crc_table(void)
+{
+#ifdef DYNAMIC_CRC_TABLE
+ if (crc_table_empty)
+ make_crc_table();
+#endif /* DYNAMIC_CRC_TABLE */
+ return (const unsigned long FAR *)crc_table;
+}
+
+/* ========================================================================= */
+#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
+#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+
+/* ========================================================================= */
+unsigned long ZEXPORT crc32(
+ unsigned long crc,
+ const unsigned char FAR *buf,
+ unsigned len)
+{
+ if (buf == Z_NULL) return 0UL;
+
+#ifdef DYNAMIC_CRC_TABLE
+ if (crc_table_empty)
+ make_crc_table();
+#endif /* DYNAMIC_CRC_TABLE */
+
+#ifdef BYFOUR
+ if (sizeof(void *) == sizeof(ptrdiff_t)) {
+ u4 endian;
+
+ endian = 1;
+ if (*((unsigned char *)(&endian)))
+ return crc32_little(crc, buf, len);
+ else
+ return crc32_big(crc, buf, len);
+ }
+#endif /* BYFOUR */
+/* crc = crc ^ 0xffffffffUL;*/
+ while (len >= 8) {
+ DO8;
+ len -= 8;
+ }
+ if (len) do {
+ DO1;
+ } while (--len);
+ return crc /* ^ 0xffffffffUL*/;
+}
+
+#ifdef BYFOUR
+
+/* ========================================================================= */
+#define DOLIT4 c ^= *buf4++; \
+ c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
+ crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
+#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
+
+/* ========================================================================= */
+local unsigned long crc32_little(crc, buf, len)
+ unsigned long crc;
+ const unsigned char FAR *buf;
+ unsigned len;
+{
+ register u4 c;
+ register const u4 FAR *buf4;
+
+ c = (u4)crc;
+ c = ~c;
+ while (len && ((ptrdiff_t)buf & 3)) {
+ c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+ len--;
+ }
+
+ buf4 = (const u4 FAR *)buf;
+ while (len >= 32) {
+ DOLIT32;
+ len -= 32;
+ }
+ while (len >= 4) {
+ DOLIT4;
+ len -= 4;
+ }
+ buf = (const unsigned char FAR *)buf4;
+
+ if (len) do {
+ c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+ } while (--len);
+ c = ~c;
+ return (unsigned long)c;
+}
+
+/* ========================================================================= */
+#define DOBIG4 c ^= *++buf4; \
+ c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
+ crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
+#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
+
+/* ========================================================================= */
+local unsigned long crc32_big(crc, buf, len)
+ unsigned long crc;
+ const unsigned char FAR *buf;
+ unsigned len;
+{
+ register u4 c;
+ register const u4 FAR *buf4;
+
+ c = REV((u4)crc);
+ c = ~c;
+ while (len && ((ptrdiff_t)buf & 3)) {
+ c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+ len--;
+ }
+
+ buf4 = (const u4 FAR *)buf;
+ buf4--;
+ while (len >= 32) {
+ DOBIG32;
+ len -= 32;
+ }
+ while (len >= 4) {
+ DOBIG4;
+ len -= 4;
+ }
+ buf4++;
+ buf = (const unsigned char FAR *)buf4;
+
+ if (len) do {
+ c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+ } while (--len);
+ c = ~c;
+ return (unsigned long)(REV(c));
+}
+
+#endif /* BYFOUR */
--- /dev/null
+/* crc32.h -- tables for rapid CRC calculation
+ * Generated automatically by crc32.c
+ */
+
+local const unsigned long FAR crc_table[TBLS][256] =
+{
+ {
+ 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
+ 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
+ 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
+ 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
+ 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
+ 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
+ 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
+ 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
+ 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
+ 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
+ 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
+ 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
+ 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
+ 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
+ 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
+ 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
+ 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
+ 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
+ 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
+ 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
+ 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
+ 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
+ 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
+ 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
+ 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
+ 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
+ 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
+ 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
+ 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
+ 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
+ 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
+ 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
+ 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
+ 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
+ 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
+ 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
+ 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
+ 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
+ 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
+ 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
+ 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
+ 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
+ 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
+ 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
+ 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
+ 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
+ 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
+ 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
+ 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
+ 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
+ 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
+ 0x2d02ef8dUL
+#ifdef BYFOUR
+ },
+ {
+ 0x00000000UL, 0x191b3141UL, 0x32366282UL, 0x2b2d53c3UL, 0x646cc504UL,
+ 0x7d77f445UL, 0x565aa786UL, 0x4f4196c7UL, 0xc8d98a08UL, 0xd1c2bb49UL,
+ 0xfaefe88aUL, 0xe3f4d9cbUL, 0xacb54f0cUL, 0xb5ae7e4dUL, 0x9e832d8eUL,
+ 0x87981ccfUL, 0x4ac21251UL, 0x53d92310UL, 0x78f470d3UL, 0x61ef4192UL,
+ 0x2eaed755UL, 0x37b5e614UL, 0x1c98b5d7UL, 0x05838496UL, 0x821b9859UL,
+ 0x9b00a918UL, 0xb02dfadbUL, 0xa936cb9aUL, 0xe6775d5dUL, 0xff6c6c1cUL,
+ 0xd4413fdfUL, 0xcd5a0e9eUL, 0x958424a2UL, 0x8c9f15e3UL, 0xa7b24620UL,
+ 0xbea97761UL, 0xf1e8e1a6UL, 0xe8f3d0e7UL, 0xc3de8324UL, 0xdac5b265UL,
+ 0x5d5daeaaUL, 0x44469febUL, 0x6f6bcc28UL, 0x7670fd69UL, 0x39316baeUL,
+ 0x202a5aefUL, 0x0b07092cUL, 0x121c386dUL, 0xdf4636f3UL, 0xc65d07b2UL,
+ 0xed705471UL, 0xf46b6530UL, 0xbb2af3f7UL, 0xa231c2b6UL, 0x891c9175UL,
+ 0x9007a034UL, 0x179fbcfbUL, 0x0e848dbaUL, 0x25a9de79UL, 0x3cb2ef38UL,
+ 0x73f379ffUL, 0x6ae848beUL, 0x41c51b7dUL, 0x58de2a3cUL, 0xf0794f05UL,
+ 0xe9627e44UL, 0xc24f2d87UL, 0xdb541cc6UL, 0x94158a01UL, 0x8d0ebb40UL,
+ 0xa623e883UL, 0xbf38d9c2UL, 0x38a0c50dUL, 0x21bbf44cUL, 0x0a96a78fUL,
+ 0x138d96ceUL, 0x5ccc0009UL, 0x45d73148UL, 0x6efa628bUL, 0x77e153caUL,
+ 0xbabb5d54UL, 0xa3a06c15UL, 0x888d3fd6UL, 0x91960e97UL, 0xded79850UL,
+ 0xc7cca911UL, 0xece1fad2UL, 0xf5facb93UL, 0x7262d75cUL, 0x6b79e61dUL,
+ 0x4054b5deUL, 0x594f849fUL, 0x160e1258UL, 0x0f152319UL, 0x243870daUL,
+ 0x3d23419bUL, 0x65fd6ba7UL, 0x7ce65ae6UL, 0x57cb0925UL, 0x4ed03864UL,
+ 0x0191aea3UL, 0x188a9fe2UL, 0x33a7cc21UL, 0x2abcfd60UL, 0xad24e1afUL,
+ 0xb43fd0eeUL, 0x9f12832dUL, 0x8609b26cUL, 0xc94824abUL, 0xd05315eaUL,
+ 0xfb7e4629UL, 0xe2657768UL, 0x2f3f79f6UL, 0x362448b7UL, 0x1d091b74UL,
+ 0x04122a35UL, 0x4b53bcf2UL, 0x52488db3UL, 0x7965de70UL, 0x607eef31UL,
+ 0xe7e6f3feUL, 0xfefdc2bfUL, 0xd5d0917cUL, 0xcccba03dUL, 0x838a36faUL,
+ 0x9a9107bbUL, 0xb1bc5478UL, 0xa8a76539UL, 0x3b83984bUL, 0x2298a90aUL,
+ 0x09b5fac9UL, 0x10aecb88UL, 0x5fef5d4fUL, 0x46f46c0eUL, 0x6dd93fcdUL,
+ 0x74c20e8cUL, 0xf35a1243UL, 0xea412302UL, 0xc16c70c1UL, 0xd8774180UL,
+ 0x9736d747UL, 0x8e2de606UL, 0xa500b5c5UL, 0xbc1b8484UL, 0x71418a1aUL,
+ 0x685abb5bUL, 0x4377e898UL, 0x5a6cd9d9UL, 0x152d4f1eUL, 0x0c367e5fUL,
+ 0x271b2d9cUL, 0x3e001cddUL, 0xb9980012UL, 0xa0833153UL, 0x8bae6290UL,
+ 0x92b553d1UL, 0xddf4c516UL, 0xc4eff457UL, 0xefc2a794UL, 0xf6d996d5UL,
+ 0xae07bce9UL, 0xb71c8da8UL, 0x9c31de6bUL, 0x852aef2aUL, 0xca6b79edUL,
+ 0xd37048acUL, 0xf85d1b6fUL, 0xe1462a2eUL, 0x66de36e1UL, 0x7fc507a0UL,
+ 0x54e85463UL, 0x4df36522UL, 0x02b2f3e5UL, 0x1ba9c2a4UL, 0x30849167UL,
+ 0x299fa026UL, 0xe4c5aeb8UL, 0xfdde9ff9UL, 0xd6f3cc3aUL, 0xcfe8fd7bUL,
+ 0x80a96bbcUL, 0x99b25afdUL, 0xb29f093eUL, 0xab84387fUL, 0x2c1c24b0UL,
+ 0x350715f1UL, 0x1e2a4632UL, 0x07317773UL, 0x4870e1b4UL, 0x516bd0f5UL,
+ 0x7a468336UL, 0x635db277UL, 0xcbfad74eUL, 0xd2e1e60fUL, 0xf9ccb5ccUL,
+ 0xe0d7848dUL, 0xaf96124aUL, 0xb68d230bUL, 0x9da070c8UL, 0x84bb4189UL,
+ 0x03235d46UL, 0x1a386c07UL, 0x31153fc4UL, 0x280e0e85UL, 0x674f9842UL,
+ 0x7e54a903UL, 0x5579fac0UL, 0x4c62cb81UL, 0x8138c51fUL, 0x9823f45eUL,
+ 0xb30ea79dUL, 0xaa1596dcUL, 0xe554001bUL, 0xfc4f315aUL, 0xd7626299UL,
+ 0xce7953d8UL, 0x49e14f17UL, 0x50fa7e56UL, 0x7bd72d95UL, 0x62cc1cd4UL,
+ 0x2d8d8a13UL, 0x3496bb52UL, 0x1fbbe891UL, 0x06a0d9d0UL, 0x5e7ef3ecUL,
+ 0x4765c2adUL, 0x6c48916eUL, 0x7553a02fUL, 0x3a1236e8UL, 0x230907a9UL,
+ 0x0824546aUL, 0x113f652bUL, 0x96a779e4UL, 0x8fbc48a5UL, 0xa4911b66UL,
+ 0xbd8a2a27UL, 0xf2cbbce0UL, 0xebd08da1UL, 0xc0fdde62UL, 0xd9e6ef23UL,
+ 0x14bce1bdUL, 0x0da7d0fcUL, 0x268a833fUL, 0x3f91b27eUL, 0x70d024b9UL,
+ 0x69cb15f8UL, 0x42e6463bUL, 0x5bfd777aUL, 0xdc656bb5UL, 0xc57e5af4UL,
+ 0xee530937UL, 0xf7483876UL, 0xb809aeb1UL, 0xa1129ff0UL, 0x8a3fcc33UL,
+ 0x9324fd72UL
+ },
+ {
+ 0x00000000UL, 0x01c26a37UL, 0x0384d46eUL, 0x0246be59UL, 0x0709a8dcUL,
+ 0x06cbc2ebUL, 0x048d7cb2UL, 0x054f1685UL, 0x0e1351b8UL, 0x0fd13b8fUL,
+ 0x0d9785d6UL, 0x0c55efe1UL, 0x091af964UL, 0x08d89353UL, 0x0a9e2d0aUL,
+ 0x0b5c473dUL, 0x1c26a370UL, 0x1de4c947UL, 0x1fa2771eUL, 0x1e601d29UL,
+ 0x1b2f0bacUL, 0x1aed619bUL, 0x18abdfc2UL, 0x1969b5f5UL, 0x1235f2c8UL,
+ 0x13f798ffUL, 0x11b126a6UL, 0x10734c91UL, 0x153c5a14UL, 0x14fe3023UL,
+ 0x16b88e7aUL, 0x177ae44dUL, 0x384d46e0UL, 0x398f2cd7UL, 0x3bc9928eUL,
+ 0x3a0bf8b9UL, 0x3f44ee3cUL, 0x3e86840bUL, 0x3cc03a52UL, 0x3d025065UL,
+ 0x365e1758UL, 0x379c7d6fUL, 0x35dac336UL, 0x3418a901UL, 0x3157bf84UL,
+ 0x3095d5b3UL, 0x32d36beaUL, 0x331101ddUL, 0x246be590UL, 0x25a98fa7UL,
+ 0x27ef31feUL, 0x262d5bc9UL, 0x23624d4cUL, 0x22a0277bUL, 0x20e69922UL,
+ 0x2124f315UL, 0x2a78b428UL, 0x2bbade1fUL, 0x29fc6046UL, 0x283e0a71UL,
+ 0x2d711cf4UL, 0x2cb376c3UL, 0x2ef5c89aUL, 0x2f37a2adUL, 0x709a8dc0UL,
+ 0x7158e7f7UL, 0x731e59aeUL, 0x72dc3399UL, 0x7793251cUL, 0x76514f2bUL,
+ 0x7417f172UL, 0x75d59b45UL, 0x7e89dc78UL, 0x7f4bb64fUL, 0x7d0d0816UL,
+ 0x7ccf6221UL, 0x798074a4UL, 0x78421e93UL, 0x7a04a0caUL, 0x7bc6cafdUL,
+ 0x6cbc2eb0UL, 0x6d7e4487UL, 0x6f38fadeUL, 0x6efa90e9UL, 0x6bb5866cUL,
+ 0x6a77ec5bUL, 0x68315202UL, 0x69f33835UL, 0x62af7f08UL, 0x636d153fUL,
+ 0x612bab66UL, 0x60e9c151UL, 0x65a6d7d4UL, 0x6464bde3UL, 0x662203baUL,
+ 0x67e0698dUL, 0x48d7cb20UL, 0x4915a117UL, 0x4b531f4eUL, 0x4a917579UL,
+ 0x4fde63fcUL, 0x4e1c09cbUL, 0x4c5ab792UL, 0x4d98dda5UL, 0x46c49a98UL,
+ 0x4706f0afUL, 0x45404ef6UL, 0x448224c1UL, 0x41cd3244UL, 0x400f5873UL,
+ 0x4249e62aUL, 0x438b8c1dUL, 0x54f16850UL, 0x55330267UL, 0x5775bc3eUL,
+ 0x56b7d609UL, 0x53f8c08cUL, 0x523aaabbUL, 0x507c14e2UL, 0x51be7ed5UL,
+ 0x5ae239e8UL, 0x5b2053dfUL, 0x5966ed86UL, 0x58a487b1UL, 0x5deb9134UL,
+ 0x5c29fb03UL, 0x5e6f455aUL, 0x5fad2f6dUL, 0xe1351b80UL, 0xe0f771b7UL,
+ 0xe2b1cfeeUL, 0xe373a5d9UL, 0xe63cb35cUL, 0xe7fed96bUL, 0xe5b86732UL,
+ 0xe47a0d05UL, 0xef264a38UL, 0xeee4200fUL, 0xeca29e56UL, 0xed60f461UL,
+ 0xe82fe2e4UL, 0xe9ed88d3UL, 0xebab368aUL, 0xea695cbdUL, 0xfd13b8f0UL,
+ 0xfcd1d2c7UL, 0xfe976c9eUL, 0xff5506a9UL, 0xfa1a102cUL, 0xfbd87a1bUL,
+ 0xf99ec442UL, 0xf85cae75UL, 0xf300e948UL, 0xf2c2837fUL, 0xf0843d26UL,
+ 0xf1465711UL, 0xf4094194UL, 0xf5cb2ba3UL, 0xf78d95faUL, 0xf64fffcdUL,
+ 0xd9785d60UL, 0xd8ba3757UL, 0xdafc890eUL, 0xdb3ee339UL, 0xde71f5bcUL,
+ 0xdfb39f8bUL, 0xddf521d2UL, 0xdc374be5UL, 0xd76b0cd8UL, 0xd6a966efUL,
+ 0xd4efd8b6UL, 0xd52db281UL, 0xd062a404UL, 0xd1a0ce33UL, 0xd3e6706aUL,
+ 0xd2241a5dUL, 0xc55efe10UL, 0xc49c9427UL, 0xc6da2a7eUL, 0xc7184049UL,
+ 0xc25756ccUL, 0xc3953cfbUL, 0xc1d382a2UL, 0xc011e895UL, 0xcb4dafa8UL,
+ 0xca8fc59fUL, 0xc8c97bc6UL, 0xc90b11f1UL, 0xcc440774UL, 0xcd866d43UL,
+ 0xcfc0d31aUL, 0xce02b92dUL, 0x91af9640UL, 0x906dfc77UL, 0x922b422eUL,
+ 0x93e92819UL, 0x96a63e9cUL, 0x976454abUL, 0x9522eaf2UL, 0x94e080c5UL,
+ 0x9fbcc7f8UL, 0x9e7eadcfUL, 0x9c381396UL, 0x9dfa79a1UL, 0x98b56f24UL,
+ 0x99770513UL, 0x9b31bb4aUL, 0x9af3d17dUL, 0x8d893530UL, 0x8c4b5f07UL,
+ 0x8e0de15eUL, 0x8fcf8b69UL, 0x8a809decUL, 0x8b42f7dbUL, 0x89044982UL,
+ 0x88c623b5UL, 0x839a6488UL, 0x82580ebfUL, 0x801eb0e6UL, 0x81dcdad1UL,
+ 0x8493cc54UL, 0x8551a663UL, 0x8717183aUL, 0x86d5720dUL, 0xa9e2d0a0UL,
+ 0xa820ba97UL, 0xaa6604ceUL, 0xaba46ef9UL, 0xaeeb787cUL, 0xaf29124bUL,
+ 0xad6fac12UL, 0xacadc625UL, 0xa7f18118UL, 0xa633eb2fUL, 0xa4755576UL,
+ 0xa5b73f41UL, 0xa0f829c4UL, 0xa13a43f3UL, 0xa37cfdaaUL, 0xa2be979dUL,
+ 0xb5c473d0UL, 0xb40619e7UL, 0xb640a7beUL, 0xb782cd89UL, 0xb2cddb0cUL,
+ 0xb30fb13bUL, 0xb1490f62UL, 0xb08b6555UL, 0xbbd72268UL, 0xba15485fUL,
+ 0xb853f606UL, 0xb9919c31UL, 0xbcde8ab4UL, 0xbd1ce083UL, 0xbf5a5edaUL,
+ 0xbe9834edUL
+ },
+ {
+ 0x00000000UL, 0xb8bc6765UL, 0xaa09c88bUL, 0x12b5afeeUL, 0x8f629757UL,
+ 0x37def032UL, 0x256b5fdcUL, 0x9dd738b9UL, 0xc5b428efUL, 0x7d084f8aUL,
+ 0x6fbde064UL, 0xd7018701UL, 0x4ad6bfb8UL, 0xf26ad8ddUL, 0xe0df7733UL,
+ 0x58631056UL, 0x5019579fUL, 0xe8a530faUL, 0xfa109f14UL, 0x42acf871UL,
+ 0xdf7bc0c8UL, 0x67c7a7adUL, 0x75720843UL, 0xcdce6f26UL, 0x95ad7f70UL,
+ 0x2d111815UL, 0x3fa4b7fbUL, 0x8718d09eUL, 0x1acfe827UL, 0xa2738f42UL,
+ 0xb0c620acUL, 0x087a47c9UL, 0xa032af3eUL, 0x188ec85bUL, 0x0a3b67b5UL,
+ 0xb28700d0UL, 0x2f503869UL, 0x97ec5f0cUL, 0x8559f0e2UL, 0x3de59787UL,
+ 0x658687d1UL, 0xdd3ae0b4UL, 0xcf8f4f5aUL, 0x7733283fUL, 0xeae41086UL,
+ 0x525877e3UL, 0x40edd80dUL, 0xf851bf68UL, 0xf02bf8a1UL, 0x48979fc4UL,
+ 0x5a22302aUL, 0xe29e574fUL, 0x7f496ff6UL, 0xc7f50893UL, 0xd540a77dUL,
+ 0x6dfcc018UL, 0x359fd04eUL, 0x8d23b72bUL, 0x9f9618c5UL, 0x272a7fa0UL,
+ 0xbafd4719UL, 0x0241207cUL, 0x10f48f92UL, 0xa848e8f7UL, 0x9b14583dUL,
+ 0x23a83f58UL, 0x311d90b6UL, 0x89a1f7d3UL, 0x1476cf6aUL, 0xaccaa80fUL,
+ 0xbe7f07e1UL, 0x06c36084UL, 0x5ea070d2UL, 0xe61c17b7UL, 0xf4a9b859UL,
+ 0x4c15df3cUL, 0xd1c2e785UL, 0x697e80e0UL, 0x7bcb2f0eUL, 0xc377486bUL,
+ 0xcb0d0fa2UL, 0x73b168c7UL, 0x6104c729UL, 0xd9b8a04cUL, 0x446f98f5UL,
+ 0xfcd3ff90UL, 0xee66507eUL, 0x56da371bUL, 0x0eb9274dUL, 0xb6054028UL,
+ 0xa4b0efc6UL, 0x1c0c88a3UL, 0x81dbb01aUL, 0x3967d77fUL, 0x2bd27891UL,
+ 0x936e1ff4UL, 0x3b26f703UL, 0x839a9066UL, 0x912f3f88UL, 0x299358edUL,
+ 0xb4446054UL, 0x0cf80731UL, 0x1e4da8dfUL, 0xa6f1cfbaUL, 0xfe92dfecUL,
+ 0x462eb889UL, 0x549b1767UL, 0xec277002UL, 0x71f048bbUL, 0xc94c2fdeUL,
+ 0xdbf98030UL, 0x6345e755UL, 0x6b3fa09cUL, 0xd383c7f9UL, 0xc1366817UL,
+ 0x798a0f72UL, 0xe45d37cbUL, 0x5ce150aeUL, 0x4e54ff40UL, 0xf6e89825UL,
+ 0xae8b8873UL, 0x1637ef16UL, 0x048240f8UL, 0xbc3e279dUL, 0x21e91f24UL,
+ 0x99557841UL, 0x8be0d7afUL, 0x335cb0caUL, 0xed59b63bUL, 0x55e5d15eUL,
+ 0x47507eb0UL, 0xffec19d5UL, 0x623b216cUL, 0xda874609UL, 0xc832e9e7UL,
+ 0x708e8e82UL, 0x28ed9ed4UL, 0x9051f9b1UL, 0x82e4565fUL, 0x3a58313aUL,
+ 0xa78f0983UL, 0x1f336ee6UL, 0x0d86c108UL, 0xb53aa66dUL, 0xbd40e1a4UL,
+ 0x05fc86c1UL, 0x1749292fUL, 0xaff54e4aUL, 0x322276f3UL, 0x8a9e1196UL,
+ 0x982bbe78UL, 0x2097d91dUL, 0x78f4c94bUL, 0xc048ae2eUL, 0xd2fd01c0UL,
+ 0x6a4166a5UL, 0xf7965e1cUL, 0x4f2a3979UL, 0x5d9f9697UL, 0xe523f1f2UL,
+ 0x4d6b1905UL, 0xf5d77e60UL, 0xe762d18eUL, 0x5fdeb6ebUL, 0xc2098e52UL,
+ 0x7ab5e937UL, 0x680046d9UL, 0xd0bc21bcUL, 0x88df31eaUL, 0x3063568fUL,
+ 0x22d6f961UL, 0x9a6a9e04UL, 0x07bda6bdUL, 0xbf01c1d8UL, 0xadb46e36UL,
+ 0x15080953UL, 0x1d724e9aUL, 0xa5ce29ffUL, 0xb77b8611UL, 0x0fc7e174UL,
+ 0x9210d9cdUL, 0x2aacbea8UL, 0x38191146UL, 0x80a57623UL, 0xd8c66675UL,
+ 0x607a0110UL, 0x72cfaefeUL, 0xca73c99bUL, 0x57a4f122UL, 0xef189647UL,
+ 0xfdad39a9UL, 0x45115eccUL, 0x764dee06UL, 0xcef18963UL, 0xdc44268dUL,
+ 0x64f841e8UL, 0xf92f7951UL, 0x41931e34UL, 0x5326b1daUL, 0xeb9ad6bfUL,
+ 0xb3f9c6e9UL, 0x0b45a18cUL, 0x19f00e62UL, 0xa14c6907UL, 0x3c9b51beUL,
+ 0x842736dbUL, 0x96929935UL, 0x2e2efe50UL, 0x2654b999UL, 0x9ee8defcUL,
+ 0x8c5d7112UL, 0x34e11677UL, 0xa9362eceUL, 0x118a49abUL, 0x033fe645UL,
+ 0xbb838120UL, 0xe3e09176UL, 0x5b5cf613UL, 0x49e959fdUL, 0xf1553e98UL,
+ 0x6c820621UL, 0xd43e6144UL, 0xc68bceaaUL, 0x7e37a9cfUL, 0xd67f4138UL,
+ 0x6ec3265dUL, 0x7c7689b3UL, 0xc4caeed6UL, 0x591dd66fUL, 0xe1a1b10aUL,
+ 0xf3141ee4UL, 0x4ba87981UL, 0x13cb69d7UL, 0xab770eb2UL, 0xb9c2a15cUL,
+ 0x017ec639UL, 0x9ca9fe80UL, 0x241599e5UL, 0x36a0360bUL, 0x8e1c516eUL,
+ 0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL,
+ 0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL,
+ 0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL,
+ 0xde0506f1UL
+ },
+ {
+ 0x00000000UL, 0x96300777UL, 0x2c610eeeUL, 0xba510999UL, 0x19c46d07UL,
+ 0x8ff46a70UL, 0x35a563e9UL, 0xa395649eUL, 0x3288db0eUL, 0xa4b8dc79UL,
+ 0x1ee9d5e0UL, 0x88d9d297UL, 0x2b4cb609UL, 0xbd7cb17eUL, 0x072db8e7UL,
+ 0x911dbf90UL, 0x6410b71dUL, 0xf220b06aUL, 0x4871b9f3UL, 0xde41be84UL,
+ 0x7dd4da1aUL, 0xebe4dd6dUL, 0x51b5d4f4UL, 0xc785d383UL, 0x56986c13UL,
+ 0xc0a86b64UL, 0x7af962fdUL, 0xecc9658aUL, 0x4f5c0114UL, 0xd96c0663UL,
+ 0x633d0ffaUL, 0xf50d088dUL, 0xc8206e3bUL, 0x5e10694cUL, 0xe44160d5UL,
+ 0x727167a2UL, 0xd1e4033cUL, 0x47d4044bUL, 0xfd850dd2UL, 0x6bb50aa5UL,
+ 0xfaa8b535UL, 0x6c98b242UL, 0xd6c9bbdbUL, 0x40f9bcacUL, 0xe36cd832UL,
+ 0x755cdf45UL, 0xcf0dd6dcUL, 0x593dd1abUL, 0xac30d926UL, 0x3a00de51UL,
+ 0x8051d7c8UL, 0x1661d0bfUL, 0xb5f4b421UL, 0x23c4b356UL, 0x9995bacfUL,
+ 0x0fa5bdb8UL, 0x9eb80228UL, 0x0888055fUL, 0xb2d90cc6UL, 0x24e90bb1UL,
+ 0x877c6f2fUL, 0x114c6858UL, 0xab1d61c1UL, 0x3d2d66b6UL, 0x9041dc76UL,
+ 0x0671db01UL, 0xbc20d298UL, 0x2a10d5efUL, 0x8985b171UL, 0x1fb5b606UL,
+ 0xa5e4bf9fUL, 0x33d4b8e8UL, 0xa2c90778UL, 0x34f9000fUL, 0x8ea80996UL,
+ 0x18980ee1UL, 0xbb0d6a7fUL, 0x2d3d6d08UL, 0x976c6491UL, 0x015c63e6UL,
+ 0xf4516b6bUL, 0x62616c1cUL, 0xd8306585UL, 0x4e0062f2UL, 0xed95066cUL,
+ 0x7ba5011bUL, 0xc1f40882UL, 0x57c40ff5UL, 0xc6d9b065UL, 0x50e9b712UL,
+ 0xeab8be8bUL, 0x7c88b9fcUL, 0xdf1ddd62UL, 0x492dda15UL, 0xf37cd38cUL,
+ 0x654cd4fbUL, 0x5861b24dUL, 0xce51b53aUL, 0x7400bca3UL, 0xe230bbd4UL,
+ 0x41a5df4aUL, 0xd795d83dUL, 0x6dc4d1a4UL, 0xfbf4d6d3UL, 0x6ae96943UL,
+ 0xfcd96e34UL, 0x468867adUL, 0xd0b860daUL, 0x732d0444UL, 0xe51d0333UL,
+ 0x5f4c0aaaUL, 0xc97c0dddUL, 0x3c710550UL, 0xaa410227UL, 0x10100bbeUL,
+ 0x86200cc9UL, 0x25b56857UL, 0xb3856f20UL, 0x09d466b9UL, 0x9fe461ceUL,
+ 0x0ef9de5eUL, 0x98c9d929UL, 0x2298d0b0UL, 0xb4a8d7c7UL, 0x173db359UL,
+ 0x810db42eUL, 0x3b5cbdb7UL, 0xad6cbac0UL, 0x2083b8edUL, 0xb6b3bf9aUL,
+ 0x0ce2b603UL, 0x9ad2b174UL, 0x3947d5eaUL, 0xaf77d29dUL, 0x1526db04UL,
+ 0x8316dc73UL, 0x120b63e3UL, 0x843b6494UL, 0x3e6a6d0dUL, 0xa85a6a7aUL,
+ 0x0bcf0ee4UL, 0x9dff0993UL, 0x27ae000aUL, 0xb19e077dUL, 0x44930ff0UL,
+ 0xd2a30887UL, 0x68f2011eUL, 0xfec20669UL, 0x5d5762f7UL, 0xcb676580UL,
+ 0x71366c19UL, 0xe7066b6eUL, 0x761bd4feUL, 0xe02bd389UL, 0x5a7ada10UL,
+ 0xcc4add67UL, 0x6fdfb9f9UL, 0xf9efbe8eUL, 0x43beb717UL, 0xd58eb060UL,
+ 0xe8a3d6d6UL, 0x7e93d1a1UL, 0xc4c2d838UL, 0x52f2df4fUL, 0xf167bbd1UL,
+ 0x6757bca6UL, 0xdd06b53fUL, 0x4b36b248UL, 0xda2b0dd8UL, 0x4c1b0aafUL,
+ 0xf64a0336UL, 0x607a0441UL, 0xc3ef60dfUL, 0x55df67a8UL, 0xef8e6e31UL,
+ 0x79be6946UL, 0x8cb361cbUL, 0x1a8366bcUL, 0xa0d26f25UL, 0x36e26852UL,
+ 0x95770cccUL, 0x03470bbbUL, 0xb9160222UL, 0x2f260555UL, 0xbe3bbac5UL,
+ 0x280bbdb2UL, 0x925ab42bUL, 0x046ab35cUL, 0xa7ffd7c2UL, 0x31cfd0b5UL,
+ 0x8b9ed92cUL, 0x1daede5bUL, 0xb0c2649bUL, 0x26f263ecUL, 0x9ca36a75UL,
+ 0x0a936d02UL, 0xa906099cUL, 0x3f360eebUL, 0x85670772UL, 0x13570005UL,
+ 0x824abf95UL, 0x147ab8e2UL, 0xae2bb17bUL, 0x381bb60cUL, 0x9b8ed292UL,
+ 0x0dbed5e5UL, 0xb7efdc7cUL, 0x21dfdb0bUL, 0xd4d2d386UL, 0x42e2d4f1UL,
+ 0xf8b3dd68UL, 0x6e83da1fUL, 0xcd16be81UL, 0x5b26b9f6UL, 0xe177b06fUL,
+ 0x7747b718UL, 0xe65a0888UL, 0x706a0fffUL, 0xca3b0666UL, 0x5c0b0111UL,
+ 0xff9e658fUL, 0x69ae62f8UL, 0xd3ff6b61UL, 0x45cf6c16UL, 0x78e20aa0UL,
+ 0xeed20dd7UL, 0x5483044eUL, 0xc2b30339UL, 0x612667a7UL, 0xf71660d0UL,
+ 0x4d476949UL, 0xdb776e3eUL, 0x4a6ad1aeUL, 0xdc5ad6d9UL, 0x660bdf40UL,
+ 0xf03bd837UL, 0x53aebca9UL, 0xc59ebbdeUL, 0x7fcfb247UL, 0xe9ffb530UL,
+ 0x1cf2bdbdUL, 0x8ac2bacaUL, 0x3093b353UL, 0xa6a3b424UL, 0x0536d0baUL,
+ 0x9306d7cdUL, 0x2957de54UL, 0xbf67d923UL, 0x2e7a66b3UL, 0xb84a61c4UL,
+ 0x021b685dUL, 0x942b6f2aUL, 0x37be0bb4UL, 0xa18e0cc3UL, 0x1bdf055aUL,
+ 0x8def022dUL
+ },
+ {
+ 0x00000000UL, 0x41311b19UL, 0x82623632UL, 0xc3532d2bUL, 0x04c56c64UL,
+ 0x45f4777dUL, 0x86a75a56UL, 0xc796414fUL, 0x088ad9c8UL, 0x49bbc2d1UL,
+ 0x8ae8effaUL, 0xcbd9f4e3UL, 0x0c4fb5acUL, 0x4d7eaeb5UL, 0x8e2d839eUL,
+ 0xcf1c9887UL, 0x5112c24aUL, 0x1023d953UL, 0xd370f478UL, 0x9241ef61UL,
+ 0x55d7ae2eUL, 0x14e6b537UL, 0xd7b5981cUL, 0x96848305UL, 0x59981b82UL,
+ 0x18a9009bUL, 0xdbfa2db0UL, 0x9acb36a9UL, 0x5d5d77e6UL, 0x1c6c6cffUL,
+ 0xdf3f41d4UL, 0x9e0e5acdUL, 0xa2248495UL, 0xe3159f8cUL, 0x2046b2a7UL,
+ 0x6177a9beUL, 0xa6e1e8f1UL, 0xe7d0f3e8UL, 0x2483dec3UL, 0x65b2c5daUL,
+ 0xaaae5d5dUL, 0xeb9f4644UL, 0x28cc6b6fUL, 0x69fd7076UL, 0xae6b3139UL,
+ 0xef5a2a20UL, 0x2c09070bUL, 0x6d381c12UL, 0xf33646dfUL, 0xb2075dc6UL,
+ 0x715470edUL, 0x30656bf4UL, 0xf7f32abbUL, 0xb6c231a2UL, 0x75911c89UL,
+ 0x34a00790UL, 0xfbbc9f17UL, 0xba8d840eUL, 0x79dea925UL, 0x38efb23cUL,
+ 0xff79f373UL, 0xbe48e86aUL, 0x7d1bc541UL, 0x3c2ade58UL, 0x054f79f0UL,
+ 0x447e62e9UL, 0x872d4fc2UL, 0xc61c54dbUL, 0x018a1594UL, 0x40bb0e8dUL,
+ 0x83e823a6UL, 0xc2d938bfUL, 0x0dc5a038UL, 0x4cf4bb21UL, 0x8fa7960aUL,
+ 0xce968d13UL, 0x0900cc5cUL, 0x4831d745UL, 0x8b62fa6eUL, 0xca53e177UL,
+ 0x545dbbbaUL, 0x156ca0a3UL, 0xd63f8d88UL, 0x970e9691UL, 0x5098d7deUL,
+ 0x11a9ccc7UL, 0xd2fae1ecUL, 0x93cbfaf5UL, 0x5cd76272UL, 0x1de6796bUL,
+ 0xdeb55440UL, 0x9f844f59UL, 0x58120e16UL, 0x1923150fUL, 0xda703824UL,
+ 0x9b41233dUL, 0xa76bfd65UL, 0xe65ae67cUL, 0x2509cb57UL, 0x6438d04eUL,
+ 0xa3ae9101UL, 0xe29f8a18UL, 0x21cca733UL, 0x60fdbc2aUL, 0xafe124adUL,
+ 0xeed03fb4UL, 0x2d83129fUL, 0x6cb20986UL, 0xab2448c9UL, 0xea1553d0UL,
+ 0x29467efbUL, 0x687765e2UL, 0xf6793f2fUL, 0xb7482436UL, 0x741b091dUL,
+ 0x352a1204UL, 0xf2bc534bUL, 0xb38d4852UL, 0x70de6579UL, 0x31ef7e60UL,
+ 0xfef3e6e7UL, 0xbfc2fdfeUL, 0x7c91d0d5UL, 0x3da0cbccUL, 0xfa368a83UL,
+ 0xbb07919aUL, 0x7854bcb1UL, 0x3965a7a8UL, 0x4b98833bUL, 0x0aa99822UL,
+ 0xc9fab509UL, 0x88cbae10UL, 0x4f5def5fUL, 0x0e6cf446UL, 0xcd3fd96dUL,
+ 0x8c0ec274UL, 0x43125af3UL, 0x022341eaUL, 0xc1706cc1UL, 0x804177d8UL,
+ 0x47d73697UL, 0x06e62d8eUL, 0xc5b500a5UL, 0x84841bbcUL, 0x1a8a4171UL,
+ 0x5bbb5a68UL, 0x98e87743UL, 0xd9d96c5aUL, 0x1e4f2d15UL, 0x5f7e360cUL,
+ 0x9c2d1b27UL, 0xdd1c003eUL, 0x120098b9UL, 0x533183a0UL, 0x9062ae8bUL,
+ 0xd153b592UL, 0x16c5f4ddUL, 0x57f4efc4UL, 0x94a7c2efUL, 0xd596d9f6UL,
+ 0xe9bc07aeUL, 0xa88d1cb7UL, 0x6bde319cUL, 0x2aef2a85UL, 0xed796bcaUL,
+ 0xac4870d3UL, 0x6f1b5df8UL, 0x2e2a46e1UL, 0xe136de66UL, 0xa007c57fUL,
+ 0x6354e854UL, 0x2265f34dUL, 0xe5f3b202UL, 0xa4c2a91bUL, 0x67918430UL,
+ 0x26a09f29UL, 0xb8aec5e4UL, 0xf99fdefdUL, 0x3accf3d6UL, 0x7bfde8cfUL,
+ 0xbc6ba980UL, 0xfd5ab299UL, 0x3e099fb2UL, 0x7f3884abUL, 0xb0241c2cUL,
+ 0xf1150735UL, 0x32462a1eUL, 0x73773107UL, 0xb4e17048UL, 0xf5d06b51UL,
+ 0x3683467aUL, 0x77b25d63UL, 0x4ed7facbUL, 0x0fe6e1d2UL, 0xccb5ccf9UL,
+ 0x8d84d7e0UL, 0x4a1296afUL, 0x0b238db6UL, 0xc870a09dUL, 0x8941bb84UL,
+ 0x465d2303UL, 0x076c381aUL, 0xc43f1531UL, 0x850e0e28UL, 0x42984f67UL,
+ 0x03a9547eUL, 0xc0fa7955UL, 0x81cb624cUL, 0x1fc53881UL, 0x5ef42398UL,
+ 0x9da70eb3UL, 0xdc9615aaUL, 0x1b0054e5UL, 0x5a314ffcUL, 0x996262d7UL,
+ 0xd85379ceUL, 0x174fe149UL, 0x567efa50UL, 0x952dd77bUL, 0xd41ccc62UL,
+ 0x138a8d2dUL, 0x52bb9634UL, 0x91e8bb1fUL, 0xd0d9a006UL, 0xecf37e5eUL,
+ 0xadc26547UL, 0x6e91486cUL, 0x2fa05375UL, 0xe836123aUL, 0xa9070923UL,
+ 0x6a542408UL, 0x2b653f11UL, 0xe479a796UL, 0xa548bc8fUL, 0x661b91a4UL,
+ 0x272a8abdUL, 0xe0bccbf2UL, 0xa18dd0ebUL, 0x62defdc0UL, 0x23efe6d9UL,
+ 0xbde1bc14UL, 0xfcd0a70dUL, 0x3f838a26UL, 0x7eb2913fUL, 0xb924d070UL,
+ 0xf815cb69UL, 0x3b46e642UL, 0x7a77fd5bUL, 0xb56b65dcUL, 0xf45a7ec5UL,
+ 0x370953eeUL, 0x763848f7UL, 0xb1ae09b8UL, 0xf09f12a1UL, 0x33cc3f8aUL,
+ 0x72fd2493UL
+ },
+ {
+ 0x00000000UL, 0x376ac201UL, 0x6ed48403UL, 0x59be4602UL, 0xdca80907UL,
+ 0xebc2cb06UL, 0xb27c8d04UL, 0x85164f05UL, 0xb851130eUL, 0x8f3bd10fUL,
+ 0xd685970dUL, 0xe1ef550cUL, 0x64f91a09UL, 0x5393d808UL, 0x0a2d9e0aUL,
+ 0x3d475c0bUL, 0x70a3261cUL, 0x47c9e41dUL, 0x1e77a21fUL, 0x291d601eUL,
+ 0xac0b2f1bUL, 0x9b61ed1aUL, 0xc2dfab18UL, 0xf5b56919UL, 0xc8f23512UL,
+ 0xff98f713UL, 0xa626b111UL, 0x914c7310UL, 0x145a3c15UL, 0x2330fe14UL,
+ 0x7a8eb816UL, 0x4de47a17UL, 0xe0464d38UL, 0xd72c8f39UL, 0x8e92c93bUL,
+ 0xb9f80b3aUL, 0x3cee443fUL, 0x0b84863eUL, 0x523ac03cUL, 0x6550023dUL,
+ 0x58175e36UL, 0x6f7d9c37UL, 0x36c3da35UL, 0x01a91834UL, 0x84bf5731UL,
+ 0xb3d59530UL, 0xea6bd332UL, 0xdd011133UL, 0x90e56b24UL, 0xa78fa925UL,
+ 0xfe31ef27UL, 0xc95b2d26UL, 0x4c4d6223UL, 0x7b27a022UL, 0x2299e620UL,
+ 0x15f32421UL, 0x28b4782aUL, 0x1fdeba2bUL, 0x4660fc29UL, 0x710a3e28UL,
+ 0xf41c712dUL, 0xc376b32cUL, 0x9ac8f52eUL, 0xada2372fUL, 0xc08d9a70UL,
+ 0xf7e75871UL, 0xae591e73UL, 0x9933dc72UL, 0x1c259377UL, 0x2b4f5176UL,
+ 0x72f11774UL, 0x459bd575UL, 0x78dc897eUL, 0x4fb64b7fUL, 0x16080d7dUL,
+ 0x2162cf7cUL, 0xa4748079UL, 0x931e4278UL, 0xcaa0047aUL, 0xfdcac67bUL,
+ 0xb02ebc6cUL, 0x87447e6dUL, 0xdefa386fUL, 0xe990fa6eUL, 0x6c86b56bUL,
+ 0x5bec776aUL, 0x02523168UL, 0x3538f369UL, 0x087faf62UL, 0x3f156d63UL,
+ 0x66ab2b61UL, 0x51c1e960UL, 0xd4d7a665UL, 0xe3bd6464UL, 0xba032266UL,
+ 0x8d69e067UL, 0x20cbd748UL, 0x17a11549UL, 0x4e1f534bUL, 0x7975914aUL,
+ 0xfc63de4fUL, 0xcb091c4eUL, 0x92b75a4cUL, 0xa5dd984dUL, 0x989ac446UL,
+ 0xaff00647UL, 0xf64e4045UL, 0xc1248244UL, 0x4432cd41UL, 0x73580f40UL,
+ 0x2ae64942UL, 0x1d8c8b43UL, 0x5068f154UL, 0x67023355UL, 0x3ebc7557UL,
+ 0x09d6b756UL, 0x8cc0f853UL, 0xbbaa3a52UL, 0xe2147c50UL, 0xd57ebe51UL,
+ 0xe839e25aUL, 0xdf53205bUL, 0x86ed6659UL, 0xb187a458UL, 0x3491eb5dUL,
+ 0x03fb295cUL, 0x5a456f5eUL, 0x6d2fad5fUL, 0x801b35e1UL, 0xb771f7e0UL,
+ 0xeecfb1e2UL, 0xd9a573e3UL, 0x5cb33ce6UL, 0x6bd9fee7UL, 0x3267b8e5UL,
+ 0x050d7ae4UL, 0x384a26efUL, 0x0f20e4eeUL, 0x569ea2ecUL, 0x61f460edUL,
+ 0xe4e22fe8UL, 0xd388ede9UL, 0x8a36abebUL, 0xbd5c69eaUL, 0xf0b813fdUL,
+ 0xc7d2d1fcUL, 0x9e6c97feUL, 0xa90655ffUL, 0x2c101afaUL, 0x1b7ad8fbUL,
+ 0x42c49ef9UL, 0x75ae5cf8UL, 0x48e900f3UL, 0x7f83c2f2UL, 0x263d84f0UL,
+ 0x115746f1UL, 0x944109f4UL, 0xa32bcbf5UL, 0xfa958df7UL, 0xcdff4ff6UL,
+ 0x605d78d9UL, 0x5737bad8UL, 0x0e89fcdaUL, 0x39e33edbUL, 0xbcf571deUL,
+ 0x8b9fb3dfUL, 0xd221f5ddUL, 0xe54b37dcUL, 0xd80c6bd7UL, 0xef66a9d6UL,
+ 0xb6d8efd4UL, 0x81b22dd5UL, 0x04a462d0UL, 0x33cea0d1UL, 0x6a70e6d3UL,
+ 0x5d1a24d2UL, 0x10fe5ec5UL, 0x27949cc4UL, 0x7e2adac6UL, 0x494018c7UL,
+ 0xcc5657c2UL, 0xfb3c95c3UL, 0xa282d3c1UL, 0x95e811c0UL, 0xa8af4dcbUL,
+ 0x9fc58fcaUL, 0xc67bc9c8UL, 0xf1110bc9UL, 0x740744ccUL, 0x436d86cdUL,
+ 0x1ad3c0cfUL, 0x2db902ceUL, 0x4096af91UL, 0x77fc6d90UL, 0x2e422b92UL,
+ 0x1928e993UL, 0x9c3ea696UL, 0xab546497UL, 0xf2ea2295UL, 0xc580e094UL,
+ 0xf8c7bc9fUL, 0xcfad7e9eUL, 0x9613389cUL, 0xa179fa9dUL, 0x246fb598UL,
+ 0x13057799UL, 0x4abb319bUL, 0x7dd1f39aUL, 0x3035898dUL, 0x075f4b8cUL,
+ 0x5ee10d8eUL, 0x698bcf8fUL, 0xec9d808aUL, 0xdbf7428bUL, 0x82490489UL,
+ 0xb523c688UL, 0x88649a83UL, 0xbf0e5882UL, 0xe6b01e80UL, 0xd1dadc81UL,
+ 0x54cc9384UL, 0x63a65185UL, 0x3a181787UL, 0x0d72d586UL, 0xa0d0e2a9UL,
+ 0x97ba20a8UL, 0xce0466aaUL, 0xf96ea4abUL, 0x7c78ebaeUL, 0x4b1229afUL,
+ 0x12ac6fadUL, 0x25c6adacUL, 0x1881f1a7UL, 0x2feb33a6UL, 0x765575a4UL,
+ 0x413fb7a5UL, 0xc429f8a0UL, 0xf3433aa1UL, 0xaafd7ca3UL, 0x9d97bea2UL,
+ 0xd073c4b5UL, 0xe71906b4UL, 0xbea740b6UL, 0x89cd82b7UL, 0x0cdbcdb2UL,
+ 0x3bb10fb3UL, 0x620f49b1UL, 0x55658bb0UL, 0x6822d7bbUL, 0x5f4815baUL,
+ 0x06f653b8UL, 0x319c91b9UL, 0xb48adebcUL, 0x83e01cbdUL, 0xda5e5abfUL,
+ 0xed3498beUL
+ },
+ {
+ 0x00000000UL, 0x6567bcb8UL, 0x8bc809aaUL, 0xeeafb512UL, 0x5797628fUL,
+ 0x32f0de37UL, 0xdc5f6b25UL, 0xb938d79dUL, 0xef28b4c5UL, 0x8a4f087dUL,
+ 0x64e0bd6fUL, 0x018701d7UL, 0xb8bfd64aUL, 0xddd86af2UL, 0x3377dfe0UL,
+ 0x56106358UL, 0x9f571950UL, 0xfa30a5e8UL, 0x149f10faUL, 0x71f8ac42UL,
+ 0xc8c07bdfUL, 0xada7c767UL, 0x43087275UL, 0x266fcecdUL, 0x707fad95UL,
+ 0x1518112dUL, 0xfbb7a43fUL, 0x9ed01887UL, 0x27e8cf1aUL, 0x428f73a2UL,
+ 0xac20c6b0UL, 0xc9477a08UL, 0x3eaf32a0UL, 0x5bc88e18UL, 0xb5673b0aUL,
+ 0xd00087b2UL, 0x6938502fUL, 0x0c5fec97UL, 0xe2f05985UL, 0x8797e53dUL,
+ 0xd1878665UL, 0xb4e03addUL, 0x5a4f8fcfUL, 0x3f283377UL, 0x8610e4eaUL,
+ 0xe3775852UL, 0x0dd8ed40UL, 0x68bf51f8UL, 0xa1f82bf0UL, 0xc49f9748UL,
+ 0x2a30225aUL, 0x4f579ee2UL, 0xf66f497fUL, 0x9308f5c7UL, 0x7da740d5UL,
+ 0x18c0fc6dUL, 0x4ed09f35UL, 0x2bb7238dUL, 0xc518969fUL, 0xa07f2a27UL,
+ 0x1947fdbaUL, 0x7c204102UL, 0x928ff410UL, 0xf7e848a8UL, 0x3d58149bUL,
+ 0x583fa823UL, 0xb6901d31UL, 0xd3f7a189UL, 0x6acf7614UL, 0x0fa8caacUL,
+ 0xe1077fbeUL, 0x8460c306UL, 0xd270a05eUL, 0xb7171ce6UL, 0x59b8a9f4UL,
+ 0x3cdf154cUL, 0x85e7c2d1UL, 0xe0807e69UL, 0x0e2fcb7bUL, 0x6b4877c3UL,
+ 0xa20f0dcbUL, 0xc768b173UL, 0x29c70461UL, 0x4ca0b8d9UL, 0xf5986f44UL,
+ 0x90ffd3fcUL, 0x7e5066eeUL, 0x1b37da56UL, 0x4d27b90eUL, 0x284005b6UL,
+ 0xc6efb0a4UL, 0xa3880c1cUL, 0x1ab0db81UL, 0x7fd76739UL, 0x9178d22bUL,
+ 0xf41f6e93UL, 0x03f7263bUL, 0x66909a83UL, 0x883f2f91UL, 0xed589329UL,
+ 0x546044b4UL, 0x3107f80cUL, 0xdfa84d1eUL, 0xbacff1a6UL, 0xecdf92feUL,
+ 0x89b82e46UL, 0x67179b54UL, 0x027027ecUL, 0xbb48f071UL, 0xde2f4cc9UL,
+ 0x3080f9dbUL, 0x55e74563UL, 0x9ca03f6bUL, 0xf9c783d3UL, 0x176836c1UL,
+ 0x720f8a79UL, 0xcb375de4UL, 0xae50e15cUL, 0x40ff544eUL, 0x2598e8f6UL,
+ 0x73888baeUL, 0x16ef3716UL, 0xf8408204UL, 0x9d273ebcUL, 0x241fe921UL,
+ 0x41785599UL, 0xafd7e08bUL, 0xcab05c33UL, 0x3bb659edUL, 0x5ed1e555UL,
+ 0xb07e5047UL, 0xd519ecffUL, 0x6c213b62UL, 0x094687daUL, 0xe7e932c8UL,
+ 0x828e8e70UL, 0xd49eed28UL, 0xb1f95190UL, 0x5f56e482UL, 0x3a31583aUL,
+ 0x83098fa7UL, 0xe66e331fUL, 0x08c1860dUL, 0x6da63ab5UL, 0xa4e140bdUL,
+ 0xc186fc05UL, 0x2f294917UL, 0x4a4ef5afUL, 0xf3762232UL, 0x96119e8aUL,
+ 0x78be2b98UL, 0x1dd99720UL, 0x4bc9f478UL, 0x2eae48c0UL, 0xc001fdd2UL,
+ 0xa566416aUL, 0x1c5e96f7UL, 0x79392a4fUL, 0x97969f5dUL, 0xf2f123e5UL,
+ 0x05196b4dUL, 0x607ed7f5UL, 0x8ed162e7UL, 0xebb6de5fUL, 0x528e09c2UL,
+ 0x37e9b57aUL, 0xd9460068UL, 0xbc21bcd0UL, 0xea31df88UL, 0x8f566330UL,
+ 0x61f9d622UL, 0x049e6a9aUL, 0xbda6bd07UL, 0xd8c101bfUL, 0x366eb4adUL,
+ 0x53090815UL, 0x9a4e721dUL, 0xff29cea5UL, 0x11867bb7UL, 0x74e1c70fUL,
+ 0xcdd91092UL, 0xa8beac2aUL, 0x46111938UL, 0x2376a580UL, 0x7566c6d8UL,
+ 0x10017a60UL, 0xfeaecf72UL, 0x9bc973caUL, 0x22f1a457UL, 0x479618efUL,
+ 0xa939adfdUL, 0xcc5e1145UL, 0x06ee4d76UL, 0x6389f1ceUL, 0x8d2644dcUL,
+ 0xe841f864UL, 0x51792ff9UL, 0x341e9341UL, 0xdab12653UL, 0xbfd69aebUL,
+ 0xe9c6f9b3UL, 0x8ca1450bUL, 0x620ef019UL, 0x07694ca1UL, 0xbe519b3cUL,
+ 0xdb362784UL, 0x35999296UL, 0x50fe2e2eUL, 0x99b95426UL, 0xfcdee89eUL,
+ 0x12715d8cUL, 0x7716e134UL, 0xce2e36a9UL, 0xab498a11UL, 0x45e63f03UL,
+ 0x208183bbUL, 0x7691e0e3UL, 0x13f65c5bUL, 0xfd59e949UL, 0x983e55f1UL,
+ 0x2106826cUL, 0x44613ed4UL, 0xaace8bc6UL, 0xcfa9377eUL, 0x38417fd6UL,
+ 0x5d26c36eUL, 0xb389767cUL, 0xd6eecac4UL, 0x6fd61d59UL, 0x0ab1a1e1UL,
+ 0xe41e14f3UL, 0x8179a84bUL, 0xd769cb13UL, 0xb20e77abUL, 0x5ca1c2b9UL,
+ 0x39c67e01UL, 0x80fea99cUL, 0xe5991524UL, 0x0b36a036UL, 0x6e511c8eUL,
+ 0xa7166686UL, 0xc271da3eUL, 0x2cde6f2cUL, 0x49b9d394UL, 0xf0810409UL,
+ 0x95e6b8b1UL, 0x7b490da3UL, 0x1e2eb11bUL, 0x483ed243UL, 0x2d596efbUL,
+ 0xc3f6dbe9UL, 0xa6916751UL, 0x1fa9b0ccUL, 0x7ace0c74UL, 0x9461b966UL,
+ 0xf10605deUL
+#endif
+ }
+};
ANNOUNCE-2.6.7
ANNOUNCE-2.6.8
ANNOUNCE-2.6.9
+ANNOUNCE-3.0-devel1
+ANNOUNCE-3.0-devel2
+ANNOUNCE-3.0-devel3
+ANNOUNCE-3.0-rc1
Assemble.c
bitmap.c
bitmap.h
ChangeLog
config.c
COPYING
+crc32.c
+crc32.h
Create.c
Detail.c
dlink.c
kernel-patch-2.6.18
kernel-patch-2.6.18.6
kernel-patch-2.6.19
+kernel-patch-2.6.25
+kernel-patch-2.6.27
Kill.c
makedist
Makefile
Manage.c
+managemon.c
mapfile.c
md.4
md5.h
mdadm.spec
mdassemble.8
mdassemble.c
+mdmon.8
+mdmon.c
+mdmon.h
mdopen.c
md_p.h
mdstat.c
misc/
misc/syslog-events
mkinitramfs
+monitor.c
Monitor.c
+msg.c
+msg.h
+platform-intel.c
+platform-intel.h
+probe_roms.c
+probe_roms.h
pwgr.c
Query.c
raid5extend.c
ReadMe.c
README.initramfs
restripe.c
+sg_io.c
sha1.c
sha1.h
super0.c
super1.c
+super-ddf.c
+super-intel.c
swap_super.c
sysfs.c
test
tests/00raid6
tests/01r1fail
tests/01r5fail
+tests/01r5integ
+tests/01raid6integ
tests/02lineargrow
tests/02r1add
tests/02r1grow
tests/07autodetect
tests/07reshape5intr
tests/07testreshape5
+tests/08imsm-overlap
+tests/09imsm-create-fail-rebuild
+tests/10ddf-create
tests/check
+tests/env-08imsm-overlap
+tests/env-09imsm-create-fail-rebuild
tests/testdev
tests/ToTest
TODO
+udev-md-raid.rules
util.c
--- /dev/null
+Status: ok
+
+Support adding a spare to a live md array with external metadata.
+
+i.e. extend the 'md/dev-XXX/slot' attribute so that you can
+tell a device to fill an vacant slot in an and md array.
+
+
+Signed-off-by: Neil Brown <neilb@suse.de>
+
+### Diffstat output
+ ./drivers/md/md.c | 44 ++++++++++++++++++++++++++++++++++++++++----
+ ./drivers/md/multipath.c | 7 ++++++-
+ ./drivers/md/raid1.c | 7 ++++++-
+ ./drivers/md/raid10.c | 10 ++++++++--
+ ./drivers/md/raid5.c | 10 ++++++++--
+ 5 files changed, 68 insertions(+), 10 deletions(-)
+
+diff .prev/drivers/md/md.c ./drivers/md/md.c
+--- .prev/drivers/md/md.c 2008-06-05 09:19:56.000000000 +1000
++++ ./drivers/md/md.c 2008-06-10 10:41:21.000000000 +1000
+@@ -1932,7 +1932,7 @@ slot_store(mdk_rdev_t *rdev, const char
+ slot = -1;
+ else if (e==buf || (*e && *e!= '\n'))
+ return -EINVAL;
+- if (rdev->mddev->pers) {
++ if (rdev->mddev->pers && slot == -1) {
+ /* Setting 'slot' on an active array requires also
+ * updating the 'rd%d' link, and communicating
+ * with the personality with ->hot_*_disk.
+@@ -1940,8 +1940,6 @@ slot_store(mdk_rdev_t *rdev, const char
+ * failed/spare devices. This normally happens automatically,
+ * but not when the metadata is externally managed.
+ */
+- if (slot != -1)
+- return -EBUSY;
+ if (rdev->raid_disk == -1)
+ return -EEXIST;
+ /* personality does all needed checks */
+@@ -1955,6 +1953,44 @@ slot_store(mdk_rdev_t *rdev, const char
+ sysfs_remove_link(&rdev->mddev->kobj, nm);
+ set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
+ md_wakeup_thread(rdev->mddev->thread);
++ } else if (rdev->mddev->pers) {
++ mdk_rdev_t *rdev2;
++ struct list_head *tmp;
++ /* Activating a spare .. or possibly reactivating
++ * if we every get bitmaps working here.
++ */
++
++ if (rdev->raid_disk != -1)
++ return -EBUSY;
++
++ if (rdev->mddev->pers->hot_add_disk == NULL)
++ return -EINVAL;
++
++ rdev_for_each(rdev2, tmp, rdev->mddev)
++ if (rdev2->raid_disk == slot)
++ return -EEXIST;
++
++ rdev->raid_disk = slot;
++ if (test_bit(In_sync, &rdev->flags))
++ rdev->saved_raid_disk = slot;
++ else
++ rdev->saved_raid_disk = -1;
++ err = rdev->mddev->pers->
++ hot_add_disk(rdev->mddev, rdev);
++ if (err != 1) {
++ rdev->raid_disk = -1;
++ if (err == 0)
++ return -EEXIST;
++ return err;
++ }
++ sprintf(nm, "rd%d", rdev->raid_disk);
++ if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
++ printk(KERN_WARNING
++ "md: cannot register "
++ "%s for %s\n",
++ nm, mdname(rdev->mddev));
++
++ /* don't wakeup anyone, leave that to userspace. */
+ } else {
+ if (slot >= rdev->mddev->raid_disks)
+ return -ENOSPC;
+@@ -4205,7 +4241,7 @@ static int add_new_disk(mddev_t * mddev,
+ super_types[mddev->major_version].
+ validate_super(mddev, rdev);
+ err = mddev->pers->hot_add_disk(mddev, rdev);
+- if (err)
++ if (err < 0)
+ unbind_rdev_from_array(rdev);
+ }
+ if (err)
+
+diff .prev/drivers/md/multipath.c ./drivers/md/multipath.c
+--- .prev/drivers/md/multipath.c 2008-05-30 14:49:31.000000000 +1000
++++ ./drivers/md/multipath.c 2008-06-10 10:35:03.000000000 +1000
+@@ -284,10 +284,15 @@ static int multipath_add_disk(mddev_t *m
+ int found = 0;
+ int path;
+ struct multipath_info *p;
++ int first = 0;
++ int last = mddev->raid_disks - 1;
++
++ if (rdev->raid_disk >= 0)
++ first = last = rdev->raid_disk;
+
+ print_multipath_conf(conf);
+
+- for (path=0; path<mddev->raid_disks; path++)
++ for (path = first; path <= last; path++)
+ if ((p=conf->multipaths+path)->rdev == NULL) {
+ q = rdev->bdev->bd_disk->queue;
+ blk_queue_stack_limits(mddev->queue, q);
+
+diff .prev/drivers/md/raid10.c ./drivers/md/raid10.c
+--- .prev/drivers/md/raid10.c 2008-05-30 14:49:31.000000000 +1000
++++ ./drivers/md/raid10.c 2008-06-10 10:28:53.000000000 +1000
+@@ -1116,6 +1116,8 @@ static int raid10_add_disk(mddev_t *mdde
+ int found = 0;
+ int mirror;
+ mirror_info_t *p;
++ int first = 0;
++ int last = mddev->raid_disks - 1;
+
+ if (mddev->recovery_cp < MaxSector)
+ /* only hot-add to in-sync arrays, as recovery is
+@@ -1125,12 +1127,16 @@ static int raid10_add_disk(mddev_t *mdde
+ if (!enough(conf))
+ return 0;
+
++ if (rdev->raid_disk)
++ first = last = rdev->raid_disk;
++
+ if (rdev->saved_raid_disk >= 0 &&
++ rdev->saved_raid_disk >= first &&
+ conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
+ mirror = rdev->saved_raid_disk;
+ else
+- mirror = 0;
+- for ( ; mirror < mddev->raid_disks; mirror++)
++ mirror = first;
++ for ( ; mirror <= last ; mirror++)
+ if ( !(p=conf->mirrors+mirror)->rdev) {
+
+ blk_queue_stack_limits(mddev->queue,
+
+diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c
+--- .prev/drivers/md/raid1.c 2008-05-30 14:49:31.000000000 +1000
++++ ./drivers/md/raid1.c 2008-06-10 10:41:00.000000000 +1000
+@@ -1103,8 +1103,13 @@ static int raid1_add_disk(mddev_t *mddev
+ int found = 0;
+ int mirror = 0;
+ mirror_info_t *p;
++ int first = 0;
++ int last = mddev->raid_disks - 1;
+
+- for (mirror=0; mirror < mddev->raid_disks; mirror++)
++ if (rdev->raid_disk >= 0)
++ first = last = rdev->raid_disk;
++
++ for (mirror = first; mirror <= last; mirror++)
+ if ( !(p=conf->mirrors+mirror)->rdev) {
+
+ blk_queue_stack_limits(mddev->queue,
+
+diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c
+--- .prev/drivers/md/raid5.c 2008-05-30 14:49:35.000000000 +1000
++++ ./drivers/md/raid5.c 2008-06-10 10:27:51.000000000 +1000
+@@ -4399,21 +4399,27 @@ static int raid5_add_disk(mddev_t *mddev
+ int found = 0;
+ int disk;
+ struct disk_info *p;
++ int first = 0;
++ int last = conf->raid_disks - 1;
+
+ if (mddev->degraded > conf->max_degraded)
+ /* no point adding a device */
+ return 0;
+
++ if (rdev->raid_disk >= 0)
++ first = last = rdev->raid_disk;
++
+ /*
+ * find the disk ... but prefer rdev->saved_raid_disk
+ * if possible.
+ */
+ if (rdev->saved_raid_disk >= 0 &&
++ rdev->saved_raid_disk >= first &&
+ conf->disks[rdev->saved_raid_disk].rdev == NULL)
+ disk = rdev->saved_raid_disk;
+ else
+- disk = 0;
+- for ( ; disk < conf->raid_disks; disk++)
++ disk = first;
++ for ( ; disk <= last ; disk++)
+ if ((p=conf->disks + disk)->rdev == NULL) {
+ clear_bit(In_sync, &rdev->flags);
+ rdev->raid_disk = disk;
--- /dev/null
+touch_mnt_namespace when the mount flags change
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+Daemons that need to be launched while the rootfs is read-only can now
+poll /proc/mounts to be notified when their O_RDWR requests may no
+longer end in EROFS.
+
+Cc: Kay Sievers <kay.sievers@vrfy.org>
+Cc: Neil Brown <neilb@suse.de>
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+---
+
+ fs/namespace.c | 7 ++++++-
+ 1 files changed, 6 insertions(+), 1 deletions(-)
+
+
+diff --git a/fs/namespace.c b/fs/namespace.c
+index 6e283c9..1bd5ba2 100644
+--- a/fs/namespace.c
++++ b/fs/namespace.c
+@@ -1553,8 +1553,13 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags,
+ if (!err)
+ nd->path.mnt->mnt_flags = mnt_flags;
+ up_write(&sb->s_umount);
+- if (!err)
++ if (!err) {
+ security_sb_post_remount(nd->path.mnt, flags, data);
++
++ spin_lock(&vfsmount_lock);
++ touch_mnt_namespace(nd->path.mnt->mnt_ns);
++ spin_unlock(&vfsmount_lock);
++ }
+ return err;
+ }
+
version=`echo $7 | sed 's/v//'`
grep "^.TH MDADM 8 .. v$version" mdadm.8 > /dev/null 2>&1 ||
{
- echo mdadm.8 does not mention verion $version.
+ echo mdadm.8 does not mention version $version.
+ exit 1
+ }
+grep "^.TH MDMON 8 .. v$version" mdmon.8 > /dev/null 2>&1 ||
+ {
+ echo mdmon.8 does not mention version $version.
exit 1
}
rpmv=`echo $version | tr - _`
--- /dev/null
+/*
+ * mdmon - monitor external metadata arrays
+ *
+ * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2007-2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * The management thread for monitoring active md arrays.
+ * This thread does things which might block such as memory
+ * allocation.
+ * In particular:
+ *
+ * - Find out about new arrays in this container.
+ * Allocate the data structures and open the files.
+ *
+ * For this we watch /proc/mdstat and find new arrays with
+ * metadata type that confirms sharing. e.g. "md4"
+ * When we find a new array we slip it into the list of
+ * arrays and signal 'monitor' by writing to a pipe.
+ *
+ * - Respond to reshape requests by allocating new data structures
+ * and opening new files.
+ *
+ * These come as a change to raid_disks. We allocate a new
+ * version of the data structures and slip it into the list.
+ * 'monitor' will notice and release the old version.
+ * Changes to level, chunksize, layout.. do not need re-allocation.
+ * Reductions in raid_disks don't really either, but we handle
+ * them the same way for consistency.
+ *
+ * - When a device is added to the container, we add it to the metadata
+ * as a spare.
+ *
+ * - Deal with degraded array
+ * We only do this when first noticing the array is degraded.
+ * This can be when we first see the array, when sync completes or
+ * when recovery completes.
+ *
+ * Check if number of failed devices suggests recovery is needed, and
+ * skip if not.
+ * Ask metadata to allocate a spare device
+ * Add device as not in_sync and give a role
+ * Update metadata.
+ * Open sysfs files and pass to monitor.
+ * Make sure that monitor Starts recovery....
+ *
+ * - Pass on metadata updates from external programs such as
+ * mdadm creating a new array.
+ *
+ * This is most-messy.
+ * It might involve adding a new array or changing the status of
+ * a spare, or any reconfig that the kernel doesn't get involved in.
+ *
+ * The required updates are received via a named pipe. There will
+ * be one named pipe for each container. Each message contains a
+ * sync marker: 0x5a5aa5a5, A byte count, and the message. This is
+ * passed to the metadata handler which will interpret and process it.
+ * For 'DDF' messages are internal data blocks with the leading
+ * 'magic number' signifying what sort of data it is.
+ *
+ */
+
+/*
+ * We select on /proc/mdstat and the named pipe.
+ * We create new arrays or updated version of arrays and slip
+ * them into the head of the list, then signal 'monitor' via a pipe write.
+ * 'monitor' will notice and place the old array on a return list.
+ * Metadata updates are placed on a queue just like they arrive
+ * from the named pipe.
+ *
+ * When new arrays are found based on correct metadata string, we
+ * need to identify them with an entry in the metadata. Maybe we require
+ * the metadata to be mdX/NN when NN is the index into an appropriate table.
+ *
+ */
+
+/*
+ * List of tasks:
+ * - Watch for spares to be added to the container, and write updated
+ * metadata to them.
+ * - Watch for new arrays using this container, confirm they match metadata
+ * and if so, start monitoring them
+ * - Watch for spares being added to monitored arrays. This shouldn't
+ * happen, as we should do all the adding. Just remove them.
+ * - Watch for change in raid-disks, chunk-size, etc. Update metadata and
+ * start a reshape.
+ */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include "mdadm.h"
+#include "mdmon.h"
+#include <sys/syscall.h>
+#include <sys/socket.h>
+#include <signal.h>
+
+static void close_aa(struct active_array *aa)
+{
+ struct mdinfo *d;
+
+ for (d = aa->info.devs; d; d = d->next)
+ close(d->state_fd);
+
+ close(aa->action_fd);
+ close(aa->info.state_fd);
+ close(aa->resync_start_fd);
+}
+
+static void free_aa(struct active_array *aa)
+{
+ /* Note that this doesn't close fds if they are being used
+ * by a clone. ->container will be set for a clone
+ */
+ dprintf("%s: devnum: %d\n", __func__, aa->devnum);
+ if (!aa->container)
+ close_aa(aa);
+ while (aa->info.devs) {
+ struct mdinfo *d = aa->info.devs;
+ aa->info.devs = d->next;
+ free(d);
+ }
+ free(aa);
+}
+
+static struct active_array *duplicate_aa(struct active_array *aa)
+{
+ struct active_array *newa = malloc(sizeof(*newa));
+ struct mdinfo **dp1, **dp2;
+
+ *newa = *aa;
+ newa->next = NULL;
+ newa->replaces = NULL;
+ newa->info.next = NULL;
+
+ dp2 = &newa->info.devs;
+
+ for (dp1 = &aa->info.devs; *dp1; dp1 = &(*dp1)->next) {
+ struct mdinfo *d;
+ if ((*dp1)->state_fd < 0)
+ continue;
+
+ d = malloc(sizeof(*d));
+ *d = **dp1;
+ *dp2 = d;
+ dp2 = & d->next;
+ }
+ *dp2 = NULL;
+
+ return newa;
+}
+
+static void wakeup_monitor(void)
+{
+ /* tgkill(getpid(), mon_tid, SIGUSR1); */
+ int pid = getpid();
+ syscall(SYS_tgkill, pid, mon_tid, SIGUSR1);
+}
+
+static void remove_old(void)
+{
+ if (discard_this) {
+ discard_this->next = NULL;
+ free_aa(discard_this);
+ if (pending_discard == discard_this)
+ pending_discard = NULL;
+ discard_this = NULL;
+ wakeup_monitor();
+ }
+}
+
+static void replace_array(struct supertype *container,
+ struct active_array *old,
+ struct active_array *new)
+{
+ /* To replace an array, we add it to the top of the list
+ * marked with ->replaces to point to the original.
+ * 'monitor' will take the original out of the list
+ * and put it on 'discard_this'. We take it from there
+ * and discard it.
+ */
+ remove_old();
+ while (pending_discard) {
+ while (discard_this == NULL)
+ sleep(1);
+ remove_old();
+ }
+ pending_discard = old;
+ new->replaces = old;
+ new->next = container->arrays;
+ container->arrays = new;
+ wakeup_monitor();
+}
+
+struct metadata_update *update_queue = NULL;
+struct metadata_update *update_queue_handled = NULL;
+struct metadata_update *update_queue_pending = NULL;
+
+void check_update_queue(struct supertype *container)
+{
+ while (update_queue_handled) {
+ struct metadata_update *this = update_queue_handled;
+ update_queue_handled = this->next;
+ free(this->buf);
+ if (this->space)
+ free(this->space);
+ free(this);
+ }
+ if (update_queue == NULL &&
+ update_queue_pending) {
+ update_queue = update_queue_pending;
+ update_queue_pending = NULL;
+ wakeup_monitor();
+ }
+}
+
+static void queue_metadata_update(struct metadata_update *mu)
+{
+ struct metadata_update **qp;
+
+ qp = &update_queue_pending;
+ while (*qp)
+ qp = & ((*qp)->next);
+ *qp = mu;
+}
+
+static void add_disk_to_container(struct supertype *st, struct mdinfo *sd)
+{
+ int dfd;
+ char nm[20];
+ struct supertype *st2;
+ struct metadata_update *update = NULL;
+ struct mdinfo info;
+ mdu_disk_info_t dk = {
+ .number = -1,
+ .major = sd->disk.major,
+ .minor = sd->disk.minor,
+ .raid_disk = -1,
+ .state = 0,
+ };
+
+ dprintf("%s: add %d:%d to container\n",
+ __func__, sd->disk.major, sd->disk.minor);
+
+ sd->next = st->devs;
+ st->devs = sd;
+
+ sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+ dfd = dev_open(nm, O_RDWR);
+ if (dfd < 0)
+ return;
+
+ /* Check the metadata and see if it is already part of this
+ * array
+ */
+ st2 = dup_super(st);
+ if (st2->ss->load_super(st2, dfd, NULL) == 0) {
+ st2->ss->getinfo_super(st, &info);
+ if (st->ss->compare_super(st, st2) == 0 &&
+ info.disk.raid_disk >= 0) {
+ /* Looks like a good member of array.
+ * Just accept it.
+ * mdadm will incorporate any parts into
+ * active arrays.
+ */
+ st2->ss->free_super(st2);
+ return;
+ }
+ }
+ st2->ss->free_super(st2);
+
+ st->update_tail = &update;
+ st->ss->add_to_super(st, &dk, dfd, NULL);
+ st->ss->write_init_super(st);
+ queue_metadata_update(update);
+ st->update_tail = NULL;
+}
+
+static void manage_container(struct mdstat_ent *mdstat,
+ struct supertype *container)
+{
+ /* The only thing of interest here is if a new device
+ * has been added to the container. We add it to the
+ * array ignoring any metadata on it.
+ * FIXME should we look for compatible metadata and take hints
+ * about spare assignment.... probably not.
+ */
+ if (mdstat->devcnt != container->devcnt) {
+ struct mdinfo **cdp, *cd, *di, *mdi;
+ int found;
+
+ /* read /sys/block/NAME/md/dev-??/block/dev to find out
+ * what is there, and compare with container->info.devs
+ * To see what is removed and what is added.
+ * These need to be remove from, or added to, the array
+ */
+ mdi = sysfs_read(-1, mdstat->devnum, GET_DEVS|SKIP_GONE_DEVS);
+ if (!mdi) {
+ /* invalidate the current count so we can try again */
+ container->devcnt = -1;
+ return;
+ }
+
+ /* check for removals */
+ for (cdp = &container->devs; *cdp; ) {
+ found = 0;
+ for (di = mdi->devs; di; di = di->next)
+ if (di->disk.major == (*cdp)->disk.major &&
+ di->disk.minor == (*cdp)->disk.minor) {
+ found = 1;
+ break;
+ }
+ if (!found) {
+ cd = *cdp;
+ *cdp = (*cdp)->next;
+ free(cd);
+ } else
+ cdp = &(*cdp)->next;
+ }
+
+ /* check for additions */
+ for (di = mdi->devs; di; di = di->next) {
+ for (cd = container->devs; cd; cd = cd->next)
+ if (di->disk.major == cd->disk.major &&
+ di->disk.minor == cd->disk.minor)
+ break;
+ if (!cd) {
+ struct mdinfo *newd = malloc(sizeof(*newd));
+
+ if (!newd) {
+ container->devcnt = -1;
+ continue;
+ }
+ *newd = *di;
+ add_disk_to_container(container, newd);
+ }
+ }
+ sysfs_free(mdi);
+ container->devcnt = mdstat->devcnt;
+ }
+}
+
+static void manage_member(struct mdstat_ent *mdstat,
+ struct active_array *a)
+{
+ /* Compare mdstat info with known state of member array.
+ * We do not need to look for device state changes here, that
+ * is dealt with by the monitor.
+ *
+ * We just look for changes which suggest that a reshape is
+ * being requested.
+ * Unfortunately decreases in raid_disks don't show up in
+ * mdstat until the reshape completes FIXME.
+ *
+ * Actually, we also want to handle degraded arrays here by
+ * trying to find and assign a spare.
+ * We do that whenever the monitor tells us too.
+ */
+ // FIXME
+ a->info.array.raid_disks = mdstat->raid_disks;
+ a->info.array.chunk_size = mdstat->chunk_size;
+ // MORE
+
+ if (a->check_degraded) {
+ struct metadata_update *updates = NULL;
+ struct mdinfo *newdev;
+ struct active_array *newa;
+
+ a->check_degraded = 0;
+
+ /* The array may not be degraded, this is just a good time
+ * to check.
+ */
+ newdev = a->container->ss->activate_spare(a, &updates);
+ if (newdev) {
+ struct mdinfo *d;
+ /* Cool, we can add a device or several. */
+ newa = duplicate_aa(a);
+ /* suspend recovery - maybe not needed */
+
+ /* Add device to array and set offset/size/slot.
+ * and open files for each newdev */
+ for (d = newdev; d ; d = d->next) {
+ struct mdinfo *newd;
+ if (sysfs_add_disk(&newa->info, d, 0) < 0)
+ continue;
+ newd = malloc(sizeof(*newd));
+ *newd = *d;
+ newd->next = newa->info.devs;
+ newa->info.devs = newd;
+
+ newd->state_fd = sysfs_open(a->devnum,
+ newd->sys_name,
+ "state");
+ newd->prev_state
+ = read_dev_state(newd->state_fd);
+ newd->curr_state = newd->prev_state;
+ }
+ queue_metadata_update(updates);
+ replace_array(a->container, a, newa);
+ sysfs_set_str(&a->info, NULL, "sync_action", "recover");
+ }
+ }
+}
+
+static int aa_ready(struct active_array *aa)
+{
+ struct mdinfo *d;
+ int level = aa->info.array.level;
+
+ for (d = aa->info.devs; d; d = d->next)
+ if (d->state_fd < 0)
+ return 0;
+
+ if (aa->info.state_fd < 0)
+ return 0;
+
+ if (level > 0 && (aa->action_fd < 0 || aa->resync_start_fd < 0))
+ return 0;
+
+ if (!aa->container)
+ return 0;
+
+ return 1;
+}
+
+static void manage_new(struct mdstat_ent *mdstat,
+ struct supertype *container,
+ struct active_array *victim)
+{
+ /* A new array has appeared in this container.
+ * Hopefully it is already recorded in the metadata.
+ * Check, then create the new array to report it to
+ * the monitor.
+ */
+
+ struct active_array *new;
+ struct mdinfo *mdi, *di;
+ char *inst;
+ int i;
+ int failed = 0;
+
+ /* check if array is ready to be monitored */
+ if (!mdstat->active)
+ return;
+
+ mdi = sysfs_read(-1, mdstat->devnum,
+ GET_LEVEL|GET_CHUNK|GET_DISKS|GET_COMPONENT|
+ GET_DEGRADED|GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE);
+
+ new = malloc(sizeof(*new));
+
+ if (!new || !mdi) {
+ if (mdi)
+ sysfs_free(mdi);
+ if (new)
+ free(new);
+ return;
+ }
+ memset(new, 0, sizeof(*new));
+
+ new->devnum = mdstat->devnum;
+ strcpy(new->info.sys_name, devnum2devname(new->devnum));
+
+ new->prev_state = new->curr_state = new->next_state = inactive;
+ new->prev_action= new->curr_action= new->next_action= idle;
+
+ new->container = container;
+
+ inst = &mdstat->metadata_version[10+strlen(container->devname)+1];
+
+ new->info.array = mdi->array;
+ new->info.component_size = mdi->component_size;
+
+ for (i = 0; i < new->info.array.raid_disks; i++) {
+ struct mdinfo *newd = malloc(sizeof(*newd));
+
+ for (di = mdi->devs; di; di = di->next)
+ if (i == di->disk.raid_disk)
+ break;
+
+ if (di && newd) {
+ memcpy(newd, di, sizeof(*newd));
+
+ newd->state_fd = sysfs_open(new->devnum,
+ newd->sys_name,
+ "state");
+
+ newd->prev_state = read_dev_state(newd->state_fd);
+ newd->curr_state = newd->prev_state;
+ } else {
+ if (newd)
+ free(newd);
+
+ failed++;
+ if (failed > new->info.array.failed_disks) {
+ /* we cannot properly monitor without all working disks */
+ new->container = NULL;
+ break;
+ }
+ continue;
+ }
+ sprintf(newd->sys_name, "rd%d", i);
+ newd->next = new->info.devs;
+ new->info.devs = newd;
+ }
+
+ new->action_fd = sysfs_open(new->devnum, NULL, "sync_action");
+ new->info.state_fd = sysfs_open(new->devnum, NULL, "array_state");
+ new->resync_start_fd = sysfs_open(new->devnum, NULL, "resync_start");
+ new->metadata_fd = sysfs_open(new->devnum, NULL, "metadata_version");
+ get_resync_start(new);
+ dprintf("%s: inst: %d action: %d state: %d\n", __func__, atoi(inst),
+ new->action_fd, new->info.state_fd);
+
+ sysfs_free(mdi);
+
+ /* if everything checks out tell the metadata handler we want to
+ * manage this instance
+ */
+ if (!aa_ready(new) || container->ss->open_new(container, new, inst) < 0) {
+ fprintf(stderr, "mdmon: failed to monitor %s\n",
+ mdstat->metadata_version);
+ new->container = NULL;
+ free_aa(new);
+ } else {
+ replace_array(container, victim, new);
+ if (failed) {
+ new->check_degraded = 1;
+ manage_member(mdstat, new);
+ }
+ }
+}
+
+void manage(struct mdstat_ent *mdstat, struct supertype *container)
+{
+ /* We have just read mdstat and need to compare it with
+ * the known active arrays.
+ * Arrays with the wrong metadata are ignored.
+ */
+
+ for ( ; mdstat ; mdstat = mdstat->next) {
+ struct active_array *a;
+ if (mdstat->devnum == container->devnum) {
+ manage_container(mdstat, container);
+ continue;
+ }
+ if (!is_container_member(mdstat, container->devname))
+ /* Not for this array */
+ continue;
+ /* Looks like a member of this container */
+ for (a = container->arrays; a; a = a->next) {
+ if (mdstat->devnum == a->devnum) {
+ if (a->container)
+ manage_member(mdstat, a);
+ break;
+ }
+ }
+ if (a == NULL || !a->container)
+ manage_new(mdstat, container, a);
+ }
+}
+
+static void handle_message(struct supertype *container, struct metadata_update *msg)
+{
+ /* queue this metadata update through to the monitor */
+
+ struct metadata_update *mu;
+
+ if (msg->len <= 0)
+ while (update_queue_pending || update_queue) {
+ check_update_queue(container);
+ usleep(15*1000);
+ }
+
+ if (msg->len == 0) { /* ping_monitor */
+ int cnt;
+
+ cnt = monitor_loop_cnt;
+ if (cnt & 1)
+ cnt += 2; /* wait until next pselect */
+ else
+ cnt += 3; /* wait for 2 pselects */
+ wakeup_monitor();
+
+ while (monitor_loop_cnt - cnt < 0)
+ usleep(10 * 1000);
+ } else if (msg->len == -1) { /* ping_manager */
+ struct mdstat_ent *mdstat = mdstat_read(1, 0);
+
+ manage(mdstat, container);
+ free_mdstat(mdstat);
+ } else if (!sigterm) {
+ mu = malloc(sizeof(*mu));
+ mu->len = msg->len;
+ mu->buf = msg->buf;
+ msg->buf = NULL;
+ mu->space = NULL;
+ mu->next = NULL;
+ if (container->ss->prepare_update)
+ container->ss->prepare_update(container, mu);
+ queue_metadata_update(mu);
+ }
+}
+
+void read_sock(struct supertype *container)
+{
+ int fd;
+ struct metadata_update msg;
+ int terminate = 0;
+ long fl;
+ int tmo = 3; /* 3 second timeout before hanging up the socket */
+
+ fd = accept(container->sock, NULL, NULL);
+ if (fd < 0)
+ return;
+
+ fl = fcntl(fd, F_GETFL, 0);
+ fl |= O_NONBLOCK;
+ fcntl(fd, F_SETFL, fl);
+
+ do {
+ msg.buf = NULL;
+
+ /* read and validate the message */
+ if (receive_message(fd, &msg, tmo) == 0) {
+ handle_message(container, &msg);
+ if (ack(fd, tmo) < 0)
+ terminate = 1;
+ } else
+ terminate = 1;
+
+ } while (!terminate);
+
+ close(fd);
+}
+
+int exit_now = 0;
+int manager_ready = 0;
+void do_manager(struct supertype *container)
+{
+ struct mdstat_ent *mdstat;
+ sigset_t set;
+ int proc_fd;
+
+ sigprocmask(SIG_UNBLOCK, NULL, &set);
+ sigdelset(&set, SIGUSR1);
+ sigdelset(&set, SIGHUP);
+ sigdelset(&set, SIGALRM);
+ sigdelset(&set, SIGTERM);
+ proc_fd = open("/proc/mounts", O_RDONLY);
+
+ do {
+
+ if (exit_now)
+ exit(0);
+
+ /* Can only 'manage' things if 'monitor' is not making
+ * structural changes to metadata, so need to check
+ * update_queue
+ */
+ if (update_queue == NULL) {
+ mdstat = mdstat_read(1, 0);
+
+ manage(mdstat, container);
+
+ read_sock(container);
+
+ if (container->sock < 0 || socket_hup_requested) {
+ close(container->sock);
+ container->sock = make_control_sock(container->devname);
+ make_pidfile(container->devname, 0);
+ socket_hup_requested = 0;
+ }
+ if (container->sock < 0)
+ alarm(30);
+
+ free_mdstat(mdstat);
+ }
+ remove_old();
+
+ check_update_queue(container);
+
+ manager_ready = 1;
+
+ if (sigterm)
+ wakeup_monitor();
+
+ if (update_queue == NULL) {
+ if (container->sock < 0)
+ mdstat_wait_fd(proc_fd, &set);
+ else
+ mdstat_wait_fd(container->sock, &set);
+ } else
+ /* If an update is happening, just wait for signal */
+ pselect(0, NULL, NULL, NULL, NULL, &set);
+ } while(1);
+}
* mapfile - manage /var/run/mdadm.map. Part of:
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* also allows the array device name to be easily found.
*
* The map file is line based with space separated fields. The fields are:
- * Device id - mdX or mdpX where is a number.
- * metadata - 0.90 1.0 1.1 1.2
+ * Device id - mdX or mdpX where X is a number.
+ * metadata - 0.90 1.0 1.1 1.2 ddf ...
* UUID - uuid of the array
* path - path where device created: /dev/md/home
*
+ * The preferred location for the map file is /var/run/mdadm.map.
+ * However /var/run may not exist or be writable in early boot. And if
+ * no-one has created /var/run/mdadm, we still want to survive.
+ * So possible locations are:
+ * /var/run/mdadm/map /var/run/mdadm.map /dev/.mdadm.map
+ * the last, because udev requires a writable /dev very early.
+ * We read from the first one that exists and write to the first
+ * one that we can.
*/
+#include "mdadm.h"
+#include <ctype.h>
+#define mapnames(base) { #base, #base ".new", #base ".lock"}
+char *mapname[3][3] = {
+ mapnames(/var/run/mdadm/map),
+ mapnames(/var/run/mdadm.map),
+ mapnames(/dev/.mdadm.map)
+};
-#include "mdadm.h"
+int mapmode[3] = { O_RDONLY, O_RDWR|O_CREAT, O_RDWR|O_CREAT | O_TRUNC };
+char *mapsmode[3] = { "r", "w", "w"};
+FILE *open_map(int modenum, int *choice)
+{
+ int i;
+ for (i = 0 ; i < 3 ; i++) {
+ int fd = open(mapname[i][modenum], mapmode[modenum], 0600);
+ if (fd >= 0) {
+ *choice = i;
+ return fdopen(fd, mapsmode[modenum]);
+ }
+ }
+ return NULL;
+}
int map_write(struct map_ent *mel)
{
FILE *f;
int err;
- int subdir = 1;
+ int which;
+
+ f = open_map(1, &which);
- f = fopen("/var/run/mdadm/map.new", "w");
- if (!f) {
- f = fopen("/var/run/mdadm.map.new", "w");
- subdir = 0;
- }
if (!f)
return 0;
- while (mel) {
+ for (; mel; mel = mel->next) {
+ if (mel->bad)
+ continue;
if (mel->devnum < 0)
fprintf(f, "mdp%d ", -1-mel->devnum);
else
fprintf(f, "md%d ", mel->devnum);
- fprintf(f, "%d.%d ", mel->major, mel->minor);
+ fprintf(f, "%s ", mel->metadata);
fprintf(f, "%08x:%08x:%08x:%08x ", mel->uuid[0],
mel->uuid[1], mel->uuid[2], mel->uuid[3]);
- fprintf(f, "%s\n", mel->path);
- mel = mel->next;
+ fprintf(f, "%s\n", mel->path?:"");
}
fflush(f);
err = ferror(f);
fclose(f);
if (err) {
- if (subdir)
- unlink("/var/run/mdadm/map.new");
- else
- unlink("/var/run/mdadm.map.new");
+ unlink(mapname[which][1]);
return 0;
}
- if (subdir)
- return rename("/var/run/mdadm/map.new",
- "/var/run/mdadm/map") == 0;
- else
- return rename("/var/run/mdadm.map.new",
- "/var/run/mdadm.map") == 0;
+ return rename(mapname[which][1],
+ mapname[which][0]) == 0;
+}
+
+
+static FILE *lf = NULL;
+static int lwhich = 0;
+int map_lock(struct map_ent **melp)
+{
+ if (lf == NULL) {
+ lf = open_map(2, &lwhich);
+ if (lf == NULL)
+ return -1;
+ if (lockf(fileno(lf), F_LOCK, 0) != 0) {
+ fclose(lf);
+ lf = NULL;
+ return -1;
+ }
+ }
+ if (*melp)
+ map_free(*melp);
+ map_read(melp);
+ return 0;
+}
+
+void map_unlock(struct map_ent **melp)
+{
+ if (lf)
+ fclose(lf);
+ unlink(mapname[lwhich][2]);
+ lf = NULL;
}
void map_add(struct map_ent **melp,
- int devnum, int major, int minor, int uuid[4], char *path)
+ int devnum, char *metadata, int uuid[4], char *path)
{
struct map_ent *me = malloc(sizeof(*me));
me->devnum = devnum;
- me->major = major;
- me->minor = minor;
+ strcpy(me->metadata, metadata);
memcpy(me->uuid, uuid, 16);
- me->path = strdup(path);
+ me->path = path ? strdup(path) : NULL;
me->next = *melp;
+ me->bad = 0;
*melp = me;
}
FILE *f;
char buf[8192];
char path[200];
- int devnum, major, minor, uuid[4];
+ int devnum, uuid[4];
+ char metadata[30];
char nam[4];
+ int which;
*melp = NULL;
- f = fopen("/var/run/mdadm/map", "r");
- if (!f)
- f = fopen("/var/run/mdadm.map", "r");
+ f = open_map(0, &which);
if (!f) {
RebuildMap();
- f = fopen("/var/run/mdadm/map", "r");
+ f = open_map(0, &which);
}
- if (!f)
- f = fopen("/var/run/mdadm.map", "r");
if (!f)
return;
while (fgets(buf, sizeof(buf), f)) {
- if (sscanf(buf, " md%1[p]%d %d.%d %x:%x:%x:%x %200s",
- nam, &devnum, &major, &minor, uuid, uuid+1,
- uuid+2, uuid+3, path) == 9) {
- if (nam[0] == 'p')
+ path[0] = 0;
+ if (sscanf(buf, " %3[mdp]%d %s %x:%x:%x:%x %200s",
+ nam, &devnum, metadata, uuid, uuid+1,
+ uuid+2, uuid+3, path) >= 7) {
+ if (strncmp(nam, "md", 2) != 0)
+ continue;
+ if (nam[2] == 'p')
devnum = -1 - devnum;
- map_add(melp, devnum, major, minor, uuid, path);
+ map_add(melp, devnum, metadata, uuid, path);
}
}
fclose(f);
}
}
-int map_update(struct map_ent **mpp, int devnum, int major, int minor,
+int map_update(struct map_ent **mpp, int devnum, char *metadata,
int *uuid, char *path)
{
struct map_ent *map, *mp;
for (mp = map ; mp ; mp=mp->next)
if (mp->devnum == devnum) {
- mp->major = major;
- mp->minor = minor;
+ strcpy(mp->metadata, metadata);
memcpy(mp->uuid, uuid, 16);
free(mp->path);
- mp->path = strdup(path);
+ mp->path = path ? strdup(path) : NULL;
break;
}
if (!mp)
- map_add(&map, devnum, major, minor, uuid, path);
- *mpp = NULL;
+ map_add(&map, devnum, metadata, uuid, path);
+ if (mpp)
+ *mpp = NULL;
rv = map_write(map);
map_free(map);
return rv;
if (!*map)
map_read(map);
- for (mp = *map ; mp ; mp = mp->next)
- if (memcmp(uuid, mp->uuid, 16) == 0)
- return mp;
+ for (mp = *map ; mp ; mp = mp->next) {
+ if (memcmp(uuid, mp->uuid, 16) != 0)
+ continue;
+ if (!mddev_busy(mp->devnum)) {
+ mp->bad = 1;
+ continue;
+ }
+ return mp;
+ }
+ return NULL;
+}
+
+struct map_ent *map_by_devnum(struct map_ent **map, int devnum)
+{
+ struct map_ent *mp;
+ if (!*map)
+ map_read(map);
+
+ for (mp = *map ; mp ; mp = mp->next) {
+ if (mp->devnum != devnum)
+ continue;
+ if (!mddev_busy(mp->devnum)) {
+ mp->bad = 1;
+ continue;
+ }
+ return mp;
+ }
return NULL;
+}
+
+struct map_ent *map_by_name(struct map_ent **map, char *name)
+{
+ struct map_ent *mp;
+ if (!*map)
+ map_read(map);
+ for (mp = *map ; mp ; mp = mp->next) {
+ if (!mp->path)
+ continue;
+ if (strncmp(mp->path, "/dev/md/", 8) != 0)
+ continue;
+ if (strcmp(mp->path+8, name) != 0)
+ continue;
+ if (!mddev_busy(mp->devnum)) {
+ mp->bad = 1;
+ continue;
+ }
+ return mp;
+ }
+ return NULL;
}
void RebuildMap(void)
struct mdstat_ent *md;
struct map_ent *map = NULL;
int mdp = get_mdp_major();
+ int require_homehost;
+ char sys_hostname[256];
+ char *homehost = conf_get_homehost(&require_homehost);
+
+ if (homehost == NULL || strcmp(homehost, "<system>")==0) {
+ if (gethostname(sys_hostname, sizeof(sys_hostname)) == 0) {
+ sys_hostname[sizeof(sys_hostname)-1] = 0;
+ homehost = sys_hostname;
+ }
+ }
for (md = mdstat ; md ; md = md->next) {
- struct mdinfo *sra = sysfs_read(-1, md->devnum, GET_DEVS);
+ struct mdinfo *sra = sysfs_read(-1, md->devnum, GET_DEVS|SKIP_GONE_DEVS);
struct mdinfo *sd;
+ if (!sra)
+ continue;
+
for (sd = sra->devs ; sd ; sd = sd->next) {
+ char namebuf[100];
char dn[30];
int dfd;
int ok;
if (ok != 0)
continue;
st->ss->getinfo_super(st, &info);
- if (md->devnum > 0)
+ if (md->devnum >= 0)
path = map_dev(MD_MAJOR, md->devnum, 0);
else
path = map_dev(mdp, (-1-md->devnum)<< 6, 0);
- map_add(&map, md->devnum, st->ss->major,
- st->minor_version,
- info.uuid, path ? : "/unknown");
+ if (path == NULL ||
+ strncmp(path, "/dev/md/", 8) != 0) {
+ /* We would really like a name that provides
+ * an MD_DEVNAME for udev.
+ * The name needs to be unique both in /dev/md/
+ * and in this mapfile.
+ * It needs to match watch -I or -As would come
+ * up with.
+ * That means:
+ * Check if array is in mdadm.conf
+ * - if so use that.
+ * determine trustworthy from homehost etc
+ * find a unique name based on metadata name.
+ *
+ */
+ struct mddev_ident_s *match = conf_match(&info, st);
+ struct stat stb;
+ if (match && match->devname && match->devname[0] == '/') {
+ path = match->devname;
+ if (path[0] != '/') {
+ strcpy(namebuf, "/dev/md/");
+ strcat(namebuf, path);
+ path = namebuf;
+ }
+ } else {
+ int unum = 0;
+ char *sep = "_";
+ const char *name;
+ int conflict = 1;
+ if ((homehost == NULL ||
+ st->ss->match_home(st, homehost) != 1) &&
+ st->ss->match_home(st, "any") != 1 &&
+ (require_homehost
+ || ! conf_name_is_free(info.name)))
+ /* require a numeric suffix */
+ unum = 0;
+ else
+ /* allow name to be used as-is if no conflict */
+ unum = -1;
+ name = info.name;
+ if (!*name) {
+ name = st->ss->name;
+ if (!isdigit(name[strlen(name)-1]) &&
+ unum == -1) {
+ unum = 0;
+ sep = "";
+ }
+ }
+ if (strchr(name, ':'))
+ /* probably a uniquifying
+ * hostname prefix. Allow
+ * without a suffix
+ */
+ unum = -1;
+
+ while (conflict) {
+ if (unum >= 0)
+ sprintf(namebuf, "/dev/md/%s%s%d",
+ name, sep, unum);
+ else
+ sprintf(namebuf, "/dev/md/%s",
+ name);
+ unum++;
+ if (lstat(namebuf, &stb) != 0 &&
+ (map == NULL ||
+ !map_by_name(&map, namebuf+8)))
+ conflict = 0;
+ }
+ path = namebuf;
+ }
+ }
+ map_add(&map, md->devnum,
+ info.text_version,
+ info.uuid, path);
st->ss->free_super(st);
break;
}
+ sysfs_free(sra);
}
- free_mdstat(mdstat);
map_write(map);
map_free(map);
+ for (md = mdstat ; md ; md = md->next) {
+ struct mdinfo *sra = sysfs_read(-1, md->devnum, GET_VERSION);
+ sysfs_uevent(sra, "change");
+ sysfs_free(sra);
+ }
+ free_mdstat(mdstat);
}
.BI /dev/md n
.br
.BI /dev/md/ n
+.br
+.BR /dev/md/ name
.SH DESCRIPTION
The
.B md
MULTIPATH (a set of different interfaces to the same device),
and FAULTY (a layer over a single device into which errors can be injected).
-.SS MD SUPER BLOCK
-Each device in an array may have a
-.I superblock
-which records information about the structure and state of the array.
+.SS MD METADATA
+Each device in an array may have some
+.I metadata
+stored in the device. This metadata is sometimes called a
+.BR superblock .
+The metadata records information about the structure and state of the array.
This allows the array to be reliably re-assembled after a shutdown.
From Linux kernel version 2.6.10,
.B md
-provides support for two different formats of this superblock, and
+provides support for two different formats of metadata, and
other formats can be added. Prior to this release, only one format is
supported.
and 12K from the end of the device, on a 4K boundary, though
variations can be stored at the start of the device (version 1.1) or 4K from
the start of the device (version 1.2).
-This superblock format stores multibyte data in a
+This metadata format stores multibyte data in a
processor-independent format and supports up to hundreds of
component devices (version 0.90 only supports 28).
-The superblock contains, among other things:
+The metadata contains, among other things:
.TP
LEVEL
The manner in which the devices are arranged into the array
a 128 bit Universally Unique Identifier that identifies the array that
contains this device.
+.PP
When a version 0.90 array is being reshaped (e.g. adding extra devices
to a RAID5), the version number is temporarily set to 0.91. This
ensures that if the reshape process is stopped in the middle (e.g. by
would cause data corruption) but will be left untouched until a kernel
that can complete the reshape processes is used.
-.SS ARRAYS WITHOUT SUPERBLOCKS
+.SS ARRAYS WITHOUT METADATA
While it is usually best to create arrays with superblocks so that
they can be assembled reliably, there are some circumstances when an
array without superblocks is preferred. These include:
the array elsewhere. While not encouraged for general us, it does
have special-purpose uses and is supported.
+.SS ARRAYS WITH EXTERNAL METADATA
+
+From release 2.6.28, the
+.I md
+driver supports arrays with externally managed metadata. That is,
+the metadata is not managed by the kernel by rather by a user-space
+program which is external to the kernel. This allows support for a
+variety of metadata formats without cluttering the kernel with lots of
+details.
+.PP
+.I md
+is able to communicate with the user-space program through various
+sysfs attributes so that it can make appropriate changes to the
+metadata \- for example to make a device as faulty. When necessary,
+.I md
+will wait for the program to acknowledge the event by writing to a
+sysfs attribute.
+The manual page for
+.IR mdmon (8)
+contains more detail about this interaction.
+
+.SS CONTAINERS
+Many metadata formats use a single block of metadata to describe a
+number of different arrays which all use the same set of devices.
+In this case it is helpful for the kernel to know about the full set
+of devices as a whole. This set is known to md as a
+.IR container .
+A container is an
+.I md
+array with externally managed metadata and with device offset and size
+so that it just covers the metadata part of the devices. The
+remainder of each device is available to be incorporated into various
+arrays.
+
.SS LINEAR
A linear array simply catenates the available space on each
striped array.
A RAID0 array is configured at creation with a
.B "Chunk Size"
-which must be a power of two, and at least 4 kibibytes.
+which must be a power of two (prior to Linux 2.6.31), and at least 4
+kibibytes.
The RAID0 driver assigns the first chunk of the array to the first
device, the second chunk to the second device, and so on until all
-drives have been assigned one chunk. This collection of chunks forms
-a
+drives have been assigned one chunk. This collection of chunks forms a
.BR stripe .
Further chunks are gathered into stripes in the same way, and are
assigned to the remaining space in the drives.
spindle. In theory, having an N-disk RAID1 will allow N sequential
threads to read from all disks.
+Individual devices in a RAID1 can be marked as "write-mostly".
+This drives are excluded from the normal read balancing and will only
+be read from when there is no other option. This can be useful for
+devices connected over a slow link.
+
.SS RAID4
A RAID4 array is like a RAID0 array with an extra device for storing
devices, often fibre channel interfaces, that all refer the the same
real device. If one of these interfaces fails (e.g. due to cable
problems), the multipath driver will attempt to redirect requests to
-another interface.
+another interface.
+
+The MULTIPATH drive is not receiving any ongoing development and
+should be considered a legacy driver. The device-mapper based
+multipath drivers should be preferred for new installations.
.SS FAULTY
The FAULTY md module is provided for testing purposes. A faulty array
.B md/stripe_cache_size
This is only available on RAID5 and RAID6. It records the size (in
pages per device) of the stripe cache which is used for synchronising
-all read and write operations to the array. The default is 128.
+all write operations to the array and all read operations if the array
+is degraded. The default is 256. Valid values are 17 to 32768.
Increasing this number can increase performance in some situations, at
-some cost in system memory.
+some cost in system memory. Note, setting this value too high can
+result in an "out of memory" condition for the system.
+memory_consumed = system_page_size * nr_disks * stripe_cache_size
+
+.TP
+.B md/preread_bypass_threshold
+This is only available on RAID5 and RAID6. This variable sets the
+number of times MD will service a full-stripe-write before servicing a
+stripe that requires some "prereading". For fairness this defaults to
+1. Valid values are 0 to stripe_cache_size. Setting this to 0
+maximizes sequential-write throughput at the cost of fairness to threads
+doing small or random writes.
.SS KERNEL PARAMETERS
.TP
.B md_mod.start_ro=1
+.TP
+.B /sys/module/md_mod/parameters/start_ro
This tells md to start all arrays in read-only mode. This is a soft
read-only that will automatically switch to read-write on the first
write request. However until that write request, nothing is written
.TP
.B md_mod.start_dirty_degraded=1
+.TP
+.B /sys/module/md_mod/parameters/start_dirty_degraded
As mentioned above, md will not normally start a RAID4, RAID5, or
RAID6 that is both dirty and degraded as this situation can imply
hidden data loss. This can be awkward if the root filesystem is
speed for times when non-rebuild activity is current on an array.
The speed is in Kibibytes per second, and is a per-device rate, not a
per-array rate (which means that an array with more disks will shuffle
-more data for a given speed). The default is 100.
+more data for a given speed). The default is 1000.
.TP
.B /proc/sys/dev/raid/speed_limit_max
A readable and writable file that reflects the current "goal" rebuild
speed for times when no non-rebuild activity is current on an array.
-The default is 100,000.
+The default is 200,000.
.SH SEE ALSO
.BR mdadm (8),
.\" the Free Software Foundation; either version 2 of the License, or
.\" (at your option) any later version.
.\" See file COPYING in distribution for details.
-.TH MDADM 8 "" v2.6.9
+.TH MDADM 8 "" v3.0-rc1
.SH NAME
mdadm \- manage MD devices
.I aka
.SH DESCRIPTION
RAID devices are virtual devices created from two or more
-real block devices. This allows multiple devices (typically disk
+real block devices. This allows multiple devices (typically disk
drives or partitions thereof) to be combined into a single device to
hold (for example) a single filesystem.
Some RAID levels include redundancy and so can survive some degree of
.BR RAID6 ,
.BR RAID10 ,
.BR MULTIPATH ,
+.BR FAULTY ,
and
-.BR FAULTY .
+.BR CONTAINER .
.B MULTIPATH
is not a Software RAID mechanism, but does involve
multiple devices:
each device is a path to one common physical storage device.
+New installations should not use md/multipath as it is not well
+supported and has no ongoing development. Use the Device Mapper based
+multipath-tools instead.
.B FAULTY
is also not true RAID, and it only involves one device. It
provides a layer over a true device that can be used to inject faults.
-.\".I mdadm
-.\"is a program that can be used to create, manage, and monitor
-.\"MD devices. As
-.\"such it provides a similar set of functionality to the
-.\".B raidtools
-.\"packages.
-.\"The key differences between
-.\".I mdadm
-.\"and
-.\".B raidtools
-.\"are:
-.\".IP \(bu 4
-.\".I mdadm
-.\"is a single program and not a collection of programs.
-.\".IP \(bu 4
-.\".I mdadm
-.\"can perform (almost) all of its functions without having a
-.\"configuration file and does not use one by default. Also
-.\".I mdadm
-.\"helps with management of the configuration
-.\"file.
-.\".IP \(bu 4
-.\".I mdadm
-.\"can provide information about your arrays (through Query, Detail, and Examine)
-.\"that
-.\".B raidtools
-.\"cannot.
-.\".P
-.\".I mdadm
-.\"does not use
-.\".IR /etc/raidtab ,
-.\"the
-.\".B raidtools
-.\"configuration file, at all. It has a different configuration file
-.\"with a different format and a different purpose.
+.B CONTAINER
+is different again. A
+.B CONTAINER
+is a collection of devices that are
+managed as a set. This is similar to the set of devices connected to
+a hardware RAID controller. The set of devices may contain a number
+of different RAID arrays each utilising some (or all) of the blocks from a
+number of the devices in the set. For example, two devices in a 5-device set
+might form a RAID1 using the whole devices. The remaining three might
+have a RAID5 over the first half of each device, and a RAID0 over the
+second half.
+
+With a
+.BR CONTAINER ,
+there is one set of metadata that describes all of
+the arrays in the container. So when
+.I mdadm
+creates a
+.B CONTAINER
+device, the device just represents the metadata. Other normal arrays (RAID1
+etc) can be created inside the container.
.SH MODES
mdadm has several major modes of operation:
.TP
.B Assemble
Assemble the components of a previously created
-array into an active array. Components can be explicitly given
+array into an active array. Components can be explicitly given
or can be searched for.
.I mdadm
checks that the components
.TP
.B Build
-Build an array that doesn't have per-device superblocks. For these
+Build an array that doesn't have per-device metadata (superblocks). For these
sorts of arrays,
.I mdadm
cannot differentiate between initial creation and subsequent assembly
.TP
.B Create
-Create a new array with per-device superblocks.
-.\"It can progress
-.\"in several step create-add-add-run or it can all happen with one command.
+Create a new array with per-device metadata (superblocks).
+Appropriate metadata is written to each device, and then the array
+comprising those devices is activated. A 'resync' process is started
+to make sure that the array is consistent (e.g. both sides of a mirror
+contain the same data) but the content of the device is left otherwise
+untouched.
+The array can be used as soon as it has been created. There is no
+need to wait for the initial resync to finish.
.TP
.B "Follow or Monitor"
Monitor one or more md devices and act on any state changes. This is
-only meaningful for raid1, 4, 5, 6, 10 or multipath arrays, as
-only these have interesting state. raid0 or linear never have
+only meaningful for RAID1, 4, 5, 6, 10 or multipath arrays, as
+only these have interesting state. RAID0 or Linear never have
missing, spare, or failed drives, so there is nothing to monitor.
.TP
.I mdadm
has a chance to include it in some array as appropriate.
+If a
+.B CONTAINER
+is passed to
+.I mdadm
+in this mode, then any arrays within that container will be assembled
+and started.
+
.TP
.B Manage
This is for doing things to specific components of an array such as
is compiled into the kernel \(em not if it is a module.
Arrays can be auto-detected by the kernel if all the components are in
primary MS-DOS partitions with partition type
-.BR FD .
+.BR FD ,
+and all use v0.90 metadata.
In-kernel autodetect is not recommended for new installations. Using
.I mdadm
to detect and assemble arrays \(em possibly in an
.BR \-\-fail ,
or
.BR \-\-remove ,
-then the MANAGE mode is assume.
+then the MANAGE mode is assumed.
Anything other than these will cause the
.B Misc
mode to be assumed.
then nothing will be read, but
.I mdadm
will act as though the config file contained exactly
-.B "DEVICE partitions"
+.B "DEVICE partitions containers"
and will read
.B /proc/partitions
-to find a list of devices to scan.
+to find a list of devices to scan, and
+.B /proc/mdstat
+to find a list of containers to examine.
If the word
.B "none"
is given for the config file, then
.TP
.B \-e ", " \-\-metadata=
-Declare the style of superblock (raid metadata) to be used. The
+Declare the style of RAID metadata (superblock) to be used. The
default is 0.90 for
.BR \-\-create ,
and to guess for other operations.
The different sub-versions store the superblock at different locations
on the device, either at the end (for 1.0), at the start (for 1.1) or
4K from the start (for 1.2).
+.IP ddf
+Use the "Industry Standard" DDF (Disk Data Format) format defined by
+SNIA.
+When creating a DDF array a
+.B CONTAINER
+will be created, and normal arrays can be created in that container.
+.IP imsm
+Use the Intel(R) Matrix Storage Manager metadata format. This creates a
+.B CONTAINER
+which is managed in a similar manner to DDF, and is supported by an
+option-rom on some platforms:
+.IP
+.B http://www.intel.com/design/chipsets/matrixstorage_sb.htm
+.PP
.RE
.TP
When creating an array, the
.B homehost
-will be recorded in the superblock. For version-1 superblocks, it will
+will be recorded in the metadata. For version-1 superblocks, it will
be prefixed to the array name. For version-0.90 superblocks, part of
the SHA1 hash of the hostname will be stored in the later half of the
UUID.
for the given homehost will be reported as such.
When using Auto-Assemble, only arrays tagged for the given homehost
-will be assembled.
+will be allowed to use 'local' names (i.e. not ending in '_' followed
+by a digit string). See below under
+.BR "Auto Assembly" .
.SH For create, build, or grow:
.I component-devices
(including "\fBmissing\fP" devices)
that are listed on the command line for
-.BR \-\-create .
+.BR \-\-create .
Setting a value of 1 is probably
a mistake and so requires that
.B \-\-force
be specified first. A value of 1 will then be allowed for linear,
-multipath, raid0 and raid1. It is never allowed for raid4 or raid5.
+multipath, RAID0 and RAID1. It is never allowed for RAID4, RAID5 or RAID6.
.br
This number can only be changed using
.B \-\-grow
-for RAID1, RAID5 and RAID6 arrays, and only on kernels which provide
-necessary support.
+for RAID1, RAID4, RAID5 and RAID6 arrays, and only on kernels which provide
+the necessary support.
.TP
.BR \-x ", " \-\-spare\-devices=
Specify the number of spare (eXtra) devices in the initial array.
Spares can also be added
and removed later. The number of component devices listed
-on the command line must equal the number of raid devices plus the
+on the command line must equal the number of RAID devices plus the
number of spare devices.
-
.TP
.BR \-z ", " \-\-size=
-Amount (in Kibibytes) of space to use from each drive in RAID level 1/4/5/6.
+Amount (in Kibibytes) of space to use from each drive in RAID levels 1/4/5/6.
This must be a multiple of the chunk size, and must leave about 128Kb
of space at the end of the drive for the RAID superblock.
If this is not specified
This value can be set with
.B \-\-grow
-for RAID level 1/4/5/6. If the array was created with a size smaller
+for RAID level 1/4/5/6. If the array was created with a size smaller
than the currently active drives, the extra space can be accessed
using
.BR \-\-grow .
.B max
which means to choose the largest size that fits on all current drives.
+This value can not be used with
+.B CONTAINER
+metadata such as DDF and IMSM.
+
.TP
.BR \-c ", " \-\-chunk=
Specify chunk size of kibibytes. The default is 64.
+This is only meaningful for RAID0, RAID4, RAID5, RAID6, and RAID10.
.TP
.BR \-\-rounding=
-Specify rounding factor for linear array (==chunk size)
+Specify rounding factor for a Linear array. The size of each
+component will be rounded down to a multiple of this size.
+This is a synonym for
+.B \-\-chunk
+but highlights the different meaning for Linear as compared to other
+RAID levels.
.TP
.BR \-l ", " \-\-level=
-Set raid level. When used with
+Set RAID level. When used with
.BR \-\-create ,
options are: linear, raid0, 0, stripe, raid1, 1, mirror, raid4, 4,
-raid5, 5, raid6, 6, raid10, 10, multipath, mp, faulty. Obviously some of these are synonymous.
+raid5, 5, raid6, 6, raid10, 10, multipath, mp, faulty, container.
+Obviously some of these are synonymous.
+
+When a
+.B CONTAINER
+metadata type is requested, only the
+.B container
+level is permitted, and it does not need to be explicitly given.
When used with
.BR \-\-build ,
.TP
.BR \-p ", " \-\-layout=
-This option configures the fine details of data layout for raid5,
-and raid10 arrays, and controls the failure modes for
+This option configures the fine details of data layout for RAID5, RAID6,
+and RAID10 arrays, and controls the failure modes for
.IR faulty .
-The layout of the raid5 parity block can be one of
+The layout of the RAID5 parity block can be one of
.BR left\-asymmetric ,
.BR left\-symmetric ,
.BR right\-asymmetric ,
The default is
.BR left\-symmetric .
+It is also possibly to cause RAID5 to use a RAID4-like layout by
+choosing
+.BR parity\-first ,
+or
+.BR parity\-last .
+
+Finally for RAID5 there are DDF\-compatible layouts,
+.BR ddf\-zero\-restart ,
+.BR ddf\-N\-restart ,
+and
+.BR ddf\-N\-continue .
+
+These same layouts are available for RAID6. There are also 4 layouts
+that will provide an intermediate stage for converting between RAID5
+and RAID6. These provide a layout which is identical to the
+corresponding RAID5 layout on the first N\-1 devices, and has the 'Q'
+syndrome (the second 'parity' block used by RAID6) on the last device.
+These layouts are:
+.BR left\-symmetric\-6 ,
+.BR right\-symmetric\-6 ,
+.BR left\-asymmetric\-6 ,
+.BR right\-asymmetric\-6 ,
+and
+.BR pairty\-first\-6 .
+
When setting the failure mode for level
.I faulty,
the options are:
by a small number. The default is 'n2'. The supported options are:
.I 'n'
-signals 'near' copies. Multiple copies of one data block are at
+signals 'near' copies. Multiple copies of one data block are at
similar offsets in different devices.
.I 'o'
.I 'f'
signals 'far' copies
(multiple copies have very different offsets).
-See md(4) for more detail about 'near' and 'far'.
+See md(4) for more detail about 'near', 'offset', and 'far'.
The number is the number of copies of each datablock. 2 is normal, 3
can be useful. This number can be at most equal to the number of
.TP
.BR \-\-bitmap\-chunk=
-Set the chunksize of the bitmap. Each bit corresponds to that many
+Set the chunksize of the bitmap. Each bit corresponds to that many
Kilobytes of storage.
When using a file based bitmap, the default is to use the smallest
size that is at-least 4 and requires no more than 2^21 chunks.
bitmap, the chunksize is automatically determined to make best use of
available space.
-
.TP
.BR \-W ", " \-\-write\-mostly
-subsequent devices lists in a
+subsequent devices listed in a
.BR \-\-build ,
.BR \-\-create ,
or
.TP
.BR \-\-write\-behind=
Specify that write-behind mode should be enabled (valid for RAID1
-only). If an argument is specified, it will set the maximum number
-of outstanding writes allowed. The default value is 256.
+only). If an argument is specified, it will set the maximum number
+of outstanding writes allowed. The default value is 256.
A write-intent bitmap is required in order to use write-behind
mode, and write-behind is only attempted on drives marked as
.IR write-mostly .
data will be affected unless you actually write to the array. It can
also be used when creating a RAID1 or RAID10 if you want to avoid the
initial resync, however this practice \(em while normally safe \(em is not
-recommended. Use this only if you really know what you are doing.
+recommended. Use this only if you really know what you are doing.
.TP
.BR \-\-backup\-file=
This is needed when
.B \-\-grow
is used to increase the number of
-raid-devices in a RAID5 if there are no spare devices available.
-See the section below on RAID_DEVICE CHANGES. The file should be
-stored on a separate device, not on the raid array being reshaped.
+raid-devices in a RAID5 if there are no spare devices available.
+See the GROW MODE section below on RAID\-DEVICES CHANGES. The file
+should be stored on a separate device, not on the RAID array being
+reshaped.
.TP
.BR \-N ", " \-\-name=
Set a
.B name
for the array. This is currently only effective when creating an
-array with a version-1 superblock. The name is a simple textual
-string that can be used to identify array components when assembling.
+array with a version-1 superblock, or an array in a DDF container.
+The name is a simple textual string that can be used to identify array
+components when assembling. If name is needed but not specified, it
+is taken from the basename of the device that is being created.
+e.g. when creating
+.I /dev/md/home
+the
+.B name
+will default to
+.IR home .
.TP
.BR \-R ", " \-\-run
accept the geometry and layout specified without question. Normally
.I mdadm
will not allow creation of an array with only one device, and will try
-to create a raid5 array with one missing drive (as this makes the
+to create a RAID5 array with one missing drive (as this makes the
initial resync work faster). With
.BR \-\-force ,
.I mdadm
will not try to be so clever.
.TP
-.BR \-a ", " "\-\-auto{=no,yes,md,mdp,part,p}{NN}"
-Instruct mdadm to create the device file if needed, possibly allocating
+.BR \-a ", " "\-\-auto{=yes,md,mdp,part,p}{NN}"
+Instruct mdadm how to create the device file if needed, possibly allocating
an unused minor number. "md" causes a non-partitionable array
-to be used. "mdp", "part" or "p" causes a partitionable array (2.6 and
+to be used (though since Linux 2.6.28, these array devices are in fact
+partitionable). "mdp", "part" or "p" causes a partitionable array (2.6 and
later) to be used. "yes" requires the named md device to have
a 'standard' format, and the type and minor number will be determined
-from this. See DEVICE NAMES below.
+from this. With mdadm 3.0, device creation is normally left up to
+.I udev
+so this option is unlikely to be needed.
+See DEVICE NAMES below.
The argument can also come immediately after
"\-a". e.g. "\-ap".
end of this option (e.g.
.BR \-\-auto=p7 ).
If the device name ends with a digit, the partition names add a 'p',
-and a number, e.g. "/dev/home1p3". If there is no
-trailing digit, then the partition names just have a number added,
-e.g. "/dev/scratch3".
+and a number, e.g.
+.IR /dev/md/home1p3 .
+If there is no trailing digit, then the partition names just have a
+number added, e.g.
+.IR /dev/md/scratch3 .
If the md device name is in a 'standard' format as described in DEVICE
NAMES, then it will be created, if necessary, with the appropriate
-number based on that name. If the device name is not in one of these
-formats, then a unused minor number will be allocated. The minor
+device number based on that name. If the device name is not in one of these
+formats, then a unused device number will be allocated. The device
number will be considered unused if there is no active array for that
number, and there is no entry in /dev for that number and with a
-non-standard name.
-
-.TP
-.BR \-\-symlink = no
-Normally when
-.B \-\-auto
-causes
-.I mdadm
-to create devices in
-.B /dev/md/
-it will also create symlinks from
-.B /dev/
-with names starting with
-.B md
-or
-.BR md_ .
-Use
-.B \-\-symlink=no
-to suppress this, or
-.B \-\-symlink=yes
-to enforce this even if it is suppressing
-.IR mdadm.conf .
-
+non-standard name. Names that are not in 'standard' format are only
+allowed in "/dev/md/".
+
+.\".TP
+.\".BR \-\-symlink = no
+.\"Normally when
+.\".B \-\-auto
+.\"causes
+.\".I mdadm
+.\"to create devices in
+.\".B /dev/md/
+.\"it will also create symlinks from
+.\".B /dev/
+.\"with names starting with
+.\".B md
+.\"or
+.\".BR md_ .
+.\"Use
+.\".B \-\-symlink=no
+.\"to suppress this, or
+.\".B \-\-symlink=yes
+.\"to enforce this even if it is suppressing
+.\".IR mdadm.conf .
+.\"
.SH For assemble:
.TP
.BR \-u ", " \-\-uuid=
-uuid of array to assemble. Devices which don't have this uuid are
+uuid of array to assemble. Devices which don't have this uuid are
excluded
.TP
.B \-\-super\-minor=dev
will look for super blocks with a minor number of 0.
+.B \-\-super\-minor
+is only relevant for v0.90 metadata, and should not normally be used.
+Using
+.B \-\-uuid
+is much safer.
+
.TP
.BR \-N ", " \-\-name=
Specify the name of the array to assemble. This must be the name
.TP
.BR \-f ", " \-\-force
-Assemble the array even if some superblocks appear out-of-date
+Assemble the array even if the metadata on some devices appears to be
+out-of-date. If
+.I mdadm
+cannot find enough working devices to start the array, but can find
+some devices that are recorded as having failed, then it will mark
+those devices as working so that the array can be started.
+An array which requires
+.B \-\-force
+to be started may contain data corruption. Use it carefully.
.TP
.BR \-R ", " \-\-run
reports a different "Preferred Minor" to
.BR \-\-detail .
In some cases this update will be performed automatically
-by the kernel driver. In particular the update happens automatically
+by the kernel driver. In particular the update happens automatically
at the first write to an array with redundancy (RAID level 1 or
greater) on a 2.6 (or later) kernel.
.B resync
option will cause the array to be marked
.I dirty
-meaning that any redundancy in the array (e.g. parity for raid5,
-copies for raid1) may be incorrect. This will cause the raid system
+meaning that any redundancy in the array (e.g. parity for RAID5,
+copies for RAID1) may be incorrect. This will cause the RAID system
to perform a "resync" pass to make sure that all redundant information
is correct.
The
.B summaries
-option will correct the summaries in the superblock. That is the
+option will correct the summaries in the superblock. That is the
counts of total, working, active, failed, and spare devices.
The
to determine the maximum usable amount of space on each device and
update the relevant field in the metadata.
+.ig XX
.TP
.B \-\-auto\-update\-homehost
This flag is only meaningful with auto-assembly (see discussion below).
.I mdadm
will rescan for any arrays at all and will assemble them and update the
homehost to match the current host.
+.XX
.SH For Manage mode:
.TP
.BR \-a ", " \-\-add
-hot-add listed devices.
+hot-add listed devices. For arrays with redundancy, the listed
+devices become available as spares. If the array is degraded, it will
+immediately start recovering data on to one of these spares.
.TP
.BR \-\-re\-add
-re-add a device that was recently removed from an array.
+re-add a device that was recently removed from an array. This is only
+needed for arrays that have be built (i.e. with
+.BR --build ).
+For created arrays, devices are always re-added if that is possible.
+When re-adding a device, if nothing has changed on the array since the
+device was removed, no recovery is performed. Also, if the array has
+a write-intent bitmap, then the recovery performed after a re-add will
+be limited to those blocks which, according to the bitmap, might have
+changed since the device was removed.
.TP
.BR \-r ", " \-\-remove
.TP
.BR \-\-write\-mostly
Subsequent devices that are added or re-added will have the 'write-mostly'
-flag set. This is only valid for RAID! and means that the 'md' driver
+flag set. This is only valid for RAID1 and means that the 'md' driver
will avoid reading from these devices if possible.
.TP
.BR \-\-readwrite
Subsequent devices that are added or re-added will have the 'write-mostly'
flag cleared.
-
.P
-Each of these options require that the first device listed is the array
+Each of these options requires that the first device listed is the array
to be acted upon, and the remainder are component devices to be added,
-removed, or marked as faulty. Several different operations can be
+removed, marked as faulty, etc. Several different operations can be
specified for different devices, e.g.
.in +5
mdadm /dev/md0 \-\-add /dev/sda1 \-\-fail /dev/sdb1 \-\-remove /dev/sdb1
.TP
.BR \-D ", " \-\-detail
-Print detail of one or more md devices.
+Print details of one or more md devices.
+
+.TP
+.BR \-\-detail\-platform
+Print details of the platform's RAID capabilities (firmware / hardware
+topology) for a given metadata format.
.TP
.BR \-Y ", " \-\-export
.TP
.BR \-E ", " \-\-examine
-Print content of md superblock on device(s).
+Print contents of the metadata stored on the named device(s).
+Note the contrast between
+.B \-\-examine
+and
+.BR \-\-detail .
+.B \-\-examine
+applies to devices which are components of an array, while
+.B \-\-detail
+applies to a whole array which is currently active.
.TP
.B \-\-sparc2.2
-If an array was created on a 2.2 Linux kernel patched with RAID
-support, the superblock will have been created incorrectly, or at
-least incompatibly with 2.4 and later kernels. Using the
+If an array was created on a SPARC machine with a 2.2 Linux kernel
+patched with RAID support, the superblock will have been created
+incorrectly, or at least incompatibly with 2.4 and later kernels.
+Using the
.B \-\-sparc2.2
flag with
.B \-\-examine
.BR \-X ", " \-\-examine\-bitmap
Report information about a bitmap file.
The argument is either an external bitmap file or an array component
-in case of an internal bitmap.
+in case of an internal bitmap. Note that running this on an array
+device (e.g.
+.BR /dev/md0 )
+does not report the bitmap for that array.
.TP
.BR \-R ", " \-\-run
-start a partially built array.
+start a partially assembled array. If
+.B \-\-assemble
+did not find enough devices to fully start the array, it might leaving
+it partially assembled. If you wish, you can then use
+.B \-\-run
+to start the array in degraded mode.
.TP
.BR \-S ", " \-\-stop
.BR \-\-detail ,
the exit status of
.I mdadm
-is set to reflect the status of the device.
+is set to reflect the status of the device. See below in
+.B MISC MODE
+for details.
.TP
.BR \-W ", " \-\-wait
will return with success if it actually waited for every device
listed, otherwise it will return failure.
+.TP
+.BR \-\-wait\-clean
+For each md device given, or each device in /proc/mdstat if
+.B \-\-scan
+is given, arrange for the array to be marked clean as soon as possible.
+Also, quiesce resync so that the monitor for external metadata arrays
+(mdmon) has an opportunity to checkpoint the resync position.
+.I mdadm
+will return with success if the array uses external metadata and we
+successfully waited. For native arrays this returns immediately as the
+kernel handles both dirty-clean transitions and resync checkpointing in
+the kernel at shutdown. No action is taken if safe-mode handling is
+disabled.
+
.SH For Incremental Assembly mode:
.TP
.BR \-\-rebuild\-map ", " \-r
Run any array assembled as soon as a minimal number of devices are
available, rather than waiting until all expected devices are present.
+.TP
+.B \-\-no\-degraded
+This allows the hot-plug system to prevent arrays from running when it knows
+that more disks may arrive later in the discovery process.
+
.TP
.BR \-\-scan ", " \-s
Only meaningful with
Give a delay in seconds.
.I mdadm
polls the md arrays and then waits this many seconds before polling
-again. The default is 60 seconds.
+again. The default is 60 seconds. Since 2.6.16, there is no need to
+reduce this as the kernel alerts
+.I mdadm
+immediately when there is any change.
.TP
.BR \-f ", " \-\-daemonise
Tell
.I mdadm
to run as a background daemon if it decides to monitor anything. This
-causes it to fork and run in the child, and to disconnect form the
+causes it to fork and run in the child, and to disconnect from the
terminal. The process id of the child is written to stdout.
This is useful with
.B \-\-scan
.HP 12
Usage:
.B mdadm \-\-assemble \-\-scan
-.I md-devices-and-options...
+.I md-devices-and-options...
.HP 12
Usage:
.B mdadm \-\-assemble \-\-scan
-.I options...
+.I options...
.PP
-This usage assembles one or more raid arrays from pre-existing components.
+This usage assembles one or more RAID arrays from pre-existing components.
For each array, mdadm needs to know the md device, the identity of the
-array, and a number of component-devices. These can be found in a number of ways.
+array, and a number of component-devices. These can be found in a number of ways.
In the first usage example (without the
.BR \-\-scan )
In the second usage example, all devices listed are treated as md
devices and assembly is attempted.
In the third (where no devices are listed) all md devices that are
-listed in the configuration file are assembled.
+listed in the configuration file are assembled. If not arrays are
+described by the configuration file, then any arrays that
+can be found on unused devices will be assembled.
If precisely one device is listed, but
.B \-\-scan
The identity can be given with the
.B \-\-uuid
-option, with the
+option, the
+.B \-\-name
+option, or the
.B \-\-super\-minor
option, will be taken from the md-device record in the config file, or
will be taken from the super block of the first component-device
Devices can be given on the
.B \-\-assemble
-command line or in the config file. Only devices which have an md
+command line or in the config file. Only devices which have an md
superblock which contains the right identity will be considered for
any array.
.BR \-\-scan .
In the later case,
.B /etc/mdadm.conf
+or
+.B /etc/mdadm/mdadm.conf
is used.
If
Normally the array will be started after it is assembled. However if
.B \-\-scan
-is not given and insufficient drives were listed to start a complete
-(non-degraded) array, then the array is not started (to guard against
-usage errors). To insist that the array be started in this case (as
-may work for RAID1, 4, 5, 6, or 10), give the
+is not given and not all expected drives were listed, then the array
+is not started (to guard against usage errors). To insist that the
+array be started in this case (as may work for RAID1, 4, 5, 6, or 10),
+give the
.B \-\-run
flag.
-If the md device does not exist, then it will be created providing the
-intent is clear. i.e. the name must be in a standard form, or the
-.B \-\-auto
-option must be given to clarify how and whether the device should be
-created.
-This can be useful for handling partitioned devices (which don't have
-a stable device number \(em it can change after a reboot) and when using
-"udev" to manage your
+If
+.I udev
+is active,
+.I mdadm
+does not create any entries in
.B /dev
-tree (udev cannot handle md devices because of the unusual device
-initialisation conventions).
+but leaves that to
+.IR udev .
+It does record information in
+.B /var/run/mdadm/map
+which will allow
+.I udev
+to choose the correct name.
-If the option to "auto" is "mdp" or "part" or (on the command line
-only) "p", then mdadm will create a partitionable array, using the
-first free one that is not in use and does not already have an entry
-in /dev (apart from numeric /dev/md* entries).
+If
+.I mdadm
+detects that udev is not configured, it will create the devices in
+.B /dev
+itself.
-If the option to "auto" is "yes" or "md" or (on the command line)
-nothing, then mdadm will create a traditional, non-partitionable md
-array.
+In Linux kernels prior to version 2.6.28 there were two distinctly
+different types of md devices that could be created: one that could be
+partitioned using standard partitioning tools and one that could not.
+Since 2.6.28 that distinction is no longer relevant as both type of
+devices can be partitioned.
+.I mdadm
+will normally create the type that originally could not be partitioned
+as it has a well defined major number (9).
-It is expected that the "auto" functionality will be used to create
-device entries with meaningful names such as "/dev/md/home" or
-"/dev/md/root", rather than names based on the numerical array number.
+Prior to 2.6.28, it is important that mdadm chooses the correct type
+of array device to use. This can be controlled with the
+.B \-\-auto
+option. In particular, a value of "mdp" or "part" or "p" tells mdadm
+to use a partitionable device rather than the default.
-When using option "auto" to create a partitionable array, the device
-files for the first 4 partitions are also created. If a different
-number is required it can be simply appended to the auto option.
-e.g. "auto=part8". Partition names are created by appending a digit
-string to the device name, with an intervening "p" if the device name
-ends with a digit.
+In the no-udev case, the value given to
+.B \-\-auto
+can be suffixed by a number. This tells
+.I mdadm
+to create that number of partition devices rather than the default of 4.
-The
+The value given to
.B \-\-auto
-option is also available in Build and Create modes. As those modes do
-not use a config file, the "auto=" config option does not apply to
-these modes.
+can also be given in the configuration file as a word starting
+.B auto=
+on the ARRAY line for the relevant array.
.SS Auto Assembly
When
will first attempt to assemble all the arrays listed in the config
file.
-If a
-.B homehost
-has been specified (either in the config file or on the command line),
-.I mdadm
-will look further for possible arrays and will try to assemble
-anything that it finds which is tagged as belonging to the given
-homehost. This is the only situation where
-.I mdadm
-will assemble arrays without being given specific device name or
-identity information for the array.
+In no array at listed in the config (other than those marked
+.BR <ignore> )
+it will look through the available devices for possible arrays and
+will try to assemble anything that it finds. Arrays which are tagged
+as belonging to the given homehost will be assembled and started
+normally. Arrays which do not obviously belong to this host are given
+names that are expected not to conflict with anything local, and are
+started "read-auto" so that nothing is written to any device until the
+array is written to. i.e. automatic resync etc is delayed.
If
.I mdadm
If the array uses version-1 metadata, then the
.B name
from the superblock is used to similarly create a name in
-.BR /dev/md
+.B /dev/md/
(the name will have any 'host' prefix stripped first).
+.ig XX
If
.I mdadm
cannot find any array for the given host at all, and if
The reason for requiring arrays to be tagged with the homehost for
auto assembly is to guard against problems that can arise when moving
devices from one host to another.
+.XX
.SH BUILD MODE
.PP
This usage is similar to
.BR \-\-create .
-The difference is that it creates an array without a superblock. With
+The difference is that it creates an array without a superblock. With
these arrays there is no difference between initially creating the array and
subsequently assembling the array, except that hopefully there is useful
data there in the second case.
-The level may raid0, linear, multipath, or faulty, or one of their
-synonyms. All devices must be listed and the array will be started
-once complete.
+The level may raid0, linear, raid1, raid10, multipath, or faulty, or
+one of their synonyms. All devices must be listed and the array will
+be started once complete. It will often be appropriate to use
+.B \-\-assume\-clean
+with levels raid1 or raid10.
.SH CREATE MODE
.BI \-\-level= Y
.br
.BI \-\-raid\-devices= Z
-.I devices
+.I devices
.PP
This usage will initialise a new md array, associate some devices with
it, and activate the array.
-If the
-.B \-\-auto
-option is given (as described in more detail in the section on
-Assemble mode), then the md device will be created with a suitable
-device number if necessary.
+The named device will normally not exist when
+.I "mdadm \-\-create"
+is run, but will be created by
+.I udev
+once the array becomes active.
-As devices are added, they are checked to see if they contain raid
-superblocks or filesystems. They are also checked to see if the variance in
+As devices are added, they are checked to see if they contain RAID
+superblocks or filesystems. They are also checked to see if the variance in
device size exceeds 1%.
If any discrepancy is found, the array will not automatically be run, though
When creating a RAID5 array,
.I mdadm
will automatically create a degraded array with an extra spare drive.
-This is because building the spare into a degraded array is in general faster than resyncing
-the parity on a non-degraded, but not clean, array. This feature can
-be overridden with the
+This is because building the spare into a degraded array is in general
+faster than resyncing the parity on a non-degraded, but not clean,
+array. This feature can be overridden with the
.B \-\-force
option.
.B home
will be used.
-When creating a partition based array, using
-.I mdadm
-with version-1.x metadata, the partition type should be set to
+When creating a partition based array, using
+.I mdadm
+with version-1.x metadata, the partition type should be set to
.B 0xDA
-(non fs-data). This type selection allows for greater precision since
+(non fs-data). This type selection allows for greater precision since
using any other [RAID auto-detect (0xFD) or a GNU/Linux partition (0x83)],
might create problems in the event of array recovery through a live cdrom.
.\".B \-\-size
.\"is given, the apparent size of the smallest drive given is used.
+When creating an array within a
+.B CONTAINER
+.I mdadm
+can be given either the list of devices to use, or simply the name of
+the container. The former case gives control over which devices in
+the container will be used for the array. The latter case allows
+.I mdadm
+to automatically choose which devices to use based on how much spare
+space is available.
+
The General Management options that are valid with
.B \-\-create
are:
.B \-\-readonly
start the array readonly \(em not supported yet.
-
.SH MANAGE MODE
.HP 12
Usage:
This usage will allow individual devices in an array to be failed,
removed or added. It is possible to perform multiple operations with
-on command. For example:
+on command. For example:
.br
.B " mdadm /dev/md0 \-f /dev/hda1 \-r /dev/hda1 \-a /dev/hda1"
.br
in as a spare. However only one md array can be affected by a single
command.
+When a device is added to an active array, mdadm checks to see if it
+has metadata on it which suggests that it was recently a member of the
+array. If it does, it tried to "re-add" the device. If there have
+been no changes since the device was removed, or if the array has a
+write-intent bitmap which has recorded whatever changes there were,
+then the device will immediately become a full member of the array and
+those differences recorded in the bitmap will be resolved.
+
.SH MISC MODE
.HP 12
Usage:
.B mdadm
.I options ...
-.I devices ...
+.I devices ...
.PP
MISC mode includes a number of distinct operations that
.TP
.B \-\-detail
The device should be an active md device.
-.B mdadm
+.B mdadm
will display a detailed description of the array.
.B \-\-brief
or
There was an error while trying to get information about the device.
.RE
+.TP
+.B \-\-detail\-platform
+Print detail of the platform's RAID capabilities (firmware / hardware
+topology). If the metadata is specified with
+.B \-e
+or
+.B \-\-metadata=
+then the return status will be:
+.RS
+.TP
+0
+metadata successfully enumerated its platform components on this system
+.TP
+1
+metadata is platform independent
+.TP
+2
+metadata failed to find its platform components on this system
+.RE
+
.TP
.B \-\-examine
The device should be a component of an md array.
.B \-\-scan
causes all devices listed in the config file to be examined.
-
.SH MONITOR MODE
.HP 12
If any devices are listed on the command line,
.I mdadm
-will only monitor those devices. Otherwise all arrays listed in the
+will only monitor those devices. Otherwise all arrays listed in the
configuration file will be monitored. Further, if
.B \-\-scan
is given, then any other md devices that appear in
.B NewArray
A new md array has been detected in the
.B /proc/mdstat
-file. (syslog priority: Info)
+file. (syslog priority: Info)
.TP
.B DegradedArray
array.
For this to work, the kernel must support the necessary change.
Various types of growth are being added during 2.6 development,
-including restructuring a raid5 array to have more active devices.
+including restructuring a RAID5 array to have more active devices.
Currently the only support available is to
.IP \(bu 4
remove a write-intent bitmap from such an array.
.PP
+GROW mode is not currently supported for
+.B CONTAINERS
+or arrays inside containers.
+
.SS SIZE CHANGES
Normally when an array is built the "size" it taken from the smallest
of the drives. If all the small drives in an arrays are, one at a
stored in the array will not automatically grow to use the space. The
filesystem will need to be explicitly told to use the extra space.
-.SS RAID-DEVICES CHANGES
+Also the size of an array cannot be changed while it has an active
+bitmap. If an array has a bitmap, it must be removed before the size
+can be changed. Once the change it complete a new bitmap can be created.
+
+.SS RAID\-DEVICES CHANGES
A RAID1 array can work with any number of devices from 1 upwards
(though 1 is not very useful). There may be times which you want to
Increasing the number of active devices in a RAID5 is much more
effort. Every block in the array will need to be read and written
back to a new location. From 2.6.17, the Linux Kernel is able to do
-this safely, including restart and interrupted "reshape".
+this safely, including restarting an interrupted "reshape".
-When relocating the first few stripes on a raid5, it is not possible
+When relocating the first few stripes on a RAID5, it is not possible
to keep the data on disk completely consistent and crash-proof. To
provide the required safety, mdadm disables writes to the array while
this "critical section" is reshaped, and takes a backup of the data
A write-intent bitmap can be added to, or removed from, an active
array. Either internal bitmaps, or bitmaps stored in a separate file,
can be added. Note that if you add a bitmap stored in a file which is
-in a filesystem that is on the raid array being affected, the system
+in a filesystem that is on the RAID array being affected, the system
will deadlock. The bitmap must be on a separate filesystem.
.SH INCREMENTAL MODE
Usage:
.B mdadm \-\-incremental \-\-run \-\-scan
-
.PP
This mode is designed to be used in conjunction with a device
discovery system. As devices are found in a system, they can be
.B "mdadm \-\-incremental"
to be conditionally added to an appropriate array.
+If the device passed is a
+.B CONTAINER
+device created by a previous call to
+.IR mdadm ,
+then rather than trying to add that device to an array, all the arrays
+described by the metadata of the container will be started.
+
.I mdadm
performs a number of tests to determine if the device is part of an
array, and which array it should be part of. If an appropriate array
(active or spare) parts of that array. It does not currently support
automatic inclusion of a new drive as a spare in some array.
-.B "mdadm \-\-incremental"
-requires a bug-fix in all kernels through 2.6.19.
-Hopefully, this will be fixed in 2.6.20; alternately, apply the patch
-which is included with the mdadm source distribution. If
-.I mdadm
-detects that this bug is present, it will abort any attempt to use
-.BR \-\-incremental .
-
The tests that
.I mdadm
makes are as follow:
.I md
metadata is found, the device is rejected.
+.ig XX
.IP +
Does the metadata match an expected array?
The metadata can match in two ways. Either there is an array listed
.I mdadm
is not able to positively identify the array as belonging to the
current host, the device will be rejected.
+.XX
-.IP +
.I mdadm
keeps a list of arrays that it has partially assembled in
.B /var/run/mdadm/map
(or
.B /var/run/mdadm.map
-if the directory doesn't exist). If no array exists which matches
+if the directory doesn't exist. Or maybe even
+.BR /dev/.mdadm.map ).
+If no array exists which matches
the metadata on the new device,
.I mdadm
must choose a device name and unit number. It does this based on any
suggests that a non-partitionable array is preferred, that will be
honoured.
-.IP +
+If the array is not found in the config file and its metadata does not
+identify it as belonging to the "homehost", then
+.I mdadm
+will choose a name for the array which is certain not to conflict with
+any array which does belong to this host. It does this be adding an
+underscore and a small number to the name preferred by the metadata.
+
Once an appropriate array is found or created and the device is added,
.I mdadm
must decide if the array is ready to be started. It will
may be passed to
.I mdadm
in which case the array will be run as soon as there are enough
-devices present for the data to be accessible. For a raid1, that
-means one device will start the array. For a clean raid5, the array
+devices present for the data to be accessible. For a RAID1, that
+means one device will start the array. For a clean RAID5, the array
will be started as soon as all but one drive is present.
Note that neither of these approaches is really ideal. If it can
happens. Further devices that are found before the first write can
still be added safely.
+.SH ENVIRONMENT
+This section describes environment variables that affect how mdadm
+operates.
+
+.TP
+.B MDADM_NO_MDMON
+Setting this value to 1 will prevent mdadm from automatically launching
+mdmon. This variable is intended primarily for debugging mdadm/mdmon.
+
+.TP
+.B MDADM_NO_UDEV
+Normally,
+.I mdadm
+does not create any device nodes in /dev, but leaves that task to
+.IR udev .
+If
+.I udev
+appears not to be configured, or if this environment variable is set
+to '1', the
+.I mdadm
+will create and devices that are needed.
+
.SH EXAMPLES
.B " mdadm \-\-query /dev/name-of-device"
.br
-This will find out if a given device is a raid array, or is part of
+This will find out if a given device is a RAID array, or is part of
one, and will provide brief information about the device.
.B " mdadm \-\-assemble \-\-scan"
Any devices which are components of /dev/md4 will be marked as faulty
and then remove from the array.
+.B " mdadm --create /dev/md/ddf --metadata=ddf --raid-disks 6 /dev/sd[a-f]"
+.br
+Create a DDF array over 6 devices.
+
+.B " mdadm --create /dev/md/home -n3 -l5 -z 30000000 /dev/md/ddf"
+.br
+Create a RAID5 array over any 3 devices in the given DDF set. Use
+only 30 gigabytes of each device.
+
+.B " mdadm -A /dev/md/ddf1 /dev/sd[a-f]"
+.br
+Assemble a pre-exist ddf array.
+
+.B " mdadm -I /dev/md/ddf1"
+.br
+Assemble all arrays contained in the ddf array, assigning names as
+appropriate.
+
.B " mdadm \-\-create \-\-help"
.br
Provide help about the Create mode.
.br
Provide general help.
-
.SH FILES
.SS /proc/mdstat
is given in Misc mode, and to monitor array reconstruction
on Monitor mode.
-
.SS /etc/mdadm.conf
The config file lists which devices may be scanned to see if
.B /var/run/mdadm
does not exist as a directory, then
.B /var/run/mdadm.map
-is used instead.
+is used instead. If
+.B /var/run
+is not available (as may be the case during early boot),
+.B /dev/.mdadm.map
+is used on the basis that
+.B /dev
+is usually available very early in boot.
.SH DEVICE NAMES
-While entries in the /dev directory can have any format you like,
.I mdadm
-has an understanding of 'standard' formats which it uses to guide its
-behaviour when creating device files via the
-.B \-\-auto
-option.
+understand two sorts of names for array devices.
+
+The first is the so-called 'standard' format name, which matches the
+names used by the kernel and which appear in
+.IR /proc/mdstat .
+
+The second sort can be freely chosen, but must reside in
+.IR /dev/md/ .
+When giving a device name to
+.I mdadm
+to create or assemble an array, either full path name such as
+.I /dev/md0
+or
+.I /dev/md/home
+can be given, or just the suffix of the second sort of name, such as
+.I home
+can be given.
+
+When
+.I mdadm
+chooses device names during auto-assembly or incremental assembly, it
+will sometimes add a small sequence number to the end of the name to
+avoid conflicted between multiple arrays that have the same name. If
+.I mdadm
+can reasonably determine that the array really is meant for this host,
+either by a hostname in the metadata, or by the presence of the array
+in /etc/mdadm.conf, then it will leave off the suffix if possible.
+Also if the homehost is specified as
+.B <ignore>
+.I mdadm
+will only use a suffix if a different array of the same name already
+exists or is listed in the config file.
The standard names for non-partitioned arrays (the only sort of md
-array available in 2.4 and earlier) are either of
+array available in 2.4 and earlier) are of the form
.IP
/dev/mdNN
-.br
-/dev/md/NN
.PP
where NN is a number.
The standard names for partitionable arrays (as available from 2.6
-onwards) are either of
+onwards) are of the form
.IP
-/dev/md/dNN
-.br
/dev/md_dNN
.PP
Partition numbers should be indicated by added "pMM" to these, thus "/dev/md/d1p2".
+.PP
+From kernel version, 2.6.28 the "non-partitioned array" can actually
+be partitioned. So the "md_dNN" names are no longer needed, and
+partitions such as "/dev/mdNNpXX" are possible.
.SH NOTE
.I mdadm
.\"for new releases of the RAID driver check out:
.\"
.\".IP
-.\".UR ftp://ftp.kernel.org/pub/linux/kernel/people/mingo/raid-patches
+.\".UR ftp://ftp.kernel.org/pub/linux/kernel/people/mingo/raid-patches
.\"ftp://ftp.kernel.org/pub/linux/kernel/people/mingo/raid-patches
.\".UE
.\".PP
.PP
Related man pages:
.PP
+.IR mdmon (8),
.IR mdadm.conf (5),
.IR md (4).
.PP
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*
* Additions for bitmap and write-behind RAID options, Copyright (C) 2003-2004,
* Paul Clements, SteelEye Technology, Inc.
char *homehost = NULL;
char sys_hostname[256];
+ int require_homehost = 1;
char *mailaddr = NULL;
char *program = NULL;
int delay = 0;
ident.bitmap_fd = -1;
ident.bitmap_file = NULL;
ident.name[0] = 0;
+ ident.container = NULL;
+ ident.member = NULL;
while ((option_index = -1) ,
(opt=getopt_long(argc, argv,
shortopt, long_options,
&option_index)) != -1) {
int newmode = mode;
- /* firstly, some mode-independant options */
+ /* firstly, some mode-independent options */
switch(opt) {
case 'h':
if (option_index > 0 &&
continue;
case HomeHost:
- homehost = optarg;
+ if (strcasecmp(optarg, "<ignore>") == 0)
+ require_homehost = 0;
+ else
+ homehost = optarg;
continue;
case ':':
case 'o':
case 'w':
case 'W':
+ case Waitclean:
+ case DetailPlatform:
case 'K': if (!mode) newmode = MISC; break;
}
if (mode && newmode == mode) {
dv->writemostly = writemostly;
dv->re_add = re_add;
dv->used = 0;
+ dv->content = NULL;
dv->next = NULL;
*devlistend = dv;
devlistend = &dv->next;
dv->disposition = devmode;
dv->writemostly = writemostly;
dv->re_add = re_add;
+ dv->used = 0;
+ dv->content = NULL;
dv->next = NULL;
*devlistend = dv;
devlistend = &dv->next;
}
continue;
+#if 0
case O(ASSEMBLE,AutoHomeHost):
auto_update_home = 1;
continue;
+#endif
case O(INCREMENTAL, 'e'):
case O(CREATE,'e'):
case O(ASSEMBLE,'e'):
optarg);
exit(2);
}
- if (level != 0 && level != -1 && level != 1 && level != -4 && level != -5 && mode == BUILD) {
+ if (level != 0 && level != LEVEL_LINEAR && level != 1 &&
+ level != LEVEL_MULTIPATH && level != LEVEL_FAULTY &&
+ level != 10 &&
+ mode == BUILD) {
fprintf(stderr, Name ": Raid level %s not permitted with --build.\n",
optarg);
exit(2);
exit(2);
case 5:
- case 6:
layout = map_name(r5layout, optarg);
if (layout==UnSet) {
fprintf(stderr, Name ": layout %s not understood for raid5.\n",
exit(2);
}
break;
+ case 6:
+ layout = map_name(r6layout, optarg);
+ if (layout==UnSet) {
+ fprintf(stderr, Name ": layout %s not understood for raid6.\n",
+ optarg);
+ exit(2);
+ }
+ break;
case 10:
/* 'f', 'o' or 'n' followed by a number <= raid_disks */
" 'summaries', 'homehost', 'byteorder', 'devicesize'.\n");
exit(outf == stdout ? 0 : 2);
+ case O(INCREMENTAL,NoDegraded):
case O(ASSEMBLE,NoDegraded): /* --no-degraded */
runstop = -1; /* --stop isn't allowed for --assemble,
* so we overload slightly */
case O(MISC,'o'):
case O(MISC,'w'):
case O(MISC,'W'):
+ case O(MISC, Waitclean):
+ case O(MISC, DetailPlatform):
if (devmode && devmode != opt &&
(devmode == 'E' || (opt == 'E' && devmode != 'Q'))) {
fprintf(stderr, Name ": --examine/-E cannot be given with -%c\n",
fprintf(stderr, Name ": --super-minor=dev is incompatible with --auto\n");
exit(2);
}
- if (mode == MANAGE || mode == GROW)
- autof=1; /* Don't create */
- mdfd = open_mddev(devlist->devname, autof);
- if (mdfd < 0)
+ if (mode == MANAGE || mode == GROW) {
+ mdfd = open_mddev(devlist->devname, 1);
+ if (mdfd < 0)
+ exit(1);
+ } else
+ /* non-existent device is OK */
+ mdfd = open_mddev(devlist->devname, 0);
+ if (mdfd == -2) {
+ fprintf(stderr, Name ": device %s exists but is not an "
+ "md array.\n", devlist->devname);
exit(1);
+ }
if ((int)ident.super_minor == -2) {
struct stat stb;
+ if (mdfd < 0) {
+ fprintf(stderr, Name ": --super-minor=dev given, and "
+ "listed device %s doesn't exist.\n",
+ devlist->devname);
+ exit(1);
+ }
fstat(mdfd, &stb);
ident.super_minor = minor(stb.st_rdev);
}
+ if (mdfd >= 0 && mode != MANAGE && mode != GROW) {
+ /* We don't really want this open yet, we just might
+ * have wanted to check some things
+ */
+ close(mdfd);
+ mdfd = -1;
+ }
}
if (raiddisks) {
}
if (homehost == NULL)
- homehost = conf_get_homehost();
- if (homehost && strcmp(homehost, "<system>")==0) {
+ homehost = conf_get_homehost(&require_homehost);
+ if (homehost == NULL || strcmp(homehost, "<system>")==0) {
if (gethostname(sys_hostname, sizeof(sys_hostname)) == 0) {
sys_hostname[sizeof(sys_hostname)-1] = 0;
homehost = sys_hostname;
}
}
+ ident.autof = autof;
+
rv = 0;
switch(mode) {
case MANAGE:
fprintf(stderr, Name ": %s not identified in config file.\n",
devlist->devname);
rv |= 1;
- } else {
- mdfd = open_mddev(devlist->devname,
- array_ident->autof ? array_ident->autof : autof);
- if (mdfd < 0)
- rv |= 1;
- else {
- rv |= Assemble(ss, devlist->devname, mdfd, array_ident,
- NULL, backup_file,
- readonly, runstop, update, homehost, verbose-quiet, force);
+ if (mdfd >= 0)
close(mdfd);
- }
+ } else {
+ if (array_ident->autof == 0)
+ array_ident->autof = autof;
+ rv |= Assemble(ss, devlist->devname, array_ident,
+ NULL, backup_file,
+ readonly, runstop, update,
+ homehost, require_homehost,
+ verbose-quiet, force);
}
} else if (!scan)
- rv = Assemble(ss, devlist->devname, mdfd, &ident,
+ rv = Assemble(ss, devlist->devname, &ident,
devlist->next, backup_file,
- readonly, runstop, update, homehost, verbose-quiet, force);
+ readonly, runstop, update,
+ homehost, require_homehost,
+ verbose-quiet, force);
else if (devs_found>0) {
if (update && devs_found > 1) {
fprintf(stderr, Name ": can only update a single array at a time\n");
rv |= 1;
continue;
}
- mdfd = open_mddev(dv->devname,
- array_ident->autof ?array_ident->autof : autof);
- if (mdfd < 0) {
- rv |= 1;
- continue;
- }
- rv |= Assemble(ss, dv->devname, mdfd, array_ident,
+ if (array_ident->autof == 0)
+ array_ident->autof = autof;
+ rv |= Assemble(ss, dv->devname, array_ident,
NULL, backup_file,
- readonly, runstop, update, homehost, verbose-quiet, force);
- close(mdfd);
+ readonly, runstop, update,
+ homehost, require_homehost,
+ verbose-quiet, force);
}
} else {
mddev_ident_t array_list = conf_get_ident(NULL);
exit(1);
}
for (; array_list; array_list = array_list->next) {
- mdu_array_info_t array;
- mdfd = open_mddev(array_list->devname,
- array_list->autof ? array_list->autof : autof);
- if (mdfd < 0) {
- rv |= 1;
+ if (array_list->devname &&
+ strcasecmp(array_list->devname, "<ignore>") == 0)
continue;
- }
- if (ioctl(mdfd, GET_ARRAY_INFO, &array)>=0)
- /* already assembled, skip */
- cnt++;
- else {
- rv |= Assemble(ss, array_list->devname, mdfd,
- array_list,
- NULL, NULL,
- readonly, runstop, NULL, homehost, verbose-quiet, force);
- if (rv == 0) cnt++;
- }
- close(mdfd);
- }
- if (homehost) {
+ if (array_list->autof == 0)
+ array_list->autof = autof;
+
+ rv |= Assemble(ss, array_list->devname,
+ array_list,
+ NULL, NULL,
+ readonly, runstop, NULL,
+ homehost, require_homehost,
+ verbose-quiet, force);
+ cnt++;
+ }
+ if (homehost && cnt == 0) {
/* Maybe we can auto-assemble something.
- * Repeatedly call Assemble in auto-assmble mode
+ * Repeatedly call Assemble in auto-assemble mode
* until it fails
*/
int rv2;
mddev_dev_t devlist = conf_get_devs();
acnt = 0;
do {
- rv2 = Assemble(ss, NULL, -1,
+ rv2 = Assemble(ss, NULL,
&ident,
devlist, NULL,
- readonly, runstop, NULL, homehost, verbose-quiet, force);
+ readonly, runstop, NULL,
+ homehost, require_homehost,
+ verbose-quiet, force);
if (rv2==0) {
cnt++;
acnt++;
} while (rv2!=2);
/* Incase there are stacked devices, we need to go around again */
} while (acnt);
+#if 0
if (cnt == 0 && auto_update_home && homehost) {
/* Nothing found, maybe we need to bootstrap homehost info */
do {
acnt = 0;
do {
- rv2 = Assemble(ss, NULL, -1,
+ rv2 = Assemble(ss, NULL,
&ident,
NULL, NULL,
- readonly, runstop, "homehost", homehost, verbose-quiet, force);
+ readonly, runstop, "homehost",
+ homehost, require_homehost,
+ verbose-quiet, force);
if (rv2==0) {
cnt++;
acnt++;
/* Incase there are stacked devices, we need to go around again */
} while (acnt);
}
+#endif
if (cnt == 0 && rv == 0) {
fprintf(stderr, Name ": No arrays found in config file or automatically\n");
rv = 1;
break;
}
}
- rv = Build(devlist->devname, mdfd, chunk, level, layout,
+ rv = Build(devlist->devname, chunk, level, layout,
raiddisks, devlist->next, assume_clean,
- bitmap_file, bitmap_chunk, write_behind, delay,
- verbose-quiet, size);
+ bitmap_file, bitmap_chunk, write_behind,
+ delay, verbose-quiet, autof, size);
break;
case CREATE:
if (delay == 0) delay = DEFAULT_BITMAP_DELAY;
break;
}
- rv = Create(ss, devlist->devname, mdfd, chunk, level, layout, size<0 ? 0 : size,
+ rv = Create(ss, devlist->devname, chunk, level, layout, size<0 ? 0 : size,
raiddisks, sparedisks, ident.name, homehost,
ident.uuid_set ? ident.uuid : NULL,
devs_found-1, devlist->next, runstop, verbose-quiet, force, assume_clean,
- bitmap_file, bitmap_chunk, write_behind, delay);
+ bitmap_file, bitmap_chunk, write_behind, delay, autof);
break;
case MISC:
if (devmode == 'E') {
rv = Examine(devlist, scan?(verbose>1?0:verbose+1):brief,
export, scan,
SparcAdjust, ss, homehost);
+ } else if (devmode == DetailPlatform) {
+ rv = Detail_Platform(ss ? ss->ss : NULL, ss ? scan : 1, verbose);
} else {
if (devlist == NULL) {
- if (devmode=='D' && scan) {
- /* apply --detail to all devices in /proc/mdstat */
+ if ((devmode=='D' || devmode == Waitclean) && scan) {
+ /* apply --detail or --wait-clean to
+ * all devices in /proc/mdstat
+ */
struct mdstat_ent *ms = mdstat_read(0, 1);
struct mdstat_ent *e;
+ struct map_ent *map = NULL;
+ int v = verbose>1?0:verbose+1;
+
for (e=ms ; e ; e=e->next) {
- char *name = get_md_name(e->devnum);
+ char *name;
+ struct map_ent *me;
+ me = map_by_devnum(&map, e->devnum);
+ if (me && me->path
+ && strcmp(me->path, "/unknown") != 0)
+ name = me->path;
+ else
+ name = get_md_name(e->devnum);
if (!name) {
fprintf(stderr, Name ": cannot find device file for %s\n",
e->dev);
continue;
}
- rv |= Detail(name, verbose>1?0:verbose+1,
- export, test, homehost);
+ if (devmode == 'D')
+ rv |= Detail(name, v,
+ export, test,
+ homehost);
+ else
+ rv |= WaitClean(name, v);
put_md_name(name);
}
free_mdstat(ms);
export, test, homehost);
continue;
case 'K': /* Zero superblock */
- rv |= Kill(dv->devname, force, quiet); continue;
+ rv |= Kill(dv->devname, force, quiet,0);
+ continue;
case 'Q':
rv |= Query(dv->devname); continue;
case 'X':
rv |= ExamineBitmap(dv->devname, brief, ss); continue;
case 'W':
rv |= Wait(dv->devname); continue;
+ case Waitclean:
+ rv |= WaitClean(dv->devname, verbose-quiet); continue;
}
mdfd = open_mddev(dv->devname, 1);
if (mdfd>=0) {
rv = 1;
break;
}
+ if (delay == 0) {
+ if (get_linux_version() > 20616)
+ /* mdstat responds to poll */
+ delay = 1000;
+ else
+ delay = 60;
+ }
rv= Monitor(devlist, mailaddr, program,
delay?delay:60, daemonise, scan, oneshot,
dosyslog, test, pidfile);
break;
}
rv = Incremental(devlist->devname, verbose-quiet, runstop,
- ss, homehost, autof);
+ ss, homehost, require_homehost, autof);
break;
case AUTODETECT:
autodetect();
Alternatively, a
.B device
-line can contain the word
+line can contain either of both of the words
+.B containers
+and
.BR partitions .
-This will cause
+The word
+.B containers
+will cause
+.I mdadm
+to look for assembled CONTAINER arrays and included them as a source
+for assembling further arrays.
+
+The word
+.I partitions
+will cause
.I mdadm
to read
.I /proc/partitions
.I /dev
to find the name that matches the numbers.
-If no DEVICE line is present, then "DEVICE partitions" is assumed.
+If no DEVICE line is present, then "DEVICE partitions containers" is assumed.
For example:
.IP
.br
DEV /dev/sd*
.br
-DEVICE /dev/discs/disc*/disc
+DEVICE /dev/disk/by-path/pci*
.br
DEVICE partitions
.TP
.B ARRAY
The ARRAY lines identify actual arrays. The second word on the line
-should be the name of the device where the array is normally
+may be the name of the device where the array is normally
assembled, such as
-.BR /dev/md1 .
+.B /dev/md1
+or
+.BR /dev/md/backup .
+If the name does not start with a slash
+.RB (' / '),
+it is treated as being in
+.BR /dev/md/ .
+Alternately the word
+.B <ignore>
+(complete with angle brackets) can be given in which case any array
+which matches the rest of the line will never be automatically assembled.
+If no device name is given,
+.I mdadm
+will use various heuristics to determine an appropriate name.
+
Subsequent words identify the array, or identify the array as a member
of a group. If multiple identities are given,
then a component device must match ALL identities to be considered a
match. Each identity word has a tag, and equals sign, and some value.
The tags are:
-
.RS 4
.TP
.B uuid=
.TP
.B spares=
The value is a number of spare devices to expect the array to have.
+The sole use of this keyword and value is as follows:
.B mdadm \-\-monitor
will report an array if it is found to have fewer than this number of
spares when
.TP
.B auto=
-This option declares to
+This option is rarely needed with mdadm-3.0, particularly if use with
+the Linux kernel v2.6.28 or later.
+It tells
.I mdadm
-that it should try to create the device file of the array if it
-doesn't already exist, or exists but with the wrong device number.
+whether to use partitionable array or non-partitionable arrays and,
+in the absence of
+.IR udev ,
+how many partition devices to create. From 2.6.28 all md array
+devices are partitionable, hence this option is not needed.
The value of this option can be "yes" or "md" to indicate that a
traditional, non-partitionable md array should be created, or "mdp",
recognised for comparability with the output of
.BR "mdadm \-Es" .
+.TP
+.B container=
+Specify that this array is a member array of some container. The
+value given can be either a path name in /dev, or a UUID of the
+container array.
+
+.TP
+.B member=
+Specify that this array is a member array of some container. Each
+type of container has some way to enumerate member arrays, often a
+simple sequence number. The value identifies which member of a
+container the array is. It will usually accompany a "container=" word.
.RE
.TP
.B homehost
line gives a default value for the
.B --homehost=
-option to mdadm. There should be exactly one other word on the line.
-It should either exactly
+option to mdadm. There should normally be only one other word on the line.
+It should either be a host name, or one of the special words
.B <system>
-or a host name.
+and
+.BR <ignore> .
If
.B <system>
is given, then the
.BR gethostname ( 2 )
systemcall is used to get the host name.
+
+If
+.B <ignore>
+is given, then a flag is set so that when arrays are being
+auto-assembled the checking of the recorded
+.I homehost
+is disabled.
+If
+.B <ignore>
+is given it is also possible to give an explicit name which will be
+used when creating arrays. This is the only case when there can be
+more that one other word on the
+.B HOMEHOST
+line.
+
When arrays are created, this host name will be stored in the
-metadata. When arrays are assembled using auto-assembly, only arrays
-with this host name stored in the metadata will be considered.
+metadata. When arrays are assembled using auto-assembly, arrays which
+do not record the correct homehost name in their metadata will be
+assembled using a "foreign" name. A "foreign" name alway ends with a
+digit string preceded by an underscore to differentiate it
+from any possible local name. e.g.
+.B /dev/md/1_1
+or
+.BR /dev/md/home_0 .
+.TP
+.B AUTO
+A list of names of metadata format can be given, each preceded by a
+plus or minus sign. Also the word
+.I all
+preceded by plus or minus is allowed and is usually last.
+
+When
+.I mdadm
+is auto-assembling an array, with via
+.I --assemble
+or
+.I --incremental
+and it finds metadata of a given type, it checks that metadata type
+against those listed in this line. The first match wins, where
+.I all
+matches anything.
+If a match is found that was preceded by a plus sign, the auto
+assembly is allowed. If the match was preceded by a minus sign, the
+auto assembly is disallowed. If no match is found, the auto assembly
+is allowed.
+
+This can be used to disable all auto-assembly (so that only arrays
+explicitly listed in mdadm.conf or on the command line are assembled),
+or to disable assembly of certain metadata types which might be
+handled by other software.
+
+The known metadata types are
+.BR 0.90 ,
+.BR 1.x ,
+.BR ddf ,
+.BR imsm .
.SH EXAMPLE
DEVICE /dev/sd[bcdjkl]1
CREATE group=system mode=0640 auto=part\-8
.br
HOMEHOST <system>
+.br
+AUTO +1.x -all
.SH SEE ALSO
.BR mdadm (8),
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
#define _GNU_SOURCE
#include "md_u.h"
#include "md_p.h"
#include "bitmap.h"
+#include "msg.h"
#include <endian.h>
/* Redhat don't like to #include <asm/byteorder.h>, and
#define __le16_to_cpu(_x) (_x)
#define __le32_to_cpu(_x) (_x)
#define __le64_to_cpu(_x) (_x)
+
+#define __cpu_to_be16(_x) bswap_16(_x)
+#define __cpu_to_be32(_x) bswap_32(_x)
+#define __cpu_to_be64(_x) bswap_64(_x)
+#define __be16_to_cpu(_x) bswap_16(_x)
+#define __be32_to_cpu(_x) bswap_32(_x)
+#define __be64_to_cpu(_x) bswap_64(_x)
#elif BYTE_ORDER == BIG_ENDIAN
#define __cpu_to_le16(_x) bswap_16(_x)
#define __cpu_to_le32(_x) bswap_32(_x)
#define __le16_to_cpu(_x) bswap_16(_x)
#define __le32_to_cpu(_x) bswap_32(_x)
#define __le64_to_cpu(_x) bswap_64(_x)
+
+#define __cpu_to_be16(_x) (_x)
+#define __cpu_to_be32(_x) (_x)
+#define __cpu_to_be64(_x) (_x)
+#define __be16_to_cpu(_x) (_x)
+#define __be32_to_cpu(_x) (_x)
+#define __be64_to_cpu(_x) (_x)
#else
# error "unknown endianness."
#endif
int uuid[4];
char name[33];
unsigned long long data_offset;
- unsigned long long component_size;
+ unsigned long long component_size; /* same as array.size, except in
+ * sectors and up to 64bits.
+ */
+ unsigned long long custom_array_size; /* size for non-default sized
+ * arrays (in sectors)
+ */
int reshape_active;
unsigned long long reshape_progress;
+ unsigned long long resync_start;
+ unsigned long safe_mode_delay; /* ms delay to mark clean */
int new_level, delta_disks, new_layout, new_chunk;
int errors;
int cache_size; /* size of raid456 stripe cache*/
int mismatch_cnt;
char text_version[50];
+ int container_member; /* for assembling external-metatdata arrays
+ * This is to be used internally by metadata
+ * handler only */
+
char sys_name[20];
struct mdinfo *devs;
struct mdinfo *next;
+
+ /* Device info for mdmon: */
+ int state_fd;
+ #define DS_FAULTY 1
+ #define DS_INSYNC 2
+ #define DS_WRITE_MOSTLY 4
+ #define DS_SPARE 8
+ #define DS_BLOCKED 16
+ #define DS_REMOVE 1024
+ #define DS_UNBLOCK 2048
+ int prev_state, curr_state, next_state;
+
};
struct createinfo {
AutoHomeHost,
Symlinks,
AutoDetect,
+ Waitclean,
+ DetailPlatform,
};
/* structures read from config file */
char *bitmap_file;
int bitmap_fd;
+ char *container; /* /dev/whatever name of container, or
+ * uuid of container. You would expect
+ * this to be the 'devname' or UUID
+ * of some other entry.
+ */
+ char *member; /* subarray within a container */
+
struct mddev_ident_s *next;
} *mddev_ident_t;
char writemostly; /* 1 for 'set writemostly', 2 for 'clear writemostly' */
char re_add;
char used; /* set when used */
+ struct mdinfo *content; /* If devname is a container, this might list
+ * the remaining member arrays. */
struct mddev_dev_s *next;
} *mddev_dev_t;
char *pattern; /* U or up, _ for down */
int percent; /* -1 if no resync */
int resync; /* 1 if resync, 0 if recovery */
+ int devcnt;
+ int raid_disks;
+ int chunk_size;
+ char * metadata_version;
struct mdstat_ent *next;
};
extern struct mdstat_ent *mdstat_read(int hold, int start);
extern void free_mdstat(struct mdstat_ent *ms);
extern void mdstat_wait(int seconds);
+extern void mdstat_wait_fd(int fd, const sigset_t *sigmask);
extern int mddev_busy(int devnum);
struct map_ent {
struct map_ent *next;
int devnum;
- int major,minor;
+ char metadata[20];
int uuid[4];
+ int bad;
char *path;
};
-extern int map_update(struct map_ent **mpp, int devnum, int major, int minor,
+extern int map_update(struct map_ent **mpp, int devnum, char *metadata,
int uuid[4], char *path);
extern struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4]);
+extern struct map_ent *map_by_devnum(struct map_ent **map, int devnum);
+extern struct map_ent *map_by_name(struct map_ent **map, char *name);
extern void map_read(struct map_ent **melp);
extern int map_write(struct map_ent *mel);
extern void map_delete(struct map_ent **mapp, int devnum);
extern void map_free(struct map_ent *map);
extern void map_add(struct map_ent **melp,
- int devnum, int major, int minor, int uuid[4], char *path);
+ int devnum, char *metadata, int uuid[4], char *path);
+extern int map_lock(struct map_ent **melp);
+extern void map_unlock(struct map_ent **melp);
/* various details can be requested */
-#define GET_LEVEL 1
-#define GET_LAYOUT 2
-#define GET_COMPONENT 4
-#define GET_CHUNK 8
-#define GET_CACHE 16
-#define GET_MISMATCH 32
-#define GET_VERSION 64
-
-#define GET_DEVS 1024 /* gets role, major, minor */
-#define GET_OFFSET 2048
-#define GET_SIZE 4096
-#define GET_STATE 8192
-#define GET_ERROR 16384
+enum sysfs_read_flags {
+ GET_LEVEL = (1 << 0),
+ GET_LAYOUT = (1 << 1),
+ GET_COMPONENT = (1 << 2),
+ GET_CHUNK = (1 << 3),
+ GET_CACHE = (1 << 4),
+ GET_MISMATCH = (1 << 5),
+ GET_VERSION = (1 << 6),
+ GET_DISKS = (1 << 7),
+ GET_DEGRADED = (1 << 8),
+ GET_SAFEMODE = (1 << 9),
+ GET_DEVS = (1 << 10), /* gets role, major, minor */
+ GET_OFFSET = (1 << 11),
+ GET_SIZE = (1 << 12),
+ GET_STATE = (1 << 13),
+ GET_ERROR = (1 << 14),
+ SKIP_GONE_DEVS = (1 << 15),
+};
/* If fd >= 0, get the array it is open on,
* else use devnum. >=0 -> major9. <0.....
*/
+extern int sysfs_open(int devnum, char *devname, char *attr);
+extern void sysfs_init(struct mdinfo *mdi, int fd, int devnum);
extern void sysfs_free(struct mdinfo *sra);
extern struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options);
+extern int sysfs_attr_match(const char *attr, const char *str);
+extern int sysfs_match_word(const char *word, char **list);
extern int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev,
char *name, char *val);
extern int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev,
char *name, unsigned long long val);
+extern int sysfs_uevent(struct mdinfo *sra, char *event);
extern int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
char *name, unsigned long long *val);
+extern int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, char *val, int size);
+extern int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms);
+extern int sysfs_set_array(struct mdinfo *info, int vers);
+extern int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd,
+ int in_sync);
+extern int sysfs_disk_to_scsi_id(int fd, __u32 *id);
+extern int sysfs_unique_holder(int devnum, long rdev);
+extern int load_sys(char *path, char *buf);
extern int save_stripes(int *source, unsigned long long *offsets,
extern char *map_num(mapping_t *map, int num);
extern int map_name(mapping_t *map, char *name);
-extern mapping_t r5layout[], pers[], modes[], faultylayout[];
+extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[];
extern char *map_dev(int major, int minor, int create);
+struct active_array;
+struct metadata_update;
+/* A superswitch provides entry point the a metadata handler.
+ *
+ * The super_switch primarily operates on some "metadata" that
+ * is accessed via the 'supertype'.
+ * This metadata has one of three possible sources.
+ * 1/ It is read from a single device. In this case it may not completely
+ * describe the array or arrays as some information might be on other
+ * devices.
+ * 2/ It is read from all devices in a container. In this case all
+ * information is present.
+ * 3/ It is created by ->init_super / ->add_to_super. In this case it will
+ * be complete once enough ->add_to_super calls have completed.
+ *
+ * When creating an array inside a container, the metadata will be
+ * formed by a combination of 2 and 3. The metadata or the array is read,
+ * then new information is added.
+ *
+ * The metadata must sometimes have a concept of a 'current' array
+ * and a 'current' device.
+ * The 'current' array is set by init_super to be the newly created array,
+ * or is set by super_by_fd when it finds it is looking at an array inside
+ * a container.
+ *
+ * The 'current' device is either the device that the metadata was read from
+ * in case 1, or the last device added by add_to_super in case 3.
+ * Case 2 does not identify a 'current' device.
+ */
extern struct superswitch {
+
+ /* Used to report details of metadata read from a component
+ * device. ->load_super has been called.
+ */
void (*examine_super)(struct supertype *st, char *homehost);
- void (*brief_examine_super)(struct supertype *st);
+ void (*brief_examine_super)(struct supertype *st, int verbose);
void (*export_examine_super)(struct supertype *st);
+
+ /* Used to report details of an active array.
+ * ->load_super was possibly given a 'component' string.
+ */
void (*detail_super)(struct supertype *st, char *homehost);
void (*brief_detail_super)(struct supertype *st);
void (*export_detail_super)(struct supertype *st);
+
+ /* Optional: platform hardware / firmware details */
+ int (*detail_platform)(int verbose, int enumerate_only);
+
+ /* Used:
+ * to get uuid to storing in bitmap metadata
+ * and 'reshape' backup-data metadata
+ * To see if a device is being re-added to an array it was part of.
+ */
void (*uuid_from_super)(struct supertype *st, int uuid[4]);
+
+ /* Extract generic details from metadata. This could be details about
+ * the container, or about an individual array within the container.
+ * The determination is made either by:
+ * load_super being given a 'component' string.
+ * validate_geometry determining what to create.
+ * The info includes both array information and device information.
+ * The particular device should be:
+ * The last device added by add_to_super
+ * The device the metadata was loaded from by load_super
+ */
void (*getinfo_super)(struct supertype *st, struct mdinfo *info);
+
+ /* Check if the given metadata is flagged as belonging to "this"
+ * host. 0 for 'no', 1 for 'yes', -1 for "Don't record homehost"
+ */
int (*match_home)(struct supertype *st, char *homehost);
+
+ /* Make one of several generic modifications to metadata
+ * prior to assembly (or other times).
+ * sparc2.2 - first bug in early 0.90 metadata
+ * super-minor - change name of 0.90 metadata
+ * summaries - 'correct' any redundant data
+ * resync - mark array as dirty to trigger a resync.
+ * uuid - set new uuid - only 0.90 or 1.x
+ * name - change the name of the array (where supported)
+ * homehost - change which host this array is tied to.
+ * devicesize - If metadata is at start of device, change recorded
+ * device size to match actual device size
+ * byteorder - swap bytes for 0.90 metadata
+ *
+ * force-one - mark that device as uptodate, not old or failed.
+ * force-array - mark array as clean if it would not otherwise
+ * assemble
+ * assemble - not sure how this is different from force-one...
+ * linear-grow-new - add a new device to a linear array, but don't
+ * change the size: so superblock still matches
+ * linear-grow-update - now change the size of the array.
+ */
int (*update_super)(struct supertype *st, struct mdinfo *info,
char *update,
char *devname, int verbose,
int uuid_set, char *homehost);
+
+ /* Create new metadata for new array as described. This could
+ * be a new container, or an array in a pre-existing container.
+ * Also used to zero metadata prior to writing it to invalidate old
+ * metadata.
+ */
int (*init_super)(struct supertype *st, mdu_array_info_t *info,
unsigned long long size, char *name,
char *homehost, int *uuid);
- void (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo);
+
+ /* update the metadata to include new device, either at create or
+ * when hot-adding a spare.
+ */
+ int (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo,
+ int fd, char *devname);
+
+ /* Write metadata to one device when fixing problems or adding
+ * a new device.
+ */
int (*store_super)(struct supertype *st, int fd);
- int (*write_init_super)(struct supertype *st, mdu_disk_info_t *dinfo,
- char *devname);
+
+ /* Write all metadata for this array.
+ */
+ int (*write_init_super)(struct supertype *st);
int (*compare_super)(struct supertype *st, struct supertype *tst);
int (*load_super)(struct supertype *st, int fd, char *devname);
struct supertype * (*match_metadata_desc)(char *arg);
void (*locate_bitmap)(struct supertype *st, int fd);
int (*write_bitmap)(struct supertype *st, int fd);
void (*free_super)(struct supertype *st);
- int major;
+
+ /* validate_geometry is called with an st returned by
+ * match_metadata_desc.
+ * It should check that the geometry described in compatible with
+ * the metadata type. It will be called repeatedly as devices
+ * added to validate changing size and new devices. If there are
+ * inter-device dependencies, it should record sufficient details
+ * so these can be validated.
+ * Both 'size' and '*freesize' are in sectors. chunk is bytes.
+ */
+ int (*validate_geometry)(struct supertype *st, int level, int layout,
+ int raiddisks,
+ int chunk, unsigned long long size,
+ char *subdev, unsigned long long *freesize,
+ int verbose);
+
+ struct mdinfo *(*container_content)(struct supertype *st);
+ /* Allow a metadata handler to override mdadm's default layouts */
+ int (*default_layout)(int level); /* optional */
+
+/* for mdmon */
+ int (*open_new)(struct supertype *c, struct active_array *a,
+ char *inst);
+
+ /* Tell the metadata handler the current state of the array.
+ * This covers whether it is known to be consistent (no pending writes)
+ * and how far along a resync is known to have progressed
+ * (in a->resync_start).
+ * resync status is really irrelevant if the array is not consistent,
+ * but some metadata (DDF!) have a place to record the distinction.
+ * If 'consistent' is '2', then the array can mark it dirty if a
+ * resync/recovery/whatever is required, or leave it clean if not.
+ * Return value is 0 dirty (not consistent) and 1 if clean.
+ * it is only really important if consistent is passed in as '2'.
+ */
+ int (*set_array_state)(struct active_array *a, int consistent);
+
+ /* When the state of a device might have changed, we call set_disk to
+ * tell the metadata what the current state is.
+ * Typically this happens on spare->in_sync and (spare|in_sync)->faulty
+ * transitions.
+ * set_disk might be called when the state of the particular disk has
+ * not in fact changed.
+ */
+ void (*set_disk)(struct active_array *a, int n, int state);
+ void (*sync_metadata)(struct supertype *st);
+ void (*process_update)(struct supertype *st,
+ struct metadata_update *update);
+ void (*prepare_update)(struct supertype *st,
+ struct metadata_update *update);
+
+ /* activate_spare will check if the array is degraded and, if it
+ * is, try to find some spare space in the container.
+ * On success, it add appropriate updates (For process_update) to
+ * to the 'updates' list and returns a list of 'mdinfo' identifying
+ * the device, or devices as there might be multiple missing
+ * devices and multiple spares available.
+ */
+ struct mdinfo *(*activate_spare)(struct active_array *a,
+ struct metadata_update **updates);
+
int swapuuid; /* true if uuid is bigending rather than hostendian */
-} super0, super1, *superlist[];
+ int external;
+ const char *name; /* canonical metadata name */
+} super0, super1, super_ddf, *superlist[];
+
+extern struct superswitch super_imsm;
+
+struct metadata_update {
+ int len;
+ char *buf;
+ void *space; /* allocated space that monitor will use */
+ struct metadata_update *next;
+};
+/* A supertype holds a particular collection of metadata.
+ * It identifies the metadata type by the superswitch, and the particular
+ * sub-version of that metadata type.
+ * metadata read in or created is stored in 'sb' and 'info'.
+ * There are also fields used by mdmon to track containers.
+ *
+ * A supertype may refer to:
+ * Just an array, possibly in a container
+ * A container, not identifying any particular array
+ * Info read from just one device, not yet fully describing the array/container.
+ *
+ *
+ * A supertype is created by:
+ * super_by_fd
+ * guess_super
+ * dup_super
+ */
struct supertype {
struct superswitch *ss;
int minor_version;
int max_devs;
+ int container_dev; /* devnum of container */
+ char subarray[32]; /* name of array inside container */
void *sb;
+ void *info;
+ int loaded_container; /* Set if load_super found a container,
+ * not just one device */
+
+ struct metadata_update *updates;
+ struct metadata_update **update_tail;
+
+ /* extra stuff used by mdmon */
+ struct active_array *arrays;
+ int sock; /* listen to external programs */
+ int devnum;
+ char *devname; /* e.g. md0. This appears in metadata_verison:
+ * external:/md0/12
+ */
+ int devcnt;
+
+ struct mdinfo *devs;
+
};
extern struct supertype *super_by_fd(int fd);
extern int get_dev_size(int fd, char *dname, unsigned long long *sizep);
extern void get_one_disk(int mdfd, mdu_array_info_t *ainf,
mdu_disk_info_t *disk);
+void wait_for(char *dev, int fd);
#if __GNUC__ < 3
struct stat64;
int *fdlist, int cnt, char *backup_file);
-extern int Assemble(struct supertype *st, char *mddev, int mdfd,
+extern int Assemble(struct supertype *st, char *mddev,
mddev_ident_t ident,
mddev_dev_t devlist, char *backup_file,
int readonly, int runstop,
- char *update, char *homehost,
+ char *update, char *homehost, int require_homehost,
int verbose, int force);
-extern int Build(char *mddev, int mdfd, int chunk, int level, int layout,
- int raiddisks,
- mddev_dev_t devlist, int assume_clean,
+extern int Build(char *mddev, int chunk, int level, int layout,
+ int raiddisks, mddev_dev_t devlist, int assume_clean,
char *bitmap_file, int bitmap_chunk, int write_behind,
- int delay, int verbose, unsigned long long size);
+ int delay, int verbose, int autof, unsigned long long size);
-extern int Create(struct supertype *st, char *mddev, int mdfd,
+extern int Create(struct supertype *st, char *mddev,
int chunk, int level, int layout, unsigned long long size, int raiddisks, int sparedisks,
char *name, char *homehost, int *uuid,
int subdevs, mddev_dev_t devlist,
int runstop, int verbose, int force, int assume_clean,
- char *bitmap_file, int bitmap_chunk, int write_behind, int delay);
+ char *bitmap_file, int bitmap_chunk, int write_behind, int delay, int autof);
extern int Detail(char *dev, int brief, int export, int test, char *homehost);
+extern int Detail_Platform(struct superswitch *ss, int scan, int verbose);
extern int Query(char *dev);
extern int Examine(mddev_dev_t devlist, int brief, int export, int scan,
int SparcAdjust, struct supertype *forcest, char *homehost);
int period, int daemonise, int scan, int oneshot,
int dosyslog, int test, char *pidfile);
-extern int Kill(char *dev, int force, int quiet);
+extern int Kill(char *dev, int force, int quiet, int noexcl);
extern int Wait(char *dev);
+extern int WaitClean(char *dev, int verbose);
extern int Incremental(char *devname, int verbose, int runstop,
- struct supertype *st, char *homehost, int autof);
+ struct supertype *st, char *homehost, int require_homehost,
+ int autof);
+extern int Incremental_container(struct supertype *st, char *devname,
+ int verbose, int runstop, int autof,
+ int trustworthy);
extern void RebuildMap(void);
extern int IncrementalScan(int verbose);
extern int get_mdp_major(void);
extern int dev_open(char *dev, int flags);
+extern int open_dev(int devnum);
+extern int open_dev_excl(int devnum);
extern int is_standard(char *dev, int *nump);
+extern int same_dev(char *one, char *two);
extern int parse_auto(char *str, char *msg, int config);
extern mddev_ident_t conf_get_ident(char *dev);
extern mddev_dev_t conf_get_devs(void);
extern int conf_test_dev(char *devname);
+extern int conf_test_metadata(const char *version);
extern struct createinfo *conf_get_create_info(void);
extern void set_conffile(char *file);
extern char *conf_get_mailaddr(void);
extern char *conf_get_mailfrom(void);
extern char *conf_get_program(void);
-extern char *conf_get_homehost(void);
+extern char *conf_get_homehost(int *require_homehostp);
extern char *conf_line(FILE *file);
extern char *conf_word(FILE *file, int allow_key);
+extern int conf_name_is_free(char *name);
+extern int devname_matches(char *name, char *match);
+extern struct mddev_ident_s *conf_match(struct mdinfo *info, struct supertype *st);
+
extern void free_line(char *line);
extern int match_oneof(char *devices, char *devname);
extern void uuid_from_super(int uuid[4], mdp_super_t *super);
+extern const int uuid_match_any[4];
extern int same_uuid(int a[4], int b[4], int swapuuid);
extern void copy_uuid(void *a, int b[4], int swapuuid);
+extern char *fname_from_uuid(struct supertype *st,
+ struct mdinfo *info, char *buf, char sep);
extern unsigned long calc_csum(void *super, int bytes);
extern int enough(int level, int raid_disks, int layout, int clean,
char *avail, int avail_disks);
extern int ask(char *mesg);
extern unsigned long long get_component_size(int fd);
extern void remove_partitions(int fd);
-
+extern unsigned long long calc_array_size(int level, int raid_disks, int layout,
+ int chunksize, unsigned long long devsize);
+extern int flush_metadata_updates(struct supertype *st);
+extern void append_metadata_update(struct supertype *st, void *buf, int len);
+extern int assemble_container_content(struct supertype *st, int mdfd,
+ struct mdinfo *content, int runstop,
+ char *chosen_name, int verbose);
+
+extern int add_disk(int mdfd, struct supertype *st,
+ struct mdinfo *sra, struct mdinfo *info);
+extern int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info);
extern char *human_size(long long bytes);
extern char *human_size_brief(long long bytes);
extern char DefaultConfFile[];
-extern int open_mddev(char *dev, int autof);
-extern int open_mddev_devnum(char *devname, int devnum, char *name,
- char *chosen_name, int parts);
+extern int create_mddev(char *dev, char *name, int autof, int trustworthy,
+ char *chosen);
+/* values for 'trustworthy' */
+#define LOCAL 1
+#define FOREIGN 2
+#define METADATA 3
+extern int open_mddev(char *dev, int report_errors);
+extern int open_container(int fd);
+
+extern int mdmon_running(int devnum);
+extern int signal_mdmon(int devnum);
+extern int check_env(char *name);
+extern int start_mdmon(int devnum);
+
+extern char *devnum2devname(int num);
+extern int devname2devnum(char *name);
+extern int stat2devnum(struct stat *st);
+extern int fd2devnum(int fd);
+
+static inline int dev2major(int d)
+{
+ if (d >= 0)
+ return MD_MAJOR;
+ else
+ return get_mdp_major();
+}
+
+static inline int dev2minor(int d)
+{
+ if (d >= 0)
+ return d;
+ return (-1-d) << MdpMinorShift;
+}
+
+static inline int ROUND_UP(int a, int base)
+{
+ return ((a+base-1)/base)*base;
+}
+
+static inline int is_subarray(char *vers)
+{
+ /* The version string for a 'subarray' (an array in a container)
+ * is
+ * /containername/componentname for normal read-write arrays
+ * -containername/componentname for read-only arrays.
+ * containername is e.g. md0, md_d1
+ * componentname is dependant on the metadata. e.g. '1' 'S1' ...
+ */
+ return (*vers == '/' || *vers == '-');
+}
+#ifdef DEBUG
+#define dprintf(fmt, arg...) \
+ fprintf(stderr, fmt, ##arg)
+#else
+#define dprintf(fmt, arg...) \
+ ({ if (0) fprintf(stderr, fmt, ##arg); 0; })
+#endif
#include <assert.h>
#include <stdarg.h>
static inline int xasprintf(char **strp, const char *fmt, ...) {
#define LEVEL_LINEAR (-1)
#define LEVEL_FAULTY (-5)
+/* kernel module doesn't know about these */
+#define LEVEL_CONTAINER (-100)
+#define LEVEL_UNSUPPORTED (-200)
+
/* faulty stuff */
#define makedev(M,m) (((M)<<8) | (m))
#endif
-/* for raid5 */
+/* for raid4/5/6 */
#define ALGORITHM_LEFT_ASYMMETRIC 0
#define ALGORITHM_RIGHT_ASYMMETRIC 1
#define ALGORITHM_LEFT_SYMMETRIC 2
#define ALGORITHM_RIGHT_SYMMETRIC 3
+
+/* Define non-rotating (raid4) algorithms. These allow
+ * conversion of raid4 to raid5.
+ */
+#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */
+#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */
+
+/* DDF RAID6 layouts differ from md/raid6 layouts in two ways.
+ * Firstly, the exact positioning of the parity block is slightly
+ * different between the 'LEFT_*' modes of md and the "_N_*" modes
+ * of DDF.
+ * Secondly, or order of datablocks over which the Q syndrome is computed
+ * is different.
+ * Consequently we have different layouts for DDF/raid6 than md/raid6.
+ * These layouts are from the DDFv1.2 spec.
+ * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but
+ * leaves RLQ=3 as 'Vendor Specific'
+ */
+
+#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */
+#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */
+#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */
+
+
+/* For every RAID5 algorithm we define a RAID6 algorithm
+ * with exactly the same layout for data and parity, and
+ * with the Q block always on the last device (N-1).
+ * This allows trivial conversion from RAID5 to RAID6
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC_6 16
+#define ALGORITHM_RIGHT_ASYMMETRIC_6 17
+#define ALGORITHM_LEFT_SYMMETRIC_6 18
+#define ALGORITHM_RIGHT_SYMMETRIC_6 19
+#define ALGORITHM_PARITY_0_6 20
+#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N
+
Summary: mdadm is used for controlling Linux md devices (aka RAID arrays)
Name: mdadm
-Version: 2.6.9
+Version: 3.0_rc1
Release: 1
Source: http://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-%{version}.tgz
URL: http://neil.brown.name/blog/mdadm
.\" -*- nroff -*-
-.TH MDASSEMBLE 8 "" v2.6.9
+.TH MDASSEMBLE 8 "" v3.0-rc1
.SH NAME
mdassemble \- assemble MD devices
.I aka
/*
* mdassemble - assemble Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
* Copyright (C) 2003 Luca Berra <bluca@vodka.it>
*
*
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
#include "mdadm.h"
#ifndef MDASSEMBLE_AUTO
/* from mdopen.c */
-int open_mddev(char *dev, int autof/*unused */)
+int open_mddev(char *dev, int report_errors/*unused*/)
{
int mdfd = open(dev, O_RDWR);
if (mdfd < 0)
}
return mdfd;
}
+int create_mddev(char *dev, char *name, int autof/*unused*/, int trustworthy,
+ char *chosen)
+{
+ return open_mddev(dev, 0);
+}
#endif
+int map_update(struct map_ent **mpp, int devnum, char *metadata,
+ int *uuid, char *path)
+{
+ return 0;
+}
+struct map_ent *map_by_name(struct map_ent **mpp, char *name)
+{
+ return NULL;
+}
int rv;
int mdfd = -1;
} else
for (; array_list; array_list = array_list->next) {
mdu_array_info_t array;
- mdfd = open_mddev(array_list->devname, array_list->autof);
- if (mdfd < 0) {
- rv |= 1;
+ if (strcasecmp(array_list->devname, "<ignore>") == 0)
continue;
- }
- if (ioctl(mdfd, GET_ARRAY_INFO, &array) < 0) {
- rv |= Assemble(array_list->st, array_list->devname, mdfd,
- array_list, NULL, NULL,
- readonly, runstop, NULL, NULL, verbose, force);
- } else {
+ mdfd = open_mddev(array_list->devname, 0);
+ if (mdfd >= 0 && ioctl(mdfd, GET_ARRAY_INFO, &array) == 0) {
rv |= Manage_ro(array_list->devname, mdfd, -1); /* make it readwrite */
+ continue;
}
- close(mdfd);
+ if (mdfd >= 0)
+ close(mdfd);
+ rv |= Assemble(array_list->st, array_list->devname,
+ array_list, NULL, NULL,
+ readonly, runstop, NULL, NULL, 0,
+ verbose, force);
}
return rv;
}
--- /dev/null
+.\" See file COPYING in distribution for details.
+.TH MDMON 8 "" v3.0-rc1
+.SH NAME
+mdmon \- monitor MD external metadata arrays
+
+.SH SYNOPSIS
+
+.BI mdmon " CONTAINER [NEWROOT]"
+
+.SH OVERVIEW
+The 2.6.27 kernel brings the ability to support external metadata arrays.
+External metadata implies that user space handles all updates to the metadata.
+The kernel's responsibility is to notify user space when a "metadata event"
+occurs, like disk failures and clean-to-dirty transitions. The kernel, in
+important cases, waits for user space to take action on these notifications.
+
+.SH DESCRIPTION
+.SS Metadata updates:
+To service metadata update requests a daemon,
+.IR mdmon ,
+is introduced.
+.I Mdmon
+is tasked with polling the sysfs namespace looking for changes in
+.BR array_state ,
+.BR sync_action ,
+and per disk
+.BR state
+attributes. When a change is detected it calls a per metadata type
+handler to make modifications to the metadata. The following actions
+are taken:
+.RS
+.TP
+.B array_state \- inactive
+Clear the dirty bit for the volume and let the array be stopped
+.TP
+.B array_state \- write pending
+Set the dirty bit for the array and then set
+.B array_state
+to
+.BR active .
+Writes
+are blocked until userspace writes
+.BR active.
+.TP
+.B array_state \- active-idle
+The safe mode timer has expired so set array state to clean to block writes to the array
+.TP
+.B array_state \- clean
+Clear the dirty bit for the volume
+.TP
+.B array_state \- read-only
+This is the initial state that all arrays start at.
+.I mdmon
+takes one of the three actions:
+.RS
+.TP
+1/
+Transition the array to read-auto keeping the dirty bit clear if the metadata
+handler determines that the array does not need resyncing or other modification
+.TP
+2/
+Transition the array to active if the metadata handler determines a resync or
+some other manipulation is necessary
+.TP
+3/
+Leave the array read\-only if the volume is marked to not be monitored; for
+example, the metadata version has been set to "external:\-dev/md127" instead of
+"external:/dev/md127"
+.RE
+.TP
+.B sync_action \- resync\-to\-idle
+Notify the metadata handler that a resync may have completed. If a resync
+process is idled before it completes this event allows the metadata handler to
+checkpoint resync.
+.TP
+.B sync_action \- recover\-to\-idle
+A spare may have completed rebuilding so tell the metadata handler about the
+state of each disk. This is the metadata handler's opportunity to clear
+any "out-of-sync" bits and clear the volume's degraded status. If a recovery
+process is idled before it completes this event allows the metadata handler to
+checkpoint recovery.
+.TP
+.B <disk>/state \- faulty
+A disk failure kicks off a series of events. First, notify the metadata
+handler that a disk has failed, and then notify the kernel that it can unblock
+writes that were dependent on this disk. After unblocking the kernel this disk
+is set to be removed+ from the member array. Finally the disk is marked failed
+in all other member arrays in the container.
+.IP
++ Note This behavior differs slightly from native MD arrays where
+removal is reserved for a
+.B mdadm --remove
+event. In the external metadata case the container holds the final
+reference on a block device and a
+.B mdadm --remove <container> <victim>
+call is still required.
+.RE
+
+.SS Containers:
+.P
+External metadata formats, like DDF, differ from the native MD metadata
+formats in that they define a set of disks and a series of sub-arrays
+within those disks. MD metadata in comparison defines a 1:1
+relationship between a set of block devices and a raid array. For
+example to create 2 arrays at different raid levels on a single
+set of disks, MD metadata requires the disks be partitioned and then
+each array can created be created with a subset of those partitions. The
+supported external formats perform this disk carving internally.
+.P
+Container devices simply hold references to all member disks and allow
+tools like
+.I mdmon
+to determine which active arrays belong to which
+container. Some array management commands like disk removal and disk
+add are now only valid at the container level. Attempts to perform
+these actions on member arrays are blocked with error messages like:
+.IP
+"mdadm: Cannot remove disks from a \'member\' array, perform this
+operation on the parent container"
+.P
+Containers are identified in /proc/mdstat with a metadata version string
+"external:<metadata name>". Member devices are identified by
+"external:/<container device>/<member index>", or "external:-<container
+device>/<member index>" if the array is to remain readonly.
+
+.SH OPTIONS
+.TP
+CONTAINER
+The
+.B container
+device to monitor. It can be a full path like /dev/md/container, a simple md
+device name like md127, or /proc/mdstat which tells
+.I mdmon
+to scan for containers and launch an
+.I mdmon
+instance for each one found.
+.TP
+[NEWROOT]
+In order to support an external metadata raid array as the rootfs
+.I mdmon
+needs to be started in the initramfs environment. Once the initramfs
+environment mounts the final rootfs
+.I mdmon
+needs to be restarted in the new namespace. When NEWROOT is specified
+.I mdmon
+will terminate any
+.I mdmon
+instances that are running in the current namespace,
+.IR chroot (2)
+to NEWROOT, and continue monitoring the container.
+.PP
+Note that
+.I mdmon
+is automatically started by
+.I mdadm
+when needed and so does not need to be considered when working with
+RAID arrays. The only times it is run other that by
+.I mdadm
+is when the boot scripts need to restart it after mounting the new
+root filesystem.
+
+.SH SEE ALSO
+.IR mdadm (8),
+.IR md (4).
--- /dev/null
+/*
+ * mdmon - monitor external metadata arrays
+ *
+ * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2007-2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * md array manager.
+ * When md arrays have user-space managed metadata, this is the program
+ * that does the managing.
+ *
+ * Given one argument: the name of the array (e.g. /dev/md0) that is
+ * the container.
+ * We fork off a helper that runs high priority and mlocked. It responds to
+ * device failures and other events that might stop writeout, or that are
+ * trivial to deal with.
+ * The main thread then watches for new arrays being created in the container
+ * and starts monitoring them too ... along with a few other tasks.
+ *
+ * The main thread communicates with the priority thread by writing over
+ * a pipe.
+ * Separate programs can communicate with the main thread via Unix-domain
+ * socket.
+ * The two threads share address space and open file table.
+ *
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <dirent.h>
+
+#include <sched.h>
+
+#include "mdadm.h"
+#include "mdmon.h"
+
+struct active_array *discard_this;
+struct active_array *pending_discard;
+
+int mon_tid, mgr_tid;
+
+int sigterm;
+
+int run_child(void *v)
+{
+ struct supertype *c = v;
+
+ do_monitor(c);
+ return 0;
+}
+
+#ifdef __ia64__
+int __clone2(int (*fn)(void *),
+ void *child_stack_base, size_t stack_size,
+ int flags, void *arg, ...
+ /* pid_t *pid, struct user_desc *tls, pid_t *ctid */ );
+#endif
+ int clone_monitor(struct supertype *container)
+{
+ static char stack[4096];
+
+#ifdef __ia64__
+ mon_tid = __clone2(run_child, stack, sizeof(stack),
+ CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD,
+ container);
+#else
+ mon_tid = clone(run_child, stack+4096-64,
+ CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD,
+ container);
+#endif
+
+ mgr_tid = syscall(SYS_gettid);
+
+ return mon_tid;
+}
+
+static struct superswitch *find_metadata_methods(char *vers)
+{
+ if (strcmp(vers, "ddf") == 0)
+ return &super_ddf;
+ if (strcmp(vers, "imsm") == 0)
+ return &super_imsm;
+ return NULL;
+}
+
+
+int make_pidfile(char *devname, int o_excl)
+{
+ char path[100];
+ char pid[10];
+ int fd;
+ int n;
+
+ if (sigterm)
+ return -1;
+
+ sprintf(path, "/var/run/mdadm/%s.pid", devname);
+
+ fd = open(path, O_RDWR|O_CREAT|o_excl, 0600);
+ if (fd < 0)
+ return -errno;
+ sprintf(pid, "%d\n", getpid());
+ n = write(fd, pid, strlen(pid));
+ close(fd);
+ if (n < 0)
+ return -errno;
+ return 0;
+}
+
+int is_container_member(struct mdstat_ent *mdstat, char *container)
+{
+ if (mdstat->metadata_version == NULL ||
+ strncmp(mdstat->metadata_version, "external:", 9) != 0 ||
+ !is_subarray(mdstat->metadata_version+9) ||
+ strncmp(mdstat->metadata_version+10, container, strlen(container)) != 0 ||
+ mdstat->metadata_version[10+strlen(container)] != '/')
+ return 0;
+
+ return 1;
+}
+
+void remove_pidfile(char *devname);
+static void try_kill_monitor(char *devname)
+{
+ char buf[100];
+ int fd;
+ pid_t pid;
+ struct mdstat_ent *mdstat;
+
+ sprintf(buf, "/var/run/mdadm/%s.pid", devname);
+ fd = open(buf, O_RDONLY);
+ if (fd < 0)
+ return;
+
+ if (read(fd, buf, sizeof(buf)) < 0) {
+ close(fd);
+ return;
+ }
+
+ close(fd);
+ pid = strtoul(buf, NULL, 10);
+
+ /* first rule of survival... don't off yourself */
+ if (pid == getpid())
+ return;
+
+ /* kill this process if it is mdmon */
+ sprintf(buf, "/proc/%lu/cmdline", (unsigned long) pid);
+ fd = open(buf, O_RDONLY);
+ if (fd < 0)
+ return;
+
+ if (read(fd, buf, sizeof(buf)) < 0) {
+ close(fd);
+ return;
+ }
+
+ if (!strstr(buf, "mdmon"))
+ return;
+
+ kill(pid, SIGTERM);
+
+ mdstat = mdstat_read(0, 0);
+ for ( ; mdstat; mdstat = mdstat->next)
+ if (is_container_member(mdstat, devname)) {
+ sprintf(buf, "/dev/%s", mdstat->dev);
+ WaitClean(buf, 0);
+ }
+ free_mdstat(mdstat);
+ remove_pidfile(devname);
+}
+
+void remove_pidfile(char *devname)
+{
+ char buf[100];
+
+ if (sigterm)
+ return;
+
+ sprintf(buf, "/var/run/mdadm/%s.pid", devname);
+ unlink(buf);
+ sprintf(buf, "/var/run/mdadm/%s.sock", devname);
+ unlink(buf);
+}
+
+int make_control_sock(char *devname)
+{
+ char path[100];
+ int sfd;
+ long fl;
+ struct sockaddr_un addr;
+
+ if (sigterm)
+ return -1;
+
+ sprintf(path, "/var/run/mdadm/%s.sock", devname);
+ unlink(path);
+ sfd = socket(PF_LOCAL, SOCK_STREAM, 0);
+ if (sfd < 0)
+ return -1;
+
+ addr.sun_family = PF_LOCAL;
+ strcpy(addr.sun_path, path);
+ if (bind(sfd, &addr, sizeof(addr)) < 0) {
+ close(sfd);
+ return -1;
+ }
+ listen(sfd, 10);
+ fl = fcntl(sfd, F_GETFL, 0);
+ fl |= O_NONBLOCK;
+ fcntl(sfd, F_SETFL, fl);
+ return sfd;
+}
+
+int socket_hup_requested;
+static void hup(int sig)
+{
+ socket_hup_requested = 1;
+}
+
+static void term(int sig)
+{
+ sigterm = 1;
+}
+
+static void wake_me(int sig)
+{
+
+}
+
+/* if we are debugging and starting mdmon by hand then don't fork */
+static int do_fork(void)
+{
+ #ifdef DEBUG
+ if (check_env("MDADM_NO_MDMON"))
+ return 0;
+ #endif
+
+ return 1;
+}
+
+void usage(void)
+{
+ fprintf(stderr, "Usage: mdmon /device/name/for/container [target_dir]\n");
+ exit(2);
+}
+
+int mdmon(char *devname, int devnum, int scan, char *switchroot);
+
+int main(int argc, char *argv[])
+{
+ char *container_name = NULL;
+ char *switchroot = NULL;
+ int devnum;
+ char *devname;
+ int scan = 0;
+ int status = 0;
+
+ switch (argc) {
+ case 3:
+ switchroot = argv[2];
+ case 2:
+ container_name = argv[1];
+ break;
+ default:
+ usage();
+ }
+
+ if (strcmp(container_name, "/proc/mdstat") == 0) {
+ struct mdstat_ent *mdstat, *e;
+
+ /* launch an mdmon instance for each container found */
+ scan = 1;
+ mdstat = mdstat_read(0, 0);
+ for (e = mdstat; e; e = e->next) {
+ if (strncmp(e->metadata_version, "external:", 9) == 0 &&
+ !is_subarray(&e->metadata_version[9])) {
+ devname = devnum2devname(e->devnum);
+ /* update cmdline so this mdmon instance can be
+ * distinguished from others in a call to ps(1)
+ */
+ if (strlen(devname) <= strlen(container_name)) {
+ memset(container_name, 0, strlen(container_name));
+ sprintf(container_name, "%s", devname);
+ }
+ status |= mdmon(devname, e->devnum, scan,
+ switchroot);
+ }
+ }
+ free_mdstat(mdstat);
+
+ return status;
+ } else if (strncmp(container_name, "md", 2) == 0) {
+ devnum = devname2devnum(container_name);
+ devname = devnum2devname(devnum);
+ if (strcmp(container_name, devname) != 0)
+ devname = NULL;
+ } else {
+ struct stat st;
+
+ devnum = NoMdDev;
+ if (stat(container_name, &st) == 0)
+ devnum = stat2devnum(&st);
+ if (devnum == NoMdDev)
+ devname = NULL;
+ else
+ devname = devnum2devname(devnum);
+ }
+
+ if (!devname) {
+ fprintf(stderr, "mdmon: %s is not a valid md device name\n",
+ container_name);
+ exit(1);
+ }
+ return mdmon(devname, devnum, scan, switchroot);
+}
+
+int mdmon(char *devname, int devnum, int scan, char *switchroot)
+{
+ int mdfd;
+ struct mdinfo *mdi, *di;
+ struct supertype *container;
+ sigset_t set;
+ struct sigaction act;
+ int pfd[2];
+ int status;
+ int ignore;
+
+ dprintf("starting mdmon for %s in %s\n",
+ devname, switchroot ? : "/");
+ mdfd = open_dev(devnum);
+ if (mdfd < 0) {
+ fprintf(stderr, "mdmon: %s: %s\n", devname,
+ strerror(errno));
+ return 1;
+ }
+ if (md_get_version(mdfd) < 0) {
+ fprintf(stderr, "mdmon: %s: Not an md device\n",
+ devname);
+ return 1;
+ }
+
+ /* Fork, and have the child tell us when they are ready */
+ if (do_fork() || scan) {
+ if (pipe(pfd) != 0) {
+ fprintf(stderr, "mdmon: failed to create pipe\n");
+ return 1;
+ }
+ switch(fork()) {
+ case -1:
+ fprintf(stderr, "mdmon: failed to fork: %s\n",
+ strerror(errno));
+ return 1;
+ case 0: /* child */
+ close(pfd[0]);
+ break;
+ default: /* parent */
+ close(pfd[1]);
+ if (read(pfd[0], &status, sizeof(status)) != sizeof(status)) {
+ wait(&status);
+ status = WEXITSTATUS(status);
+ }
+ return status;
+ }
+ } else
+ pfd[0] = pfd[1] = -1;
+
+ container = malloc(sizeof(*container));
+ container->devnum = devnum;
+ container->devname = devname;
+ container->arrays = NULL;
+ container->subarray[0] = 0;
+
+ if (!container->devname) {
+ fprintf(stderr, "mdmon: failed to allocate container name string\n");
+ exit(3);
+ }
+
+ mdi = sysfs_read(mdfd, container->devnum,
+ GET_VERSION|GET_LEVEL|GET_DEVS|SKIP_GONE_DEVS);
+
+ if (!mdi) {
+ fprintf(stderr, "mdmon: failed to load sysfs info for %s\n",
+ container->devname);
+ exit(3);
+ }
+ if (mdi->array.level != UnSet) {
+ fprintf(stderr, "mdmon: %s is not a container - cannot monitor\n",
+ devname);
+ exit(3);
+ }
+ if (mdi->array.major_version != -1 ||
+ mdi->array.minor_version != -2) {
+ fprintf(stderr, "mdmon: %s does not use external metadata - cannot monitor\n",
+ devname);
+ exit(3);
+ }
+
+ container->ss = find_metadata_methods(mdi->text_version);
+ if (container->ss == NULL) {
+ fprintf(stderr, "mdmon: %s uses unknown metadata: %s\n",
+ devname, mdi->text_version);
+ exit(3);
+ }
+
+ container->devs = NULL;
+ for (di = mdi->devs; di; di = di->next) {
+ struct mdinfo *cd = malloc(sizeof(*cd));
+ *cd = *di;
+ cd->next = container->devs;
+ container->devs = cd;
+ }
+ sysfs_free(mdi);
+
+ /* SIGUSR is sent between parent and child. So both block it
+ * and enable it only with pselect.
+ */
+ sigemptyset(&set);
+ sigaddset(&set, SIGUSR1);
+ sigaddset(&set, SIGHUP);
+ sigaddset(&set, SIGALRM);
+ sigaddset(&set, SIGTERM);
+ sigprocmask(SIG_BLOCK, &set, NULL);
+ act.sa_handler = wake_me;
+ act.sa_flags = 0;
+ sigaction(SIGUSR1, &act, NULL);
+ sigaction(SIGALRM, &act, NULL);
+ act.sa_handler = hup;
+ sigaction(SIGHUP, &act, NULL);
+ act.sa_handler = term;
+ sigaction(SIGTERM, &act, NULL);
+ act.sa_handler = SIG_IGN;
+ sigaction(SIGPIPE, &act, NULL);
+
+ if (switchroot) {
+ /* we assume we assume that /sys /proc /dev are available in
+ * the new root (see nash:setuproot)
+ *
+ * kill any monitors in the current namespace and change
+ * to the new one
+ */
+ try_kill_monitor(container->devname);
+ if (chroot(switchroot) != 0) {
+ fprintf(stderr, "mdmon: failed to chroot to '%s': %s\n",
+ switchroot, strerror(errno));
+ exit(4);
+ }
+ }
+
+ /* If this fails, we hope it already exists
+ * pid file lives in /var/run/mdadm/mdXX.pid
+ */
+ mkdir("/var", 0600);
+ mkdir("/var/run", 0600);
+ mkdir("/var/run/mdadm", 0600);
+ ignore = chdir("/");
+ if (make_pidfile(container->devname, O_EXCL) < 0) {
+ if (ping_monitor(container->devname) == 0) {
+ fprintf(stderr, "mdmon: %s already managed\n",
+ container->devname);
+ exit(3);
+ } else {
+ int err;
+
+ /* cleanup the old monitor, this one is taking over */
+ try_kill_monitor(container->devname);
+ err = make_pidfile(container->devname, 0);
+ if (err < 0) {
+ fprintf(stderr, "mdmon: %s Cannot create pidfile\n",
+ container->devname);
+ if (err == -EROFS) {
+ /* FIXME implement a mechanism to
+ * prevent duplicate monitor instances
+ */
+ fprintf(stderr,
+ "mdmon: continuing on read-only file system\n");
+ } else
+ exit(3);
+ }
+ }
+ }
+ container->sock = make_control_sock(container->devname);
+
+ if (container->ss->load_super(container, mdfd, devname)) {
+ fprintf(stderr, "mdmon: Cannot load metadata for %s\n",
+ devname);
+ exit(3);
+ }
+ close(mdfd);
+
+ /* Ok, this is close enough. We can say goodbye to our parent now.
+ */
+ status = 0;
+ if (write(pfd[1], &status, sizeof(status)) < 0)
+ fprintf(stderr, "mdmon: failed to notify our parent: %d\n",
+ getppid());
+ close(pfd[1]);
+
+ setsid();
+ close(0);
+ open("/dev/null", O_RDWR);
+ close(1);
+ ignore = dup(0);
+#ifndef DEBUG
+ close(2);
+ ignore = dup(0);
+#endif
+
+ mlockall(MCL_FUTURE);
+
+ if (clone_monitor(container) < 0) {
+ fprintf(stderr, "mdmon: failed to start monitor process: %s\n",
+ strerror(errno));
+ exit(2);
+ }
+
+ do_manager(container);
+
+ exit(0);
+}
--- /dev/null
+/*
+ * mdmon - monitor external metadata arrays
+ *
+ * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2007-2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+enum array_state { clear, inactive, suspended, readonly, read_auto,
+ clean, active, write_pending, active_idle, bad_word};
+
+enum sync_action { idle, reshape, resync, recover, check, repair, bad_action };
+
+
+struct active_array {
+ struct mdinfo info;
+ struct supertype *container;
+ struct active_array *next, *replaces;
+
+ int action_fd;
+ int resync_start_fd;
+ int metadata_fd; /* for monitoring rw/ro status */
+
+ enum array_state prev_state, curr_state, next_state;
+ enum sync_action prev_action, curr_action, next_action;
+
+ int check_degraded; /* flag set by mon, read by manage */
+
+ int devnum;
+
+ unsigned long long resync_start;
+};
+
+/*
+ * Metadata updates are handled by the monitor thread,
+ * as it has exclusive access to the metadata.
+ * When the manager want to updates metadata, either
+ * for it's own reason (e.g. committing a spare) or
+ * on behalf of mdadm, it creates a metadata_update
+ * structure and queues it to the monitor.
+ * Updates are created and processed by code under the
+ * superswitch. All common code sees them as opaque
+ * blobs.
+ */
+extern struct metadata_update *update_queue, *update_queue_handled;
+
+#define MD_MAJOR 9
+
+extern struct active_array *container;
+extern struct active_array *discard_this;
+extern struct active_array *pending_discard;
+extern struct md_generic_cmd *active_cmd;
+
+
+void remove_pidfile(char *devname);
+void do_monitor(struct supertype *container);
+void do_manager(struct supertype *container);
+int make_control_sock(char *devname);
+int make_pidfile(char *devname, int o_excl);
+extern int socket_hup_requested;
+extern int sigterm;
+
+int read_dev_state(int fd);
+int get_resync_start(struct active_array *a);
+int is_container_member(struct mdstat_ent *mdstat, char *container);
+
+struct mdstat_ent *mdstat_read(int hold, int start);
+
+extern int exit_now, manager_ready;
+extern int mon_tid, mgr_tid;
+extern int monitor_loop_cnt;
+
+/* helper routine to determine resync completion since MaxSector is a
+ * moving target
+ */
+static inline int is_resync_complete(struct active_array *a)
+{
+ if (a->resync_start >= a->info.component_size)
+ return 1;
+ return 0;
+}
+
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
#include "mdadm.h"
#include <ctype.h>
-void make_dev_symlink(char *dev)
-{
- char *new = strdup(dev);
-
- if (!new) return;
- /* /dev/md/0 -> /dev/md0
- * /dev/md/d0 -> /dev/md_d0
- */
- if (isdigit(new[8]))
- strcpy(new+7, new+8);
- else
- new[7] = '_';
- if (symlink(dev+5, new))
- perror(new);
-}
-
-
-void make_parts(char *dev, int cnt, int symlinks)
+void make_parts(char *dev, int cnt)
{
/* make 'cnt' partition devices for 'dev'
- * We use the major/minor from dev and add 1..cnt
+ * If dev is a device name we use the
+ * major/minor from dev and add 1..cnt
+ * If it is a symlink, we make similar symlinks.
* If dev ends with a digit, we add "p%d" else "%d"
* If the name exists, we use it's owner/mode,
* else that of dev
*/
struct stat stb;
- int major_num, minor_num;
+ int major_num = major_num; /* quiet gcc -Os unitialized warning */
+ int minor_num = minor_num; /* quiet gcc -Os unitialized warning */
+ int odig = odig; /* quiet gcc -Os unitialized warning */
int i;
int nlen = strlen(dev) + 20;
char *name = malloc(nlen);
int dig = isdigit(dev[strlen(dev)-1]);
+ char orig[1024];
+ char sym[1024];
+ int err;
if (cnt==0) cnt=4;
- if (stat(dev, &stb)!= 0)
+ if (lstat(dev, &stb)!= 0)
return;
- if (!S_ISBLK(stb.st_mode))
- return;
- major_num = major(stb.st_rdev);
- minor_num = minor(stb.st_rdev);
+ if (S_ISLNK(stb.st_mode)) {
+ int len = readlink(dev, orig, sizeof(orig));
+ if (len < 0 || len > 1000)
+ return;
+ orig[len] = 0;
+ odig = isdigit(orig[len-1]);
+ } else if (S_ISBLK(stb.st_mode)) {
+ major_num = major(stb.st_rdev);
+ minor_num = minor(stb.st_rdev);
+ } else
+ return;
for (i=1; i <= cnt ; i++) {
struct stat stb2;
snprintf(name, nlen, "%s%s%d", dev, dig?"p":"", i);
} else {
stb2 = stb;
}
- if (mknod(name, S_IFBLK | 0600, makedev(major_num, minor_num+i)))
- perror("mknod");
- if (chown(name, stb2.st_uid, stb2.st_gid))
- perror("chown");
- if (chmod(name, stb2.st_mode & 07777))
- perror("chmod");
- if (symlinks && strncmp(name, "/dev/md/", 8) == 0)
- make_dev_symlink(name);
- stat(name, &stb2);
- add_dev(name, &stb2, 0, NULL);
+ if (S_ISBLK(stb.st_mode)) {
+ if (mknod(name, S_IFBLK | 0600,
+ makedev(major_num, minor_num+i)))
+ perror("mknod");
+ if (chown(name, stb2.st_uid, stb2.st_gid))
+ perror("chown");
+ if (chmod(name, stb2.st_mode & 07777))
+ perror("chmod");
+ err = 0;
+ } else {
+ snprintf(sym, sizeof(sym), "%s%s%d", orig, odig?"p":"", i);
+ err = symlink(sym, name);
+ }
+
+ if (err == 0 && stat(name, &stb2) == 0)
+ add_dev(name, &stb2, 0, NULL);
}
}
/*
- * Open a given md device, and check that it really is one.
- * If 'autof' is given, then we need to create, or recreate, the md device.
- * If the name already exists, and is not a block device, we fail.
- * If it exists and is not an md device, is not the right type (partitioned or not),
- * or is currently in-use, we remove the device, but remember the owner and mode.
- * If it now doesn't exist, we find a new md array and create the device.
- * Default ownership/mode comes from config file.
+ * We need a new md device to assemble/build/create an array.
+ * 'dev' is a name given us by the user (command line or mdadm.conf)
+ * It might start with /dev or /dev/md any might end with a digit
+ * string.
+ * If it starts with just /dev, it must be /dev/mdX or /dev/md_dX
+ * If it ends with a digit string, then it must be as above, or
+ * 'trustworthy' must be 'METADATA' and the 'dev' must be
+ * /dev/md/'name'NN or 'name'NN
+ * If it doesn't end with a digit string, it must be /dev/md/'name'
+ * or 'name' or must be NULL.
+ * If the digit string is present, it gives the minor number to use
+ * If not, we choose a high, unused minor number.
+ * If the 'dev' is a standard name, it devices whether 'md' or 'mdp'.
+ * else if the name is 'd[0-9]+' then we use mdp
+ * else if trustworthy is 'METADATA' we use md
+ * else the choice depends on 'autof'.
+ * If name is NULL it is assumed to match whatever dev provides.
+ * If both name and dev are NULL, we choose a name 'mdXX' or 'mdpXX'
+ *
+ * If 'name' is given, and 'trustworthy' is 'foreign' and name is not
+ * supported by 'dev', we add a "_%d" suffix based on the minor number
+ * use that.
+ *
+ * If udev is configured, we create a temporary device, open it, and
+ * unlink it.
+ * If not, we create the /dev/mdXX device, and is name is usable,
+ * /dev/md/name
+ * In any case we return /dev/md/name or (if that isn't available)
+ * /dev/mdXX in 'chosen'.
+ *
+ * When we create devices, we use uid/gid/umask from config file.
*/
-int open_mddev(char *dev, int autof)
+
+int create_mddev(char *dev, char *name, int autof, int trustworthy,
+ char *chosen)
{
int mdfd;
struct stat stb;
- int major_num = MD_MAJOR;
- int minor_num = 0;
- int must_remove = 0;
- int num;
+ int num = -1;
+ int use_mdp = -1;
struct createinfo *ci = conf_get_create_info();
int parts;
+ char *cname;
+ char devname[20];
+ char cbuf[400];
+ if (chosen == NULL)
+ chosen = cbuf;
+
if (autof == 0)
autof = ci->autof;
parts = autof >> 3;
autof &= 7;
- if (autof && autof != 1) {
- /* autof is set, so we need to check that the name is ok,
- * and possibly create one if not
- */
- int std;
- stb.st_mode = 0;
- if (stat(dev, &stb)==0 && ! S_ISBLK(stb.st_mode)) {
- fprintf(stderr, Name ": %s is not a block device.\n",
- dev);
- return -1;
- }
- /* check major number is correct */
- num = -1;
- std = is_standard(dev, &num);
- if (std>0) major_num = get_mdp_major();
- switch(autof) {
- case 2: /* only create is_standard names */
- if (!std && !stb.st_mode) {
- fprintf(stderr, Name
- ": %s does not exist and is not a 'standard' name "
- "so it cannot be created\n", dev);
- return -1;
- }
- break;
- case 3: /* create md, reject std>0 */
- if (std > 0) {
- fprintf(stderr, Name ": that --auto option "
- "not compatable with device named %s\n", dev);
- return -1;
- }
- break;
- case 4: /* create mdp, reject std<0 */
- if (std < 0) {
- fprintf(stderr, Name ": that --auto option "
- "not compatable with device named %s\n", dev);
+ strcpy(chosen, "/dev/md/");
+ cname = chosen + strlen(chosen);
+
+
+ if (dev) {
+
+ if (strncmp(dev, "/dev/md/", 8) == 0) {
+ strcpy(cname, dev+8);
+ } else if (strncmp(dev, "/dev/", 5) == 0) {
+ char *e = dev + strlen(dev);
+ while (e > dev && isdigit(e[-1]))
+ e--;
+ if (e[0])
+ num = strtoul(e, NULL, 10);
+ strcpy(cname, dev+5);
+ cname[e-(dev+5)] = 0;
+ /* name *must* be mdXX or md_dXX in this context */
+ if (num < 0 ||
+ (strcmp(cname, "md") != 0 && strcmp(cname, "md_d") != 0)) {
+ fprintf(stderr, Name ": %s is an invalid name "
+ "for an md device. Try /dev/md/%s\n",
+ dev, dev+5);
return -1;
}
- major_num = get_mdp_major();
- break;
- case 5: /* default to md if not standard */
- break;
- case 6: /* default to mdp if not standard */
- if (std == 0) major_num = get_mdp_major();
- break;
+ if (strcmp(cname, "md") == 0)
+ use_mdp = 0;
+ else
+ use_mdp = 1;
+ /* recreate name: /dev/md/0 or /dev/md/d0 */
+ sprintf(cname, "%s%d", use_mdp?"d":"", num);
+ } else
+ strcpy(cname, dev);
+
+ /* 'cname' must not contain a slash, and may not be
+ * empty.
+ */
+ if (strchr(cname, '/') != NULL) {
+ fprintf(stderr, Name ": %s is an invalid name "
+ "for an md device.\n", dev);
+ return -1;
}
- /* major is final. num is -1 if not standard */
- if (stb.st_mode && major(stb.st_rdev) != major_num)
- must_remove = 1;
- if (stb.st_mode && !must_remove) {
- /* looks ok, see if it is available */
- mdfd = open(dev, O_RDWR);
- if (mdfd < 0) {
- fprintf(stderr, Name ": error opening %s: %s\n",
- dev, strerror(errno));
- return -1;
- } else if (md_get_version(mdfd) <= 0) {
- fprintf(stderr, Name ": %s does not appear to be an md device\n",
- dev);
- close(mdfd);
- return -1;
- }
- if (major_num != MD_MAJOR && parts > 0)
- make_parts(dev, parts, ci->symlinks);
- return mdfd;
+ if (cname[0] == 0) {
+ fprintf(stderr, Name ": %s is an invalid name "
+ "for an md device (empty!).", dev);
+ return -1;
}
- /* Ok, need to find a minor that is not in use.
- * If the device name is in a 'standard' format,
- * intuit the minor from that, else
- * easiest to read /proc/mdstat, and hunt through for
- * an unused number
- */
if (num < 0) {
- /* need to pick an unused number */
- int num = find_free_devnum(major_num != MD_MAJOR);
-
- if (major_num == MD_MAJOR)
- minor_num = num;
+ /* If cname is 'N' or 'dN', we get dev number
+ * from there.
+ */
+ char *sp = cname;
+ char *ep;
+ if (cname[0] == 'd')
+ sp++;
+ num = strtoul(sp, &ep, 10);
+ if (ep == sp || *ep || num < 0)
+ num = -1;
+ else if (cname[0] == 'd')
+ use_mdp = 1;
else
- minor_num = (-1-num) << MdpMinorShift;
- } else if (major_num == MD_MAJOR)
- minor_num = num;
- else
- minor_num = num << MdpMinorShift;
- /* major and minor have been chosen */
-
- /* If it was a 'standard' name and it is in-use, then
- * the device could already be correct
- */
- if (stb.st_mode && major(stb.st_rdev) == major_num &&
- minor(stb.st_rdev) == minor_num)
- ;
- else {
- if (major(makedev(major_num,minor_num)) != major_num ||
- minor(makedev(major_num,minor_num)) != minor_num) {
- fprintf(stderr, Name ": Need newer C library to use more than 4 partitionable md devices, sorry\n");
- return -1;
- }
- if (must_remove)
- unlink(dev);
-
- if (strncmp(dev, "/dev/md/", 8) == 0) {
- if (mkdir("/dev/md",0700)==0) {
- if (chown("/dev/md", ci->uid, ci->gid))
- perror("chown /dev/md");
- if (chmod("/dev/md", ci->mode| ((ci->mode>>2) & 0111)))
- perror("chmod /dev/md");
- }
- }
- if (mknod(dev, S_IFBLK|0600, makedev(major_num, minor_num))!= 0) {
- fprintf(stderr, Name ": failed to create %s\n", dev);
- return -1;
- }
- if (must_remove) {
- if (chown(dev, stb.st_uid, stb.st_gid))
- perror("chown");
- if (chmod(dev, stb.st_mode & 07777))
- perror("chmod");
- } else {
- if (chown(dev, ci->uid, ci->gid))
- perror("chown");
- if (chmod(dev, ci->mode))
- perror("chmod");
- }
- stat(dev, &stb);
- add_dev(dev, &stb, 0, NULL);
- if (ci->symlinks && strncmp(dev, "/dev/md/", 8) == 0)
- make_dev_symlink(dev);
- if (major_num != MD_MAJOR)
- make_parts(dev,parts, ci->symlinks);
+ use_mdp = 0;
}
}
- mdfd = open(dev, O_RDWR);
- if (mdfd < 0)
- fprintf(stderr, Name ": error opening %s: %s\n",
- dev, strerror(errno));
- else if (md_get_version(mdfd) <= 0) {
- fprintf(stderr, Name ": %s does not appear to be an md device\n",
- dev);
- close(mdfd);
- mdfd = -1;
- }
- return mdfd;
-}
-
-int open_mddev_devnum(char *devname, int devnum, char *name,
- char *chosen_name, int parts)
-{
- /* Open the md device with number 'devnum', possibly using 'devname',
- * possibly constructing a name with 'name', but in any case, copying
- * the name into 'chosen_name'
- */
- int major_num, minor_num;
- struct stat stb;
- int i;
- struct createinfo *ci = conf_get_create_info();
-
- if (devname)
- strcpy(chosen_name, devname);
- else if (name && *name && strchr(name,'/') == NULL) {
- char *n = strchr(name, ':');
- if (n) n++; else n = name;
- if (isdigit(*n) && devnum < 0)
- sprintf(chosen_name, "/dev/md/d%s", n);
- else
- sprintf(chosen_name, "/dev/md/%s", n);
- } else {
- if (devnum >= 0)
- sprintf(chosen_name, "/dev/md%d", devnum);
+ /* Now determine device number */
+ /* named 'METADATA' cannot use 'mdp'. */
+ if (name && name[0] == 0)
+ name = NULL;
+ if (name && trustworthy == METADATA && use_mdp == 1) {
+ fprintf(stderr, Name ": %s is not allowed for a %s container. "
+ "Consider /dev/md%d.\n", dev, name, num);
+ return -1;
+ }
+ if (name && trustworthy == METADATA)
+ use_mdp = 0;
+ if (use_mdp == -1) {
+ if (autof == 4 || autof == 6)
+ use_mdp = 1;
else
- sprintf(chosen_name, "/dev/md/d%d", -1-devnum);
+ use_mdp = 0;
}
- if (devnum >= 0) {
- major_num = MD_MAJOR;
- minor_num = devnum;
- } else {
- major_num = get_mdp_major();
- minor_num = (-1-devnum) << 6;
+ if (num < 0 && trustworthy == LOCAL && name) {
+ /* if name is numeric, possibly prefixed by
+ * 'md' or '/dev/md', use that for num
+ * if it is not already in use */
+ char *ep;
+ char *n2 = name;
+ if (strncmp(n2, "/dev/", 5) == 0)
+ n2 += 5;
+ if (strncmp(n2, "md", 2) == 0)
+ n2 += 2;
+ if (*n2 == '/')
+ n2++;
+ num = strtoul(n2, &ep, 10);
+ if (ep == n2 || *ep)
+ num = -1;
+ else if (mddev_busy(use_mdp ? (-1-num) : num))
+ num = -1;
}
- if (stat(chosen_name, &stb) == 0) {
- /* It already exists. Check it is right. */
- if ( ! S_ISBLK(stb.st_mode) ||
- stb.st_rdev != makedev(major_num, minor_num)) {
- errno = EEXIST;
+
+ if (num < 0) {
+ /* need to choose a free number. */
+ num = find_free_devnum(use_mdp);
+ if (num == NoMdDev) {
+ fprintf(stderr, Name ": No avail md devices - aborting\n");
return -1;
}
} else {
- /* special case: if --incremental is suggesting a name
- * in /dev/md/, we make sure the directory exists.
+ num = use_mdp ? (-1-num) : num;
+ if (mddev_busy(num)) {
+ fprintf(stderr, Name ": %s is already in use.\n",
+ dev);
+ return -1;
+ }
+ }
+
+ if (num < 0)
+ sprintf(devname, "/dev/md_d%d", -1-num);
+ else
+ sprintf(devname, "/dev/md%d", num);
+
+ if (cname[0] == 0 && name) {
+ /* Need to find a name if we can
+ * We don't completely trust 'name'. Truncate to
+ * reasonable length and remove '/'
*/
- if (strncmp(chosen_name, "/dev/md/", 8) == 0) {
+ char *cp;
+ struct map_ent *map = NULL;
+ int conflict = 1;
+ int unum = 0;
+ int cnlen;
+ strncpy(cname, name, 200);
+ cname[200] = 0;
+ while ((cp = strchr(cname, '/')) != NULL)
+ *cp = '-';
+ if (trustworthy == LOCAL ||
+ (trustworthy == FOREIGN && strchr(cname, ':') != NULL)) {
+ /* Only need suffix if there is a conflict */
+ if (map_by_name(&map, cname) == NULL)
+ conflict = 0;
+ }
+ cnlen = strlen(cname);
+ while (conflict) {
+ if (trustworthy == METADATA && !isdigit(cname[cnlen-1]))
+ sprintf(cname+cnlen, "%d", unum);
+ else
+ /* add _%d to FOREIGN array that don't
+ * a 'host:' prefix
+ */
+ sprintf(cname+cnlen, "_%d", unum);
+ unum++;
+ if (map_by_name(&map, cname) == NULL)
+ conflict = 0;
+ }
+ }
+
+ if (dev)
+ strcpy(chosen, dev);
+ else if (cname[0] == 0)
+ strcpy(chosen, devname);
+
+ /* We have a device number and name.
+ * If we cannot detect udev, we need to make
+ * devices and links ourselves.
+ */
+ if (stat("/dev/.udev", &stb) != 0 ||
+ check_env("MDADM_NO_UDEV")) {
+ /* Make sure 'devname' exists and 'chosen' is a symlink to it */
+ if (lstat(devname, &stb) == 0) {
+ /* Must be the correct device, else error */
+ if ((stb.st_mode&S_IFMT) != S_IFBLK ||
+ stb.st_rdev != makedev(dev2major(num),dev2minor(num))) {
+ fprintf(stderr, Name ": %s exists but looks wrong, please fix\n",
+ devname);
+ return -1;
+ }
+ } else {
+ if (mknod(devname, S_IFBLK|0600,
+ makedev(dev2major(num),dev2minor(num))) != 0) {
+ fprintf(stderr, Name ": failed to create %s\n",
+ devname);
+ return -1;
+ }
+ if (chown(devname, ci->uid, ci->gid))
+ perror("chown");
+ if (chmod(devname, ci->mode))
+ perror("chmod");
+ stat(devname, &stb);
+ add_dev(devname, &stb, 0, NULL);
+ }
+ if (use_mdp == 1)
+ make_parts(devname, parts);
+ if (strcmp(chosen, devname) != 0) {
+
if (mkdir("/dev/md",0700)==0) {
if (chown("/dev/md", ci->uid, ci->gid))
perror("chown /dev/md");
- if (chmod("/dev/md", ci->mode|
- ((ci->mode>>2) & 0111)))
+ if (chmod("/dev/md", ci->mode| ((ci->mode>>2) & 0111)))
perror("chmod /dev/md");
}
- }
- if (mknod(chosen_name, S_IFBLK | 0600,
- makedev(major_num, minor_num)) != 0) {
- return -1;
+ if (dev && strcmp(chosen, dev) == 0)
+ /* We know we are allowed to use this name */
+ unlink(chosen);
+
+ if (lstat(chosen, &stb) == 0) {
+ char buf[300];
+ if ((stb.st_mode & S_IFMT) != S_IFLNK ||
+ readlink(chosen, buf, 300) <0 ||
+ strcmp(buf, devname) != 0) {
+ fprintf(stderr, Name ": %s exists - ignoring\n",
+ chosen);
+ strcpy(chosen, devname);
+ }
+ } else if (symlink(devname, chosen) != 0)
+ fprintf(stderr, Name ": failed to create %s: %s\n",
+ chosen, strerror(errno));
+ if (use_mdp && strcmp(chosen, devname) != 0)
+ make_parts(chosen, parts);
}
- /* FIXME chown/chmod ?? */
}
+ mdfd = open_dev_excl(num);
+ if (mdfd < 0)
+ fprintf(stderr, Name ": unexpected failure opening %s\n",
+ devname);
+ return mdfd;
+}
- /* Simple locking to avoid --incr being called for the same
- * array multiple times in parallel.
- */
- for (i = 0; i < 25 ; i++) {
- int fd;
- fd = open(chosen_name, O_RDWR|O_EXCL);
- if (fd >= 0 || errno != EBUSY) {
- if (devnum < 0)
- make_parts(chosen_name, parts, ci->symlinks);
- return fd;
- }
- usleep(200000);
+/* Open this and check that it is an md device.
+ * On success, return filedescriptor.
+ * On failure, return -1 if it doesn't exist,
+ * or -2 if it exists but is not an md device.
+ */
+int open_mddev(char *dev, int report_errors)
+{
+ int mdfd = open(dev, O_RDWR);
+ if (mdfd < 0) {
+ if (report_errors)
+ fprintf(stderr, Name ": error opening %s: %s\n",
+ dev, strerror(errno));
+ return -1;
}
- return -1;
+ if (md_get_version(mdfd) <= 0) {
+ close(mdfd);
+ if (report_errors)
+ fprintf(stderr, Name ": %s does not appear to be "
+ "an md device\n", dev);
+ return -2;
+ }
+ return mdfd;
}
* mdstat - parse /proc/mdstat file. Part of:
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2002-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2002-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
/*
#include "mdadm.h"
#include "dlink.h"
#include <sys/select.h>
+#include <ctype.h>
void free_mdstat(struct mdstat_ent *ms)
{
if (ms->dev) free(ms->dev);
if (ms->level) free(ms->level);
if (ms->pattern) free(ms->pattern);
+ if (ms->metadata_version) free(ms->metadata_version);
t = ms;
ms = ms->next;
free(t);
ent->percent = -1;
ent->active = -1;
ent->resync = 0;
+ ent->metadata_version = NULL;
+ ent->raid_disks = 0;
+ ent->chunk_size = 0;
+ ent->devcnt = 0;
ent->dev = strdup(line);
ent->devnum = devnum;
in_devs = 1;
} else if (in_devs && strcmp(w, "blocks")==0)
in_devs = 0;
- else if (in_devs && strncmp(w, "md", 2)==0) {
- /* This has an md device as a component.
- * If that device is already in the list,
- * make sure we insert before there.
- */
- struct mdstat_ent **ih;
- int dn2;
- if (strncmp(w, "md_d", 4)==0)
- dn2 = -1-strtoul(w+4, &ep, 10);
- else
- dn2 = strtoul(w+2, &ep, 10);
- ih = &all;
- while (ih != insert_here && *ih &&
- (*ih)->devnum != dn2)
- ih = & (*ih)->next;
- insert_here = ih;
+ else if (in_devs) {
+ ent->devcnt++;
+ if (strncmp(w, "md", 2)==0) {
+ /* This has an md device as a component.
+ * If that device is already in the
+ * list, make sure we insert before
+ * there.
+ */
+ struct mdstat_ent **ih;
+ int dn2 = devname2devnum(w);
+ ih = &all;
+ while (ih != insert_here && *ih &&
+ (*ih)->devnum != dn2)
+ ih = & (*ih)->next;
+ insert_here = ih;
+ }
+ } else if (strcmp(w, "super") == 0 &&
+ dl_next(w) != line) {
+ w = dl_next(w);
+ ent->metadata_version = strdup(w);
+ } else if (w[0] == '[' && isdigit(w[1])) {
+ ent->raid_disks = atoi(w+1);
} else if (!ent->pattern &&
w[0] == '[' &&
(w[1] == 'U' || w[1] == '_')) {
{
fd_set fds;
struct timeval tm;
+ int maxfd = 0;
FD_ZERO(&fds);
- if (mdstat_fd >= 0)
+ if (mdstat_fd >= 0) {
FD_SET(mdstat_fd, &fds);
+ maxfd = mdstat_fd;
+ }
tm.tv_sec = seconds;
tm.tv_usec = 0;
- select(mdstat_fd >2 ? mdstat_fd+1:3, NULL, NULL, &fds, &tm);
+ select(maxfd + 1, NULL, NULL, &fds, &tm);
+}
+
+void mdstat_wait_fd(int fd, const sigset_t *sigmask)
+{
+ fd_set fds, rfds;
+ int maxfd = fd;
+
+ FD_ZERO(&fds);
+ FD_ZERO(&rfds);
+ if (mdstat_fd >= 0)
+ FD_SET(mdstat_fd, &fds);
+ if (fd >= 0) {
+ struct stat stb;
+ fstat(fd, &stb);
+ if ((stb.st_mode & S_IFMT) == S_IFREG)
+ /* Must be a /proc or /sys fd, so expect
+ * POLLPRI
+ * i.e. an 'exceptional' event.
+ */
+ FD_SET(fd, &fds);
+ else
+ FD_SET(fd, &rfds);
+ }
+ if (mdstat_fd > maxfd)
+ maxfd = mdstat_fd;
+
+ pselect(maxfd + 1, &rfds, NULL, &fds,
+ NULL, sigmask);
}
int mddev_busy(int devnum)
--- /dev/null
+/*
+ * mdmon - monitor external metadata arrays
+ *
+ * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2007-2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "mdadm.h"
+#include "mdmon.h"
+#include <sys/syscall.h>
+#include <sys/select.h>
+#include <signal.h>
+
+static char *array_states[] = {
+ "clear", "inactive", "suspended", "readonly", "read-auto",
+ "clean", "active", "write-pending", "active-idle", NULL };
+static char *sync_actions[] = {
+ "idle", "reshape", "resync", "recover", "check", "repair", NULL
+};
+
+static int write_attr(char *attr, int fd)
+{
+ return write(fd, attr, strlen(attr));
+}
+
+static void add_fd(fd_set *fds, int *maxfd, int fd)
+{
+ if (fd < 0)
+ return;
+ if (fd > *maxfd)
+ *maxfd = fd;
+ FD_SET(fd, fds);
+}
+
+static int read_attr(char *buf, int len, int fd)
+{
+ int n;
+
+ if (fd < 0) {
+ buf[0] = 0;
+ return 0;
+ }
+ lseek(fd, 0, 0);
+ n = read(fd, buf, len - 1);
+
+ if (n <= 0) {
+ buf[0] = 0;
+ return 0;
+ }
+ buf[n] = 0;
+ if (buf[n-1] == '\n')
+ buf[n-1] = 0;
+ return n;
+}
+
+int get_resync_start(struct active_array *a)
+{
+ char buf[30];
+ int n;
+
+ n = read_attr(buf, 30, a->resync_start_fd);
+ if (n <= 0)
+ return n;
+ if (strncmp(buf, "none", 4) == 0)
+ a->resync_start = ~0ULL;
+ else
+ a->resync_start = strtoull(buf, NULL, 10);
+
+ return 1;
+}
+
+
+static enum array_state read_state(int fd)
+{
+ char buf[20];
+ int n = read_attr(buf, 20, fd);
+
+ if (n <= 0)
+ return bad_word;
+ return (enum array_state) sysfs_match_word(buf, array_states);
+}
+
+static enum sync_action read_action( int fd)
+{
+ char buf[20];
+ int n = read_attr(buf, 20, fd);
+
+ if (n <= 0)
+ return bad_action;
+ return (enum sync_action) sysfs_match_word(buf, sync_actions);
+}
+
+int read_dev_state(int fd)
+{
+ char buf[60];
+ int n = read_attr(buf, 60, fd);
+ char *cp;
+ int rv = 0;
+
+ if (n <= 0)
+ return 0;
+
+ cp = buf;
+ while (cp) {
+ if (sysfs_attr_match(cp, "faulty"))
+ rv |= DS_FAULTY;
+ if (sysfs_attr_match(cp, "in_sync"))
+ rv |= DS_INSYNC;
+ if (sysfs_attr_match(cp, "write_mostly"))
+ rv |= DS_WRITE_MOSTLY;
+ if (sysfs_attr_match(cp, "spare"))
+ rv |= DS_SPARE;
+ if (sysfs_attr_match(cp, "blocked"))
+ rv |= DS_BLOCKED;
+ cp = strchr(cp, ',');
+ if (cp)
+ cp++;
+ }
+ return rv;
+}
+
+static void signal_manager(void)
+{
+ /* tgkill(getpid(), mon_tid, SIGUSR1); */
+ int pid = getpid();
+ syscall(SYS_tgkill, pid, mgr_tid, SIGUSR1);
+}
+
+/* Monitor a set of active md arrays - all of which share the
+ * same metadata - and respond to events that require
+ * metadata update.
+ *
+ * New arrays are detected by another thread which allocates
+ * required memory and attaches the data structure to our list.
+ *
+ * Events:
+ * Array stops.
+ * This is detected by array_state going to 'clear' or 'inactive'.
+ * while we thought it was active.
+ * Response is to mark metadata as clean and 'clear' the array(??)
+ * write-pending
+ * array_state if 'write-pending'
+ * We mark metadata as 'dirty' then set array to 'active'.
+ * active_idle
+ * Either ignore, or mark clean, then mark metadata as clean.
+ *
+ * device fails
+ * detected by rd-N/state reporting "faulty"
+ * mark device as 'failed' in metadata, let the kernel release the
+ * device by writing '-blocked' to rd/state, and finally write 'remove' to
+ * rd/state. Before a disk can be replaced it must be failed and removed
+ * from all container members, this will be preemptive for the other
+ * arrays... safe?
+ *
+ * sync completes
+ * sync_action was 'resync' and becomes 'idle' and resync_start becomes
+ * MaxSector
+ * Notify metadata that sync is complete.
+ *
+ * recovery completes
+ * sync_action changes from 'recover' to 'idle'
+ * Check each device state and mark metadata if 'faulty' or 'in_sync'.
+ *
+ * deal with resync
+ * This only happens on finding a new array... mdadm will have set
+ * 'resync_start' to the correct value. If 'resync_start' indicates that an
+ * resync needs to occur set the array to the 'active' state rather than the
+ * initial read-auto state.
+ *
+ *
+ *
+ * We wait for a change (poll/select) on array_state, sync_action, and
+ * each rd-X/state file.
+ * When we get any change, we check everything. So read each state file,
+ * then decide what to do.
+ *
+ * The core action is to write new metadata to all devices in the array.
+ * This is done at most once on any wakeup.
+ * After that we might:
+ * - update the array_state
+ * - set the role of some devices.
+ * - request a sync_action
+ *
+ */
+
+static int read_and_act(struct active_array *a)
+{
+ int check_degraded = 0;
+ int deactivate = 0;
+ struct mdinfo *mdi;
+ int dirty = 0;
+
+ a->next_state = bad_word;
+ a->next_action = bad_action;
+
+ a->curr_state = read_state(a->info.state_fd);
+ a->curr_action = read_action(a->action_fd);
+ for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+ mdi->next_state = 0;
+ if (mdi->state_fd >= 0)
+ mdi->curr_state = read_dev_state(mdi->state_fd);
+ }
+
+ if (a->curr_state <= inactive &&
+ a->prev_state > inactive) {
+ /* array has been stopped */
+ get_resync_start(a);
+ a->container->ss->set_array_state(a, 1);
+ a->next_state = clear;
+ deactivate = 1;
+ }
+ if (a->curr_state == write_pending) {
+ get_resync_start(a);
+ a->container->ss->set_array_state(a, 0);
+ a->next_state = active;
+ dirty = 1;
+ }
+ if (a->curr_state == active_idle) {
+ /* Set array to 'clean' FIRST, then mark clean
+ * in the metadata
+ */
+ a->next_state = clean;
+ dirty = 1;
+ }
+ if (a->curr_state == clean) {
+ get_resync_start(a);
+ a->container->ss->set_array_state(a, 1);
+ }
+ if (a->curr_state == active ||
+ a->curr_state == suspended ||
+ a->curr_state == bad_word)
+ dirty = 1;
+ if (a->curr_state == readonly) {
+ /* Well, I'm ready to handle things. If readonly
+ * wasn't requested, transition to read-auto.
+ */
+ char buf[64];
+ read_attr(buf, sizeof(buf), a->metadata_fd);
+ if (strncmp(buf, "external:-", 10) == 0) {
+ /* explicit request for readonly array. Leave it alone */
+ ;
+ } else {
+ get_resync_start(a);
+ if (a->container->ss->set_array_state(a, 2))
+ a->next_state = read_auto; /* array is clean */
+ else {
+ a->next_state = active; /* Now active for recovery etc */
+ dirty = 1;
+ }
+ }
+ }
+
+ if (!deactivate &&
+ a->curr_action == idle &&
+ a->prev_action == resync) {
+ /* A resync has finished. The endpoint is recorded in
+ * 'sync_start'. We don't update the metadata
+ * until the array goes inactive or readonly though.
+ * Just check if we need to fiddle spares.
+ */
+ get_resync_start(a);
+ a->container->ss->set_array_state(a, a->curr_state <= clean);
+ check_degraded = 1;
+ }
+
+ if (!deactivate &&
+ a->curr_action == idle &&
+ a->prev_action == recover) {
+ /* A recovery has finished. Some disks may be in sync now,
+ * and the array may no longer be degraded
+ */
+ for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
+ a->container->ss->set_disk(a, mdi->disk.raid_disk,
+ mdi->curr_state);
+ if (! (mdi->curr_state & DS_INSYNC))
+ check_degraded = 1;
+ }
+ }
+
+ /* Check for failures and if found:
+ * 1/ Record the failure in the metadata and unblock the device.
+ * FIXME update the kernel to stop notifying on failed drives when
+ * the array is readonly and we have cleared 'blocked'
+ * 2/ Try to remove the device if the array is writable, or can be
+ * made writable.
+ */
+ for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
+ if (mdi->curr_state & DS_FAULTY) {
+ a->container->ss->set_disk(a, mdi->disk.raid_disk,
+ mdi->curr_state);
+ check_degraded = 1;
+ mdi->next_state |= DS_UNBLOCK;
+ if (a->curr_state == read_auto) {
+ a->container->ss->set_array_state(a, 0);
+ a->next_state = active;
+ }
+ if (a->curr_state > readonly)
+ mdi->next_state |= DS_REMOVE;
+ }
+ }
+
+ a->container->ss->sync_metadata(a->container);
+ dprintf("%s(%d): state:%s action:%s next(", __func__, a->info.container_member,
+ array_states[a->curr_state], sync_actions[a->curr_action]);
+
+ /* Effect state changes in the array */
+ if (a->next_state != bad_word) {
+ dprintf(" state:%s", array_states[a->next_state]);
+ write_attr(array_states[a->next_state], a->info.state_fd);
+ }
+ if (a->next_action != bad_action) {
+ write_attr(sync_actions[a->next_action], a->action_fd);
+ dprintf(" action:%s", sync_actions[a->next_action]);
+ }
+ for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+ if (mdi->next_state & DS_UNBLOCK) {
+ dprintf(" %d:-blocked", mdi->disk.raid_disk);
+ write_attr("-blocked", mdi->state_fd);
+ }
+
+ if ((mdi->next_state & DS_REMOVE) && mdi->state_fd >= 0) {
+ int remove_result;
+
+ /* the kernel may not be able to immediately remove the
+ * disk, we can simply wait until the next event to try
+ * again.
+ */
+ remove_result = write_attr("remove", mdi->state_fd);
+ if (remove_result > 0) {
+ dprintf(" %d:removed", mdi->disk.raid_disk);
+ close(mdi->state_fd);
+ mdi->state_fd = -1;
+ }
+ }
+ if (mdi->next_state & DS_INSYNC) {
+ write_attr("+in_sync", mdi->state_fd);
+ dprintf(" %d:+in_sync", mdi->disk.raid_disk);
+ }
+ }
+ dprintf(" )\n");
+
+ /* move curr_ to prev_ */
+ a->prev_state = a->curr_state;
+
+ a->prev_action = a->curr_action;
+
+ for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+ mdi->prev_state = mdi->curr_state;
+ mdi->next_state = 0;
+ }
+
+ if (check_degraded) {
+ /* manager will do the actual check */
+ a->check_degraded = 1;
+ signal_manager();
+ }
+
+ if (deactivate)
+ a->container = NULL;
+
+ return dirty;
+}
+
+static struct mdinfo *
+find_device(struct active_array *a, int major, int minor)
+{
+ struct mdinfo *mdi;
+
+ for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
+ if (mdi->disk.major == major && mdi->disk.minor == minor)
+ return mdi;
+
+ return NULL;
+}
+
+static void reconcile_failed(struct active_array *aa, struct mdinfo *failed)
+{
+ struct active_array *a;
+ struct mdinfo *victim;
+
+ for (a = aa; a; a = a->next) {
+ if (!a->container)
+ continue;
+ victim = find_device(a, failed->disk.major, failed->disk.minor);
+ if (!victim)
+ continue;
+
+ if (!(victim->curr_state & DS_FAULTY))
+ write_attr("faulty", victim->state_fd);
+ }
+}
+
+#ifdef DEBUG
+static void dprint_wake_reasons(fd_set *fds)
+{
+ int i;
+ char proc_path[256];
+ char link[256];
+ char *basename;
+ int rv;
+
+ fprintf(stderr, "monitor: wake ( ");
+ for (i = 0; i < FD_SETSIZE; i++) {
+ if (FD_ISSET(i, fds)) {
+ sprintf(proc_path, "/proc/%d/fd/%d",
+ (int) getpid(), i);
+
+ rv = readlink(proc_path, link, sizeof(link) - 1);
+ if (rv < 0) {
+ fprintf(stderr, "%d:unknown ", i);
+ continue;
+ }
+ link[rv] = '\0';
+ basename = strrchr(link, '/');
+ fprintf(stderr, "%d:%s ",
+ i, basename ? ++basename : link);
+ }
+ }
+ fprintf(stderr, ")\n");
+}
+#endif
+
+int monitor_loop_cnt;
+
+static int wait_and_act(struct supertype *container, int nowait)
+{
+ fd_set rfds;
+ int maxfd = 0;
+ struct active_array **aap = &container->arrays;
+ struct active_array *a, **ap;
+ int rv;
+ struct mdinfo *mdi;
+ static unsigned int dirty_arrays = ~0; /* start at some non-zero value */
+
+ FD_ZERO(&rfds);
+
+ for (ap = aap ; *ap ;) {
+ a = *ap;
+ /* once an array has been deactivated we want to
+ * ask the manager to discard it.
+ */
+ if (!a->container) {
+ if (discard_this) {
+ ap = &(*ap)->next;
+ continue;
+ }
+ *ap = a->next;
+ a->next = NULL;
+ discard_this = a;
+ signal_manager();
+ continue;
+ }
+
+ add_fd(&rfds, &maxfd, a->info.state_fd);
+ add_fd(&rfds, &maxfd, a->action_fd);
+ for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
+ add_fd(&rfds, &maxfd, mdi->state_fd);
+
+ ap = &(*ap)->next;
+ }
+
+ if (manager_ready && (*aap == NULL || (sigterm && !dirty_arrays))) {
+ /* No interesting arrays, or we have been told to
+ * terminate and everything is clean. Lets see about
+ * exiting. Note that blocking at this point is not a
+ * problem as there are no active arrays, there is
+ * nothing that we need to be ready to do.
+ */
+ int fd = open_dev_excl(container->devnum);
+ if (fd >= 0 || errno != EBUSY) {
+ /* OK, we are safe to leave */
+ if (sigterm && !dirty_arrays)
+ dprintf("caught sigterm, all clean... exiting\n");
+ else
+ dprintf("no arrays to monitor... exiting\n");
+ remove_pidfile(container->devname);
+ exit_now = 1;
+ signal_manager();
+ exit(0);
+ }
+ }
+
+ if (!nowait) {
+ sigset_t set;
+ sigprocmask(SIG_UNBLOCK, NULL, &set);
+ sigdelset(&set, SIGUSR1);
+ monitor_loop_cnt |= 1;
+ rv = pselect(maxfd+1, NULL, NULL, &rfds, NULL, &set);
+ monitor_loop_cnt += 1;
+ if (rv == -1 && errno == EINTR)
+ rv = 0;
+ #ifdef DEBUG
+ dprint_wake_reasons(&rfds);
+ #endif
+
+ }
+
+ if (update_queue) {
+ struct metadata_update *this;
+
+ for (this = update_queue; this ; this = this->next)
+ container->ss->process_update(container, this);
+
+ update_queue_handled = update_queue;
+ update_queue = NULL;
+ signal_manager();
+ container->ss->sync_metadata(container);
+ }
+
+ rv = 0;
+ dirty_arrays = 0;
+ for (a = *aap; a ; a = a->next) {
+ int is_dirty;
+
+ if (a->replaces && !discard_this) {
+ struct active_array **ap;
+ for (ap = &a->next; *ap && *ap != a->replaces;
+ ap = & (*ap)->next)
+ ;
+ if (*ap)
+ *ap = (*ap)->next;
+ discard_this = a->replaces;
+ a->replaces = NULL;
+ /* FIXME check if device->state_fd need to be cleared?*/
+ signal_manager();
+ }
+ if (a->container) {
+ is_dirty = read_and_act(a);
+ rv |= 1;
+ dirty_arrays += is_dirty;
+ /* when terminating stop manipulating the array after it
+ * is clean, but make sure read_and_act() is given a
+ * chance to handle 'active_idle'
+ */
+ if (sigterm && !is_dirty)
+ a->container = NULL; /* stop touching this array */
+ }
+ }
+
+ /* propagate failures across container members */
+ for (a = *aap; a ; a = a->next) {
+ if (!a->container)
+ continue;
+ for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
+ if (mdi->curr_state & DS_FAULTY)
+ reconcile_failed(*aap, mdi);
+ }
+
+ return rv;
+}
+
+void do_monitor(struct supertype *container)
+{
+ int rv;
+ int first = 1;
+ do {
+ rv = wait_and_act(container, first);
+ first = 0;
+ } while (rv >= 0);
+}
--- /dev/null
+/*
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * mdmon socket / message handling
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include "mdadm.h"
+#include "mdmon.h"
+
+static const __u32 start_magic = 0x5a5aa5a5;
+static const __u32 end_magic = 0xa5a55a5a;
+
+static int send_buf(int fd, const void* buf, int len, int tmo)
+{
+ fd_set set;
+ int rv;
+ struct timeval timeout = {tmo, 0};
+ struct timeval *ptmo = tmo ? &timeout : NULL;
+
+ while (len) {
+ FD_ZERO(&set);
+ FD_SET(fd, &set);
+ rv = select(fd+1, NULL, &set, NULL, ptmo);
+ if (rv <= 0)
+ return -1;
+ rv = write(fd, buf, len);
+ if (rv <= 0)
+ return -1;
+ len -= rv;
+ buf += rv;
+ }
+ return 0;
+}
+
+static int recv_buf(int fd, void* buf, int len, int tmo)
+{
+ fd_set set;
+ int rv;
+ struct timeval timeout = {tmo, 0};
+ struct timeval *ptmo = tmo ? &timeout : NULL;
+
+ while (len) {
+ FD_ZERO(&set);
+ FD_SET(fd, &set);
+ rv = select(fd+1, &set, NULL, NULL, ptmo);
+ if (rv <= 0)
+ return -1;
+ rv = read(fd, buf, len);
+ if (rv <= 0)
+ return -1;
+ len -= rv;
+ buf += rv;
+ }
+ return 0;
+}
+
+
+int send_message(int fd, struct metadata_update *msg, int tmo)
+{
+ __s32 len = msg->len;
+ int rv;
+
+ rv = send_buf(fd, &start_magic, 4, tmo);
+ rv = rv ?: send_buf(fd, &len, 4, tmo);
+ if (len > 0)
+ rv = rv ?: send_buf(fd, msg->buf, msg->len, tmo);
+ rv = send_buf(fd, &end_magic, 4, tmo);
+
+ return rv;
+}
+
+int receive_message(int fd, struct metadata_update *msg, int tmo)
+{
+ __u32 magic;
+ __s32 len;
+ int rv;
+
+ rv = recv_buf(fd, &magic, 4, tmo);
+ if (rv < 0 || magic != start_magic)
+ return -1;
+ rv = recv_buf(fd, &len, 4, tmo);
+ if (rv < 0 || len > MSG_MAX_LEN)
+ return -1;
+ if (len > 0) {
+ msg->buf = malloc(len);
+ if (msg->buf == NULL)
+ return -1;
+ rv = recv_buf(fd, msg->buf, len, tmo);
+ if (rv < 0) {
+ free(msg->buf);
+ return -1;
+ }
+ } else
+ msg->buf = NULL;
+ rv = recv_buf(fd, &magic, 4, tmo);
+ if (rv < 0 || magic != end_magic) {
+ free(msg->buf);
+ return -1;
+ }
+ msg->len = len;
+ return 0;
+}
+
+int ack(int fd, int tmo)
+{
+ struct metadata_update msg = { .len = 0 };
+
+ return send_message(fd, &msg, tmo);
+}
+
+int wait_reply(int fd, int tmo)
+{
+ struct metadata_update msg;
+ return receive_message(fd, &msg, tmo);
+}
+
+int connect_monitor(char *devname)
+{
+ char path[100];
+ int sfd;
+ long fl;
+ struct sockaddr_un addr;
+ int pos;
+ char *c;
+
+ pos = sprintf(path, "/var/run/mdadm/");
+ if (is_subarray(devname)) {
+ devname++;
+ c = strchr(devname, '/');
+ if (!c)
+ return -1;
+ snprintf(&path[pos], c - devname + 1, "%s", devname);
+ pos += c - devname;
+ } else
+ pos += sprintf(&path[pos], "%s", devname);
+ sprintf(&path[pos], ".sock");
+
+ sfd = socket(PF_LOCAL, SOCK_STREAM, 0);
+ if (sfd < 0)
+ return -1;
+
+ addr.sun_family = PF_LOCAL;
+ strcpy(addr.sun_path, path);
+ if (connect(sfd, &addr, sizeof(addr)) < 0) {
+ close(sfd);
+ return -1;
+ }
+
+ fl = fcntl(sfd, F_GETFL, 0);
+ fl |= O_NONBLOCK;
+ fcntl(sfd, F_SETFL, fl);
+
+ return sfd;
+}
+
+/* give the monitor a chance to update the metadata */
+int ping_monitor(char *devname)
+{
+ int sfd = connect_monitor(devname);
+ int err = 0;
+
+ if (sfd < 0)
+ return sfd;
+
+ /* try to ping existing socket */
+ if (ack(sfd, 20) != 0)
+ err = -1;
+
+ /* check the reply */
+ if (!err && wait_reply(sfd, 20) != 0)
+ err = -1;
+
+ close(sfd);
+ return err;
+}
+
+/* give the manager a chance to view the updated container state. This
+ * would naturally happen due to the manager noticing a change in
+ * /proc/mdstat; however, pinging encourages this detection to happen
+ * while an exclusive open() on the container is active
+ */
+int ping_manager(char *devname)
+{
+ int sfd = connect_monitor(devname);
+ struct metadata_update msg = { .len = -1 };
+ int err = 0;
+
+ if (sfd < 0)
+ return sfd;
+
+ err = send_message(sfd, &msg, 20);
+
+ /* check the reply */
+ if (!err && wait_reply(sfd, 20) != 0)
+ err = -1;
+
+ close(sfd);
+ return err;
+}
--- /dev/null
+/*
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * mdmon socket / message handling
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+
+struct mdinfo;
+struct metadata_update;
+
+extern int receive_message(int fd, struct metadata_update *msg, int tmo);
+extern int send_message(int fd, struct metadata_update *msg, int tmo);
+extern int ack(int fd, int tmo);
+extern int wait_reply(int fd, int tmo);
+extern int connect_monitor(char *devname);
+extern int ping_monitor(char *devname);
+extern int ping_manager(char *devname);
+
+#define MSG_MAX_LEN (4*1024*1024)
--- /dev/null
+/*
+ * Intel(R) Matrix Storage Manager hardware and firmware support routines
+ *
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include "mdadm.h"
+#include "platform-intel.h"
+#include "probe_roms.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+void free_sys_dev(struct sys_dev **list)
+{
+ while (*list) {
+ struct sys_dev *next = (*list)->next;
+
+ if ((*list)->path)
+ free((*list)->path);
+ free(*list);
+ *list = next;
+ }
+}
+
+struct sys_dev *find_driver_devices(const char *bus, const char *driver)
+{
+ /* search sysfs for devices driven by 'driver' */
+ char path[256];
+ char link[256];
+ char *c;
+ DIR *driver_dir;
+ struct dirent *de;
+ struct sys_dev *head = NULL;
+ struct sys_dev *list = NULL;
+
+ sprintf(path, "/sys/bus/%s/drivers/%s", bus, driver);
+ driver_dir = opendir(path);
+ if (!driver_dir)
+ return NULL;
+ for (de = readdir(driver_dir); de; de = readdir(driver_dir)) {
+ /* is 'de' a device? check that the 'subsystem' link exists and
+ * that its target matches 'bus'
+ */
+ sprintf(path, "/sys/bus/%s/drivers/%s/%s/subsystem",
+ bus, driver, de->d_name);
+ if (readlink(path, link, sizeof(link)) < 0)
+ continue;
+ c = strrchr(link, '/');
+ if (!c)
+ continue;
+ if (strncmp(bus, c+1, strlen(bus)) != 0)
+ continue;
+
+ /* start / add list entry */
+ if (!head) {
+ head = malloc(sizeof(*head));
+ list = head;
+ } else {
+ list->next = malloc(sizeof(*head));
+ list = list->next;
+ }
+
+ if (!list) {
+ free_sys_dev(&head);
+ break;
+ }
+
+ /* generate canonical path name for the device */
+ sprintf(path, "/sys/bus/%s/drivers/%s/%s",
+ bus, driver, de->d_name);
+ list->path = canonicalize_file_name(path);
+ list->next = NULL;
+ }
+
+ return head;
+}
+
+__u16 devpath_to_vendor(const char *dev_path)
+{
+ char path[strlen(dev_path) + strlen("/vendor") + 1];
+ char vendor[7];
+ int fd;
+ __u16 id = 0xffff;
+ int n;
+
+ sprintf(path, "%s/vendor", dev_path);
+
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ return 0xffff;
+
+ n = read(fd, vendor, sizeof(vendor));
+ if (n == sizeof(vendor)) {
+ vendor[n - 1] = '\0';
+ id = strtoul(vendor, NULL, 16);
+ }
+ close(fd);
+
+ return id;
+}
+
+static int platform_has_intel_ahci(void)
+{
+ struct sys_dev *devices = find_driver_devices("pci", "ahci");
+ struct sys_dev *dev;
+ int ret = 0;
+
+ for (dev = devices; dev; dev = dev->next)
+ if (devpath_to_vendor(dev->path) == 0x8086) {
+ ret = 1;
+ break;
+ }
+
+ free_sys_dev(&devices);
+
+ return ret;
+}
+
+
+static struct imsm_orom imsm_orom;
+static int scan(const void *start, const void *end)
+{
+ int offset;
+ const struct imsm_orom *imsm_mem;
+ int len = (end - start);
+
+ for (offset = 0; offset < len; offset += 4) {
+ imsm_mem = start + offset;
+ if (memcmp(imsm_mem->signature, "$VER", 4) == 0) {
+ imsm_orom = *imsm_mem;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+const struct imsm_orom *find_imsm_orom(void)
+{
+ static int populated = 0;
+
+ /* it's static data so we only need to read it once */
+ if (populated)
+ return &imsm_orom;
+
+ if (check_env("IMSM_TEST_OROM")) {
+ memset(&imsm_orom, 0, sizeof(imsm_orom));
+ imsm_orom.rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 |
+ IMSM_OROM_RLC_RAID10 | IMSM_OROM_RLC_RAID5;
+ imsm_orom.sss = IMSM_OROM_SSS_4kB | IMSM_OROM_SSS_8kB |
+ IMSM_OROM_SSS_16kB | IMSM_OROM_SSS_32kB |
+ IMSM_OROM_SSS_64kB | IMSM_OROM_SSS_128kB |
+ IMSM_OROM_SSS_256kB | IMSM_OROM_SSS_512kB |
+ IMSM_OROM_SSS_1MB | IMSM_OROM_SSS_2MB;
+ imsm_orom.dpa = 6;
+ imsm_orom.tds = 6;
+ imsm_orom.vpa = 2;
+ imsm_orom.vphba = 4;
+ imsm_orom.attr = imsm_orom.rlc | IMSM_OROM_ATTR_ChecksumVerify;
+ populated = 1;
+ return &imsm_orom;
+ }
+
+ if (!platform_has_intel_ahci())
+ return NULL;
+
+ /* scan option-rom memory looking for an imsm signature */
+ if (probe_roms_init() != 0)
+ return NULL;
+ probe_roms();
+ populated = scan_adapter_roms(scan);
+ probe_roms_exit();
+
+ if (populated)
+ return &imsm_orom;
+ return NULL;
+}
+
+char *devt_to_devpath(dev_t dev)
+{
+ char device[40];
+
+ sprintf(device, "/sys/dev/block/%d:%d/device", major(dev), minor(dev));
+ return canonicalize_file_name(device);
+}
+
+static char *diskfd_to_devpath(int fd)
+{
+ /* return the device path for a disk, return NULL on error or fd
+ * refers to a partition
+ */
+ struct stat st;
+
+ if (fstat(fd, &st) != 0)
+ return NULL;
+ if (!S_ISBLK(st.st_mode))
+ return NULL;
+
+ return devt_to_devpath(st.st_rdev);
+}
+
+int path_attached_to_hba(const char *disk_path, const char *hba_path)
+{
+ int rc;
+
+ if (!disk_path || !hba_path)
+ return 0;
+
+ if (strncmp(disk_path, hba_path, strlen(hba_path)) == 0)
+ rc = 1;
+ else
+ rc = 0;
+
+ return rc;
+}
+
+int devt_attached_to_hba(dev_t dev, const char *hba_path)
+{
+ char *disk_path = devt_to_devpath(dev);
+ int rc = path_attached_to_hba(disk_path, hba_path);
+
+ if (disk_path)
+ free(disk_path);
+
+ return rc;
+}
+
+int disk_attached_to_hba(int fd, const char *hba_path)
+{
+ char *disk_path = diskfd_to_devpath(fd);
+ int rc = path_attached_to_hba(disk_path, hba_path);
+
+ if (disk_path)
+ free(disk_path);
+
+ return rc;
+}
+
--- /dev/null
+/*
+ * Intel(R) Matrix Storage Manager hardware and firmware support routines
+ *
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <asm/types.h>
+#include <strings.h>
+
+/* The IMSM OROM Version Table definition */
+struct imsm_orom {
+ __u8 signature[4];
+ __u8 table_ver_major; /* Currently 2 (can change with future revs) */
+ __u8 table_ver_minor; /* Currently 2 (can change with future revs) */
+ __u16 major_ver; /* Example: 8 as in 8.6.0.1020 */
+ __u16 minor_ver; /* Example: 6 as in 8.6.0.1020 */
+ __u16 hotfix_ver; /* Example: 0 as in 8.6.0.1020 */
+ __u16 build; /* Example: 1020 as in 8.6.0.1020 */
+ __u8 len; /* number of bytes in this entire table */
+ __u8 checksum; /* checksum of all the bytes in this table */
+ __u16 rlc; /* RAID Level Capability */
+ /* we assume the cpu is x86 as the orom should not be found
+ * anywhere else
+ */
+ #define IMSM_OROM_RLC_RAID0 (1 << 0)
+ #define IMSM_OROM_RLC_RAID1 (1 << 1)
+ #define IMSM_OROM_RLC_RAID10 (1 << 2)
+ #define IMSM_OROM_RLC_RAID1E (1 << 3)
+ #define IMSM_OROM_RLC_RAID5 (1 << 4)
+ #define IMSM_OROM_RLC_RAID_CNG (1 << 5)
+ __u16 sss; /* Strip Size Supported */
+ #define IMSM_OROM_SSS_2kB (1 << 0)
+ #define IMSM_OROM_SSS_4kB (1 << 1)
+ #define IMSM_OROM_SSS_8kB (1 << 2)
+ #define IMSM_OROM_SSS_16kB (1 << 3)
+ #define IMSM_OROM_SSS_32kB (1 << 4)
+ #define IMSM_OROM_SSS_64kB (1 << 5)
+ #define IMSM_OROM_SSS_128kB (1 << 6)
+ #define IMSM_OROM_SSS_256kB (1 << 7)
+ #define IMSM_OROM_SSS_512kB (1 << 8)
+ #define IMSM_OROM_SSS_1MB (1 << 9)
+ #define IMSM_OROM_SSS_2MB (1 << 10)
+ #define IMSM_OROM_SSS_4MB (1 << 11)
+ #define IMSM_OROM_SSS_8MB (1 << 12)
+ #define IMSM_OROM_SSS_16MB (1 << 13)
+ #define IMSM_OROM_SSS_32MB (1 << 14)
+ #define IMSM_OROM_SSS_64MB (1 << 15)
+ __u16 dpa; /* Disks Per Array supported */
+ __u16 tds; /* Total Disks Supported */
+ __u8 vpa; /* # Volumes Per Array supported */
+ __u8 vphba; /* # Volumes Per Host Bus Adapter supported */
+ /* Attributes supported. This should map to the
+ * attributes in the MPB. Also, lower 16 bits
+ * should match/duplicate RLC bits above.
+ */
+ __u32 attr;
+ #define IMSM_OROM_ATTR_RAID0 IMSM_OROM_RLC_RAID0
+ #define IMSM_OROM_ATTR_RAID1 IMSM_OROM_RLC_RAID1
+ #define IMSM_OROM_ATTR_RAID10 IMSM_OROM_RLC_RAID10
+ #define IMSM_OROM_ATTR_RAID1E IMSM_OROM_RLC_RAID1E
+ #define IMSM_OROM_ATTR_RAID5 IMSM_OROM_RLC_RAID5
+ #define IMSM_OROM_ATTR_RAID_CNG IMSM_OROM_RLC_RAID_CNG
+ #define IMSM_OROM_ATTR_2TB (1 << 29)
+ #define IMSM_OROM_ATTR_PM (1 << 30)
+ #define IMSM_OROM_ATTR_ChecksumVerify (1 << 31)
+ __u32 reserved1;
+ __u32 reserved2;
+} __attribute__((packed));
+
+static inline int imsm_orom_has_raid0(const struct imsm_orom *orom)
+{
+ return !!(orom->rlc & IMSM_OROM_RLC_RAID0);
+}
+static inline int imsm_orom_has_raid1(const struct imsm_orom *orom)
+{
+ return !!(orom->rlc & IMSM_OROM_RLC_RAID1);
+}
+static inline int imsm_orom_has_raid1e(const struct imsm_orom *orom)
+{
+ return !!(orom->rlc & IMSM_OROM_RLC_RAID1E);
+}
+static inline int imsm_orom_has_raid10(const struct imsm_orom *orom)
+{
+ return !!(orom->rlc & IMSM_OROM_RLC_RAID10);
+}
+static inline int imsm_orom_has_raid5(const struct imsm_orom *orom)
+{
+ return !!(orom->rlc & IMSM_OROM_RLC_RAID5);
+}
+
+/**
+ * imsm_orom_has_chunk - check if the orom supports the given chunk size
+ * @orom: orom pointer from find_imsm_orom
+ * @chunk: chunk size in kibibytes
+ */
+static inline int imsm_orom_has_chunk(const struct imsm_orom *orom, int chunk)
+{
+ int fs = ffs(chunk);
+
+ if (!fs)
+ return 0;
+ fs--; /* bit num to bit index */
+ return !!(orom->sss & (1 << (fs - 1)));
+}
+
+struct sys_dev {
+ char *path;
+ struct sys_dev *next;
+};
+
+struct sys_dev *find_driver_devices(const char *bus, const char *driver);
+__u16 devpath_to_vendor(const char *dev_path);
+void free_sys_dev(struct sys_dev **list);
+const struct imsm_orom *find_imsm_orom(void);
+int disk_attached_to_hba(int fd, const char *hba_path);
+char *devt_to_devpath(dev_t dev);
+int path_attached_to_hba(const char *disk_path, const char *hba_path);
--- /dev/null
+/*
+ * probe_roms - scan for Adapter ROMS
+ *
+ * (based on linux-2.6:arch/x86/kernel/probe_roms_32.c)
+ *
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "probe_roms.h"
+#include <unistd.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <asm/types.h>
+
+static void *rom_mem = MAP_FAILED;
+static int rom_fd = -1;
+const static int rom_len = 0xf0000 - 0xc0000; /* option-rom memory region */
+static int _sigbus;
+#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
+
+static void sigbus(int sig)
+{
+ _sigbus = 1;
+}
+
+static int probe_address8(const __u8 *ptr, __u8 *val)
+{
+ int rc = 0;
+
+ *val = *ptr;
+ if (_sigbus)
+ rc = -1;
+ _sigbus = 0;
+
+ return rc;
+}
+
+static int probe_address16(const __u16 *ptr, __u16 *val)
+{
+ int rc = 0;
+
+ *val = *ptr;
+ if (_sigbus)
+ rc = -1;
+ _sigbus = 0;
+
+ return rc;
+}
+
+void probe_roms_exit(void)
+{
+ signal(SIGBUS, SIG_DFL);
+ if (rom_fd >= 0) {
+ close(rom_fd);
+ rom_fd = -1;
+ }
+ if (rom_mem != MAP_FAILED) {
+ munmap(rom_mem, rom_len);
+ rom_mem = MAP_FAILED;
+ }
+}
+
+int probe_roms_init(void)
+{
+ int fd;
+ int rc = 0;
+
+ if (signal(SIGBUS, sigbus) == SIG_ERR)
+ rc = -1;
+ if (rc == 0) {
+ fd = open("/dev/mem", O_RDONLY);
+ if (fd < 0)
+ rc = -1;
+ }
+ if (rc == 0) {
+ rom_mem = mmap(NULL, rom_len, PROT_READ, MAP_PRIVATE, fd, 0xc0000);
+ if (rom_mem == MAP_FAILED)
+ rc = -1;
+ }
+
+ if (rc == 0)
+ rom_fd = fd;
+ else
+ probe_roms_exit();
+
+ return rc;
+}
+
+/**
+ * isa_bus_to_virt - convert physical address to mmap'd region
+ * @addr - address to convert
+ *
+ * Only valid between a successful call to probe_roms_init and the
+ * corresponding probe_roms_exit
+ */
+static void *isa_bus_to_virt(unsigned long addr)
+{
+ return rom_mem + (addr - 0xc0000);
+}
+
+struct resource {
+ unsigned long start;
+ unsigned long end;
+ const char *name;
+};
+
+static struct resource system_rom_resource = {
+ .name = "System ROM",
+ .start = 0xf0000,
+ .end = 0xfffff,
+};
+
+static struct resource extension_rom_resource = {
+ .name = "Extension ROM",
+ .start = 0xe0000,
+ .end = 0xeffff,
+};
+
+static struct resource adapter_rom_resources[] = { {
+ .name = "Adapter ROM",
+ .start = 0xc8000,
+ .end = 0,
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+} };
+
+static struct resource video_rom_resource = {
+ .name = "Video ROM",
+ .start = 0xc0000,
+ .end = 0xc7fff,
+};
+
+#define ROMSIGNATURE 0xaa55
+
+static int romsignature(const unsigned char *rom)
+{
+ const unsigned short * const ptr = (const unsigned short *)rom;
+ unsigned short sig = 0;
+
+ return probe_address16(ptr, &sig) == 0 && sig == ROMSIGNATURE;
+}
+
+static int romchecksum(const unsigned char *rom, unsigned long length)
+{
+ unsigned char sum, c;
+
+ for (sum = 0; length && probe_address8(rom++, &c) == 0; length--)
+ sum += c;
+ return !length && !sum;
+}
+
+int scan_adapter_roms(scan_fn fn)
+{
+ /* let scan_fn examing each of the adapter roms found by probe_roms */
+ int i;
+ int found;
+
+ if (rom_fd < 0)
+ return 0;
+
+ found = 0;
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
+ struct resource *res = &adapter_rom_resources[i];
+
+ if (res->start) {
+ found = fn(isa_bus_to_virt(res->start),
+ isa_bus_to_virt(res->end));
+ if (found)
+ break;
+ } else
+ break;
+ }
+
+ return found;
+}
+
+void probe_roms(void)
+{
+ const void *rom;
+ unsigned long start, length, upper;
+ unsigned char c;
+ int i;
+
+ if (rom_fd < 0)
+ return;
+
+ /* video rom */
+ upper = adapter_rom_resources[0].start;
+ for (start = video_rom_resource.start; start < upper; start += 2048) {
+ rom = isa_bus_to_virt(start);
+ if (!romsignature(rom))
+ continue;
+
+ video_rom_resource.start = start;
+
+ if (probe_address8(rom + 2, &c) != 0)
+ continue;
+
+ /* 0 < length <= 0x7f * 512, historically */
+ length = c * 512;
+
+ /* if checksum okay, trust length byte */
+ if (length && romchecksum(rom, length))
+ video_rom_resource.end = start + length - 1;
+ break;
+ }
+
+ start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+ if (start < upper)
+ start = upper;
+
+ /* system rom */
+ upper = system_rom_resource.start;
+
+ /* check for extension rom (ignore length byte!) */
+ rom = isa_bus_to_virt(extension_rom_resource.start);
+ if (romsignature(rom)) {
+ length = extension_rom_resource.end - extension_rom_resource.start + 1;
+ if (romchecksum(rom, length))
+ upper = extension_rom_resource.start;
+ }
+
+ /* check for adapter roms on 2k boundaries */
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+ rom = isa_bus_to_virt(start);
+ if (!romsignature(rom))
+ continue;
+
+ if (probe_address8(rom + 2, &c) != 0)
+ continue;
+
+ /* 0 < length <= 0x7f * 512, historically */
+ length = c * 512;
+
+ /* but accept any length that fits if checksum okay */
+ if (!length || start + length > upper || !romchecksum(rom, length))
+ continue;
+
+ adapter_rom_resources[i].start = start;
+ adapter_rom_resources[i].end = start + length - 1;
+
+ start = adapter_rom_resources[i++].end & ~2047UL;
+ }
+}
+
--- /dev/null
+/*
+ * probe_roms - scan for Adapter ROMS
+ *
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+void probe_roms_exit(void);
+int probe_roms_init(void);
+typedef int (*scan_fn)(const void *start, const void *end);
+int scan_adapter_roms(scan_fn fn);
+void probe_roms(void);
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
int nwrites, int *dest,
unsigned long long start, unsigned long long length)
{
- char buf[8192];
+ char abuf[8192+512];
+ char *buf = (char*)(((unsigned long)abuf+511)&~511UL);
int cpos = start % chunk_size; /* where in chunk we are up to */
int len;
int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2);
unsigned long long offset;
int i;
len = chunk_size - cpos;
- if (len > sizeof(buf)) len = sizeof(buf);
+ if (len > 8192) len = 8192;
if (len > length) len = length;
/* len bytes to be moved from one device */
--- /dev/null
+/*
+ * Copyright (C) 2007-2008 Intel Corporation
+ *
+ * Retrieve drive serial numbers for scsi disks
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <string.h>
+#include <scsi/scsi.h>
+#include <scsi/sg.h>
+#include <sys/ioctl.h>
+
+int scsi_get_serial(int fd, void *buf, size_t buf_len)
+{
+ unsigned char inq_cmd[] = {INQUIRY, 1, 0x80, 0, buf_len, 0};
+ unsigned char sense[32];
+ struct sg_io_hdr io_hdr;
+
+ memset(&io_hdr, 0, sizeof(io_hdr));
+ io_hdr.interface_id = 'S';
+ io_hdr.cmdp = inq_cmd;
+ io_hdr.cmd_len = sizeof(inq_cmd);
+ io_hdr.dxferp = buf;
+ io_hdr.dxfer_len = buf_len;
+ io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+ io_hdr.sbp = sense;
+ io_hdr.mx_sb_len = sizeof(sense);
+ io_hdr.timeout = 5000;
+
+ return ioctl(fd, SG_IO, &io_hdr);
+}
--- /dev/null
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neil@brown.name>
+ *
+ * Specifications for DDF takes from Common RAID DDF Specification Revision 1.2
+ * (July 28 2006). Reused by permission of SNIA.
+ */
+
+#define HAVE_STDINT_H 1
+#include "mdadm.h"
+#include "mdmon.h"
+#include "sha1.h"
+#include <values.h>
+
+/* a non-official T10 name for creation GUIDs */
+static char T10[] = "Linux-MD";
+
+/* DDF timestamps are 1980 based, so we need to add
+ * second-in-decade-of-seventies to convert to linux timestamps.
+ * 10 years with 2 leap years.
+ */
+#define DECADE (3600*24*(365*10+2))
+unsigned long crc32(
+ unsigned long crc,
+ const unsigned char *buf,
+ unsigned len);
+
+/* The DDF metadata handling.
+ * DDF metadata lives at the end of the device.
+ * The last 512 byte block provides an 'anchor' which is used to locate
+ * the rest of the metadata which usually lives immediately behind the anchor.
+ *
+ * Note:
+ * - all multibyte numeric fields are bigendian.
+ * - all strings are space padded.
+ *
+ */
+
+/* Primary Raid Level (PRL) */
+#define DDF_RAID0 0x00
+#define DDF_RAID1 0x01
+#define DDF_RAID3 0x03
+#define DDF_RAID4 0x04
+#define DDF_RAID5 0x05
+#define DDF_RAID1E 0x11
+#define DDF_JBOD 0x0f
+#define DDF_CONCAT 0x1f
+#define DDF_RAID5E 0x15
+#define DDF_RAID5EE 0x25
+#define DDF_RAID6 0x06
+
+/* Raid Level Qualifier (RLQ) */
+#define DDF_RAID0_SIMPLE 0x00
+#define DDF_RAID1_SIMPLE 0x00 /* just 2 devices in this plex */
+#define DDF_RAID1_MULTI 0x01 /* exactly 3 devices in this plex */
+#define DDF_RAID3_0 0x00 /* parity in first extent */
+#define DDF_RAID3_N 0x01 /* parity in last extent */
+#define DDF_RAID4_0 0x00 /* parity in first extent */
+#define DDF_RAID4_N 0x01 /* parity in last extent */
+/* these apply to raid5e and raid5ee as well */
+#define DDF_RAID5_0_RESTART 0x00 /* same as 'right asymmetric' - layout 1 */
+#define DDF_RAID6_0_RESTART 0x01 /* raid6 different from raid5 here!!! */
+#define DDF_RAID5_N_RESTART 0x02 /* same as 'left asymmetric' - layout 0 */
+#define DDF_RAID5_N_CONTINUE 0x03 /* same as 'left symmetric' - layout 2 */
+
+#define DDF_RAID1E_ADJACENT 0x00 /* raid10 nearcopies==2 */
+#define DDF_RAID1E_OFFSET 0x01 /* raid10 offsetcopies==2 */
+
+/* Secondary RAID Level (SRL) */
+#define DDF_2STRIPED 0x00 /* This is weirder than RAID0 !! */
+#define DDF_2MIRRORED 0x01
+#define DDF_2CONCAT 0x02
+#define DDF_2SPANNED 0x03 /* This is also weird - be careful */
+
+/* Magic numbers */
+#define DDF_HEADER_MAGIC __cpu_to_be32(0xDE11DE11)
+#define DDF_CONTROLLER_MAGIC __cpu_to_be32(0xAD111111)
+#define DDF_PHYS_RECORDS_MAGIC __cpu_to_be32(0x22222222)
+#define DDF_PHYS_DATA_MAGIC __cpu_to_be32(0x33333333)
+#define DDF_VIRT_RECORDS_MAGIC __cpu_to_be32(0xDDDDDDDD)
+#define DDF_VD_CONF_MAGIC __cpu_to_be32(0xEEEEEEEE)
+#define DDF_SPARE_ASSIGN_MAGIC __cpu_to_be32(0x55555555)
+#define DDF_VU_CONF_MAGIC __cpu_to_be32(0x88888888)
+#define DDF_VENDOR_LOG_MAGIC __cpu_to_be32(0x01dBEEF0)
+#define DDF_BBM_LOG_MAGIC __cpu_to_be32(0xABADB10C)
+
+#define DDF_GUID_LEN 24
+#define DDF_REVISION_0 "01.00.00"
+#define DDF_REVISION_2 "01.02.00"
+
+struct ddf_header {
+ __u32 magic; /* DDF_HEADER_MAGIC */
+ __u32 crc;
+ char guid[DDF_GUID_LEN];
+ char revision[8]; /* 01.02.00 */
+ __u32 seq; /* starts at '1' */
+ __u32 timestamp;
+ __u8 openflag;
+ __u8 foreignflag;
+ __u8 enforcegroups;
+ __u8 pad0; /* 0xff */
+ __u8 pad1[12]; /* 12 * 0xff */
+ /* 64 bytes so far */
+ __u8 header_ext[32]; /* reserved: fill with 0xff */
+ __u64 primary_lba;
+ __u64 secondary_lba;
+ __u8 type;
+ __u8 pad2[3]; /* 0xff */
+ __u32 workspace_len; /* sectors for vendor space -
+ * at least 32768(sectors) */
+ __u64 workspace_lba;
+ __u16 max_pd_entries; /* one of 15, 63, 255, 1023, 4095 */
+ __u16 max_vd_entries; /* 2^(4,6,8,10,12)-1 : i.e. as above */
+ __u16 max_partitions; /* i.e. max num of configuration
+ record entries per disk */
+ __u16 config_record_len; /* 1 +ROUNDUP(max_primary_element_entries
+ *12/512) */
+ __u16 max_primary_element_entries; /* 16, 64, 256, 1024, or 4096 */
+ __u8 pad3[54]; /* 0xff */
+ /* 192 bytes so far */
+ __u32 controller_section_offset;
+ __u32 controller_section_length;
+ __u32 phys_section_offset;
+ __u32 phys_section_length;
+ __u32 virt_section_offset;
+ __u32 virt_section_length;
+ __u32 config_section_offset;
+ __u32 config_section_length;
+ __u32 data_section_offset;
+ __u32 data_section_length;
+ __u32 bbm_section_offset;
+ __u32 bbm_section_length;
+ __u32 diag_space_offset;
+ __u32 diag_space_length;
+ __u32 vendor_offset;
+ __u32 vendor_length;
+ /* 256 bytes so far */
+ __u8 pad4[256]; /* 0xff */
+};
+
+/* type field */
+#define DDF_HEADER_ANCHOR 0x00
+#define DDF_HEADER_PRIMARY 0x01
+#define DDF_HEADER_SECONDARY 0x02
+
+/* The content of the 'controller section' - global scope */
+struct ddf_controller_data {
+ __u32 magic; /* DDF_CONTROLLER_MAGIC */
+ __u32 crc;
+ char guid[DDF_GUID_LEN];
+ struct controller_type {
+ __u16 vendor_id;
+ __u16 device_id;
+ __u16 sub_vendor_id;
+ __u16 sub_device_id;
+ } type;
+ char product_id[16];
+ __u8 pad[8]; /* 0xff */
+ __u8 vendor_data[448];
+};
+
+/* The content of phys_section - global scope */
+struct phys_disk {
+ __u32 magic; /* DDF_PHYS_RECORDS_MAGIC */
+ __u32 crc;
+ __u16 used_pdes;
+ __u16 max_pdes;
+ __u8 pad[52];
+ struct phys_disk_entry {
+ char guid[DDF_GUID_LEN];
+ __u32 refnum;
+ __u16 type;
+ __u16 state;
+ __u64 config_size; /* DDF structures must be after here */
+ char path[18]; /* another horrible structure really */
+ __u8 pad[6];
+ } entries[0];
+};
+
+/* phys_disk_entry.type is a bitmap - bigendian remember */
+#define DDF_Forced_PD_GUID 1
+#define DDF_Active_in_VD 2
+#define DDF_Global_Spare 4 /* VD_CONF records are ignored */
+#define DDF_Spare 8 /* overrides Global_spare */
+#define DDF_Foreign 16
+#define DDF_Legacy 32 /* no DDF on this device */
+
+#define DDF_Interface_mask 0xf00
+#define DDF_Interface_SCSI 0x100
+#define DDF_Interface_SAS 0x200
+#define DDF_Interface_SATA 0x300
+#define DDF_Interface_FC 0x400
+
+/* phys_disk_entry.state is a bigendian bitmap */
+#define DDF_Online 1
+#define DDF_Failed 2 /* overrides 1,4,8 */
+#define DDF_Rebuilding 4
+#define DDF_Transition 8
+#define DDF_SMART 16
+#define DDF_ReadErrors 32
+#define DDF_Missing 64
+
+/* The content of the virt_section global scope */
+struct virtual_disk {
+ __u32 magic; /* DDF_VIRT_RECORDS_MAGIC */
+ __u32 crc;
+ __u16 populated_vdes;
+ __u16 max_vdes;
+ __u8 pad[52];
+ struct virtual_entry {
+ char guid[DDF_GUID_LEN];
+ __u16 unit;
+ __u16 pad0; /* 0xffff */
+ __u16 guid_crc;
+ __u16 type;
+ __u8 state;
+ __u8 init_state;
+ __u8 pad1[14];
+ char name[16];
+ } entries[0];
+};
+
+/* virtual_entry.type is a bitmap - bigendian */
+#define DDF_Shared 1
+#define DDF_Enforce_Groups 2
+#define DDF_Unicode 4
+#define DDF_Owner_Valid 8
+
+/* virtual_entry.state is a bigendian bitmap */
+#define DDF_state_mask 0x7
+#define DDF_state_optimal 0x0
+#define DDF_state_degraded 0x1
+#define DDF_state_deleted 0x2
+#define DDF_state_missing 0x3
+#define DDF_state_failed 0x4
+#define DDF_state_part_optimal 0x5
+
+#define DDF_state_morphing 0x8
+#define DDF_state_inconsistent 0x10
+
+/* virtual_entry.init_state is a bigendian bitmap */
+#define DDF_initstate_mask 0x03
+#define DDF_init_not 0x00
+#define DDF_init_quick 0x01 /* initialisation is progress.
+ * i.e. 'state_inconsistent' */
+#define DDF_init_full 0x02
+
+#define DDF_access_mask 0xc0
+#define DDF_access_rw 0x00
+#define DDF_access_ro 0x80
+#define DDF_access_blocked 0xc0
+
+/* The content of the config_section - local scope
+ * It has multiple records each config_record_len sectors
+ * They can be vd_config or spare_assign
+ */
+
+struct vd_config {
+ __u32 magic; /* DDF_VD_CONF_MAGIC */
+ __u32 crc;
+ char guid[DDF_GUID_LEN];
+ __u32 timestamp;
+ __u32 seqnum;
+ __u8 pad0[24];
+ __u16 prim_elmnt_count;
+ __u8 chunk_shift; /* 0 == 512, 1==1024 etc */
+ __u8 prl;
+ __u8 rlq;
+ __u8 sec_elmnt_count;
+ __u8 sec_elmnt_seq;
+ __u8 srl;
+ __u64 blocks; /* blocks per component could be different
+ * on different component devices...(only
+ * for concat I hope) */
+ __u64 array_blocks; /* blocks in array */
+ __u8 pad1[8];
+ __u32 spare_refs[8];
+ __u8 cache_pol[8];
+ __u8 bg_rate;
+ __u8 pad2[3];
+ __u8 pad3[52];
+ __u8 pad4[192];
+ __u8 v0[32]; /* reserved- 0xff */
+ __u8 v1[32]; /* reserved- 0xff */
+ __u8 v2[16]; /* reserved- 0xff */
+ __u8 v3[16]; /* reserved- 0xff */
+ __u8 vendor[32];
+ __u32 phys_refnum[0]; /* refnum of each disk in sequence */
+ /*__u64 lba_offset[0]; LBA offset in each phys. Note extents in a
+ bvd are always the same size */
+};
+
+/* vd_config.cache_pol[7] is a bitmap */
+#define DDF_cache_writeback 1 /* else writethrough */
+#define DDF_cache_wadaptive 2 /* only applies if writeback */
+#define DDF_cache_readahead 4
+#define DDF_cache_radaptive 8 /* only if doing read-ahead */
+#define DDF_cache_ifnobatt 16 /* even to write cache if battery is poor */
+#define DDF_cache_wallowed 32 /* enable write caching */
+#define DDF_cache_rallowed 64 /* enable read caching */
+
+struct spare_assign {
+ __u32 magic; /* DDF_SPARE_ASSIGN_MAGIC */
+ __u32 crc;
+ __u32 timestamp;
+ __u8 reserved[7];
+ __u8 type;
+ __u16 populated; /* SAEs used */
+ __u16 max; /* max SAEs */
+ __u8 pad[8];
+ struct spare_assign_entry {
+ char guid[DDF_GUID_LEN];
+ __u16 secondary_element;
+ __u8 pad[6];
+ } spare_ents[0];
+};
+/* spare_assign.type is a bitmap */
+#define DDF_spare_dedicated 0x1 /* else global */
+#define DDF_spare_revertible 0x2 /* else committable */
+#define DDF_spare_active 0x4 /* else not active */
+#define DDF_spare_affinity 0x8 /* enclosure affinity */
+
+/* The data_section contents - local scope */
+struct disk_data {
+ __u32 magic; /* DDF_PHYS_DATA_MAGIC */
+ __u32 crc;
+ char guid[DDF_GUID_LEN];
+ __u32 refnum; /* crc of some magic drive data ... */
+ __u8 forced_ref; /* set when above was not result of magic */
+ __u8 forced_guid; /* set if guid was forced rather than magic */
+ __u8 vendor[32];
+ __u8 pad[442];
+};
+
+/* bbm_section content */
+struct bad_block_log {
+ __u32 magic;
+ __u32 crc;
+ __u16 entry_count;
+ __u32 spare_count;
+ __u8 pad[10];
+ __u64 first_spare;
+ struct mapped_block {
+ __u64 defective_start;
+ __u32 replacement_start;
+ __u16 remap_count;
+ __u8 pad[2];
+ } entries[0];
+};
+
+/* Struct for internally holding ddf structures */
+/* The DDF structure stored on each device is potentially
+ * quite different, as some data is global and some is local.
+ * The global data is:
+ * - ddf header
+ * - controller_data
+ * - Physical disk records
+ * - Virtual disk records
+ * The local data is:
+ * - Configuration records
+ * - Physical Disk data section
+ * ( and Bad block and vendor which I don't care about yet).
+ *
+ * The local data is parsed into separate lists as it is read
+ * and reconstructed for writing. This means that we only need
+ * to make config changes once and they are automatically
+ * propagated to all devices.
+ * Note that the ddf_super has space of the conf and disk data
+ * for this disk and also for a list of all such data.
+ * The list is only used for the superblock that is being
+ * built in Create or Assemble to describe the whole array.
+ */
+struct ddf_super {
+ struct ddf_header anchor, primary, secondary;
+ struct ddf_controller_data controller;
+ struct ddf_header *active;
+ struct phys_disk *phys;
+ struct virtual_disk *virt;
+ int pdsize, vdsize;
+ int max_part, mppe, conf_rec_len;
+ int currentdev;
+ int updates_pending;
+ struct vcl {
+ union {
+ char space[512];
+ struct {
+ struct vcl *next;
+ __u64 *lba_offset; /* location in 'conf' of
+ * the lba table */
+ int vcnum; /* index into ->virt */
+ __u64 *block_sizes; /* NULL if all the same */
+ };
+ };
+ struct vd_config conf;
+ } *conflist, *currentconf;
+ struct dl {
+ union {
+ char space[512];
+ struct {
+ struct dl *next;
+ int major, minor;
+ char *devname;
+ int fd;
+ unsigned long long size; /* sectors */
+ int pdnum; /* index in ->phys */
+ struct spare_assign *spare;
+ void *mdupdate; /* hold metadata update */
+
+ /* These fields used by auto-layout */
+ int raiddisk; /* slot to fill in autolayout */
+ __u64 esize;
+ };
+ };
+ struct disk_data disk;
+ struct vcl *vlist[0]; /* max_part in size */
+ } *dlist, *add_list;
+};
+
+#ifndef offsetof
+#define offsetof(t,f) ((size_t)&(((t*)0)->f))
+#endif
+
+
+static int calc_crc(void *buf, int len)
+{
+ /* crcs are always at the same place as in the ddf_header */
+ struct ddf_header *ddf = buf;
+ __u32 oldcrc = ddf->crc;
+ __u32 newcrc;
+ ddf->crc = 0xffffffff;
+
+ newcrc = crc32(0, buf, len);
+ ddf->crc = oldcrc;
+ /* The crc is store (like everything) bigendian, so convert
+ * here for simplicity
+ */
+ return __cpu_to_be32(newcrc);
+}
+
+static int load_ddf_header(int fd, unsigned long long lba,
+ unsigned long long size,
+ int type,
+ struct ddf_header *hdr, struct ddf_header *anchor)
+{
+ /* read a ddf header (primary or secondary) from fd/lba
+ * and check that it is consistent with anchor
+ * Need to check:
+ * magic, crc, guid, rev, and LBA's header_type, and
+ * everything after header_type must be the same
+ */
+ if (lba >= size-1)
+ return 0;
+
+ if (lseek64(fd, lba<<9, 0) < 0)
+ return 0;
+
+ if (read(fd, hdr, 512) != 512)
+ return 0;
+
+ if (hdr->magic != DDF_HEADER_MAGIC)
+ return 0;
+ if (calc_crc(hdr, 512) != hdr->crc)
+ return 0;
+ if (memcmp(anchor->guid, hdr->guid, DDF_GUID_LEN) != 0 ||
+ memcmp(anchor->revision, hdr->revision, 8) != 0 ||
+ anchor->primary_lba != hdr->primary_lba ||
+ anchor->secondary_lba != hdr->secondary_lba ||
+ hdr->type != type ||
+ memcmp(anchor->pad2, hdr->pad2, 512 -
+ offsetof(struct ddf_header, pad2)) != 0)
+ return 0;
+
+ /* Looks good enough to me... */
+ return 1;
+}
+
+static void *load_section(int fd, struct ddf_super *super, void *buf,
+ __u32 offset_be, __u32 len_be, int check)
+{
+ unsigned long long offset = __be32_to_cpu(offset_be);
+ unsigned long long len = __be32_to_cpu(len_be);
+ int dofree = (buf == NULL);
+
+ if (check)
+ if (len != 2 && len != 8 && len != 32
+ && len != 128 && len != 512)
+ return NULL;
+
+ if (len > 1024)
+ return NULL;
+ if (buf) {
+ /* All pre-allocated sections are a single block */
+ if (len != 1)
+ return NULL;
+ } else if (posix_memalign(&buf, 512, len<<9) != 0)
+ buf = NULL;
+
+ if (!buf)
+ return NULL;
+
+ if (super->active->type == 1)
+ offset += __be64_to_cpu(super->active->primary_lba);
+ else
+ offset += __be64_to_cpu(super->active->secondary_lba);
+
+ if (lseek64(fd, offset<<9, 0) != (offset<<9)) {
+ if (dofree)
+ free(buf);
+ return NULL;
+ }
+ if (read(fd, buf, len<<9) != (len<<9)) {
+ if (dofree)
+ free(buf);
+ return NULL;
+ }
+ return buf;
+}
+
+static int load_ddf_headers(int fd, struct ddf_super *super, char *devname)
+{
+ unsigned long long dsize;
+
+ get_dev_size(fd, NULL, &dsize);
+
+ if (lseek64(fd, dsize-512, 0) < 0) {
+ if (devname)
+ fprintf(stderr,
+ Name": Cannot seek to anchor block on %s: %s\n",
+ devname, strerror(errno));
+ return 1;
+ }
+ if (read(fd, &super->anchor, 512) != 512) {
+ if (devname)
+ fprintf(stderr,
+ Name ": Cannot read anchor block on %s: %s\n",
+ devname, strerror(errno));
+ return 1;
+ }
+ if (super->anchor.magic != DDF_HEADER_MAGIC) {
+ if (devname)
+ fprintf(stderr, Name ": no DDF anchor found on %s\n",
+ devname);
+ return 2;
+ }
+ if (calc_crc(&super->anchor, 512) != super->anchor.crc) {
+ if (devname)
+ fprintf(stderr, Name ": bad CRC on anchor on %s\n",
+ devname);
+ return 2;
+ }
+ if (memcmp(super->anchor.revision, DDF_REVISION_0, 8) != 0 &&
+ memcmp(super->anchor.revision, DDF_REVISION_2, 8) != 0) {
+ if (devname)
+ fprintf(stderr, Name ": can only support super revision"
+ " %.8s and earlier, not %.8s on %s\n",
+ DDF_REVISION_2, super->anchor.revision,devname);
+ return 2;
+ }
+ if (load_ddf_header(fd, __be64_to_cpu(super->anchor.primary_lba),
+ dsize >> 9, 1,
+ &super->primary, &super->anchor) == 0) {
+ if (devname)
+ fprintf(stderr,
+ Name ": Failed to load primary DDF header "
+ "on %s\n", devname);
+ return 2;
+ }
+ super->active = &super->primary;
+ if (load_ddf_header(fd, __be64_to_cpu(super->anchor.secondary_lba),
+ dsize >> 9, 2,
+ &super->secondary, &super->anchor)) {
+ if ((__be32_to_cpu(super->primary.seq)
+ < __be32_to_cpu(super->secondary.seq) &&
+ !super->secondary.openflag)
+ || (__be32_to_cpu(super->primary.seq)
+ == __be32_to_cpu(super->secondary.seq) &&
+ super->primary.openflag && !super->secondary.openflag)
+ )
+ super->active = &super->secondary;
+ }
+ return 0;
+}
+
+static int load_ddf_global(int fd, struct ddf_super *super, char *devname)
+{
+ void *ok;
+ ok = load_section(fd, super, &super->controller,
+ super->active->controller_section_offset,
+ super->active->controller_section_length,
+ 0);
+ super->phys = load_section(fd, super, NULL,
+ super->active->phys_section_offset,
+ super->active->phys_section_length,
+ 1);
+ super->pdsize = __be32_to_cpu(super->active->phys_section_length) * 512;
+
+ super->virt = load_section(fd, super, NULL,
+ super->active->virt_section_offset,
+ super->active->virt_section_length,
+ 1);
+ super->vdsize = __be32_to_cpu(super->active->virt_section_length) * 512;
+ if (!ok ||
+ !super->phys ||
+ !super->virt) {
+ free(super->phys);
+ free(super->virt);
+ super->phys = NULL;
+ super->virt = NULL;
+ return 2;
+ }
+ super->conflist = NULL;
+ super->dlist = NULL;
+
+ super->max_part = __be16_to_cpu(super->active->max_partitions);
+ super->mppe = __be16_to_cpu(super->active->max_primary_element_entries);
+ super->conf_rec_len = __be16_to_cpu(super->active->config_record_len);
+ return 0;
+}
+
+static int load_ddf_local(int fd, struct ddf_super *super,
+ char *devname, int keep)
+{
+ struct dl *dl;
+ struct stat stb;
+ char *conf;
+ int i;
+ int confsec;
+ int vnum;
+ int max_virt_disks = __be16_to_cpu(super->active->max_vd_entries);
+ unsigned long long dsize;
+
+ /* First the local disk info */
+ if (posix_memalign((void**)&dl, 512,
+ sizeof(*dl) +
+ (super->max_part) * sizeof(dl->vlist[0])) != 0) {
+ fprintf(stderr, Name ": %s could not allocate disk info buffer\n",
+ __func__);
+ return 1;
+ }
+
+ load_section(fd, super, &dl->disk,
+ super->active->data_section_offset,
+ super->active->data_section_length,
+ 0);
+ dl->devname = devname ? strdup(devname) : NULL;
+
+ fstat(fd, &stb);
+ dl->major = major(stb.st_rdev);
+ dl->minor = minor(stb.st_rdev);
+ dl->next = super->dlist;
+ dl->fd = keep ? fd : -1;
+
+ dl->size = 0;
+ if (get_dev_size(fd, devname, &dsize))
+ dl->size = dsize >> 9;
+ dl->spare = NULL;
+ for (i=0 ; i < super->max_part ; i++)
+ dl->vlist[i] = NULL;
+ super->dlist = dl;
+ dl->pdnum = -1;
+ for (i=0; i < __be16_to_cpu(super->active->max_pd_entries); i++)
+ if (memcmp(super->phys->entries[i].guid,
+ dl->disk.guid, DDF_GUID_LEN) == 0)
+ dl->pdnum = i;
+
+ /* Now the config list. */
+ /* 'conf' is an array of config entries, some of which are
+ * probably invalid. Those which are good need to be copied into
+ * the conflist
+ */
+
+ conf = load_section(fd, super, NULL,
+ super->active->config_section_offset,
+ super->active->config_section_length,
+ 0);
+
+ vnum = 0;
+ for (confsec = 0;
+ confsec < __be32_to_cpu(super->active->config_section_length);
+ confsec += super->conf_rec_len) {
+ struct vd_config *vd =
+ (struct vd_config *)((char*)conf + confsec*512);
+ struct vcl *vcl;
+
+ if (vd->magic == DDF_SPARE_ASSIGN_MAGIC) {
+ if (dl->spare)
+ continue;
+ if (posix_memalign((void**)&dl->spare, 512,
+ super->conf_rec_len*512) != 0) {
+ fprintf(stderr, Name
+ ": %s could not allocate spare info buf\n",
+ __func__);
+ return 1;
+ }
+
+ memcpy(dl->spare, vd, super->conf_rec_len*512);
+ continue;
+ }
+ if (vd->magic != DDF_VD_CONF_MAGIC)
+ continue;
+ for (vcl = super->conflist; vcl; vcl = vcl->next) {
+ if (memcmp(vcl->conf.guid,
+ vd->guid, DDF_GUID_LEN) == 0)
+ break;
+ }
+
+ if (vcl) {
+ dl->vlist[vnum++] = vcl;
+ if (__be32_to_cpu(vd->seqnum) <=
+ __be32_to_cpu(vcl->conf.seqnum))
+ continue;
+ } else {
+ if (posix_memalign((void**)&vcl, 512,
+ (super->conf_rec_len*512 +
+ offsetof(struct vcl, conf))) != 0) {
+ fprintf(stderr, Name
+ ": %s could not allocate vcl buf\n",
+ __func__);
+ return 1;
+ }
+ vcl->next = super->conflist;
+ vcl->block_sizes = NULL; /* FIXME not for CONCAT */
+ super->conflist = vcl;
+ dl->vlist[vnum++] = vcl;
+ }
+ memcpy(&vcl->conf, vd, super->conf_rec_len*512);
+ vcl->lba_offset = (__u64*)
+ &vcl->conf.phys_refnum[super->mppe];
+
+ for (i=0; i < max_virt_disks ; i++)
+ if (memcmp(super->virt->entries[i].guid,
+ vcl->conf.guid, DDF_GUID_LEN)==0)
+ break;
+ if (i < max_virt_disks)
+ vcl->vcnum = i;
+ }
+ free(conf);
+
+ return 0;
+}
+
+#ifndef MDASSEMBLE
+static int load_super_ddf_all(struct supertype *st, int fd,
+ void **sbp, char *devname, int keep_fd);
+#endif
+static int load_super_ddf(struct supertype *st, int fd,
+ char *devname)
+{
+ unsigned long long dsize;
+ struct ddf_super *super;
+ int rv;
+
+#ifndef MDASSEMBLE
+ /* if 'fd' is a container, load metadata from all the devices */
+ if (load_super_ddf_all(st, fd, &st->sb, devname, 1) == 0)
+ return 0;
+#endif
+ if (st->subarray[0])
+ return 1; /* FIXME Is this correct */
+
+ if (get_dev_size(fd, devname, &dsize) == 0)
+ return 1;
+
+ /* 32M is a lower bound */
+ if (dsize <= 32*1024*1024) {
+ if (devname)
+ fprintf(stderr,
+ Name ": %s is too small for ddf: "
+ "size is %llu sectors.\n",
+ devname, dsize>>9);
+ return 1;
+ }
+ if (dsize & 511) {
+ if (devname)
+ fprintf(stderr,
+ Name ": %s is an odd size for ddf: "
+ "size is %llu bytes.\n",
+ devname, dsize);
+ return 1;
+ }
+
+ if (posix_memalign((void**)&super, 512, sizeof(*super))!= 0) {
+ fprintf(stderr, Name ": malloc of %zu failed.\n",
+ sizeof(*super));
+ return 1;
+ }
+ memset(super, 0, sizeof(*super));
+
+ rv = load_ddf_headers(fd, super, devname);
+ if (rv) {
+ free(super);
+ return rv;
+ }
+
+ /* Have valid headers and have chosen the best. Let's read in the rest*/
+
+ rv = load_ddf_global(fd, super, devname);
+
+ if (rv) {
+ if (devname)
+ fprintf(stderr,
+ Name ": Failed to load all information "
+ "sections on %s\n", devname);
+ free(super);
+ return rv;
+ }
+
+ rv = load_ddf_local(fd, super, devname, 0);
+
+ if (rv) {
+ if (devname)
+ fprintf(stderr,
+ Name ": Failed to load all information "
+ "sections on %s\n", devname);
+ free(super);
+ return rv;
+ }
+
+ /* Should possibly check the sections .... */
+
+ st->sb = super;
+ if (st->ss == NULL) {
+ st->ss = &super_ddf;
+ st->minor_version = 0;
+ st->max_devs = 512;
+ }
+ st->loaded_container = 0;
+ return 0;
+
+}
+
+static void free_super_ddf(struct supertype *st)
+{
+ struct ddf_super *ddf = st->sb;
+ if (ddf == NULL)
+ return;
+ free(ddf->phys);
+ free(ddf->virt);
+ while (ddf->conflist) {
+ struct vcl *v = ddf->conflist;
+ ddf->conflist = v->next;
+ if (v->block_sizes)
+ free(v->block_sizes);
+ free(v);
+ }
+ while (ddf->dlist) {
+ struct dl *d = ddf->dlist;
+ ddf->dlist = d->next;
+ if (d->fd >= 0)
+ close(d->fd);
+ if (d->spare)
+ free(d->spare);
+ free(d);
+ }
+ free(ddf);
+ st->sb = NULL;
+}
+
+static struct supertype *match_metadata_desc_ddf(char *arg)
+{
+ /* 'ddf' only support containers */
+ struct supertype *st;
+ if (strcmp(arg, "ddf") != 0 &&
+ strcmp(arg, "default") != 0
+ )
+ return NULL;
+
+ st = malloc(sizeof(*st));
+ memset(st, 0, sizeof(*st));
+ st->ss = &super_ddf;
+ st->max_devs = 512;
+ st->minor_version = 0;
+ st->sb = NULL;
+ return st;
+}
+
+
+#ifndef MDASSEMBLE
+
+static mapping_t ddf_state[] = {
+ { "Optimal", 0},
+ { "Degraded", 1},
+ { "Deleted", 2},
+ { "Missing", 3},
+ { "Failed", 4},
+ { "Partially Optimal", 5},
+ { "-reserved-", 6},
+ { "-reserved-", 7},
+ { NULL, 0}
+};
+
+static mapping_t ddf_init_state[] = {
+ { "Not Initialised", 0},
+ { "QuickInit in Progress", 1},
+ { "Fully Initialised", 2},
+ { "*UNKNOWN*", 3},
+ { NULL, 0}
+};
+static mapping_t ddf_access[] = {
+ { "Read/Write", 0},
+ { "Reserved", 1},
+ { "Read Only", 2},
+ { "Blocked (no access)", 3},
+ { NULL ,0}
+};
+
+static mapping_t ddf_level[] = {
+ { "RAID0", DDF_RAID0},
+ { "RAID1", DDF_RAID1},
+ { "RAID3", DDF_RAID3},
+ { "RAID4", DDF_RAID4},
+ { "RAID5", DDF_RAID5},
+ { "RAID1E",DDF_RAID1E},
+ { "JBOD", DDF_JBOD},
+ { "CONCAT",DDF_CONCAT},
+ { "RAID5E",DDF_RAID5E},
+ { "RAID5EE",DDF_RAID5EE},
+ { "RAID6", DDF_RAID6},
+ { NULL, 0}
+};
+static mapping_t ddf_sec_level[] = {
+ { "Striped", DDF_2STRIPED},
+ { "Mirrored", DDF_2MIRRORED},
+ { "Concat", DDF_2CONCAT},
+ { "Spanned", DDF_2SPANNED},
+ { NULL, 0}
+};
+#endif
+
+struct num_mapping {
+ int num1, num2;
+};
+static struct num_mapping ddf_level_num[] = {
+ { DDF_RAID0, 0 },
+ { DDF_RAID1, 1 },
+ { DDF_RAID3, LEVEL_UNSUPPORTED },
+ { DDF_RAID4, 4 },
+ { DDF_RAID5, 5 },
+ { DDF_RAID1E, LEVEL_UNSUPPORTED },
+ { DDF_JBOD, LEVEL_UNSUPPORTED },
+ { DDF_CONCAT, LEVEL_LINEAR },
+ { DDF_RAID5E, LEVEL_UNSUPPORTED },
+ { DDF_RAID5EE, LEVEL_UNSUPPORTED },
+ { DDF_RAID6, 6},
+ { MAXINT, MAXINT }
+};
+
+static int map_num1(struct num_mapping *map, int num)
+{
+ int i;
+ for (i=0 ; map[i].num1 != MAXINT; i++)
+ if (map[i].num1 == num)
+ break;
+ return map[i].num2;
+}
+
+static int all_ff(char *guid)
+{
+ int i;
+ for (i = 0; i < DDF_GUID_LEN; i++)
+ if (guid[i] != (char)0xff)
+ return 0;
+ return 1;
+}
+
+#ifndef MDASSEMBLE
+static void print_guid(char *guid, int tstamp)
+{
+ /* A GUIDs are part (or all) ASCII and part binary.
+ * They tend to be space padded.
+ * We print the GUID in HEX, then in parentheses add
+ * any initial ASCII sequence, and a possible
+ * time stamp from bytes 16-19
+ */
+ int l = DDF_GUID_LEN;
+ int i;
+
+ for (i=0 ; i<DDF_GUID_LEN ; i++) {
+ if ((i&3)==0 && i != 0) printf(":");
+ printf("%02X", guid[i]&255);
+ }
+
+ printf("\n (");
+ while (l && guid[l-1] == ' ')
+ l--;
+ for (i=0 ; i<l ; i++) {
+ if (guid[i] >= 0x20 && guid[i] < 0x7f)
+ fputc(guid[i], stdout);
+ else
+ break;
+ }
+ if (tstamp) {
+ time_t then = __be32_to_cpu(*(__u32*)(guid+16)) + DECADE;
+ char tbuf[100];
+ struct tm *tm;
+ tm = localtime(&then);
+ strftime(tbuf, 100, " %D %T",tm);
+ fputs(tbuf, stdout);
+ }
+ printf(")");
+}
+
+static void examine_vd(int n, struct ddf_super *sb, char *guid)
+{
+ int crl = sb->conf_rec_len;
+ struct vcl *vcl;
+
+ for (vcl = sb->conflist ; vcl ; vcl = vcl->next) {
+ int i;
+ struct vd_config *vc = &vcl->conf;
+
+ if (calc_crc(vc, crl*512) != vc->crc)
+ continue;
+ if (memcmp(vc->guid, guid, DDF_GUID_LEN) != 0)
+ continue;
+
+ /* Ok, we know about this VD, let's give more details */
+ printf(" Raid Devices[%d] : %d (", n,
+ __be16_to_cpu(vc->prim_elmnt_count));
+ for (i=0; i<__be16_to_cpu(vc->prim_elmnt_count); i++) {
+ int j;
+ int cnt = __be16_to_cpu(sb->phys->used_pdes);
+ for (j=0; j<cnt; j++)
+ if (vc->phys_refnum[i] == sb->phys->entries[j].refnum)
+ break;
+ if (i) printf(" ");
+ if (j < cnt)
+ printf("%d", j);
+ else
+ printf("--");
+ }
+ printf(")\n");
+ if (vc->chunk_shift != 255)
+ printf(" Chunk Size[%d] : %d sectors\n", n,
+ 1 << vc->chunk_shift);
+ printf(" Raid Level[%d] : %s\n", n,
+ map_num(ddf_level, vc->prl)?:"-unknown-");
+ if (vc->sec_elmnt_count != 1) {
+ printf(" Secondary Position[%d] : %d of %d\n", n,
+ vc->sec_elmnt_seq, vc->sec_elmnt_count);
+ printf(" Secondary Level[%d] : %s\n", n,
+ map_num(ddf_sec_level, vc->srl) ?: "-unknown-");
+ }
+ printf(" Device Size[%d] : %llu\n", n,
+ (unsigned long long)__be64_to_cpu(vc->blocks)/2);
+ printf(" Array Size[%d] : %llu\n", n,
+ (unsigned long long)__be64_to_cpu(vc->array_blocks)/2);
+ }
+}
+
+static void examine_vds(struct ddf_super *sb)
+{
+ int cnt = __be16_to_cpu(sb->virt->populated_vdes);
+ int i;
+ printf(" Virtual Disks : %d\n", cnt);
+
+ for (i=0; i<cnt; i++) {
+ struct virtual_entry *ve = &sb->virt->entries[i];
+ printf("\n");
+ printf(" VD GUID[%d] : ", i); print_guid(ve->guid, 1);
+ printf("\n");
+ printf(" unit[%d] : %d\n", i, __be16_to_cpu(ve->unit));
+ printf(" state[%d] : %s, %s%s\n", i,
+ map_num(ddf_state, ve->state & 7),
+ (ve->state & 8) ? "Morphing, ": "",
+ (ve->state & 16)? "Not Consistent" : "Consistent");
+ printf(" init state[%d] : %s\n", i,
+ map_num(ddf_init_state, ve->init_state&3));
+ printf(" access[%d] : %s\n", i,
+ map_num(ddf_access, (ve->init_state>>6) & 3));
+ printf(" Name[%d] : %.16s\n", i, ve->name);
+ examine_vd(i, sb, ve->guid);
+ }
+ if (cnt) printf("\n");
+}
+
+static void examine_pds(struct ddf_super *sb)
+{
+ int cnt = __be16_to_cpu(sb->phys->used_pdes);
+ int i;
+ struct dl *dl;
+ printf(" Physical Disks : %d\n", cnt);
+ printf(" Number RefNo Size Device Type/State\n");
+
+ for (i=0 ; i<cnt ; i++) {
+ struct phys_disk_entry *pd = &sb->phys->entries[i];
+ int type = __be16_to_cpu(pd->type);
+ int state = __be16_to_cpu(pd->state);
+
+ //printf(" PD GUID[%d] : ", i); print_guid(pd->guid, 0);
+ //printf("\n");
+ printf(" %3d %08x ", i,
+ __be32_to_cpu(pd->refnum));
+ printf("%8lluK ",
+ (unsigned long long)__be64_to_cpu(pd->config_size)>>1);
+ for (dl = sb->dlist; dl ; dl = dl->next) {
+ if (dl->disk.refnum == pd->refnum) {
+ char *dv = map_dev(dl->major, dl->minor, 0);
+ if (dv) {
+ printf("%-15s", dv);
+ break;
+ }
+ }
+ }
+ if (!dl)
+ printf("%15s","");
+ printf(" %s%s%s%s%s",
+ (type&2) ? "active":"",
+ (type&4) ? "Global-Spare":"",
+ (type&8) ? "spare" : "",
+ (type&16)? ", foreign" : "",
+ (type&32)? "pass-through" : "");
+ printf("/%s%s%s%s%s%s%s",
+ (state&1)? "Online": "Offline",
+ (state&2)? ", Failed": "",
+ (state&4)? ", Rebuilding": "",
+ (state&8)? ", in-transition": "",
+ (state&16)? ", SMART-errors": "",
+ (state&32)? ", Unrecovered-Read-Errors": "",
+ (state&64)? ", Missing" : "");
+ printf("\n");
+ }
+}
+
+static void examine_super_ddf(struct supertype *st, char *homehost)
+{
+ struct ddf_super *sb = st->sb;
+
+ printf(" Magic : %08x\n", __be32_to_cpu(sb->anchor.magic));
+ printf(" Version : %.8s\n", sb->anchor.revision);
+ printf("Controller GUID : "); print_guid(sb->controller.guid, 0);
+ printf("\n");
+ printf(" Container GUID : "); print_guid(sb->anchor.guid, 1);
+ printf("\n");
+ printf(" Seq : %08x\n", __be32_to_cpu(sb->active->seq));
+ printf(" Redundant hdr : %s\n", sb->secondary.magic == DDF_HEADER_MAGIC
+ ?"yes" : "no");
+ examine_vds(sb);
+ examine_pds(sb);
+}
+
+static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info);
+
+static void uuid_from_super_ddf(struct supertype *st, int uuid[4]);
+
+static void brief_examine_super_ddf(struct supertype *st, int verbose)
+{
+ /* We just write a generic DDF ARRAY entry
+ */
+ struct ddf_super *ddf = st->sb;
+ struct mdinfo info;
+ int i;
+ char nbuf[64];
+ getinfo_super_ddf(st, &info);
+ fname_from_uuid(st, &info, nbuf, ':');
+ printf("ARRAY metadata=ddf UUID=%s\n", nbuf + 5);
+
+ for (i=0; i<__be16_to_cpu(ddf->virt->max_vdes); i++) {
+ struct virtual_entry *ve = &ddf->virt->entries[i];
+ struct vcl vcl;
+ char nbuf1[64];
+ if (all_ff(ve->guid))
+ continue;
+ memcpy(vcl.conf.guid, ve->guid, DDF_GUID_LEN);
+ ddf->currentconf =&vcl;
+ uuid_from_super_ddf(st, info.uuid);
+ fname_from_uuid(st, &info, nbuf1, ':');
+ printf("ARRAY container=%s member=%d UUID=%s\n",
+ nbuf+5, i, nbuf1+5);
+ }
+}
+
+static void export_examine_super_ddf(struct supertype *st)
+{
+ struct mdinfo info;
+ char nbuf[64];
+ getinfo_super_ddf(st, &info);
+ fname_from_uuid(st, &info, nbuf, ':');
+ printf("MD_METADATA=ddf\n");
+ printf("MD_LEVEL=container\n");
+ printf("MD_UUID=%s\n", nbuf+5);
+}
+
+
+static void detail_super_ddf(struct supertype *st, char *homehost)
+{
+ /* FIXME later
+ * Could print DDF GUID
+ * Need to find which array
+ * If whole, briefly list all arrays
+ * If one, give name
+ */
+}
+
+static void brief_detail_super_ddf(struct supertype *st)
+{
+ /* FIXME I really need to know which array we are detailing.
+ * Can that be stored in ddf_super??
+ */
+// struct ddf_super *ddf = st->sb;
+ struct mdinfo info;
+ char nbuf[64];
+ getinfo_super_ddf(st, &info);
+ fname_from_uuid(st, &info, nbuf,':');
+ printf(" UUID=%s", nbuf + 5);
+}
+#endif
+
+static int match_home_ddf(struct supertype *st, char *homehost)
+{
+ /* It matches 'this' host if the controller is a
+ * Linux-MD controller with vendor_data matching
+ * the hostname
+ */
+ struct ddf_super *ddf = st->sb;
+ int len = strlen(homehost);
+
+ return (memcmp(ddf->controller.guid, T10, 8) == 0 &&
+ len < sizeof(ddf->controller.vendor_data) &&
+ memcmp(ddf->controller.vendor_data, homehost,len) == 0 &&
+ ddf->controller.vendor_data[len] == 0);
+}
+
+#ifndef MDASSEMBLE
+static struct vd_config *find_vdcr(struct ddf_super *ddf, int inst)
+{
+ struct vcl *v;
+
+ for (v = ddf->conflist; v; v = v->next)
+ if (inst == v->vcnum)
+ return &v->conf;
+ return NULL;
+}
+#endif
+
+static int find_phys(struct ddf_super *ddf, __u32 phys_refnum)
+{
+ /* Find the entry in phys_disk which has the given refnum
+ * and return it's index
+ */
+ int i;
+ for (i=0; i < __be16_to_cpu(ddf->phys->max_pdes); i++)
+ if (ddf->phys->entries[i].refnum == phys_refnum)
+ return i;
+ return -1;
+}
+
+static void uuid_from_super_ddf(struct supertype *st, int uuid[4])
+{
+ /* The uuid returned here is used for:
+ * uuid to put into bitmap file (Create, Grow)
+ * uuid for backup header when saving critical section (Grow)
+ * comparing uuids when re-adding a device into an array
+ * In these cases the uuid required is that of the data-array,
+ * not the device-set.
+ * uuid to recognise same set when adding a missing device back
+ * to an array. This is a uuid for the device-set.
+ *
+ * For each of these we can make do with a truncated
+ * or hashed uuid rather than the original, as long as
+ * everyone agrees.
+ * In the case of SVD we assume the BVD is of interest,
+ * though that might be the case if a bitmap were made for
+ * a mirrored SVD - worry about that later.
+ * So we need to find the VD configuration record for the
+ * relevant BVD and extract the GUID and Secondary_Element_Seq.
+ * The first 16 bytes of the sha1 of these is used.
+ */
+ struct ddf_super *ddf = st->sb;
+ struct vcl *vcl = ddf->currentconf;
+ char *guid;
+ char buf[20];
+ struct sha1_ctx ctx;
+
+ if (vcl)
+ guid = vcl->conf.guid;
+ else
+ guid = ddf->anchor.guid;
+
+ sha1_init_ctx(&ctx);
+ sha1_process_bytes(guid, DDF_GUID_LEN, &ctx);
+ sha1_finish_ctx(&ctx, buf);
+ memcpy(uuid, buf, 4*4);
+}
+
+static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info);
+
+static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info)
+{
+ struct ddf_super *ddf = st->sb;
+
+ if (ddf->currentconf) {
+ getinfo_super_ddf_bvd(st, info);
+ return;
+ }
+
+ info->array.raid_disks = __be16_to_cpu(ddf->phys->used_pdes);
+ info->array.level = LEVEL_CONTAINER;
+ info->array.layout = 0;
+ info->array.md_minor = -1;
+ info->array.ctime = DECADE + __be32_to_cpu(*(__u32*)
+ (ddf->anchor.guid+16));
+ info->array.utime = 0;
+ info->array.chunk_size = 0;
+
+
+ info->disk.major = 0;
+ info->disk.minor = 0;
+ if (ddf->dlist) {
+ info->disk.number = __be32_to_cpu(ddf->dlist->disk.refnum);
+ info->disk.raid_disk = find_phys(ddf, ddf->dlist->disk.refnum);
+
+ info->data_offset = __be64_to_cpu(ddf->phys->
+ entries[info->disk.raid_disk].
+ config_size);
+ info->component_size = ddf->dlist->size - info->data_offset;
+ } else {
+ info->disk.number = -1;
+ info->disk.raid_disk = -1;
+// info->disk.raid_disk = find refnum in the table and use index;
+ }
+ info->disk.state = (1 << MD_DISK_SYNC);
+
+
+ info->reshape_active = 0;
+ info->name[0] = 0;
+
+ info->array.major_version = -1;
+ info->array.minor_version = -2;
+ strcpy(info->text_version, "ddf");
+ info->safe_mode_delay = 0;
+
+ uuid_from_super_ddf(st, info->uuid);
+
+}
+
+static int rlq_to_layout(int rlq, int prl, int raiddisks);
+
+static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info)
+{
+ struct ddf_super *ddf = st->sb;
+ struct vcl *vc = ddf->currentconf;
+ int cd = ddf->currentdev;
+ int j;
+ struct dl *dl;
+
+ /* FIXME this returns BVD info - what if we want SVD ?? */
+
+ info->array.raid_disks = __be16_to_cpu(vc->conf.prim_elmnt_count);
+ info->array.level = map_num1(ddf_level_num, vc->conf.prl);
+ info->array.layout = rlq_to_layout(vc->conf.rlq, vc->conf.prl,
+ info->array.raid_disks);
+ info->array.md_minor = -1;
+ info->array.ctime = DECADE +
+ __be32_to_cpu(*(__u32*)(vc->conf.guid+16));
+ info->array.utime = DECADE + __be32_to_cpu(vc->conf.timestamp);
+ info->array.chunk_size = 512 << vc->conf.chunk_shift;
+ info->custom_array_size = 0;
+
+ if (cd >= 0 && cd < ddf->mppe) {
+ info->data_offset = __be64_to_cpu(vc->lba_offset[cd]);
+ if (vc->block_sizes)
+ info->component_size = vc->block_sizes[cd];
+ else
+ info->component_size = __be64_to_cpu(vc->conf.blocks);
+ }
+
+ for (dl = ddf->dlist; dl ; dl = dl->next)
+ if (dl->raiddisk == info->disk.raid_disk)
+ break;
+ info->disk.major = 0;
+ info->disk.minor = 0;
+ if (dl) {
+ info->disk.major = dl->major;
+ info->disk.minor = dl->minor;
+ }
+// info->disk.number = __be32_to_cpu(ddf->disk.refnum);
+// info->disk.raid_disk = find refnum in the table and use index;
+// info->disk.state = ???;
+
+ info->container_member = ddf->currentconf->vcnum;
+
+ info->resync_start = 0;
+ if (!(ddf->virt->entries[info->container_member].state
+ & DDF_state_inconsistent) &&
+ (ddf->virt->entries[info->container_member].init_state
+ & DDF_initstate_mask)
+ == DDF_init_full)
+ info->resync_start = ~0ULL;
+
+ uuid_from_super_ddf(st, info->uuid);
+
+ info->container_member = atoi(st->subarray);
+ info->array.major_version = -1;
+ info->array.minor_version = -2;
+ sprintf(info->text_version, "/%s/%s",
+ devnum2devname(st->container_dev),
+ st->subarray);
+ info->safe_mode_delay = 200;
+
+ memcpy(info->name, ddf->virt->entries[info->container_member].name, 16);
+ info->name[16]=0;
+ for(j=0; j<16; j++)
+ if (info->name[j] == ' ')
+ info->name[j] = 0;
+}
+
+
+static int update_super_ddf(struct supertype *st, struct mdinfo *info,
+ char *update,
+ char *devname, int verbose,
+ int uuid_set, char *homehost)
+{
+ /* For 'assemble' and 'force' we need to return non-zero if any
+ * change was made. For others, the return value is ignored.
+ * Update options are:
+ * force-one : This device looks a bit old but needs to be included,
+ * update age info appropriately.
+ * assemble: clear any 'faulty' flag to allow this device to
+ * be assembled.
+ * force-array: Array is degraded but being forced, mark it clean
+ * if that will be needed to assemble it.
+ *
+ * newdev: not used ????
+ * grow: Array has gained a new device - this is currently for
+ * linear only
+ * resync: mark as dirty so a resync will happen.
+ * uuid: Change the uuid of the array to match what is given
+ * homehost: update the recorded homehost
+ * name: update the name - preserving the homehost
+ * _reshape_progress: record new reshape_progress position.
+ *
+ * Following are not relevant for this version:
+ * sparc2.2 : update from old dodgey metadata
+ * super-minor: change the preferred_minor number
+ * summaries: update redundant counters.
+ */
+ int rv = 0;
+// struct ddf_super *ddf = st->sb;
+// struct vd_config *vd = find_vdcr(ddf, info->container_member);
+// struct virtual_entry *ve = find_ve(ddf);
+
+ /* we don't need to handle "force-*" or "assemble" as
+ * there is no need to 'trick' the kernel. We the metadata is
+ * first updated to activate the array, all the implied modifications
+ * will just happen.
+ */
+
+ if (strcmp(update, "grow") == 0) {
+ /* FIXME */
+ }
+ if (strcmp(update, "resync") == 0) {
+// info->resync_checkpoint = 0;
+ }
+ /* We ignore UUID updates as they make even less sense
+ * with DDF
+ */
+ if (strcmp(update, "homehost") == 0) {
+ /* homehost is stored in controller->vendor_data,
+ * or it is when we are the vendor
+ */
+// if (info->vendor_is_local)
+// strcpy(ddf->controller.vendor_data, homehost);
+ }
+ if (strcmp(update, "name") == 0) {
+ /* name is stored in virtual_entry->name */
+// memset(ve->name, ' ', 16);
+// strncpy(ve->name, info->name, 16);
+ }
+ if (strcmp(update, "_reshape_progress") == 0) {
+ /* We don't support reshape yet */
+ }
+
+// update_all_csum(ddf);
+
+ return rv;
+}
+
+__u32 random32(void)
+{
+ __u32 rv;
+ int rfd = open("/dev/urandom", O_RDONLY);
+ if (rfd < 0 || read(rfd, &rv, 4) != 4)
+ rv = random();
+ if (rfd >= 0)
+ close(rfd);
+ return rv;
+}
+
+static void make_header_guid(char *guid)
+{
+ __u32 stamp;
+ /* Create a DDF Header of Virtual Disk GUID */
+
+ /* 24 bytes of fiction required.
+ * first 8 are a 'vendor-id' - "Linux-MD"
+ * next 8 are controller type.. how about 0X DEAD BEEF 0000 0000
+ * Remaining 8 random number plus timestamp
+ */
+ memcpy(guid, T10, sizeof(T10));
+ stamp = __cpu_to_be32(0xdeadbeef);
+ memcpy(guid+8, &stamp, 4);
+ stamp = __cpu_to_be32(0);
+ memcpy(guid+12, &stamp, 4);
+ stamp = __cpu_to_be32(time(0) - DECADE);
+ memcpy(guid+16, &stamp, 4);
+ stamp = random32();
+ memcpy(guid+20, &stamp, 4);
+}
+
+static int init_super_ddf_bvd(struct supertype *st,
+ mdu_array_info_t *info,
+ unsigned long long size,
+ char *name, char *homehost,
+ int *uuid);
+
+static int init_super_ddf(struct supertype *st,
+ mdu_array_info_t *info,
+ unsigned long long size, char *name, char *homehost,
+ int *uuid)
+{
+ /* This is primarily called by Create when creating a new array.
+ * We will then get add_to_super called for each component, and then
+ * write_init_super called to write it out to each device.
+ * For DDF, Create can create on fresh devices or on a pre-existing
+ * array.
+ * To create on a pre-existing array a different method will be called.
+ * This one is just for fresh drives.
+ *
+ * We need to create the entire 'ddf' structure which includes:
+ * DDF headers - these are easy.
+ * Controller data - a Sector describing this controller .. not that
+ * this is a controller exactly.
+ * Physical Disk Record - one entry per device, so
+ * leave plenty of space.
+ * Virtual Disk Records - again, just leave plenty of space.
+ * This just lists VDs, doesn't give details
+ * Config records - describes the VDs that use this disk
+ * DiskData - describes 'this' device.
+ * BadBlockManagement - empty
+ * Diag Space - empty
+ * Vendor Logs - Could we put bitmaps here?
+ *
+ */
+ struct ddf_super *ddf;
+ char hostname[17];
+ int hostlen;
+ int max_phys_disks, max_virt_disks;
+ unsigned long long sector;
+ int clen;
+ int i;
+ int pdsize, vdsize;
+ struct phys_disk *pd;
+ struct virtual_disk *vd;
+
+ if (!info) {
+ st->sb = NULL;
+ return 0;
+ }
+ if (st->sb)
+ return init_super_ddf_bvd(st, info, size, name, homehost,
+ uuid);
+
+ if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) {
+ fprintf(stderr, Name ": %s could not allocate superblock\n", __func__);
+ return 0;
+ }
+ memset(ddf, 0, sizeof(*ddf));
+ ddf->dlist = NULL; /* no physical disks yet */
+ ddf->conflist = NULL; /* No virtual disks yet */
+
+ /* At least 32MB *must* be reserved for the ddf. So let's just
+ * start 32MB from the end, and put the primary header there.
+ * Don't do secondary for now.
+ * We don't know exactly where that will be yet as it could be
+ * different on each device. To just set up the lengths.
+ *
+ */
+
+ ddf->anchor.magic = DDF_HEADER_MAGIC;
+ make_header_guid(ddf->anchor.guid);
+
+ memcpy(ddf->anchor.revision, DDF_REVISION_2, 8);
+ ddf->anchor.seq = __cpu_to_be32(1);
+ ddf->anchor.timestamp = __cpu_to_be32(time(0) - DECADE);
+ ddf->anchor.openflag = 0xFF;
+ ddf->anchor.foreignflag = 0;
+ ddf->anchor.enforcegroups = 0; /* Is this best?? */
+ ddf->anchor.pad0 = 0xff;
+ memset(ddf->anchor.pad1, 0xff, 12);
+ memset(ddf->anchor.header_ext, 0xff, 32);
+ ddf->anchor.primary_lba = ~(__u64)0;
+ ddf->anchor.secondary_lba = ~(__u64)0;
+ ddf->anchor.type = DDF_HEADER_ANCHOR;
+ memset(ddf->anchor.pad2, 0xff, 3);
+ ddf->anchor.workspace_len = __cpu_to_be32(32768); /* Must be reserved */
+ ddf->anchor.workspace_lba = ~(__u64)0; /* Put this at bottom
+ of 32M reserved.. */
+ max_phys_disks = 1023; /* Should be enough */
+ ddf->anchor.max_pd_entries = __cpu_to_be16(max_phys_disks);
+ max_virt_disks = 255;
+ ddf->anchor.max_vd_entries = __cpu_to_be16(max_virt_disks); /* ?? */
+ ddf->anchor.max_partitions = __cpu_to_be16(64); /* ?? */
+ ddf->max_part = 64;
+ ddf->mppe = 256;
+ ddf->conf_rec_len = 1 + ROUND_UP(ddf->mppe * (4+8), 512)/512;
+ ddf->anchor.config_record_len = __cpu_to_be16(ddf->conf_rec_len);
+ ddf->anchor.max_primary_element_entries = __cpu_to_be16(ddf->mppe);
+ memset(ddf->anchor.pad3, 0xff, 54);
+ /* controller sections is one sector long immediately
+ * after the ddf header */
+ sector = 1;
+ ddf->anchor.controller_section_offset = __cpu_to_be32(sector);
+ ddf->anchor.controller_section_length = __cpu_to_be32(1);
+ sector += 1;
+
+ /* phys is 8 sectors after that */
+ pdsize = ROUND_UP(sizeof(struct phys_disk) +
+ sizeof(struct phys_disk_entry)*max_phys_disks,
+ 512);
+ switch(pdsize/512) {
+ case 2: case 8: case 32: case 128: case 512: break;
+ default: abort();
+ }
+ ddf->anchor.phys_section_offset = __cpu_to_be32(sector);
+ ddf->anchor.phys_section_length =
+ __cpu_to_be32(pdsize/512); /* max_primary_element_entries/8 */
+ sector += pdsize/512;
+
+ /* virt is another 32 sectors */
+ vdsize = ROUND_UP(sizeof(struct virtual_disk) +
+ sizeof(struct virtual_entry) * max_virt_disks,
+ 512);
+ switch(vdsize/512) {
+ case 2: case 8: case 32: case 128: case 512: break;
+ default: abort();
+ }
+ ddf->anchor.virt_section_offset = __cpu_to_be32(sector);
+ ddf->anchor.virt_section_length =
+ __cpu_to_be32(vdsize/512); /* max_vd_entries/8 */
+ sector += vdsize/512;
+
+ clen = ddf->conf_rec_len * (ddf->max_part+1);
+ ddf->anchor.config_section_offset = __cpu_to_be32(sector);
+ ddf->anchor.config_section_length = __cpu_to_be32(clen);
+ sector += clen;
+
+ ddf->anchor.data_section_offset = __cpu_to_be32(sector);
+ ddf->anchor.data_section_length = __cpu_to_be32(1);
+ sector += 1;
+
+ ddf->anchor.bbm_section_length = __cpu_to_be32(0);
+ ddf->anchor.bbm_section_offset = __cpu_to_be32(0xFFFFFFFF);
+ ddf->anchor.diag_space_length = __cpu_to_be32(0);
+ ddf->anchor.diag_space_offset = __cpu_to_be32(0xFFFFFFFF);
+ ddf->anchor.vendor_length = __cpu_to_be32(0);
+ ddf->anchor.vendor_offset = __cpu_to_be32(0xFFFFFFFF);
+
+ memset(ddf->anchor.pad4, 0xff, 256);
+
+ memcpy(&ddf->primary, &ddf->anchor, 512);
+ memcpy(&ddf->secondary, &ddf->anchor, 512);
+
+ ddf->primary.openflag = 1; /* I guess.. */
+ ddf->primary.type = DDF_HEADER_PRIMARY;
+
+ ddf->secondary.openflag = 1; /* I guess.. */
+ ddf->secondary.type = DDF_HEADER_SECONDARY;
+
+ ddf->active = &ddf->primary;
+
+ ddf->controller.magic = DDF_CONTROLLER_MAGIC;
+
+ /* 24 more bytes of fiction required.
+ * first 8 are a 'vendor-id' - "Linux-MD"
+ * Remaining 16 are serial number.... maybe a hostname would do?
+ */
+ memcpy(ddf->controller.guid, T10, sizeof(T10));
+ gethostname(hostname, sizeof(hostname));
+ hostname[sizeof(hostname) - 1] = 0;
+ hostlen = strlen(hostname);
+ memcpy(ddf->controller.guid + 24 - hostlen, hostname, hostlen);
+ for (i = strlen(T10) ; i+hostlen < 24; i++)
+ ddf->controller.guid[i] = ' ';
+
+ ddf->controller.type.vendor_id = __cpu_to_be16(0xDEAD);
+ ddf->controller.type.device_id = __cpu_to_be16(0xBEEF);
+ ddf->controller.type.sub_vendor_id = 0;
+ ddf->controller.type.sub_device_id = 0;
+ memcpy(ddf->controller.product_id, "What Is My PID??", 16);
+ memset(ddf->controller.pad, 0xff, 8);
+ memset(ddf->controller.vendor_data, 0xff, 448);
+ if (homehost && strlen(homehost) < 440)
+ strcpy((char*)ddf->controller.vendor_data, homehost);
+
+ if (posix_memalign((void**)&pd, 512, pdsize) != 0) {
+ fprintf(stderr, Name ": %s could not allocate pd\n", __func__);
+ return 0;
+ }
+ ddf->phys = pd;
+ ddf->pdsize = pdsize;
+
+ memset(pd, 0xff, pdsize);
+ memset(pd, 0, sizeof(*pd));
+ pd->magic = DDF_PHYS_DATA_MAGIC;
+ pd->used_pdes = __cpu_to_be16(0);
+ pd->max_pdes = __cpu_to_be16(max_phys_disks);
+ memset(pd->pad, 0xff, 52);
+
+ if (posix_memalign((void**)&vd, 512, vdsize) != 0) {
+ fprintf(stderr, Name ": %s could not allocate vd\n", __func__);
+ return 0;
+ }
+ ddf->virt = vd;
+ ddf->vdsize = vdsize;
+ memset(vd, 0, vdsize);
+ vd->magic = DDF_VIRT_RECORDS_MAGIC;
+ vd->populated_vdes = __cpu_to_be16(0);
+ vd->max_vdes = __cpu_to_be16(max_virt_disks);
+ memset(vd->pad, 0xff, 52);
+
+ for (i=0; i<max_virt_disks; i++)
+ memset(&vd->entries[i], 0xff, sizeof(struct virtual_entry));
+
+ st->sb = ddf;
+ ddf->updates_pending = 1;
+ return 1;
+}
+
+static int chunk_to_shift(int chunksize)
+{
+ return ffs(chunksize/512)-1;
+}
+
+static int level_to_prl(int level)
+{
+ switch (level) {
+ case LEVEL_LINEAR: return DDF_CONCAT;
+ case 0: return DDF_RAID0;
+ case 1: return DDF_RAID1;
+ case 4: return DDF_RAID4;
+ case 5: return DDF_RAID5;
+ case 6: return DDF_RAID6;
+ default: return -1;
+ }
+}
+static int layout_to_rlq(int level, int layout, int raiddisks)
+{
+ switch(level) {
+ case 0:
+ return DDF_RAID0_SIMPLE;
+ case 1:
+ switch(raiddisks) {
+ case 2: return DDF_RAID1_SIMPLE;
+ case 3: return DDF_RAID1_MULTI;
+ default: return -1;
+ }
+ case 4:
+ switch(layout) {
+ case 0: return DDF_RAID4_N;
+ }
+ break;
+ case 5:
+ switch(layout) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+ return DDF_RAID5_N_RESTART;
+ case ALGORITHM_RIGHT_ASYMMETRIC:
+ return DDF_RAID5_0_RESTART;
+ case ALGORITHM_LEFT_SYMMETRIC:
+ return DDF_RAID5_N_CONTINUE;
+ case ALGORITHM_RIGHT_SYMMETRIC:
+ return -1; /* not mentioned in standard */
+ }
+ case 6:
+ switch(layout) {
+ case ALGORITHM_ROTATING_N_RESTART:
+ return DDF_RAID5_N_RESTART;
+ case ALGORITHM_ROTATING_ZERO_RESTART:
+ return DDF_RAID6_0_RESTART;
+ case ALGORITHM_ROTATING_N_CONTINUE:
+ return DDF_RAID5_N_CONTINUE;
+ }
+ }
+ return -1;
+}
+
+static int rlq_to_layout(int rlq, int prl, int raiddisks)
+{
+ switch(prl) {
+ case DDF_RAID0:
+ return 0; /* hopefully rlq == DDF_RAID0_SIMPLE */
+ case DDF_RAID1:
+ return 0; /* hopefully rlq == SIMPLE or MULTI depending
+ on raiddisks*/
+ case DDF_RAID4:
+ switch(rlq) {
+ case DDF_RAID4_N:
+ return 0;
+ default:
+ /* not supported */
+ return -1; /* FIXME this isn't checked */
+ }
+ case DDF_RAID5:
+ switch(rlq) {
+ case DDF_RAID5_N_RESTART:
+ return ALGORITHM_LEFT_ASYMMETRIC;
+ case DDF_RAID5_0_RESTART:
+ return ALGORITHM_RIGHT_ASYMMETRIC;
+ case DDF_RAID5_N_CONTINUE:
+ return ALGORITHM_LEFT_SYMMETRIC;
+ default:
+ return -1;
+ }
+ case DDF_RAID6:
+ switch(rlq) {
+ case DDF_RAID5_N_RESTART:
+ return ALGORITHM_ROTATING_N_RESTART;
+ case DDF_RAID6_0_RESTART:
+ return ALGORITHM_ROTATING_ZERO_RESTART;
+ case DDF_RAID5_N_CONTINUE:
+ return ALGORITHM_ROTATING_N_CONTINUE;
+ default:
+ return -1;
+ }
+ }
+ return -1;
+}
+
+#ifndef MDASSEMBLE
+struct extent {
+ unsigned long long start, size;
+};
+static int cmp_extent(const void *av, const void *bv)
+{
+ const struct extent *a = av;
+ const struct extent *b = bv;
+ if (a->start < b->start)
+ return -1;
+ if (a->start > b->start)
+ return 1;
+ return 0;
+}
+
+static struct extent *get_extents(struct ddf_super *ddf, struct dl *dl)
+{
+ /* find a list of used extents on the give physical device
+ * (dnum) of the given ddf.
+ * Return a malloced array of 'struct extent'
+
+FIXME ignore DDF_Legacy devices?
+
+ */
+ struct extent *rv;
+ int n = 0;
+ int i, j;
+
+ rv = malloc(sizeof(struct extent) * (ddf->max_part + 2));
+ if (!rv)
+ return NULL;
+
+ for (i = 0; i < ddf->max_part; i++) {
+ struct vcl *v = dl->vlist[i];
+ if (v == NULL)
+ continue;
+ for (j=0; j < v->conf.prim_elmnt_count; j++)
+ if (v->conf.phys_refnum[j] == dl->disk.refnum) {
+ /* This device plays role 'j' in 'v'. */
+ rv[n].start = __be64_to_cpu(v->lba_offset[j]);
+ rv[n].size = __be64_to_cpu(v->conf.blocks);
+ n++;
+ break;
+ }
+ }
+ qsort(rv, n, sizeof(*rv), cmp_extent);
+
+ rv[n].start = __be64_to_cpu(ddf->phys->entries[dl->pdnum].config_size);
+ rv[n].size = 0;
+ return rv;
+}
+#endif
+
+static int init_super_ddf_bvd(struct supertype *st,
+ mdu_array_info_t *info,
+ unsigned long long size,
+ char *name, char *homehost,
+ int *uuid)
+{
+ /* We are creating a BVD inside a pre-existing container.
+ * so st->sb is already set.
+ * We need to create a new vd_config and a new virtual_entry
+ */
+ struct ddf_super *ddf = st->sb;
+ int venum;
+ struct virtual_entry *ve;
+ struct vcl *vcl;
+ struct vd_config *vc;
+
+ if (__be16_to_cpu(ddf->virt->populated_vdes)
+ >= __be16_to_cpu(ddf->virt->max_vdes)) {
+ fprintf(stderr, Name": This ddf already has the "
+ "maximum of %d virtual devices\n",
+ __be16_to_cpu(ddf->virt->max_vdes));
+ return 0;
+ }
+
+ for (venum = 0; venum < __be16_to_cpu(ddf->virt->max_vdes); venum++)
+ if (all_ff(ddf->virt->entries[venum].guid))
+ break;
+ if (venum == __be16_to_cpu(ddf->virt->max_vdes)) {
+ fprintf(stderr, Name ": Cannot find spare slot for "
+ "virtual disk - DDF is corrupt\n");
+ return 0;
+ }
+ ve = &ddf->virt->entries[venum];
+
+ /* A Virtual Disk GUID contains the T10 Vendor ID, controller type,
+ * timestamp, random number
+ */
+ make_header_guid(ve->guid);
+ ve->unit = __cpu_to_be16(info->md_minor);
+ ve->pad0 = 0xFFFF;
+ ve->guid_crc = crc32(0, (unsigned char*)ddf->anchor.guid, DDF_GUID_LEN);
+ ve->type = 0;
+ ve->state = DDF_state_degraded; /* Will be modified as devices are added */
+ if (info->state & 1) /* clean */
+ ve->init_state = DDF_init_full;
+ else
+ ve->init_state = DDF_init_not;
+
+ memset(ve->pad1, 0xff, 14);
+ memset(ve->name, ' ', 16);
+ if (name)
+ strncpy(ve->name, name, 16);
+ ddf->virt->populated_vdes =
+ __cpu_to_be16(__be16_to_cpu(ddf->virt->populated_vdes)+1);
+
+ /* Now create a new vd_config */
+ if (posix_memalign((void**)&vcl, 512,
+ (offsetof(struct vcl, conf) + ddf->conf_rec_len * 512)) != 0) {
+ fprintf(stderr, Name ": %s could not allocate vd_config\n", __func__);
+ return 0;
+ }
+ vcl->lba_offset = (__u64*) &vcl->conf.phys_refnum[ddf->mppe];
+ vcl->vcnum = venum;
+ sprintf(st->subarray, "%d", venum);
+ vcl->block_sizes = NULL; /* FIXME not for CONCAT */
+
+ vc = &vcl->conf;
+
+ vc->magic = DDF_VD_CONF_MAGIC;
+ memcpy(vc->guid, ve->guid, DDF_GUID_LEN);
+ vc->timestamp = __cpu_to_be32(time(0)-DECADE);
+ vc->seqnum = __cpu_to_be32(1);
+ memset(vc->pad0, 0xff, 24);
+ vc->prim_elmnt_count = __cpu_to_be16(info->raid_disks);
+ vc->chunk_shift = chunk_to_shift(info->chunk_size);
+ vc->prl = level_to_prl(info->level);
+ vc->rlq = layout_to_rlq(info->level, info->layout, info->raid_disks);
+ vc->sec_elmnt_count = 1;
+ vc->sec_elmnt_seq = 0;
+ vc->srl = 0;
+ vc->blocks = __cpu_to_be64(info->size * 2);
+ vc->array_blocks = __cpu_to_be64(
+ calc_array_size(info->level, info->raid_disks, info->layout,
+ info->chunk_size, info->size*2));
+ memset(vc->pad1, 0xff, 8);
+ vc->spare_refs[0] = 0xffffffff;
+ vc->spare_refs[1] = 0xffffffff;
+ vc->spare_refs[2] = 0xffffffff;
+ vc->spare_refs[3] = 0xffffffff;
+ vc->spare_refs[4] = 0xffffffff;
+ vc->spare_refs[5] = 0xffffffff;
+ vc->spare_refs[6] = 0xffffffff;
+ vc->spare_refs[7] = 0xffffffff;
+ memset(vc->cache_pol, 0, 8);
+ vc->bg_rate = 0x80;
+ memset(vc->pad2, 0xff, 3);
+ memset(vc->pad3, 0xff, 52);
+ memset(vc->pad4, 0xff, 192);
+ memset(vc->v0, 0xff, 32);
+ memset(vc->v1, 0xff, 32);
+ memset(vc->v2, 0xff, 16);
+ memset(vc->v3, 0xff, 16);
+ memset(vc->vendor, 0xff, 32);
+
+ memset(vc->phys_refnum, 0xff, 4*ddf->mppe);
+ memset(vc->phys_refnum+ddf->mppe, 0x00, 8*ddf->mppe);
+
+ vcl->next = ddf->conflist;
+ ddf->conflist = vcl;
+ ddf->currentconf = vcl;
+ ddf->updates_pending = 1;
+ return 1;
+}
+
+#ifndef MDASSEMBLE
+static void add_to_super_ddf_bvd(struct supertype *st,
+ mdu_disk_info_t *dk, int fd, char *devname)
+{
+ /* fd and devname identify a device with-in the ddf container (st).
+ * dk identifies a location in the new BVD.
+ * We need to find suitable free space in that device and update
+ * the phys_refnum and lba_offset for the newly created vd_config.
+ * We might also want to update the type in the phys_disk
+ * section.
+ *
+ * Alternately: fd == -1 and we have already chosen which device to
+ * use and recorded in dlist->raid_disk;
+ */
+ struct dl *dl;
+ struct ddf_super *ddf = st->sb;
+ struct vd_config *vc;
+ __u64 *lba_offset;
+ int working;
+ int i;
+ unsigned long long blocks, pos, esize;
+ struct extent *ex;
+
+ if (fd == -1) {
+ for (dl = ddf->dlist; dl ; dl = dl->next)
+ if (dl->raiddisk == dk->raid_disk)
+ break;
+ } else {
+ for (dl = ddf->dlist; dl ; dl = dl->next)
+ if (dl->major == dk->major &&
+ dl->minor == dk->minor)
+ break;
+ }
+ if (!dl || ! (dk->state & (1<<MD_DISK_SYNC)))
+ return;
+
+ vc = &ddf->currentconf->conf;
+ lba_offset = ddf->currentconf->lba_offset;
+
+ ex = get_extents(ddf, dl);
+ if (!ex)
+ return;
+
+ i = 0; pos = 0;
+ blocks = __be64_to_cpu(vc->blocks);
+ if (ddf->currentconf->block_sizes)
+ blocks = ddf->currentconf->block_sizes[dk->raid_disk];
+
+ do {
+ esize = ex[i].start - pos;
+ if (esize >= blocks)
+ break;
+ pos = ex[i].start + ex[i].size;
+ i++;
+ } while (ex[i-1].size);
+
+ free(ex);
+ if (esize < blocks)
+ return;
+
+ ddf->currentdev = dk->raid_disk;
+ vc->phys_refnum[dk->raid_disk] = dl->disk.refnum;
+ lba_offset[dk->raid_disk] = __cpu_to_be64(pos);
+
+ for (i=0; i < ddf->max_part ; i++)
+ if (dl->vlist[i] == NULL)
+ break;
+ if (i == ddf->max_part)
+ return;
+ dl->vlist[i] = ddf->currentconf;
+
+ if (fd >= 0)
+ dl->fd = fd;
+ if (devname)
+ dl->devname = devname;
+
+ /* Check how many working raid_disks, and if we can mark
+ * array as optimal yet
+ */
+ working = 0;
+
+ for (i=0; i < __be16_to_cpu(vc->prim_elmnt_count); i++)
+ if (vc->phys_refnum[i] != 0xffffffff)
+ working++;
+
+ /* Find which virtual_entry */
+ i = ddf->currentconf->vcnum;
+ if (working == __be16_to_cpu(vc->prim_elmnt_count))
+ ddf->virt->entries[i].state =
+ (ddf->virt->entries[i].state & ~DDF_state_mask)
+ | DDF_state_optimal;
+
+ if (vc->prl == DDF_RAID6 &&
+ working+1 == __be16_to_cpu(vc->prim_elmnt_count))
+ ddf->virt->entries[i].state =
+ (ddf->virt->entries[i].state & ~DDF_state_mask)
+ | DDF_state_part_optimal;
+
+ ddf->phys->entries[dl->pdnum].type &= ~__cpu_to_be16(DDF_Global_Spare);
+ ddf->phys->entries[dl->pdnum].type |= __cpu_to_be16(DDF_Active_in_VD);
+ ddf->updates_pending = 1;
+}
+
+/* add a device to a container, either while creating it or while
+ * expanding a pre-existing container
+ */
+static int add_to_super_ddf(struct supertype *st,
+ mdu_disk_info_t *dk, int fd, char *devname)
+{
+ struct ddf_super *ddf = st->sb;
+ struct dl *dd;
+ time_t now;
+ struct tm *tm;
+ unsigned long long size;
+ struct phys_disk_entry *pde;
+ int n, i;
+ struct stat stb;
+
+ if (ddf->currentconf) {
+ add_to_super_ddf_bvd(st, dk, fd, devname);
+ return 0;
+ }
+
+ /* This is device numbered dk->number. We need to create
+ * a phys_disk entry and a more detailed disk_data entry.
+ */
+ fstat(fd, &stb);
+ if (posix_memalign((void**)&dd, 512,
+ sizeof(*dd) + sizeof(dd->vlist[0]) * ddf->max_part) != 0) {
+ fprintf(stderr, Name
+ ": %s could allocate buffer for new disk, aborting\n",
+ __func__);
+ return 1;
+ }
+ dd->major = major(stb.st_rdev);
+ dd->minor = minor(stb.st_rdev);
+ dd->devname = devname;
+ dd->fd = fd;
+ dd->spare = NULL;
+
+ dd->disk.magic = DDF_PHYS_DATA_MAGIC;
+ now = time(0);
+ tm = localtime(&now);
+ sprintf(dd->disk.guid, "%8s%04d%02d%02d",
+ T10, tm->tm_year+1900, tm->tm_mon+1, tm->tm_mday);
+ *(__u32*)(dd->disk.guid + 16) = random32();
+ *(__u32*)(dd->disk.guid + 20) = random32();
+
+ do {
+ /* Cannot be bothered finding a CRC of some irrelevant details*/
+ dd->disk.refnum = random32();
+ for (i = __be16_to_cpu(ddf->active->max_pd_entries) - 1;
+ i >= 0; i--)
+ if (ddf->phys->entries[i].refnum == dd->disk.refnum)
+ break;
+ } while (i >= 0);
+
+ dd->disk.forced_ref = 1;
+ dd->disk.forced_guid = 1;
+ memset(dd->disk.vendor, ' ', 32);
+ memcpy(dd->disk.vendor, "Linux", 5);
+ memset(dd->disk.pad, 0xff, 442);
+ for (i = 0; i < ddf->max_part ; i++)
+ dd->vlist[i] = NULL;
+
+ n = __be16_to_cpu(ddf->phys->used_pdes);
+ pde = &ddf->phys->entries[n];
+ dd->pdnum = n;
+
+ if (st->update_tail) {
+ int len = (sizeof(struct phys_disk) +
+ sizeof(struct phys_disk_entry));
+ struct phys_disk *pd;
+
+ pd = malloc(len);
+ pd->magic = DDF_PHYS_RECORDS_MAGIC;
+ pd->used_pdes = __cpu_to_be16(n);
+ pde = &pd->entries[0];
+ dd->mdupdate = pd;
+ } else {
+ n++;
+ ddf->phys->used_pdes = __cpu_to_be16(n);
+ }
+
+ memcpy(pde->guid, dd->disk.guid, DDF_GUID_LEN);
+ pde->refnum = dd->disk.refnum;
+ pde->type = __cpu_to_be16(DDF_Forced_PD_GUID | DDF_Global_Spare);
+ pde->state = __cpu_to_be16(DDF_Online);
+ get_dev_size(fd, NULL, &size);
+ /* We are required to reserve 32Meg, and record the size in sectors */
+ pde->config_size = __cpu_to_be64( (size - 32*1024*1024) / 512);
+ sprintf(pde->path, "%17.17s","Information: nil") ;
+ memset(pde->pad, 0xff, 6);
+
+ dd->size = size >> 9;
+ if (st->update_tail) {
+ dd->next = ddf->add_list;
+ ddf->add_list = dd;
+ } else {
+ dd->next = ddf->dlist;
+ ddf->dlist = dd;
+ ddf->updates_pending = 1;
+ }
+
+ return 0;
+}
+
+/*
+ * This is the write_init_super method for a ddf container. It is
+ * called when creating a container or adding another device to a
+ * container.
+ */
+
+static unsigned char null_conf[4096+512];
+
+static int __write_init_super_ddf(struct supertype *st, int do_close)
+{
+
+ struct ddf_super *ddf = st->sb;
+ int i;
+ struct dl *d;
+ int n_config;
+ int conf_size;
+ int attempts = 0;
+ int successes = 0;
+ unsigned long long size, sector;
+
+ /* try to write updated metadata,
+ * if we catch a failure move on to the next disk
+ */
+ for (d = ddf->dlist; d; d=d->next) {
+ int fd = d->fd;
+
+ if (fd < 0)
+ continue;
+
+ attempts++;
+ /* We need to fill in the primary, (secondary) and workspace
+ * lba's in the headers, set their checksums,
+ * Also checksum phys, virt....
+ *
+ * Then write everything out, finally the anchor is written.
+ */
+ get_dev_size(fd, NULL, &size);
+ size /= 512;
+ ddf->anchor.workspace_lba = __cpu_to_be64(size - 32*1024*2);
+ ddf->anchor.primary_lba = __cpu_to_be64(size - 16*1024*2);
+ ddf->anchor.seq = __cpu_to_be32(1);
+ memcpy(&ddf->primary, &ddf->anchor, 512);
+ memcpy(&ddf->secondary, &ddf->anchor, 512);
+
+ ddf->anchor.openflag = 0xFF; /* 'open' means nothing */
+ ddf->anchor.seq = 0xFFFFFFFF; /* no sequencing in anchor */
+ ddf->anchor.crc = calc_crc(&ddf->anchor, 512);
+
+ ddf->primary.openflag = 0;
+ ddf->primary.type = DDF_HEADER_PRIMARY;
+
+ ddf->secondary.openflag = 0;
+ ddf->secondary.type = DDF_HEADER_SECONDARY;
+
+ ddf->primary.crc = calc_crc(&ddf->primary, 512);
+ ddf->secondary.crc = calc_crc(&ddf->secondary, 512);
+
+ sector = size - 16*1024*2;
+ lseek64(fd, sector<<9, 0);
+ if (write(fd, &ddf->primary, 512) < 0)
+ continue;
+
+ ddf->controller.crc = calc_crc(&ddf->controller, 512);
+ if (write(fd, &ddf->controller, 512) < 0)
+ continue;
+
+ ddf->phys->crc = calc_crc(ddf->phys, ddf->pdsize);
+
+ if (write(fd, ddf->phys, ddf->pdsize) < 0)
+ continue;
+
+ ddf->virt->crc = calc_crc(ddf->virt, ddf->vdsize);
+ if (write(fd, ddf->virt, ddf->vdsize) < 0)
+ continue;
+
+ /* Now write lots of config records. */
+ n_config = ddf->max_part;
+ conf_size = ddf->conf_rec_len * 512;
+ for (i = 0 ; i <= n_config ; i++) {
+ struct vcl *c = d->vlist[i];
+ if (i == n_config)
+ c = (struct vcl*)d->spare;
+
+ if (c) {
+ c->conf.crc = calc_crc(&c->conf, conf_size);
+ if (write(fd, &c->conf, conf_size) < 0)
+ break;
+ } else {
+ char *null_aligned = (char*)((((unsigned long)null_conf)+511)&~511UL);
+ if (null_conf[0] != 0xff)
+ memset(null_conf, 0xff, sizeof(null_conf));
+ int togo = conf_size;
+ while (togo > sizeof(null_conf)-512) {
+ if (write(fd, null_aligned, sizeof(null_conf)-512) < 0)
+ break;
+ togo -= sizeof(null_conf)-512;
+ }
+ if (write(fd, null_aligned, togo) < 0)
+ break;
+ }
+ }
+ if (i <= n_config)
+ continue;
+ d->disk.crc = calc_crc(&d->disk, 512);
+ if (write(fd, &d->disk, 512) < 0)
+ continue;
+
+ /* Maybe do the same for secondary */
+
+ lseek64(fd, (size-1)*512, SEEK_SET);
+ if (write(fd, &ddf->anchor, 512) < 0)
+ continue;
+ successes++;
+ }
+
+ if (do_close)
+ for (d = ddf->dlist; d; d=d->next) {
+ close(d->fd);
+ d->fd = -1;
+ }
+
+ return attempts != successes;
+}
+
+static int write_init_super_ddf(struct supertype *st)
+{
+
+ if (st->update_tail) {
+ /* queue the virtual_disk and vd_config as metadata updates */
+ struct virtual_disk *vd;
+ struct vd_config *vc;
+ struct ddf_super *ddf = st->sb;
+ int len;
+
+ if (!ddf->currentconf) {
+ int len = (sizeof(struct phys_disk) +
+ sizeof(struct phys_disk_entry));
+
+ /* adding a disk to the container. */
+ if (!ddf->add_list)
+ return 0;
+
+ append_metadata_update(st, ddf->add_list->mdupdate, len);
+ ddf->add_list->mdupdate = NULL;
+ return 0;
+ }
+
+ /* Newly created VD */
+
+ /* First the virtual disk. We have a slightly fake header */
+ len = sizeof(struct virtual_disk) + sizeof(struct virtual_entry);
+ vd = malloc(len);
+ *vd = *ddf->virt;
+ vd->entries[0] = ddf->virt->entries[ddf->currentconf->vcnum];
+ vd->populated_vdes = __cpu_to_be16(ddf->currentconf->vcnum);
+ append_metadata_update(st, vd, len);
+
+ /* Then the vd_config */
+ len = ddf->conf_rec_len * 512;
+ vc = malloc(len);
+ memcpy(vc, &ddf->currentconf->conf, len);
+ append_metadata_update(st, vc, len);
+
+ /* FIXME I need to close the fds! */
+ return 0;
+ } else
+ return __write_init_super_ddf(st, 1);
+}
+
+#endif
+
+static __u64 avail_size_ddf(struct supertype *st, __u64 devsize)
+{
+ /* We must reserve the last 32Meg */
+ if (devsize <= 32*1024*2)
+ return 0;
+ return devsize - 32*1024*2;
+}
+
+#ifndef MDASSEMBLE
+
+static int reserve_space(struct supertype *st, int raiddisks,
+ unsigned long long size, int chunk,
+ unsigned long long *freesize)
+{
+ /* Find 'raiddisks' spare extents at least 'size' big (but
+ * only caring about multiples of 'chunk') and remember
+ * them.
+ * If the cannot be found, fail.
+ */
+ struct dl *dl;
+ struct ddf_super *ddf = st->sb;
+ int cnt = 0;
+
+ for (dl = ddf->dlist; dl ; dl=dl->next) {
+ dl->raiddisk = -1;
+ dl->esize = 0;
+ }
+ /* Now find largest extent on each device */
+ for (dl = ddf->dlist ; dl ; dl=dl->next) {
+ struct extent *e = get_extents(ddf, dl);
+ unsigned long long pos = 0;
+ int i = 0;
+ int found = 0;
+ unsigned long long minsize = size;
+
+ if (size == 0)
+ minsize = chunk;
+
+ if (!e)
+ continue;
+ do {
+ unsigned long long esize;
+ esize = e[i].start - pos;
+ if (esize >= minsize) {
+ found = 1;
+ minsize = esize;
+ }
+ pos = e[i].start + e[i].size;
+ i++;
+ } while (e[i-1].size);
+ if (found) {
+ cnt++;
+ dl->esize = minsize;
+ }
+ free(e);
+ }
+ if (cnt < raiddisks) {
+ fprintf(stderr, Name ": not enough devices with space to create array.\n");
+ return 0; /* No enough free spaces large enough */
+ }
+ if (size == 0) {
+ /* choose the largest size of which there are at least 'raiddisk' */
+ for (dl = ddf->dlist ; dl ; dl=dl->next) {
+ struct dl *dl2;
+ if (dl->esize <= size)
+ continue;
+ /* This is bigger than 'size', see if there are enough */
+ cnt = 0;
+ for (dl2 = dl; dl2 ; dl2=dl2->next)
+ if (dl2->esize >= dl->esize)
+ cnt++;
+ if (cnt >= raiddisks)
+ size = dl->esize;
+ }
+ if (chunk) {
+ size = size / chunk;
+ size *= chunk;
+ }
+ *freesize = size;
+ if (size < 32) {
+ fprintf(stderr, Name ": not enough spare devices to create array.\n");
+ return 0;
+ }
+ }
+ /* We have a 'size' of which there are enough spaces.
+ * We simply do a first-fit */
+ cnt = 0;
+ for (dl = ddf->dlist ; dl && cnt < raiddisks ; dl=dl->next) {
+ if (dl->esize < size)
+ continue;
+
+ dl->raiddisk = cnt;
+ cnt++;
+ }
+ return 1;
+}
+
+
+
+static int
+validate_geometry_ddf_container(struct supertype *st,
+ int level, int layout, int raiddisks,
+ int chunk, unsigned long long size,
+ char *dev, unsigned long long *freesize,
+ int verbose);
+
+static int validate_geometry_ddf_bvd(struct supertype *st,
+ int level, int layout, int raiddisks,
+ int chunk, unsigned long long size,
+ char *dev, unsigned long long *freesize,
+ int verbose);
+
+static int validate_geometry_ddf(struct supertype *st,
+ int level, int layout, int raiddisks,
+ int chunk, unsigned long long size,
+ char *dev, unsigned long long *freesize,
+ int verbose)
+{
+ int fd;
+ struct mdinfo *sra;
+ int cfd;
+
+ /* ddf potentially supports lots of things, but it depends on
+ * what devices are offered (and maybe kernel version?)
+ * If given unused devices, we will make a container.
+ * If given devices in a container, we will make a BVD.
+ * If given BVDs, we make an SVD, changing all the GUIDs in the process.
+ */
+
+ if (level == LEVEL_CONTAINER) {
+ /* Must be a fresh device to add to a container */
+ return validate_geometry_ddf_container(st, level, layout,
+ raiddisks, chunk,
+ size, dev, freesize,
+ verbose);
+ }
+
+ if (!dev) {
+ /* Initial sanity check. Exclude illegal levels. */
+ int i;
+ for (i=0; ddf_level_num[i].num1 != MAXINT; i++)
+ if (ddf_level_num[i].num2 == level)
+ break;
+ if (ddf_level_num[i].num1 == MAXINT)
+ return 0;
+ /* Should check layout? etc */
+
+ if (st->sb && freesize) {
+ /* --create was given a container to create in.
+ * So we need to check that there are enough
+ * free spaces and return the amount of space.
+ * We may as well remember which drives were
+ * chosen so that add_to_super/getinfo_super
+ * can return them.
+ */
+ return reserve_space(st, raiddisks, size, chunk, freesize);
+ }
+ return 1;
+ }
+
+ if (st->sb) {
+ /* A container has already been opened, so we are
+ * creating in there. Maybe a BVD, maybe an SVD.
+ * Should make a distinction one day.
+ */
+ return validate_geometry_ddf_bvd(st, level, layout, raiddisks,
+ chunk, size, dev, freesize,
+ verbose);
+ }
+ /* This is the first device for the array.
+ * If it is a container, we read it in and do automagic allocations,
+ * no other devices should be given.
+ * Otherwise it must be a member device of a container, and we
+ * do manual allocation.
+ * Later we should check for a BVD and make an SVD.
+ */
+ fd = open(dev, O_RDONLY|O_EXCL, 0);
+ if (fd >= 0) {
+ sra = sysfs_read(fd, 0, GET_VERSION);
+ close(fd);
+ if (sra && sra->array.major_version == -1 &&
+ strcmp(sra->text_version, "ddf") == 0) {
+
+ /* load super */
+ /* find space for 'n' devices. */
+ /* remember the devices */
+ /* Somehow return the fact that we have enough */
+ }
+
+ if (verbose)
+ fprintf(stderr,
+ Name ": ddf: Cannot create this array "
+ "on device %s\n",
+ dev);
+ return 0;
+ }
+ if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) {
+ if (verbose)
+ fprintf(stderr, Name ": ddf: Cannot open %s: %s\n",
+ dev, strerror(errno));
+ return 0;
+ }
+ /* Well, it is in use by someone, maybe a 'ddf' container. */
+ cfd = open_container(fd);
+ if (cfd < 0) {
+ close(fd);
+ if (verbose)
+ fprintf(stderr, Name ": ddf: Cannot use %s: %s\n",
+ dev, strerror(EBUSY));
+ return 0;
+ }
+ sra = sysfs_read(cfd, 0, GET_VERSION);
+ close(fd);
+ if (sra && sra->array.major_version == -1 &&
+ strcmp(sra->text_version, "ddf") == 0) {
+ /* This is a member of a ddf container. Load the container
+ * and try to create a bvd
+ */
+ struct ddf_super *ddf;
+ if (load_super_ddf_all(st, cfd, (void **)&ddf, NULL, 1) == 0) {
+ st->sb = ddf;
+ st->container_dev = fd2devnum(cfd);
+ close(cfd);
+ return validate_geometry_ddf_bvd(st, level, layout,
+ raiddisks, chunk, size,
+ dev, freesize,
+ verbose);
+ }
+ close(cfd);
+ } else /* device may belong to a different container */
+ return 0;
+
+ return 1;
+}
+
+static int
+validate_geometry_ddf_container(struct supertype *st,
+ int level, int layout, int raiddisks,
+ int chunk, unsigned long long size,
+ char *dev, unsigned long long *freesize,
+ int verbose)
+{
+ int fd;
+ unsigned long long ldsize;
+
+ if (level != LEVEL_CONTAINER)
+ return 0;
+ if (!dev)
+ return 1;
+
+ fd = open(dev, O_RDONLY|O_EXCL, 0);
+ if (fd < 0) {
+ if (verbose)
+ fprintf(stderr, Name ": ddf: Cannot open %s: %s\n",
+ dev, strerror(errno));
+ return 0;
+ }
+ if (!get_dev_size(fd, dev, &ldsize)) {
+ close(fd);
+ return 0;
+ }
+ close(fd);
+
+ *freesize = avail_size_ddf(st, ldsize >> 9);
+ if (*freesize == 0)
+ return 0;
+
+ return 1;
+}
+
+static int validate_geometry_ddf_bvd(struct supertype *st,
+ int level, int layout, int raiddisks,
+ int chunk, unsigned long long size,
+ char *dev, unsigned long long *freesize,
+ int verbose)
+{
+ struct stat stb;
+ struct ddf_super *ddf = st->sb;
+ struct dl *dl;
+ unsigned long long pos = 0;
+ unsigned long long maxsize;
+ struct extent *e;
+ int i;
+ /* ddf/bvd supports lots of things, but not containers */
+ if (level == LEVEL_CONTAINER)
+ return 0;
+ /* We must have the container info already read in. */
+ if (!ddf)
+ return 0;
+
+ if (!dev) {
+ /* General test: make sure there is space for
+ * 'raiddisks' device extents of size 'size'.
+ */
+ unsigned long long minsize = size;
+ int dcnt = 0;
+ if (minsize == 0)
+ minsize = 8;
+ for (dl = ddf->dlist; dl ; dl = dl->next)
+ {
+ int found = 0;
+ pos = 0;
+
+ i = 0;
+ e = get_extents(ddf, dl);
+ if (!e) continue;
+ do {
+ unsigned long long esize;
+ esize = e[i].start - pos;
+ if (esize >= minsize)
+ found = 1;
+ pos = e[i].start + e[i].size;
+ i++;
+ } while (e[i-1].size);
+ if (found)
+ dcnt++;
+ free(e);
+ }
+ if (dcnt < raiddisks) {
+ if (verbose)
+ fprintf(stderr,
+ Name ": ddf: Not enough devices with "
+ "space for this array (%d < %d)\n",
+ dcnt, raiddisks);
+ return 0;
+ }
+ return 1;
+ }
+ /* This device must be a member of the set */
+ if (stat(dev, &stb) < 0)
+ return 0;
+ if ((S_IFMT & stb.st_mode) != S_IFBLK)
+ return 0;
+ for (dl = ddf->dlist ; dl ; dl = dl->next) {
+ if (dl->major == major(stb.st_rdev) &&
+ dl->minor == minor(stb.st_rdev))
+ break;
+ }
+ if (!dl) {
+ if (verbose)
+ fprintf(stderr, Name ": ddf: %s is not in the "
+ "same DDF set\n",
+ dev);
+ return 0;
+ }
+ e = get_extents(ddf, dl);
+ maxsize = 0;
+ i = 0;
+ if (e) do {
+ unsigned long long esize;
+ esize = e[i].start - pos;
+ if (esize >= maxsize)
+ maxsize = esize;
+ pos = e[i].start + e[i].size;
+ i++;
+ } while (e[i-1].size);
+ *freesize = maxsize;
+ // FIXME here I am
+
+ return 1;
+}
+
+static int load_super_ddf_all(struct supertype *st, int fd,
+ void **sbp, char *devname, int keep_fd)
+{
+ struct mdinfo *sra;
+ struct ddf_super *super;
+ struct mdinfo *sd, *best = NULL;
+ int bestseq = 0;
+ int seq;
+ char nm[20];
+ int dfd;
+ int devnum = fd2devnum(fd);
+ enum sysfs_read_flags flags;
+
+ flags = GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE;
+ if (mdmon_running(devnum))
+ flags |= SKIP_GONE_DEVS;
+
+ sra = sysfs_read(fd, 0, flags);
+ if (!sra)
+ return 1;
+ if (sra->array.major_version != -1 ||
+ sra->array.minor_version != -2 ||
+ strcmp(sra->text_version, "ddf") != 0)
+ return 1;
+
+ if (posix_memalign((void**)&super, 512, sizeof(*super)) != 0)
+ return 1;
+ memset(super, 0, sizeof(*super));
+
+ /* first, try each device, and choose the best ddf */
+ for (sd = sra->devs ; sd ; sd = sd->next) {
+ int rv;
+ sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+ dfd = dev_open(nm, O_RDONLY);
+ if (dfd < 0)
+ return 2;
+ rv = load_ddf_headers(dfd, super, NULL);
+ close(dfd);
+ if (rv == 0) {
+ seq = __be32_to_cpu(super->active->seq);
+ if (super->active->openflag)
+ seq--;
+ if (!best || seq > bestseq) {
+ bestseq = seq;
+ best = sd;
+ }
+ }
+ }
+ if (!best)
+ return 1;
+ /* OK, load this ddf */
+ sprintf(nm, "%d:%d", best->disk.major, best->disk.minor);
+ dfd = dev_open(nm, O_RDONLY);
+ if (dfd < 0)
+ return 1;
+ load_ddf_headers(dfd, super, NULL);
+ load_ddf_global(dfd, super, NULL);
+ close(dfd);
+ /* Now we need the device-local bits */
+ for (sd = sra->devs ; sd ; sd = sd->next) {
+ int rv;
+
+ sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+ dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY);
+ if (dfd < 0)
+ return 2;
+ rv = load_ddf_headers(dfd, super, NULL);
+ if (rv == 0)
+ rv = load_ddf_local(dfd, super, NULL, keep_fd);
+ if (!keep_fd) close(dfd);
+ if (rv)
+ return 1;
+ }
+ if (st->subarray[0]) {
+ struct vcl *v;
+
+ for (v = super->conflist; v; v = v->next)
+ if (v->vcnum == atoi(st->subarray))
+ super->currentconf = v;
+ if (!super->currentconf)
+ return 1;
+ }
+ *sbp = super;
+ if (st->ss == NULL) {
+ st->ss = &super_ddf;
+ st->minor_version = 0;
+ st->max_devs = 512;
+ st->container_dev = fd2devnum(fd);
+ }
+ st->loaded_container = 1;
+ return 0;
+}
+#endif /* MDASSEMBLE */
+
+static struct mdinfo *container_content_ddf(struct supertype *st)
+{
+ /* Given a container loaded by load_super_ddf_all,
+ * extract information about all the arrays into
+ * an mdinfo tree.
+ *
+ * For each vcl in conflist: create an mdinfo, fill it in,
+ * then look for matching devices (phys_refnum) in dlist
+ * and create appropriate device mdinfo.
+ */
+ struct ddf_super *ddf = st->sb;
+ struct mdinfo *rest = NULL;
+ struct vcl *vc;
+
+ for (vc = ddf->conflist ; vc ; vc=vc->next)
+ {
+ int i;
+ int j;
+ struct mdinfo *this;
+ this = malloc(sizeof(*this));
+ memset(this, 0, sizeof(*this));
+ this->next = rest;
+ rest = this;
+
+ this->array.level = map_num1(ddf_level_num, vc->conf.prl);
+ this->array.raid_disks =
+ __be16_to_cpu(vc->conf.prim_elmnt_count);
+ this->array.layout = rlq_to_layout(vc->conf.rlq, vc->conf.prl,
+ this->array.raid_disks);
+ this->array.md_minor = -1;
+ this->array.major_version = -1;
+ this->array.minor_version = -2;
+ this->array.ctime = DECADE +
+ __be32_to_cpu(*(__u32*)(vc->conf.guid+16));
+ this->array.utime = DECADE +
+ __be32_to_cpu(vc->conf.timestamp);
+ this->array.chunk_size = 512 << vc->conf.chunk_shift;
+
+ i = vc->vcnum;
+ if ((ddf->virt->entries[i].state & DDF_state_inconsistent) ||
+ (ddf->virt->entries[i].init_state & DDF_initstate_mask) !=
+ DDF_init_full) {
+ this->array.state = 0;
+ this->resync_start = 0;
+ } else {
+ this->array.state = 1;
+ this->resync_start = ~0ULL;
+ }
+ memcpy(this->name, ddf->virt->entries[i].name, 16);
+ this->name[16]=0;
+ for(j=0; j<16; j++)
+ if (this->name[j] == ' ')
+ this->name[j] = 0;
+
+ memset(this->uuid, 0, sizeof(this->uuid));
+ this->component_size = __be64_to_cpu(vc->conf.blocks);
+ this->array.size = this->component_size / 2;
+ this->container_member = i;
+
+ ddf->currentconf = vc;
+ uuid_from_super_ddf(st, this->uuid);
+ ddf->currentconf = NULL;
+
+ sprintf(this->text_version, "/%s/%d",
+ devnum2devname(st->container_dev),
+ this->container_member);
+
+ for (i=0 ; i < ddf->mppe ; i++) {
+ struct mdinfo *dev;
+ struct dl *d;
+
+ if (vc->conf.phys_refnum[i] == 0xFFFFFFFF)
+ continue;
+
+ this->array.working_disks++;
+
+ for (d = ddf->dlist; d ; d=d->next)
+ if (d->disk.refnum == vc->conf.phys_refnum[i])
+ break;
+ if (d == NULL)
+ /* Haven't found that one yet, maybe there are others */
+ continue;
+
+ dev = malloc(sizeof(*dev));
+ memset(dev, 0, sizeof(*dev));
+ dev->next = this->devs;
+ this->devs = dev;
+
+ dev->disk.number = __be32_to_cpu(d->disk.refnum);
+ dev->disk.major = d->major;
+ dev->disk.minor = d->minor;
+ dev->disk.raid_disk = i;
+ dev->disk.state = (1<<MD_DISK_SYNC)|(1<<MD_DISK_ACTIVE);
+
+ dev->events = __be32_to_cpu(ddf->primary.seq);
+ dev->data_offset = __be64_to_cpu(vc->lba_offset[i]);
+ dev->component_size = __be64_to_cpu(vc->conf.blocks);
+ if (d->devname)
+ strcpy(dev->name, d->devname);
+ }
+ }
+ return rest;
+}
+
+static int store_zero_ddf(struct supertype *st, int fd)
+{
+ unsigned long long dsize;
+ void *buf;
+ int rc;
+
+ if (!get_dev_size(fd, NULL, &dsize))
+ return 1;
+
+ if (posix_memalign(&buf, 512, 512) != 0)
+ return 1;
+ memset(buf, 0, 512);
+
+ lseek64(fd, dsize-512, 0);
+ rc = write(fd, buf, 512);
+ free(buf);
+ if (rc < 0)
+ return 1;
+ return 0;
+}
+
+static int compare_super_ddf(struct supertype *st, struct supertype *tst)
+{
+ /*
+ * return:
+ * 0 same, or first was empty, and second was copied
+ * 1 second had wrong number
+ * 2 wrong uuid
+ * 3 wrong other info
+ */
+ struct ddf_super *first = st->sb;
+ struct ddf_super *second = tst->sb;
+
+ if (!first) {
+ st->sb = tst->sb;
+ tst->sb = NULL;
+ return 0;
+ }
+
+ if (memcmp(first->anchor.guid, second->anchor.guid, DDF_GUID_LEN) != 0)
+ return 2;
+
+ /* FIXME should I look at anything else? */
+ return 0;
+}
+
+#ifndef MDASSEMBLE
+/*
+ * A new array 'a' has been started which claims to be instance 'inst'
+ * within container 'c'.
+ * We need to confirm that the array matches the metadata in 'c' so
+ * that we don't corrupt any metadata.
+ */
+static int ddf_open_new(struct supertype *c, struct active_array *a, char *inst)
+{
+ dprintf("ddf: open_new %s\n", inst);
+ a->info.container_member = atoi(inst);
+ return 0;
+}
+
+/*
+ * The array 'a' is to be marked clean in the metadata.
+ * If '->resync_start' is not ~(unsigned long long)0, then the array is only
+ * clean up to the point (in sectors). If that cannot be recorded in the
+ * metadata, then leave it as dirty.
+ *
+ * For DDF, we need to clear the DDF_state_inconsistent bit in the
+ * !global! virtual_disk.virtual_entry structure.
+ */
+static int ddf_set_array_state(struct active_array *a, int consistent)
+{
+ struct ddf_super *ddf = a->container->sb;
+ int inst = a->info.container_member;
+ int old = ddf->virt->entries[inst].state;
+ if (consistent == 2) {
+ /* Should check if a recovery should be started FIXME */
+ consistent = 1;
+ if (!is_resync_complete(a))
+ consistent = 0;
+ }
+ if (consistent)
+ ddf->virt->entries[inst].state &= ~DDF_state_inconsistent;
+ else
+ ddf->virt->entries[inst].state |= DDF_state_inconsistent;
+ if (old != ddf->virt->entries[inst].state)
+ ddf->updates_pending = 1;
+
+ old = ddf->virt->entries[inst].init_state;
+ ddf->virt->entries[inst].init_state &= ~DDF_initstate_mask;
+ if (is_resync_complete(a))
+ ddf->virt->entries[inst].init_state |= DDF_init_full;
+ else if (a->resync_start == 0)
+ ddf->virt->entries[inst].init_state |= DDF_init_not;
+ else
+ ddf->virt->entries[inst].init_state |= DDF_init_quick;
+ if (old != ddf->virt->entries[inst].init_state)
+ ddf->updates_pending = 1;
+
+ dprintf("ddf mark %d %s %llu\n", inst, consistent?"clean":"dirty",
+ a->resync_start);
+ return consistent;
+}
+
+/*
+ * The state of each disk is stored in the global phys_disk structure
+ * in phys_disk.entries[n].state.
+ * This makes various combinations awkward.
+ * - When a device fails in any array, it must be failed in all arrays
+ * that include a part of this device.
+ * - When a component is rebuilding, we cannot include it officially in the
+ * array unless this is the only array that uses the device.
+ *
+ * So: when transitioning:
+ * Online -> failed, just set failed flag. monitor will propagate
+ * spare -> online, the device might need to be added to the array.
+ * spare -> failed, just set failed. Don't worry if in array or not.
+ */
+static void ddf_set_disk(struct active_array *a, int n, int state)
+{
+ struct ddf_super *ddf = a->container->sb;
+ int inst = a->info.container_member;
+ struct vd_config *vc = find_vdcr(ddf, inst);
+ int pd = find_phys(ddf, vc->phys_refnum[n]);
+ int i, st, working;
+
+ if (vc == NULL) {
+ dprintf("ddf: cannot find instance %d!!\n", inst);
+ return;
+ }
+ if (pd < 0) {
+ /* disk doesn't currently exist. If it is now in_sync,
+ * insert it. */
+ if ((state & DS_INSYNC) && ! (state & DS_FAULTY)) {
+ /* Find dev 'n' in a->info->devs, determine the
+ * ddf refnum, and set vc->phys_refnum and update
+ * phys->entries[]
+ */
+ /* FIXME */
+ }
+ } else {
+ int old = ddf->phys->entries[pd].state;
+ if (state & DS_FAULTY)
+ ddf->phys->entries[pd].state |= __cpu_to_be16(DDF_Failed);
+ if (state & DS_INSYNC) {
+ ddf->phys->entries[pd].state |= __cpu_to_be16(DDF_Online);
+ ddf->phys->entries[pd].state &= __cpu_to_be16(~DDF_Rebuilding);
+ }
+ if (old != ddf->phys->entries[pd].state)
+ ddf->updates_pending = 1;
+ }
+
+ dprintf("ddf: set_disk %d to %x\n", n, state);
+
+ /* Now we need to check the state of the array and update
+ * virtual_disk.entries[n].state.
+ * It needs to be one of "optimal", "degraded", "failed".
+ * I don't understand 'deleted' or 'missing'.
+ */
+ working = 0;
+ for (i=0; i < a->info.array.raid_disks; i++) {
+ pd = find_phys(ddf, vc->phys_refnum[i]);
+ if (pd < 0)
+ continue;
+ st = __be16_to_cpu(ddf->phys->entries[pd].state);
+ if ((st & (DDF_Online|DDF_Failed|DDF_Rebuilding))
+ == DDF_Online)
+ working++;
+ }
+ state = DDF_state_degraded;
+ if (working == a->info.array.raid_disks)
+ state = DDF_state_optimal;
+ else switch(vc->prl) {
+ case DDF_RAID0:
+ case DDF_CONCAT:
+ case DDF_JBOD:
+ state = DDF_state_failed;
+ break;
+ case DDF_RAID1:
+ if (working == 0)
+ state = DDF_state_failed;
+ break;
+ case DDF_RAID4:
+ case DDF_RAID5:
+ if (working < a->info.array.raid_disks-1)
+ state = DDF_state_failed;
+ break;
+ case DDF_RAID6:
+ if (working < a->info.array.raid_disks-2)
+ state = DDF_state_failed;
+ else if (working == a->info.array.raid_disks-1)
+ state = DDF_state_part_optimal;
+ break;
+ }
+
+ if (ddf->virt->entries[inst].state !=
+ ((ddf->virt->entries[inst].state & ~DDF_state_mask)
+ | state)) {
+
+ ddf->virt->entries[inst].state =
+ (ddf->virt->entries[inst].state & ~DDF_state_mask)
+ | state;
+ ddf->updates_pending = 1;
+ }
+
+}
+
+static void ddf_sync_metadata(struct supertype *st)
+{
+
+ /*
+ * Write all data to all devices.
+ * Later, we might be able to track whether only local changes
+ * have been made, or whether any global data has been changed,
+ * but ddf is sufficiently weird that it probably always
+ * changes global data ....
+ */
+ struct ddf_super *ddf = st->sb;
+ if (!ddf->updates_pending)
+ return;
+ ddf->updates_pending = 0;
+ __write_init_super_ddf(st, 0);
+ dprintf("ddf: sync_metadata\n");
+}
+
+static void ddf_process_update(struct supertype *st,
+ struct metadata_update *update)
+{
+ /* Apply this update to the metadata.
+ * The first 4 bytes are a DDF_*_MAGIC which guides
+ * our actions.
+ * Possible update are:
+ * DDF_PHYS_RECORDS_MAGIC
+ * Add a new physical device. Changes to this record
+ * only happen implicitly.
+ * used_pdes is the device number.
+ * DDF_VIRT_RECORDS_MAGIC
+ * Add a new VD. Possibly also change the 'access' bits.
+ * populated_vdes is the entry number.
+ * DDF_VD_CONF_MAGIC
+ * New or updated VD. the VIRT_RECORD must already
+ * exist. For an update, phys_refnum and lba_offset
+ * (at least) are updated, and the VD_CONF must
+ * be written to precisely those devices listed with
+ * a phys_refnum.
+ * DDF_SPARE_ASSIGN_MAGIC
+ * replacement Spare Assignment Record... but for which device?
+ *
+ * So, e.g.:
+ * - to create a new array, we send a VIRT_RECORD and
+ * a VD_CONF. Then assemble and start the array.
+ * - to activate a spare we send a VD_CONF to add the phys_refnum
+ * and offset. This will also mark the spare as active with
+ * a spare-assignment record.
+ */
+ struct ddf_super *ddf = st->sb;
+ __u32 *magic = (__u32*)update->buf;
+ struct phys_disk *pd;
+ struct virtual_disk *vd;
+ struct vd_config *vc;
+ struct vcl *vcl;
+ struct dl *dl;
+ int mppe;
+ int ent;
+
+ dprintf("Process update %x\n", *magic);
+
+ switch (*magic) {
+ case DDF_PHYS_RECORDS_MAGIC:
+
+ if (update->len != (sizeof(struct phys_disk) +
+ sizeof(struct phys_disk_entry)))
+ return;
+ pd = (struct phys_disk*)update->buf;
+
+ ent = __be16_to_cpu(pd->used_pdes);
+ if (ent >= __be16_to_cpu(ddf->phys->max_pdes))
+ return;
+ if (!all_ff(ddf->phys->entries[ent].guid))
+ return;
+ ddf->phys->entries[ent] = pd->entries[0];
+ ddf->phys->used_pdes = __cpu_to_be16(1 +
+ __be16_to_cpu(ddf->phys->used_pdes));
+ ddf->updates_pending = 1;
+ if (ddf->add_list) {
+ struct active_array *a;
+ struct dl *al = ddf->add_list;
+ ddf->add_list = al->next;
+
+ al->next = ddf->dlist;
+ ddf->dlist = al;
+
+ /* As a device has been added, we should check
+ * for any degraded devices that might make
+ * use of this spare */
+ for (a = st->arrays ; a; a=a->next)
+ a->check_degraded = 1;
+ }
+ break;
+
+ case DDF_VIRT_RECORDS_MAGIC:
+
+ if (update->len != (sizeof(struct virtual_disk) +
+ sizeof(struct virtual_entry)))
+ return;
+ vd = (struct virtual_disk*)update->buf;
+
+ ent = __be16_to_cpu(vd->populated_vdes);
+ if (ent >= __be16_to_cpu(ddf->virt->max_vdes))
+ return;
+ if (!all_ff(ddf->virt->entries[ent].guid))
+ return;
+ ddf->virt->entries[ent] = vd->entries[0];
+ ddf->virt->populated_vdes = __cpu_to_be16(1 +
+ __be16_to_cpu(ddf->virt->populated_vdes));
+ ddf->updates_pending = 1;
+ break;
+
+ case DDF_VD_CONF_MAGIC:
+ dprintf("len %d %d\n", update->len, ddf->conf_rec_len);
+
+ mppe = __be16_to_cpu(ddf->anchor.max_primary_element_entries);
+ if (update->len != ddf->conf_rec_len * 512)
+ return;
+ vc = (struct vd_config*)update->buf;
+ for (vcl = ddf->conflist; vcl ; vcl = vcl->next)
+ if (memcmp(vcl->conf.guid, vc->guid, DDF_GUID_LEN) == 0)
+ break;
+ dprintf("vcl = %p\n", vcl);
+ if (vcl) {
+ /* An update, just copy the phys_refnum and lba_offset
+ * fields
+ */
+ memcpy(vcl->conf.phys_refnum, vc->phys_refnum,
+ mppe * (sizeof(__u32) + sizeof(__u64)));
+ } else {
+ /* A new VD_CONF */
+ if (!update->space)
+ return;
+ vcl = update->space;
+ update->space = NULL;
+ vcl->next = ddf->conflist;
+ memcpy(&vcl->conf, vc, update->len);
+ vcl->lba_offset = (__u64*)
+ &vcl->conf.phys_refnum[mppe];
+ ddf->conflist = vcl;
+ }
+ /* Now make sure vlist is correct for each dl. */
+ for (dl = ddf->dlist; dl; dl = dl->next) {
+ int dn;
+ int vn = 0;
+ for (vcl = ddf->conflist; vcl ; vcl = vcl->next)
+ for (dn=0; dn < ddf->mppe ; dn++)
+ if (vcl->conf.phys_refnum[dn] ==
+ dl->disk.refnum) {
+ dprintf("dev %d has %p at %d\n",
+ dl->pdnum, vcl, vn);
+ dl->vlist[vn++] = vcl;
+ break;
+ }
+ while (vn < ddf->max_part)
+ dl->vlist[vn++] = NULL;
+ if (dl->vlist[0]) {
+ ddf->phys->entries[dl->pdnum].type &=
+ ~__cpu_to_be16(DDF_Global_Spare);
+ ddf->phys->entries[dl->pdnum].type |=
+ __cpu_to_be16(DDF_Active_in_VD);
+ }
+ if (dl->spare) {
+ ddf->phys->entries[dl->pdnum].type &=
+ ~__cpu_to_be16(DDF_Global_Spare);
+ ddf->phys->entries[dl->pdnum].type |=
+ __cpu_to_be16(DDF_Spare);
+ }
+ if (!dl->vlist[0] && !dl->spare) {
+ ddf->phys->entries[dl->pdnum].type |=
+ __cpu_to_be16(DDF_Global_Spare);
+ ddf->phys->entries[dl->pdnum].type &=
+ ~__cpu_to_be16(DDF_Spare |
+ DDF_Active_in_VD);
+ }
+ }
+ ddf->updates_pending = 1;
+ break;
+ case DDF_SPARE_ASSIGN_MAGIC:
+ default: break;
+ }
+}
+
+static void ddf_prepare_update(struct supertype *st,
+ struct metadata_update *update)
+{
+ /* This update arrived at managemon.
+ * We are about to pass it to monitor.
+ * If a malloc is needed, do it here.
+ */
+ struct ddf_super *ddf = st->sb;
+ __u32 *magic = (__u32*)update->buf;
+ if (*magic == DDF_VD_CONF_MAGIC)
+ if (posix_memalign(&update->space, 512,
+ offsetof(struct vcl, conf)
+ + ddf->conf_rec_len * 512) != 0)
+ update->space = NULL;
+}
+
+/*
+ * Check if the array 'a' is degraded but not failed.
+ * If it is, find as many spares as are available and needed and
+ * arrange for their inclusion.
+ * We only choose devices which are not already in the array,
+ * and prefer those with a spare-assignment to this array.
+ * otherwise we choose global spares - assuming always that
+ * there is enough room.
+ * For each spare that we assign, we return an 'mdinfo' which
+ * describes the position for the device in the array.
+ * We also add to 'updates' a DDF_VD_CONF_MAGIC update with
+ * the new phys_refnum and lba_offset values.
+ *
+ * Only worry about BVDs at the moment.
+ */
+static struct mdinfo *ddf_activate_spare(struct active_array *a,
+ struct metadata_update **updates)
+{
+ int working = 0;
+ struct mdinfo *d;
+ struct ddf_super *ddf = a->container->sb;
+ int global_ok = 0;
+ struct mdinfo *rv = NULL;
+ struct mdinfo *di;
+ struct metadata_update *mu;
+ struct dl *dl;
+ int i;
+ struct vd_config *vc;
+ __u64 *lba;
+
+ for (d = a->info.devs ; d ; d = d->next) {
+ if ((d->curr_state & DS_FAULTY) &&
+ d->state_fd >= 0)
+ /* wait for Removal to happen */
+ return NULL;
+ if (d->state_fd >= 0)
+ working ++;
+ }
+
+ dprintf("ddf_activate: working=%d (%d) level=%d\n", working, a->info.array.raid_disks,
+ a->info.array.level);
+ if (working == a->info.array.raid_disks)
+ return NULL; /* array not degraded */
+ switch (a->info.array.level) {
+ case 1:
+ if (working == 0)
+ return NULL; /* failed */
+ break;
+ case 4:
+ case 5:
+ if (working < a->info.array.raid_disks - 1)
+ return NULL; /* failed */
+ break;
+ case 6:
+ if (working < a->info.array.raid_disks - 2)
+ return NULL; /* failed */
+ break;
+ default: /* concat or stripe */
+ return NULL; /* failed */
+ }
+
+ /* For each slot, if it is not working, find a spare */
+ dl = ddf->dlist;
+ for (i = 0; i < a->info.array.raid_disks; i++) {
+ for (d = a->info.devs ; d ; d = d->next)
+ if (d->disk.raid_disk == i)
+ break;
+ dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0);
+ if (d && (d->state_fd >= 0))
+ continue;
+
+ /* OK, this device needs recovery. Find a spare */
+ again:
+ for ( ; dl ; dl = dl->next) {
+ unsigned long long esize;
+ unsigned long long pos;
+ struct mdinfo *d2;
+ int is_global = 0;
+ int is_dedicated = 0;
+ struct extent *ex;
+ int j;
+ /* If in this array, skip */
+ for (d2 = a->info.devs ; d2 ; d2 = d2->next)
+ if (d2->disk.major == dl->major &&
+ d2->disk.minor == dl->minor) {
+ dprintf("%x:%x already in array\n", dl->major, dl->minor);
+ break;
+ }
+ if (d2)
+ continue;
+ if (ddf->phys->entries[dl->pdnum].type &
+ __cpu_to_be16(DDF_Spare)) {
+ /* Check spare assign record */
+ if (dl->spare) {
+ if (dl->spare->type & DDF_spare_dedicated) {
+ /* check spare_ents for guid */
+ for (j = 0 ;
+ j < __be16_to_cpu(dl->spare->populated);
+ j++) {
+ if (memcmp(dl->spare->spare_ents[j].guid,
+ ddf->virt->entries[a->info.container_member].guid,
+ DDF_GUID_LEN) == 0)
+ is_dedicated = 1;
+ }
+ } else
+ is_global = 1;
+ }
+ } else if (ddf->phys->entries[dl->pdnum].type &
+ __cpu_to_be16(DDF_Global_Spare)) {
+ is_global = 1;
+ }
+ if ( ! (is_dedicated ||
+ (is_global && global_ok))) {
+ dprintf("%x:%x not suitable: %d %d\n", dl->major, dl->minor,
+ is_dedicated, is_global);
+ continue;
+ }
+
+ /* We are allowed to use this device - is there space?
+ * We need a->info.component_size sectors */
+ ex = get_extents(ddf, dl);
+ if (!ex) {
+ dprintf("cannot get extents\n");
+ continue;
+ }
+ j = 0; pos = 0;
+ esize = 0;
+
+ do {
+ esize = ex[j].start - pos;
+ if (esize >= a->info.component_size)
+ break;
+ pos = ex[i].start + ex[i].size;
+ i++;
+ } while (ex[i-1].size);
+
+ free(ex);
+ if (esize < a->info.component_size) {
+ dprintf("%x:%x has no room: %llu %llu\n", dl->major, dl->minor,
+ esize, a->info.component_size);
+ /* No room */
+ continue;
+ }
+
+ /* Cool, we have a device with some space at pos */
+ di = malloc(sizeof(*di));
+ if (!di)
+ continue;
+ memset(di, 0, sizeof(*di));
+ di->disk.number = i;
+ di->disk.raid_disk = i;
+ di->disk.major = dl->major;
+ di->disk.minor = dl->minor;
+ di->disk.state = 0;
+ di->data_offset = pos;
+ di->component_size = a->info.component_size;
+ di->container_member = dl->pdnum;
+ di->next = rv;
+ rv = di;
+ dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor,
+ i, pos);
+
+ break;
+ }
+ if (!dl && ! global_ok) {
+ /* not enough dedicated spares, try global */
+ global_ok = 1;
+ dl = ddf->dlist;
+ goto again;
+ }
+ }
+
+ if (!rv)
+ /* No spares found */
+ return rv;
+ /* Now 'rv' has a list of devices to return.
+ * Create a metadata_update record to update the
+ * phys_refnum and lba_offset values
+ */
+ mu = malloc(sizeof(*mu));
+ if (mu && posix_memalign(&mu->space, 512, sizeof(struct vcl)) != 0) {
+ free(mu);
+ mu = NULL;
+ }
+ if (!mu) {
+ while (rv) {
+ struct mdinfo *n = rv->next;
+
+ free(rv);
+ rv = n;
+ }
+ return NULL;
+ }
+
+ mu->buf = malloc(ddf->conf_rec_len * 512);
+ mu->len = ddf->conf_rec_len;
+ mu->next = *updates;
+ vc = find_vdcr(ddf, a->info.container_member);
+ memcpy(mu->buf, vc, ddf->conf_rec_len * 512);
+
+ vc = (struct vd_config*)mu->buf;
+ lba = (__u64*)&vc->phys_refnum[ddf->mppe];
+ for (di = rv ; di ; di = di->next) {
+ vc->phys_refnum[di->disk.raid_disk] =
+ ddf->phys->entries[dl->pdnum].refnum;
+ lba[di->disk.raid_disk] = di->data_offset;
+ }
+ *updates = mu;
+ return rv;
+}
+#endif /* MDASSEMBLE */
+
+static int ddf_level_to_layout(int level)
+{
+ switch(level) {
+ case 0:
+ case 1:
+ return 0;
+ case 5:
+ return ALGORITHM_LEFT_SYMMETRIC;
+ case 6:
+ return ALGORITHM_ROTATING_N_CONTINUE;
+ case 10:
+ return 0x102;
+ default:
+ return UnSet;
+ }
+}
+
+struct superswitch super_ddf = {
+#ifndef MDASSEMBLE
+ .examine_super = examine_super_ddf,
+ .brief_examine_super = brief_examine_super_ddf,
+ .export_examine_super = export_examine_super_ddf,
+ .detail_super = detail_super_ddf,
+ .brief_detail_super = brief_detail_super_ddf,
+ .validate_geometry = validate_geometry_ddf,
+ .write_init_super = write_init_super_ddf,
+ .add_to_super = add_to_super_ddf,
+#endif
+ .match_home = match_home_ddf,
+ .uuid_from_super= uuid_from_super_ddf,
+ .getinfo_super = getinfo_super_ddf,
+ .update_super = update_super_ddf,
+
+ .avail_size = avail_size_ddf,
+
+ .compare_super = compare_super_ddf,
+
+ .load_super = load_super_ddf,
+ .init_super = init_super_ddf,
+ .store_super = store_zero_ddf,
+ .free_super = free_super_ddf,
+ .match_metadata_desc = match_metadata_desc_ddf,
+ .container_content = container_content_ddf,
+ .default_layout = ddf_level_to_layout,
+
+ .external = 1,
+
+#ifndef MDASSEMBLE
+/* for mdmon */
+ .open_new = ddf_open_new,
+ .set_array_state= ddf_set_array_state,
+ .set_disk = ddf_set_disk,
+ .sync_metadata = ddf_sync_metadata,
+ .process_update = ddf_process_update,
+ .prepare_update = ddf_prepare_update,
+ .activate_spare = ddf_activate_spare,
+#endif
+ .name = "ddf",
+};
--- /dev/null
+/*
+ * mdadm - Intel(R) Matrix Storage Manager Support
+ *
+ * Copyright (C) 2002-2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#define HAVE_STDINT_H 1
+#include "mdadm.h"
+#include "mdmon.h"
+#include "sha1.h"
+#include "platform-intel.h"
+#include <values.h>
+#include <scsi/sg.h>
+#include <ctype.h>
+#include <dirent.h>
+
+/* MPB == Metadata Parameter Block */
+#define MPB_SIGNATURE "Intel Raid ISM Cfg Sig. "
+#define MPB_SIG_LEN (strlen(MPB_SIGNATURE))
+#define MPB_VERSION_RAID0 "1.0.00"
+#define MPB_VERSION_RAID1 "1.1.00"
+#define MPB_VERSION_MANY_VOLUMES_PER_ARRAY "1.2.00"
+#define MPB_VERSION_3OR4_DISK_ARRAY "1.2.01"
+#define MPB_VERSION_RAID5 "1.2.02"
+#define MPB_VERSION_5OR6_DISK_ARRAY "1.2.04"
+#define MPB_VERSION_CNG "1.2.06"
+#define MPB_VERSION_ATTRIBS "1.3.00"
+#define MAX_SIGNATURE_LENGTH 32
+#define MAX_RAID_SERIAL_LEN 16
+
+#define MPB_ATTRIB_CHECKSUM_VERIFY __cpu_to_le32(0x80000000)
+#define MPB_ATTRIB_PM __cpu_to_le32(0x40000000)
+#define MPB_ATTRIB_2TB __cpu_to_le32(0x20000000)
+#define MPB_ATTRIB_RAID0 __cpu_to_le32(0x00000001)
+#define MPB_ATTRIB_RAID1 __cpu_to_le32(0x00000002)
+#define MPB_ATTRIB_RAID10 __cpu_to_le32(0x00000004)
+#define MPB_ATTRIB_RAID1E __cpu_to_le32(0x00000008)
+#define MPB_ATTRIB_RAID5 __cpu_to_le32(0x00000010)
+#define MPB_ATTRIB_RAIDCNG __cpu_to_le32(0x00000020)
+
+#define MPB_SECTOR_CNT 418
+#define IMSM_RESERVED_SECTORS 4096
+#define SECT_PER_MB_SHIFT 11
+
+/* Disk configuration info. */
+#define IMSM_MAX_DEVICES 255
+struct imsm_disk {
+ __u8 serial[MAX_RAID_SERIAL_LEN];/* 0xD8 - 0xE7 ascii serial number */
+ __u32 total_blocks; /* 0xE8 - 0xEB total blocks */
+ __u32 scsi_id; /* 0xEC - 0xEF scsi ID */
+#define SPARE_DISK __cpu_to_le32(0x01) /* Spare */
+#define CONFIGURED_DISK __cpu_to_le32(0x02) /* Member of some RaidDev */
+#define FAILED_DISK __cpu_to_le32(0x04) /* Permanent failure */
+#define USABLE_DISK __cpu_to_le32(0x08) /* Fully usable unless FAILED_DISK is set */
+ __u32 status; /* 0xF0 - 0xF3 */
+ __u32 owner_cfg_num; /* which config 0,1,2... owns this disk */
+#define IMSM_DISK_FILLERS 4
+ __u32 filler[IMSM_DISK_FILLERS]; /* 0xF4 - 0x107 MPB_DISK_FILLERS for future expansion */
+};
+
+/* RAID map configuration infos. */
+struct imsm_map {
+ __u32 pba_of_lba0; /* start address of partition */
+ __u32 blocks_per_member;/* blocks per member */
+ __u32 num_data_stripes; /* number of data stripes */
+ __u16 blocks_per_strip;
+ __u8 map_state; /* Normal, Uninitialized, Degraded, Failed */
+#define IMSM_T_STATE_NORMAL 0
+#define IMSM_T_STATE_UNINITIALIZED 1
+#define IMSM_T_STATE_DEGRADED 2
+#define IMSM_T_STATE_FAILED 3
+ __u8 raid_level;
+#define IMSM_T_RAID0 0
+#define IMSM_T_RAID1 1
+#define IMSM_T_RAID5 5 /* since metadata version 1.2.02 ? */
+ __u8 num_members; /* number of member disks */
+ __u8 num_domains; /* number of parity domains */
+ __u8 failed_disk_num; /* valid only when state is degraded */
+ __u8 ddf;
+ __u32 filler[7]; /* expansion area */
+#define IMSM_ORD_REBUILD (1 << 24)
+ __u32 disk_ord_tbl[1]; /* disk_ord_tbl[num_members],
+ * top byte contains some flags
+ */
+} __attribute__ ((packed));
+
+struct imsm_vol {
+ __u32 curr_migr_unit;
+ __u32 checkpoint_id; /* id to access curr_migr_unit */
+ __u8 migr_state; /* Normal or Migrating */
+#define MIGR_INIT 0
+#define MIGR_REBUILD 1
+#define MIGR_VERIFY 2 /* analagous to echo check > sync_action */
+#define MIGR_GEN_MIGR 3
+#define MIGR_STATE_CHANGE 4
+#define MIGR_REPAIR 5
+ __u8 migr_type; /* Initializing, Rebuilding, ... */
+ __u8 dirty;
+ __u8 fs_state; /* fast-sync state for CnG (0xff == disabled) */
+ __u16 verify_errors; /* number of mismatches */
+ __u16 bad_blocks; /* number of bad blocks during verify */
+ __u32 filler[4];
+ struct imsm_map map[1];
+ /* here comes another one if migr_state */
+} __attribute__ ((packed));
+
+struct imsm_dev {
+ __u8 volume[MAX_RAID_SERIAL_LEN];
+ __u32 size_low;
+ __u32 size_high;
+#define DEV_BOOTABLE __cpu_to_le32(0x01)
+#define DEV_BOOT_DEVICE __cpu_to_le32(0x02)
+#define DEV_READ_COALESCING __cpu_to_le32(0x04)
+#define DEV_WRITE_COALESCING __cpu_to_le32(0x08)
+#define DEV_LAST_SHUTDOWN_DIRTY __cpu_to_le32(0x10)
+#define DEV_HIDDEN_AT_BOOT __cpu_to_le32(0x20)
+#define DEV_CURRENTLY_HIDDEN __cpu_to_le32(0x40)
+#define DEV_VERIFY_AND_FIX __cpu_to_le32(0x80)
+#define DEV_MAP_STATE_UNINIT __cpu_to_le32(0x100)
+#define DEV_NO_AUTO_RECOVERY __cpu_to_le32(0x200)
+#define DEV_CLONE_N_GO __cpu_to_le32(0x400)
+#define DEV_CLONE_MAN_SYNC __cpu_to_le32(0x800)
+#define DEV_CNG_MASTER_DISK_NUM __cpu_to_le32(0x1000)
+ __u32 status; /* Persistent RaidDev status */
+ __u32 reserved_blocks; /* Reserved blocks at beginning of volume */
+ __u8 migr_priority;
+ __u8 num_sub_vols;
+ __u8 tid;
+ __u8 cng_master_disk;
+ __u16 cache_policy;
+ __u8 cng_state;
+ __u8 cng_sub_state;
+#define IMSM_DEV_FILLERS 10
+ __u32 filler[IMSM_DEV_FILLERS];
+ struct imsm_vol vol;
+} __attribute__ ((packed));
+
+struct imsm_super {
+ __u8 sig[MAX_SIGNATURE_LENGTH]; /* 0x00 - 0x1F */
+ __u32 check_sum; /* 0x20 - 0x23 MPB Checksum */
+ __u32 mpb_size; /* 0x24 - 0x27 Size of MPB */
+ __u32 family_num; /* 0x28 - 0x2B Checksum from first time this config was written */
+ __u32 generation_num; /* 0x2C - 0x2F Incremented each time this array's MPB is written */
+ __u32 error_log_size; /* 0x30 - 0x33 in bytes */
+ __u32 attributes; /* 0x34 - 0x37 */
+ __u8 num_disks; /* 0x38 Number of configured disks */
+ __u8 num_raid_devs; /* 0x39 Number of configured volumes */
+ __u8 error_log_pos; /* 0x3A */
+ __u8 fill[1]; /* 0x3B */
+ __u32 cache_size; /* 0x3c - 0x40 in mb */
+ __u32 orig_family_num; /* 0x40 - 0x43 original family num */
+ __u32 pwr_cycle_count; /* 0x44 - 0x47 simulated power cycle count for array */
+ __u32 bbm_log_size; /* 0x48 - 0x4B - size of bad Block Mgmt Log in bytes */
+#define IMSM_FILLERS 35
+ __u32 filler[IMSM_FILLERS]; /* 0x4C - 0xD7 RAID_MPB_FILLERS */
+ struct imsm_disk disk[1]; /* 0xD8 diskTbl[numDisks] */
+ /* here comes imsm_dev[num_raid_devs] */
+ /* here comes BBM logs */
+} __attribute__ ((packed));
+
+#define BBM_LOG_MAX_ENTRIES 254
+
+struct bbm_log_entry {
+ __u64 defective_block_start;
+#define UNREADABLE 0xFFFFFFFF
+ __u32 spare_block_offset;
+ __u16 remapped_marked_count;
+ __u16 disk_ordinal;
+} __attribute__ ((__packed__));
+
+struct bbm_log {
+ __u32 signature; /* 0xABADB10C */
+ __u32 entry_count;
+ __u32 reserved_spare_block_count; /* 0 */
+ __u32 reserved; /* 0xFFFF */
+ __u64 first_spare_lba;
+ struct bbm_log_entry mapped_block_entries[BBM_LOG_MAX_ENTRIES];
+} __attribute__ ((__packed__));
+
+
+#ifndef MDASSEMBLE
+static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed" };
+#endif
+
+static __u8 migr_type(struct imsm_dev *dev)
+{
+ if (dev->vol.migr_type == MIGR_VERIFY &&
+ dev->status & DEV_VERIFY_AND_FIX)
+ return MIGR_REPAIR;
+ else
+ return dev->vol.migr_type;
+}
+
+static void set_migr_type(struct imsm_dev *dev, __u8 migr_type)
+{
+ /* for compatibility with older oroms convert MIGR_REPAIR, into
+ * MIGR_VERIFY w/ DEV_VERIFY_AND_FIX status
+ */
+ if (migr_type == MIGR_REPAIR) {
+ dev->vol.migr_type = MIGR_VERIFY;
+ dev->status |= DEV_VERIFY_AND_FIX;
+ } else {
+ dev->vol.migr_type = migr_type;
+ dev->status &= ~DEV_VERIFY_AND_FIX;
+ }
+}
+
+static unsigned int sector_count(__u32 bytes)
+{
+ return ((bytes + (512-1)) & (~(512-1))) / 512;
+}
+
+static unsigned int mpb_sectors(struct imsm_super *mpb)
+{
+ return sector_count(__le32_to_cpu(mpb->mpb_size));
+}
+
+struct intel_dev {
+ struct imsm_dev *dev;
+ struct intel_dev *next;
+ int index;
+};
+
+/* internal representation of IMSM metadata */
+struct intel_super {
+ union {
+ void *buf; /* O_DIRECT buffer for reading/writing metadata */
+ struct imsm_super *anchor; /* immovable parameters */
+ };
+ size_t len; /* size of the 'buf' allocation */
+ void *next_buf; /* for realloc'ing buf from the manager */
+ size_t next_len;
+ int updates_pending; /* count of pending updates for mdmon */
+ int creating_imsm; /* flag to indicate container creation */
+ int current_vol; /* index of raid device undergoing creation */
+ __u32 create_offset; /* common start for 'current_vol' */
+ struct intel_dev *devlist;
+ struct dl {
+ struct dl *next;
+ int index;
+ __u8 serial[MAX_RAID_SERIAL_LEN];
+ int major, minor;
+ char *devname;
+ struct imsm_disk disk;
+ int fd;
+ int extent_cnt;
+ struct extent *e; /* for determining freespace @ create */
+ int raiddisk; /* slot to fill in autolayout */
+ } *disks;
+ struct dl *add; /* list of disks to add while mdmon active */
+ struct dl *missing; /* disks removed while we weren't looking */
+ struct bbm_log *bbm_log;
+ const char *hba; /* device path of the raid controller for this metadata */
+ const struct imsm_orom *orom; /* platform firmware support */
+};
+
+struct extent {
+ unsigned long long start, size;
+};
+
+/* definition of messages passed to imsm_process_update */
+enum imsm_update_type {
+ update_activate_spare,
+ update_create_array,
+ update_add_disk,
+};
+
+struct imsm_update_activate_spare {
+ enum imsm_update_type type;
+ struct dl *dl;
+ int slot;
+ int array;
+ struct imsm_update_activate_spare *next;
+};
+
+struct disk_info {
+ __u8 serial[MAX_RAID_SERIAL_LEN];
+};
+
+struct imsm_update_create_array {
+ enum imsm_update_type type;
+ int dev_idx;
+ struct imsm_dev dev;
+};
+
+struct imsm_update_add_disk {
+ enum imsm_update_type type;
+};
+
+static struct supertype *match_metadata_desc_imsm(char *arg)
+{
+ struct supertype *st;
+
+ if (strcmp(arg, "imsm") != 0 &&
+ strcmp(arg, "default") != 0
+ )
+ return NULL;
+
+ st = malloc(sizeof(*st));
+ memset(st, 0, sizeof(*st));
+ st->ss = &super_imsm;
+ st->max_devs = IMSM_MAX_DEVICES;
+ st->minor_version = 0;
+ st->sb = NULL;
+ return st;
+}
+
+#ifndef MDASSEMBLE
+static __u8 *get_imsm_version(struct imsm_super *mpb)
+{
+ return &mpb->sig[MPB_SIG_LEN];
+}
+#endif
+
+/* retrieve a disk directly from the anchor when the anchor is known to be
+ * up-to-date, currently only at load time
+ */
+static struct imsm_disk *__get_imsm_disk(struct imsm_super *mpb, __u8 index)
+{
+ if (index >= mpb->num_disks)
+ return NULL;
+ return &mpb->disk[index];
+}
+
+#ifndef MDASSEMBLE
+/* retrieve a disk from the parsed metadata */
+static struct imsm_disk *get_imsm_disk(struct intel_super *super, __u8 index)
+{
+ struct dl *d;
+
+ for (d = super->disks; d; d = d->next)
+ if (d->index == index)
+ return &d->disk;
+
+ return NULL;
+}
+#endif
+
+/* generate a checksum directly from the anchor when the anchor is known to be
+ * up-to-date, currently only at load or write_super after coalescing
+ */
+static __u32 __gen_imsm_checksum(struct imsm_super *mpb)
+{
+ __u32 end = mpb->mpb_size / sizeof(end);
+ __u32 *p = (__u32 *) mpb;
+ __u32 sum = 0;
+
+ while (end--) {
+ sum += __le32_to_cpu(*p);
+ p++;
+ }
+
+ return sum - __le32_to_cpu(mpb->check_sum);
+}
+
+static size_t sizeof_imsm_map(struct imsm_map *map)
+{
+ return sizeof(struct imsm_map) + sizeof(__u32) * (map->num_members - 1);
+}
+
+struct imsm_map *get_imsm_map(struct imsm_dev *dev, int second_map)
+{
+ struct imsm_map *map = &dev->vol.map[0];
+
+ if (second_map && !dev->vol.migr_state)
+ return NULL;
+ else if (second_map) {
+ void *ptr = map;
+
+ return ptr + sizeof_imsm_map(map);
+ } else
+ return map;
+
+}
+
+/* return the size of the device.
+ * migr_state increases the returned size if map[0] were to be duplicated
+ */
+static size_t sizeof_imsm_dev(struct imsm_dev *dev, int migr_state)
+{
+ size_t size = sizeof(*dev) - sizeof(struct imsm_map) +
+ sizeof_imsm_map(get_imsm_map(dev, 0));
+
+ /* migrating means an additional map */
+ if (dev->vol.migr_state)
+ size += sizeof_imsm_map(get_imsm_map(dev, 1));
+ else if (migr_state)
+ size += sizeof_imsm_map(get_imsm_map(dev, 0));
+
+ return size;
+}
+
+#ifndef MDASSEMBLE
+/* retrieve disk serial number list from a metadata update */
+static struct disk_info *get_disk_info(struct imsm_update_create_array *update)
+{
+ void *u = update;
+ struct disk_info *inf;
+
+ inf = u + sizeof(*update) - sizeof(struct imsm_dev) +
+ sizeof_imsm_dev(&update->dev, 0);
+
+ return inf;
+}
+#endif
+
+static struct imsm_dev *__get_imsm_dev(struct imsm_super *mpb, __u8 index)
+{
+ int offset;
+ int i;
+ void *_mpb = mpb;
+
+ if (index >= mpb->num_raid_devs)
+ return NULL;
+
+ /* devices start after all disks */
+ offset = ((void *) &mpb->disk[mpb->num_disks]) - _mpb;
+
+ for (i = 0; i <= index; i++)
+ if (i == index)
+ return _mpb + offset;
+ else
+ offset += sizeof_imsm_dev(_mpb + offset, 0);
+
+ return NULL;
+}
+
+static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index)
+{
+ struct intel_dev *dv;
+
+ if (index >= super->anchor->num_raid_devs)
+ return NULL;
+ for (dv = super->devlist; dv; dv = dv->next)
+ if (dv->index == index)
+ return dv->dev;
+ return NULL;
+}
+
+static __u32 get_imsm_ord_tbl_ent(struct imsm_dev *dev, int slot)
+{
+ struct imsm_map *map;
+
+ if (dev->vol.migr_state)
+ map = get_imsm_map(dev, 1);
+ else
+ map = get_imsm_map(dev, 0);
+
+ /* top byte identifies disk under rebuild */
+ return __le32_to_cpu(map->disk_ord_tbl[slot]);
+}
+
+#define ord_to_idx(ord) (((ord) << 8) >> 8)
+static __u32 get_imsm_disk_idx(struct imsm_dev *dev, int slot)
+{
+ __u32 ord = get_imsm_ord_tbl_ent(dev, slot);
+
+ return ord_to_idx(ord);
+}
+
+static void set_imsm_ord_tbl_ent(struct imsm_map *map, int slot, __u32 ord)
+{
+ map->disk_ord_tbl[slot] = __cpu_to_le32(ord);
+}
+
+static int get_imsm_disk_slot(struct imsm_map *map, int idx)
+{
+ int slot;
+ __u32 ord;
+
+ for (slot = 0; slot < map->num_members; slot++) {
+ ord = __le32_to_cpu(map->disk_ord_tbl[slot]);
+ if (ord_to_idx(ord) == idx)
+ return slot;
+ }
+
+ return -1;
+}
+
+static int get_imsm_raid_level(struct imsm_map *map)
+{
+ if (map->raid_level == 1) {
+ if (map->num_members == 2)
+ return 1;
+ else
+ return 10;
+ }
+
+ return map->raid_level;
+}
+
+static int cmp_extent(const void *av, const void *bv)
+{
+ const struct extent *a = av;
+ const struct extent *b = bv;
+ if (a->start < b->start)
+ return -1;
+ if (a->start > b->start)
+ return 1;
+ return 0;
+}
+
+static int count_memberships(struct dl *dl, struct intel_super *super)
+{
+ int memberships = 0;
+ int i;
+
+ for (i = 0; i < super->anchor->num_raid_devs; i++) {
+ struct imsm_dev *dev = get_imsm_dev(super, i);
+ struct imsm_map *map = get_imsm_map(dev, 0);
+
+ if (get_imsm_disk_slot(map, dl->index) >= 0)
+ memberships++;
+ }
+
+ return memberships;
+}
+
+static struct extent *get_extents(struct intel_super *super, struct dl *dl)
+{
+ /* find a list of used extents on the given physical device */
+ struct extent *rv, *e;
+ int i;
+ int memberships = count_memberships(dl, super);
+ __u32 reservation = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
+
+ rv = malloc(sizeof(struct extent) * (memberships + 1));
+ if (!rv)
+ return NULL;
+ e = rv;
+
+ for (i = 0; i < super->anchor->num_raid_devs; i++) {
+ struct imsm_dev *dev = get_imsm_dev(super, i);
+ struct imsm_map *map = get_imsm_map(dev, 0);
+
+ if (get_imsm_disk_slot(map, dl->index) >= 0) {
+ e->start = __le32_to_cpu(map->pba_of_lba0);
+ e->size = __le32_to_cpu(map->blocks_per_member);
+ e++;
+ }
+ }
+ qsort(rv, memberships, sizeof(*rv), cmp_extent);
+
+ /* determine the start of the metadata
+ * when no raid devices are defined use the default
+ * ...otherwise allow the metadata to truncate the value
+ * as is the case with older versions of imsm
+ */
+ if (memberships) {
+ struct extent *last = &rv[memberships - 1];
+ __u32 remainder;
+
+ remainder = __le32_to_cpu(dl->disk.total_blocks) -
+ (last->start + last->size);
+ /* round down to 1k block to satisfy precision of the kernel
+ * 'size' interface
+ */
+ remainder &= ~1UL;
+ /* make sure remainder is still sane */
+ if (remainder < ROUND_UP(super->len, 512) >> 9)
+ remainder = ROUND_UP(super->len, 512) >> 9;
+ if (reservation > remainder)
+ reservation = remainder;
+ }
+ e->start = __le32_to_cpu(dl->disk.total_blocks) - reservation;
+ e->size = 0;
+ return rv;
+}
+
+/* try to determine how much space is reserved for metadata from
+ * the last get_extents() entry, otherwise fallback to the
+ * default
+ */
+static __u32 imsm_reserved_sectors(struct intel_super *super, struct dl *dl)
+{
+ struct extent *e;
+ int i;
+ __u32 rv;
+
+ /* for spares just return a minimal reservation which will grow
+ * once the spare is picked up by an array
+ */
+ if (dl->index == -1)
+ return MPB_SECTOR_CNT;
+
+ e = get_extents(super, dl);
+ if (!e)
+ return MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
+
+ /* scroll to last entry */
+ for (i = 0; e[i].size; i++)
+ continue;
+
+ rv = __le32_to_cpu(dl->disk.total_blocks) - e[i].start;
+
+ free(e);
+
+ return rv;
+}
+
+#ifndef MDASSEMBLE
+static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx)
+{
+ __u64 sz;
+ int slot;
+ struct imsm_map *map = get_imsm_map(dev, 0);
+ __u32 ord;
+
+ printf("\n");
+ printf("[%.16s]:\n", dev->volume);
+ printf(" UUID : %s\n", uuid);
+ printf(" RAID Level : %d\n", get_imsm_raid_level(map));
+ printf(" Members : %d\n", map->num_members);
+ slot = get_imsm_disk_slot(map, disk_idx);
+ if (slot >= 0) {
+ ord = get_imsm_ord_tbl_ent(dev, slot);
+ printf(" This Slot : %d%s\n", slot,
+ ord & IMSM_ORD_REBUILD ? " (out-of-sync)" : "");
+ } else
+ printf(" This Slot : ?\n");
+ sz = __le32_to_cpu(dev->size_high);
+ sz <<= 32;
+ sz += __le32_to_cpu(dev->size_low);
+ printf(" Array Size : %llu%s\n", (unsigned long long)sz,
+ human_size(sz * 512));
+ sz = __le32_to_cpu(map->blocks_per_member);
+ printf(" Per Dev Size : %llu%s\n", (unsigned long long)sz,
+ human_size(sz * 512));
+ printf(" Sector Offset : %u\n",
+ __le32_to_cpu(map->pba_of_lba0));
+ printf(" Num Stripes : %u\n",
+ __le32_to_cpu(map->num_data_stripes));
+ printf(" Chunk Size : %u KiB\n",
+ __le16_to_cpu(map->blocks_per_strip) / 2);
+ printf(" Reserved : %d\n", __le32_to_cpu(dev->reserved_blocks));
+ printf(" Migrate State : %s", dev->vol.migr_state ? "migrating" : "idle\n");
+ if (dev->vol.migr_state) {
+ if (migr_type(dev) == MIGR_INIT)
+ printf(": initializing\n");
+ else if (migr_type(dev) == MIGR_REBUILD)
+ printf(": rebuilding\n");
+ else if (migr_type(dev) == MIGR_VERIFY)
+ printf(": check\n");
+ else if (migr_type(dev) == MIGR_GEN_MIGR)
+ printf(": general migration\n");
+ else if (migr_type(dev) == MIGR_STATE_CHANGE)
+ printf(": state change\n");
+ else if (migr_type(dev) == MIGR_REPAIR)
+ printf(": repair\n");
+ else
+ printf(": <unknown:%d>\n", migr_type(dev));
+ }
+ printf(" Map State : %s", map_state_str[map->map_state]);
+ if (dev->vol.migr_state) {
+ struct imsm_map *map = get_imsm_map(dev, 1);
+ printf(" <-- %s", map_state_str[map->map_state]);
+ }
+ printf("\n");
+ printf(" Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean");
+}
+
+static void print_imsm_disk(struct imsm_super *mpb, int index, __u32 reserved)
+{
+ struct imsm_disk *disk = __get_imsm_disk(mpb, index);
+ char str[MAX_RAID_SERIAL_LEN + 1];
+ __u32 s;
+ __u64 sz;
+
+ if (index < 0)
+ return;
+
+ printf("\n");
+ snprintf(str, MAX_RAID_SERIAL_LEN + 1, "%s", disk->serial);
+ printf(" Disk%02d Serial : %s\n", index, str);
+ s = disk->status;
+ printf(" State :%s%s%s%s\n", s&SPARE_DISK ? " spare" : "",
+ s&CONFIGURED_DISK ? " active" : "",
+ s&FAILED_DISK ? " failed" : "",
+ s&USABLE_DISK ? " usable" : "");
+ printf(" Id : %08x\n", __le32_to_cpu(disk->scsi_id));
+ sz = __le32_to_cpu(disk->total_blocks) - reserved;
+ printf(" Usable Size : %llu%s\n", (unsigned long long)sz,
+ human_size(sz * 512));
+}
+
+static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info);
+
+static void examine_super_imsm(struct supertype *st, char *homehost)
+{
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb = super->anchor;
+ char str[MAX_SIGNATURE_LENGTH];
+ int i;
+ struct mdinfo info;
+ char nbuf[64];
+ __u32 sum;
+ __u32 reserved = imsm_reserved_sectors(super, super->disks);
+
+
+ snprintf(str, MPB_SIG_LEN, "%s", mpb->sig);
+ printf(" Magic : %s\n", str);
+ snprintf(str, strlen(MPB_VERSION_RAID0), "%s", get_imsm_version(mpb));
+ printf(" Version : %s\n", get_imsm_version(mpb));
+ printf(" Family : %08x\n", __le32_to_cpu(mpb->family_num));
+ printf(" Generation : %08x\n", __le32_to_cpu(mpb->generation_num));
+ getinfo_super_imsm(st, &info);
+ fname_from_uuid(st, &info, nbuf, ':');
+ printf(" UUID : %s\n", nbuf + 5);
+ sum = __le32_to_cpu(mpb->check_sum);
+ printf(" Checksum : %08x %s\n", sum,
+ __gen_imsm_checksum(mpb) == sum ? "correct" : "incorrect");
+ printf(" MPB Sectors : %d\n", mpb_sectors(mpb));
+ printf(" Disks : %d\n", mpb->num_disks);
+ printf(" RAID Devices : %d\n", mpb->num_raid_devs);
+ print_imsm_disk(mpb, super->disks->index, reserved);
+ if (super->bbm_log) {
+ struct bbm_log *log = super->bbm_log;
+
+ printf("\n");
+ printf("Bad Block Management Log:\n");
+ printf(" Log Size : %d\n", __le32_to_cpu(mpb->bbm_log_size));
+ printf(" Signature : %x\n", __le32_to_cpu(log->signature));
+ printf(" Entry Count : %d\n", __le32_to_cpu(log->entry_count));
+ printf(" Spare Blocks : %d\n", __le32_to_cpu(log->reserved_spare_block_count));
+ printf(" First Spare : %llx\n",
+ (unsigned long long) __le64_to_cpu(log->first_spare_lba));
+ }
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ struct mdinfo info;
+ struct imsm_dev *dev = __get_imsm_dev(mpb, i);
+
+ super->current_vol = i;
+ getinfo_super_imsm(st, &info);
+ fname_from_uuid(st, &info, nbuf, ':');
+ print_imsm_dev(dev, nbuf + 5, super->disks->index);
+ }
+ for (i = 0; i < mpb->num_disks; i++) {
+ if (i == super->disks->index)
+ continue;
+ print_imsm_disk(mpb, i, reserved);
+ }
+}
+
+static void brief_examine_super_imsm(struct supertype *st, int verbose)
+{
+ /* We just write a generic IMSM ARRAY entry */
+ struct mdinfo info;
+ char nbuf[64];
+ char nbuf1[64];
+ struct intel_super *super = st->sb;
+ int i;
+
+ if (!super->anchor->num_raid_devs)
+ return;
+
+ getinfo_super_imsm(st, &info);
+ fname_from_uuid(st, &info, nbuf, ':');
+ printf("ARRAY metadata=imsm auto=md UUID=%s\n", nbuf + 5);
+ for (i = 0; i < super->anchor->num_raid_devs; i++) {
+ struct imsm_dev *dev = get_imsm_dev(super, i);
+
+ super->current_vol = i;
+ getinfo_super_imsm(st, &info);
+ fname_from_uuid(st, &info, nbuf1, ':');
+ printf("ARRAY /dev/md/%.16s container=%s\n"
+ " member=%d auto=mdp UUID=%s\n",
+ dev->volume, nbuf + 5, i, nbuf1 + 5);
+ }
+}
+
+static void export_examine_super_imsm(struct supertype *st)
+{
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb = super->anchor;
+ struct mdinfo info;
+ char nbuf[64];
+
+ getinfo_super_imsm(st, &info);
+ fname_from_uuid(st, &info, nbuf, ':');
+ printf("MD_METADATA=imsm\n");
+ printf("MD_LEVEL=container\n");
+ printf("MD_UUID=%s\n", nbuf+5);
+ printf("MD_DEVICES=%u\n", mpb->num_disks);
+}
+
+static void detail_super_imsm(struct supertype *st, char *homehost)
+{
+ struct mdinfo info;
+ char nbuf[64];
+
+ getinfo_super_imsm(st, &info);
+ fname_from_uuid(st, &info, nbuf, ':');
+ printf("\n UUID : %s\n", nbuf + 5);
+}
+
+static void brief_detail_super_imsm(struct supertype *st)
+{
+ struct mdinfo info;
+ char nbuf[64];
+ getinfo_super_imsm(st, &info);
+ fname_from_uuid(st, &info, nbuf, ':');
+ printf(" UUID=%s", nbuf + 5);
+}
+
+static int imsm_read_serial(int fd, char *devname, __u8 *serial);
+static void fd2devname(int fd, char *name);
+
+static int imsm_enumerate_ports(const char *hba_path, int port_count, int host_base, int verbose)
+{
+ /* dump an unsorted list of devices attached to ahci, as well as
+ * non-connected ports
+ */
+ int hba_len = strlen(hba_path) + 1;
+ struct dirent *ent;
+ DIR *dir;
+ char *path = NULL;
+ int err = 0;
+ unsigned long port_mask = (1 << port_count) - 1;
+
+ if (port_count > sizeof(port_mask) * 8) {
+ if (verbose)
+ fprintf(stderr, Name ": port_count %d out of range\n", port_count);
+ return 2;
+ }
+
+ /* scroll through /sys/dev/block looking for devices attached to
+ * this hba
+ */
+ dir = opendir("/sys/dev/block");
+ for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) {
+ int fd;
+ char model[64];
+ char vendor[64];
+ char buf[1024];
+ int major, minor;
+ char *device;
+ char *c;
+ int port;
+ int type;
+
+ if (sscanf(ent->d_name, "%d:%d", &major, &minor) != 2)
+ continue;
+ path = devt_to_devpath(makedev(major, minor));
+ if (!path)
+ continue;
+ if (!path_attached_to_hba(path, hba_path)) {
+ free(path);
+ path = NULL;
+ continue;
+ }
+
+ /* retrieve the scsi device type */
+ if (asprintf(&device, "/sys/dev/block/%d:%d/device/xxxxxxx", major, minor) < 0) {
+ if (verbose)
+ fprintf(stderr, Name ": failed to allocate 'device'\n");
+ err = 2;
+ break;
+ }
+ sprintf(device, "/sys/dev/block/%d:%d/device/type", major, minor);
+ if (load_sys(device, buf) != 0) {
+ if (verbose)
+ fprintf(stderr, Name ": failed to read device type for %s\n",
+ path);
+ err = 2;
+ free(device);
+ break;
+ }
+ type = strtoul(buf, NULL, 10);
+
+ /* if it's not a disk print the vendor and model */
+ if (!(type == 0 || type == 7 || type == 14)) {
+ vendor[0] = '\0';
+ model[0] = '\0';
+ sprintf(device, "/sys/dev/block/%d:%d/device/vendor", major, minor);
+ if (load_sys(device, buf) == 0) {
+ strncpy(vendor, buf, sizeof(vendor));
+ vendor[sizeof(vendor) - 1] = '\0';
+ c = (char *) &vendor[sizeof(vendor) - 1];
+ while (isspace(*c) || *c == '\0')
+ *c-- = '\0';
+
+ }
+ sprintf(device, "/sys/dev/block/%d:%d/device/model", major, minor);
+ if (load_sys(device, buf) == 0) {
+ strncpy(model, buf, sizeof(model));
+ model[sizeof(model) - 1] = '\0';
+ c = (char *) &model[sizeof(model) - 1];
+ while (isspace(*c) || *c == '\0')
+ *c-- = '\0';
+ }
+
+ if (vendor[0] && model[0])
+ sprintf(buf, "%.64s %.64s", vendor, model);
+ else
+ switch (type) { /* numbers from hald/linux/device.c */
+ case 1: sprintf(buf, "tape"); break;
+ case 2: sprintf(buf, "printer"); break;
+ case 3: sprintf(buf, "processor"); break;
+ case 4:
+ case 5: sprintf(buf, "cdrom"); break;
+ case 6: sprintf(buf, "scanner"); break;
+ case 8: sprintf(buf, "media_changer"); break;
+ case 9: sprintf(buf, "comm"); break;
+ case 12: sprintf(buf, "raid"); break;
+ default: sprintf(buf, "unknown");
+ }
+ } else
+ buf[0] = '\0';
+ free(device);
+
+ /* chop device path to 'host%d' and calculate the port number */
+ c = strchr(&path[hba_len], '/');
+ *c = '\0';
+ if (sscanf(&path[hba_len], "host%d", &port) == 1)
+ port -= host_base;
+ else {
+ if (verbose) {
+ *c = '/'; /* repair the full string */
+ fprintf(stderr, Name ": failed to determine port number for %s\n",
+ path);
+ }
+ err = 2;
+ break;
+ }
+
+ /* mark this port as used */
+ port_mask &= ~(1 << port);
+
+ /* print out the device information */
+ if (buf[0]) {
+ printf(" Port%d : - non-disk device (%s) -\n", port, buf);
+ continue;
+ }
+
+ fd = dev_open(ent->d_name, O_RDONLY);
+ if (fd < 0)
+ printf(" Port%d : - disk info unavailable -\n", port);
+ else {
+ fd2devname(fd, buf);
+ printf(" Port%d : %s", port, buf);
+ if (imsm_read_serial(fd, NULL, (__u8 *) buf) == 0)
+ printf(" (%s)\n", buf);
+ else
+ printf("()\n");
+ }
+ close(fd);
+ free(path);
+ path = NULL;
+ }
+ if (path)
+ free(path);
+ if (dir)
+ closedir(dir);
+ if (err == 0) {
+ int i;
+
+ for (i = 0; i < port_count; i++)
+ if (port_mask & (1 << i))
+ printf(" Port%d : - no device attached -\n", i);
+ }
+
+ return err;
+}
+
+static int detail_platform_imsm(int verbose, int enumerate_only)
+{
+ /* There are two components to imsm platform support, the ahci SATA
+ * controller and the option-rom. To find the SATA controller we
+ * simply look in /sys/bus/pci/drivers/ahci to see if an ahci
+ * controller with the Intel vendor id is present. This approach
+ * allows mdadm to leverage the kernel's ahci detection logic, with the
+ * caveat that if ahci.ko is not loaded mdadm will not be able to
+ * detect platform raid capabilities. The option-rom resides in a
+ * platform "Adapter ROM". We scan for its signature to retrieve the
+ * platform capabilities. If raid support is disabled in the BIOS the
+ * option-rom capability structure will not be available.
+ */
+ const struct imsm_orom *orom;
+ struct sys_dev *list, *hba;
+ DIR *dir;
+ struct dirent *ent;
+ const char *hba_path;
+ int host_base = 0;
+ int port_count = 0;
+
+ if (enumerate_only) {
+ if (check_env("IMSM_NO_PLATFORM") || find_imsm_orom())
+ return 0;
+ return 2;
+ }
+
+ list = find_driver_devices("pci", "ahci");
+ for (hba = list; hba; hba = hba->next)
+ if (devpath_to_vendor(hba->path) == 0x8086)
+ break;
+
+ if (!hba) {
+ if (verbose)
+ fprintf(stderr, Name ": unable to find active ahci controller\n");
+ free_sys_dev(&list);
+ return 2;
+ } else if (verbose)
+ fprintf(stderr, Name ": found Intel SATA AHCI Controller\n");
+ hba_path = hba->path;
+ hba->path = NULL;
+ free_sys_dev(&list);
+
+ orom = find_imsm_orom();
+ if (!orom) {
+ if (verbose)
+ fprintf(stderr, Name ": imsm option-rom not found\n");
+ return 2;
+ }
+
+ printf(" Platform : Intel(R) Matrix Storage Manager\n");
+ printf(" Version : %d.%d.%d.%d\n", orom->major_ver, orom->minor_ver,
+ orom->hotfix_ver, orom->build);
+ printf(" RAID Levels :%s%s%s%s%s\n",
+ imsm_orom_has_raid0(orom) ? " raid0" : "",
+ imsm_orom_has_raid1(orom) ? " raid1" : "",
+ imsm_orom_has_raid1e(orom) ? " raid1e" : "",
+ imsm_orom_has_raid10(orom) ? " raid10" : "",
+ imsm_orom_has_raid5(orom) ? " raid5" : "");
+ printf(" Chunk Sizes :%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
+ imsm_orom_has_chunk(orom, 2) ? " 2k" : "",
+ imsm_orom_has_chunk(orom, 4) ? " 4k" : "",
+ imsm_orom_has_chunk(orom, 8) ? " 8k" : "",
+ imsm_orom_has_chunk(orom, 16) ? " 16k" : "",
+ imsm_orom_has_chunk(orom, 32) ? " 32k" : "",
+ imsm_orom_has_chunk(orom, 64) ? " 64k" : "",
+ imsm_orom_has_chunk(orom, 128) ? " 128k" : "",
+ imsm_orom_has_chunk(orom, 256) ? " 256k" : "",
+ imsm_orom_has_chunk(orom, 512) ? " 512k" : "",
+ imsm_orom_has_chunk(orom, 1024*1) ? " 1M" : "",
+ imsm_orom_has_chunk(orom, 1024*2) ? " 2M" : "",
+ imsm_orom_has_chunk(orom, 1024*4) ? " 4M" : "",
+ imsm_orom_has_chunk(orom, 1024*8) ? " 8M" : "",
+ imsm_orom_has_chunk(orom, 1024*16) ? " 16M" : "",
+ imsm_orom_has_chunk(orom, 1024*32) ? " 32M" : "",
+ imsm_orom_has_chunk(orom, 1024*64) ? " 64M" : "");
+ printf(" Max Disks : %d\n", orom->tds);
+ printf(" Max Volumes : %d\n", orom->vpa);
+ printf(" I/O Controller : %s\n", hba_path);
+
+ /* find the smallest scsi host number to determine a port number base */
+ dir = opendir(hba_path);
+ for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) {
+ int host;
+
+ if (sscanf(ent->d_name, "host%d", &host) != 1)
+ continue;
+ if (port_count == 0)
+ host_base = host;
+ else if (host < host_base)
+ host_base = host;
+
+ if (host + 1 > port_count + host_base)
+ port_count = host + 1 - host_base;
+
+ }
+ if (dir)
+ closedir(dir);
+
+ if (!port_count || imsm_enumerate_ports(hba_path, port_count,
+ host_base, verbose) != 0) {
+ if (verbose)
+ fprintf(stderr, Name ": failed to enumerate ports\n");
+ return 2;
+ }
+
+ return 0;
+}
+#endif
+
+static int match_home_imsm(struct supertype *st, char *homehost)
+{
+ /* the imsm metadata format does not specify any host
+ * identification information. We return -1 since we can never
+ * confirm nor deny whether a given array is "meant" for this
+ * host. We rely on compare_super and the 'family_num' field to
+ * exclude member disks that do not belong, and we rely on
+ * mdadm.conf to specify the arrays that should be assembled.
+ * Auto-assembly may still pick up "foreign" arrays.
+ */
+
+ return -1;
+}
+
+static void uuid_from_super_imsm(struct supertype *st, int uuid[4])
+{
+ /* The uuid returned here is used for:
+ * uuid to put into bitmap file (Create, Grow)
+ * uuid for backup header when saving critical section (Grow)
+ * comparing uuids when re-adding a device into an array
+ * In these cases the uuid required is that of the data-array,
+ * not the device-set.
+ * uuid to recognise same set when adding a missing device back
+ * to an array. This is a uuid for the device-set.
+ *
+ * For each of these we can make do with a truncated
+ * or hashed uuid rather than the original, as long as
+ * everyone agrees.
+ * In each case the uuid required is that of the data-array,
+ * not the device-set.
+ */
+ /* imsm does not track uuid's so we synthesis one using sha1 on
+ * - The signature (Which is constant for all imsm array, but no matter)
+ * - the family_num of the container
+ * - the index number of the volume
+ * - the 'serial' number of the volume.
+ * Hopefully these are all constant.
+ */
+ struct intel_super *super = st->sb;
+
+ char buf[20];
+ struct sha1_ctx ctx;
+ struct imsm_dev *dev = NULL;
+
+ sha1_init_ctx(&ctx);
+ sha1_process_bytes(super->anchor->sig, MPB_SIG_LEN, &ctx);
+ sha1_process_bytes(&super->anchor->family_num, sizeof(__u32), &ctx);
+ if (super->current_vol >= 0)
+ dev = get_imsm_dev(super, super->current_vol);
+ if (dev) {
+ __u32 vol = super->current_vol;
+ sha1_process_bytes(&vol, sizeof(vol), &ctx);
+ sha1_process_bytes(dev->volume, MAX_RAID_SERIAL_LEN, &ctx);
+ }
+ sha1_finish_ctx(&ctx, buf);
+ memcpy(uuid, buf, 4*4);
+}
+
+#if 0
+static void
+get_imsm_numerical_version(struct imsm_super *mpb, int *m, int *p)
+{
+ __u8 *v = get_imsm_version(mpb);
+ __u8 *end = mpb->sig + MAX_SIGNATURE_LENGTH;
+ char major[] = { 0, 0, 0 };
+ char minor[] = { 0 ,0, 0 };
+ char patch[] = { 0, 0, 0 };
+ char *ver_parse[] = { major, minor, patch };
+ int i, j;
+
+ i = j = 0;
+ while (*v != '\0' && v < end) {
+ if (*v != '.' && j < 2)
+ ver_parse[i][j++] = *v;
+ else {
+ i++;
+ j = 0;
+ }
+ v++;
+ }
+
+ *m = strtol(minor, NULL, 0);
+ *p = strtol(patch, NULL, 0);
+}
+#endif
+
+static int imsm_level_to_layout(int level)
+{
+ switch (level) {
+ case 0:
+ case 1:
+ return 0;
+ case 5:
+ case 6:
+ return ALGORITHM_LEFT_ASYMMETRIC;
+ case 10:
+ return 0x102;
+ }
+ return UnSet;
+}
+
+static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info)
+{
+ struct intel_super *super = st->sb;
+ struct imsm_dev *dev = get_imsm_dev(super, super->current_vol);
+ struct imsm_map *map = get_imsm_map(dev, 0);
+ struct dl *dl;
+
+ for (dl = super->disks; dl; dl = dl->next)
+ if (dl->raiddisk == info->disk.raid_disk)
+ break;
+ info->container_member = super->current_vol;
+ info->array.raid_disks = map->num_members;
+ info->array.level = get_imsm_raid_level(map);
+ info->array.layout = imsm_level_to_layout(info->array.level);
+ info->array.md_minor = -1;
+ info->array.ctime = 0;
+ info->array.utime = 0;
+ info->array.chunk_size = __le16_to_cpu(map->blocks_per_strip) << 9;
+ info->array.state = !dev->vol.dirty;
+ info->custom_array_size = __le32_to_cpu(dev->size_high);
+ info->custom_array_size <<= 32;
+ info->custom_array_size |= __le32_to_cpu(dev->size_low);
+
+ info->disk.major = 0;
+ info->disk.minor = 0;
+ if (dl) {
+ info->disk.major = dl->major;
+ info->disk.minor = dl->minor;
+ }
+
+ info->data_offset = __le32_to_cpu(map->pba_of_lba0);
+ info->component_size = __le32_to_cpu(map->blocks_per_member);
+ memset(info->uuid, 0, sizeof(info->uuid));
+
+ if (map->map_state == IMSM_T_STATE_UNINITIALIZED || dev->vol.dirty)
+ info->resync_start = 0;
+ else if (dev->vol.migr_state)
+ /* FIXME add curr_migr_unit to resync_start conversion */
+ info->resync_start = 0;
+ else
+ info->resync_start = ~0ULL;
+
+ strncpy(info->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN);
+ info->name[MAX_RAID_SERIAL_LEN] = 0;
+
+ info->array.major_version = -1;
+ info->array.minor_version = -2;
+ sprintf(info->text_version, "/%s/%d",
+ devnum2devname(st->container_dev),
+ info->container_member);
+ info->safe_mode_delay = 4000; /* 4 secs like the Matrix driver */
+ uuid_from_super_imsm(st, info->uuid);
+}
+
+/* check the config file to see if we can return a real uuid for this spare */
+static void fixup_container_spare_uuid(struct mdinfo *inf)
+{
+ struct mddev_ident_s *array_list;
+
+ if (inf->array.level != LEVEL_CONTAINER ||
+ memcmp(inf->uuid, uuid_match_any, sizeof(int[4])) != 0)
+ return;
+
+ array_list = conf_get_ident(NULL);
+
+ for (; array_list; array_list = array_list->next) {
+ if (array_list->uuid_set) {
+ struct supertype *_sst; /* spare supertype */
+ struct supertype *_cst; /* container supertype */
+
+ _cst = array_list->st;
+ _sst = _cst->ss->match_metadata_desc(inf->text_version);
+ if (_sst) {
+ memcpy(inf->uuid, array_list->uuid, sizeof(int[4]));
+ free(_sst);
+ break;
+ }
+ }
+ }
+}
+
+static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info)
+{
+ struct intel_super *super = st->sb;
+ struct imsm_disk *disk;
+ __u32 s;
+
+ if (super->current_vol >= 0) {
+ getinfo_super_imsm_volume(st, info);
+ return;
+ }
+
+ /* Set raid_disks to zero so that Assemble will always pull in valid
+ * spares
+ */
+ info->array.raid_disks = 0;
+ info->array.level = LEVEL_CONTAINER;
+ info->array.layout = 0;
+ info->array.md_minor = -1;
+ info->array.ctime = 0; /* N/A for imsm */
+ info->array.utime = 0;
+ info->array.chunk_size = 0;
+
+ info->disk.major = 0;
+ info->disk.minor = 0;
+ info->disk.raid_disk = -1;
+ info->reshape_active = 0;
+ info->array.major_version = -1;
+ info->array.minor_version = -2;
+ strcpy(info->text_version, "imsm");
+ info->safe_mode_delay = 0;
+ info->disk.number = -1;
+ info->disk.state = 0;
+ info->name[0] = 0;
+
+ if (super->disks) {
+ __u32 reserved = imsm_reserved_sectors(super, super->disks);
+
+ disk = &super->disks->disk;
+ info->data_offset = __le32_to_cpu(disk->total_blocks) - reserved;
+ info->component_size = reserved;
+ s = disk->status;
+ info->disk.state = s & CONFIGURED_DISK ? (1 << MD_DISK_ACTIVE) : 0;
+ /* we don't change info->disk.raid_disk here because
+ * this state will be finalized in mdmon after we have
+ * found the 'most fresh' version of the metadata
+ */
+ info->disk.state |= s & FAILED_DISK ? (1 << MD_DISK_FAULTY) : 0;
+ info->disk.state |= s & SPARE_DISK ? 0 : (1 << MD_DISK_SYNC);
+ }
+
+ /* only call uuid_from_super_imsm when this disk is part of a populated container,
+ * ->compare_super may have updated the 'num_raid_devs' field for spares
+ */
+ if (info->disk.state & (1 << MD_DISK_SYNC) || super->anchor->num_raid_devs)
+ uuid_from_super_imsm(st, info->uuid);
+ else {
+ memcpy(info->uuid, uuid_match_any, sizeof(int[4]));
+ fixup_container_spare_uuid(info);
+ }
+}
+
+static int update_super_imsm(struct supertype *st, struct mdinfo *info,
+ char *update, char *devname, int verbose,
+ int uuid_set, char *homehost)
+{
+ /* FIXME */
+
+ /* For 'assemble' and 'force' we need to return non-zero if any
+ * change was made. For others, the return value is ignored.
+ * Update options are:
+ * force-one : This device looks a bit old but needs to be included,
+ * update age info appropriately.
+ * assemble: clear any 'faulty' flag to allow this device to
+ * be assembled.
+ * force-array: Array is degraded but being forced, mark it clean
+ * if that will be needed to assemble it.
+ *
+ * newdev: not used ????
+ * grow: Array has gained a new device - this is currently for
+ * linear only
+ * resync: mark as dirty so a resync will happen.
+ * name: update the name - preserving the homehost
+ *
+ * Following are not relevant for this imsm:
+ * sparc2.2 : update from old dodgey metadata
+ * super-minor: change the preferred_minor number
+ * summaries: update redundant counters.
+ * uuid: Change the uuid of the array to match watch is given
+ * homehost: update the recorded homehost
+ * _reshape_progress: record new reshape_progress position.
+ */
+ int rv = 0;
+ //struct intel_super *super = st->sb;
+ //struct imsm_super *mpb = super->mpb;
+
+ if (strcmp(update, "grow") == 0) {
+ }
+ if (strcmp(update, "resync") == 0) {
+ /* dev->vol.dirty = 1; */
+ }
+
+ /* IMSM has no concept of UUID or homehost */
+
+ return rv;
+}
+
+static size_t disks_to_mpb_size(int disks)
+{
+ size_t size;
+
+ size = sizeof(struct imsm_super);
+ size += (disks - 1) * sizeof(struct imsm_disk);
+ size += 2 * sizeof(struct imsm_dev);
+ /* up to 2 maps per raid device (-2 for imsm_maps in imsm_dev */
+ size += (4 - 2) * sizeof(struct imsm_map);
+ /* 4 possible disk_ord_tbl's */
+ size += 4 * (disks - 1) * sizeof(__u32);
+
+ return size;
+}
+
+static __u64 avail_size_imsm(struct supertype *st, __u64 devsize)
+{
+ if (devsize < (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS))
+ return 0;
+
+ return devsize - (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS);
+}
+
+static void free_devlist(struct intel_super *super)
+{
+ struct intel_dev *dv;
+
+ while (super->devlist) {
+ dv = super->devlist->next;
+ free(super->devlist->dev);
+ free(super->devlist);
+ super->devlist = dv;
+ }
+}
+
+static void imsm_copy_dev(struct imsm_dev *dest, struct imsm_dev *src)
+{
+ memcpy(dest, src, sizeof_imsm_dev(src, 0));
+}
+
+static int compare_super_imsm(struct supertype *st, struct supertype *tst)
+{
+ /*
+ * return:
+ * 0 same, or first was empty, and second was copied
+ * 1 second had wrong number
+ * 2 wrong uuid
+ * 3 wrong other info
+ */
+ struct intel_super *first = st->sb;
+ struct intel_super *sec = tst->sb;
+
+ if (!first) {
+ st->sb = tst->sb;
+ tst->sb = NULL;
+ return 0;
+ }
+
+ if (memcmp(first->anchor->sig, sec->anchor->sig, MAX_SIGNATURE_LENGTH) != 0)
+ return 3;
+
+ /* if an anchor does not have num_raid_devs set then it is a free
+ * floating spare
+ */
+ if (first->anchor->num_raid_devs > 0 &&
+ sec->anchor->num_raid_devs > 0) {
+ if (first->anchor->family_num != sec->anchor->family_num)
+ return 3;
+ }
+
+ /* if 'first' is a spare promote it to a populated mpb with sec's
+ * family number
+ */
+ if (first->anchor->num_raid_devs == 0 &&
+ sec->anchor->num_raid_devs > 0) {
+ int i;
+ struct intel_dev *dv;
+ struct imsm_dev *dev;
+
+ /* we need to copy raid device info from sec if an allocation
+ * fails here we don't associate the spare
+ */
+ for (i = 0; i < sec->anchor->num_raid_devs; i++) {
+ dv = malloc(sizeof(*dv));
+ if (!dv)
+ break;
+ dev = malloc(sizeof_imsm_dev(get_imsm_dev(sec, i), 1));
+ if (!dev) {
+ free(dv);
+ break;
+ }
+ dv->dev = dev;
+ dv->index = i;
+ dv->next = first->devlist;
+ first->devlist = dv;
+ }
+ if (i <= sec->anchor->num_raid_devs) {
+ /* allocation failure */
+ free_devlist(first);
+ fprintf(stderr, "imsm: failed to associate spare\n");
+ return 3;
+ }
+ for (i = 0; i < sec->anchor->num_raid_devs; i++)
+ imsm_copy_dev(get_imsm_dev(first, i), get_imsm_dev(sec, i));
+
+ first->anchor->num_raid_devs = sec->anchor->num_raid_devs;
+ first->anchor->family_num = sec->anchor->family_num;
+ }
+
+ return 0;
+}
+
+static void fd2devname(int fd, char *name)
+{
+ struct stat st;
+ char path[256];
+ char dname[100];
+ char *nm;
+ int rv;
+
+ name[0] = '\0';
+ if (fstat(fd, &st) != 0)
+ return;
+ sprintf(path, "/sys/dev/block/%d:%d",
+ major(st.st_rdev), minor(st.st_rdev));
+
+ rv = readlink(path, dname, sizeof(dname));
+ if (rv <= 0)
+ return;
+
+ dname[rv] = '\0';
+ nm = strrchr(dname, '/');
+ nm++;
+ snprintf(name, MAX_RAID_SERIAL_LEN, "/dev/%s", nm);
+}
+
+
+extern int scsi_get_serial(int fd, void *buf, size_t buf_len);
+
+static int imsm_read_serial(int fd, char *devname,
+ __u8 serial[MAX_RAID_SERIAL_LEN])
+{
+ unsigned char scsi_serial[255];
+ int rv;
+ int rsp_len;
+ int len;
+ char *dest;
+ char *src;
+ char *rsp_buf;
+ int i;
+
+ memset(scsi_serial, 0, sizeof(scsi_serial));
+
+ rv = scsi_get_serial(fd, scsi_serial, sizeof(scsi_serial));
+
+ if (rv && check_env("IMSM_DEVNAME_AS_SERIAL")) {
+ memset(serial, 0, MAX_RAID_SERIAL_LEN);
+ fd2devname(fd, (char *) serial);
+ return 0;
+ }
+
+ if (rv != 0) {
+ if (devname)
+ fprintf(stderr,
+ Name ": Failed to retrieve serial for %s\n",
+ devname);
+ return rv;
+ }
+
+ rsp_len = scsi_serial[3];
+ if (!rsp_len) {
+ if (devname)
+ fprintf(stderr,
+ Name ": Failed to retrieve serial for %s\n",
+ devname);
+ return 2;
+ }
+ rsp_buf = (char *) &scsi_serial[4];
+
+ /* trim all whitespace and non-printable characters and convert
+ * ':' to ';'
+ */
+ for (i = 0, dest = rsp_buf; i < rsp_len; i++) {
+ src = &rsp_buf[i];
+ if (*src > 0x20) {
+ /* ':' is reserved for use in placeholder serial
+ * numbers for missing disks
+ */
+ if (*src == ':')
+ *dest++ = ';';
+ else
+ *dest++ = *src;
+ }
+ }
+ len = dest - rsp_buf;
+ dest = rsp_buf;
+
+ /* truncate leading characters */
+ if (len > MAX_RAID_SERIAL_LEN) {
+ dest += len - MAX_RAID_SERIAL_LEN;
+ len = MAX_RAID_SERIAL_LEN;
+ }
+
+ memset(serial, 0, MAX_RAID_SERIAL_LEN);
+ memcpy(serial, dest, len);
+
+ return 0;
+}
+
+static int serialcmp(__u8 *s1, __u8 *s2)
+{
+ return strncmp((char *) s1, (char *) s2, MAX_RAID_SERIAL_LEN);
+}
+
+static void serialcpy(__u8 *dest, __u8 *src)
+{
+ strncpy((char *) dest, (char *) src, MAX_RAID_SERIAL_LEN);
+}
+
+static struct dl *serial_to_dl(__u8 *serial, struct intel_super *super)
+{
+ struct dl *dl;
+
+ for (dl = super->disks; dl; dl = dl->next)
+ if (serialcmp(dl->serial, serial) == 0)
+ break;
+
+ return dl;
+}
+
+static int
+load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd)
+{
+ struct dl *dl;
+ struct stat stb;
+ int rv;
+ int i;
+ int alloc = 1;
+ __u8 serial[MAX_RAID_SERIAL_LEN];
+
+ rv = imsm_read_serial(fd, devname, serial);
+
+ if (rv != 0)
+ return 2;
+
+ /* check if this is a disk we have seen before. it may be a spare in
+ * super->disks while the current anchor believes it is a raid member,
+ * check if we need to update dl->index
+ */
+ dl = serial_to_dl(serial, super);
+ if (!dl)
+ dl = malloc(sizeof(*dl));
+ else
+ alloc = 0;
+
+ if (!dl) {
+ if (devname)
+ fprintf(stderr,
+ Name ": failed to allocate disk buffer for %s\n",
+ devname);
+ return 2;
+ }
+
+ if (alloc) {
+ fstat(fd, &stb);
+ dl->major = major(stb.st_rdev);
+ dl->minor = minor(stb.st_rdev);
+ dl->next = super->disks;
+ dl->fd = keep_fd ? fd : -1;
+ dl->devname = devname ? strdup(devname) : NULL;
+ serialcpy(dl->serial, serial);
+ dl->index = -2;
+ dl->e = NULL;
+ } else if (keep_fd) {
+ close(dl->fd);
+ dl->fd = fd;
+ }
+
+ /* look up this disk's index in the current anchor */
+ for (i = 0; i < super->anchor->num_disks; i++) {
+ struct imsm_disk *disk_iter;
+
+ disk_iter = __get_imsm_disk(super->anchor, i);
+
+ if (serialcmp(disk_iter->serial, dl->serial) == 0) {
+ dl->disk = *disk_iter;
+ /* only set index on disks that are a member of a
+ * populated contianer, i.e. one with raid_devs
+ */
+ if (dl->disk.status & FAILED_DISK)
+ dl->index = -2;
+ else if (dl->disk.status & SPARE_DISK)
+ dl->index = -1;
+ else
+ dl->index = i;
+
+ break;
+ }
+ }
+
+ /* no match, maybe a stale failed drive */
+ if (i == super->anchor->num_disks && dl->index >= 0) {
+ dl->disk = *__get_imsm_disk(super->anchor, dl->index);
+ if (dl->disk.status & FAILED_DISK)
+ dl->index = -2;
+ }
+
+ if (alloc)
+ super->disks = dl;
+
+ return 0;
+}
+
+#ifndef MDASSEMBLE
+/* When migrating map0 contains the 'destination' state while map1
+ * contains the current state. When not migrating map0 contains the
+ * current state. This routine assumes that map[0].map_state is set to
+ * the current array state before being called.
+ *
+ * Migration is indicated by one of the following states
+ * 1/ Idle (migr_state=0 map0state=normal||unitialized||degraded||failed)
+ * 2/ Initialize (migr_state=1 migr_type=MIGR_INIT map0state=normal
+ * map1state=unitialized)
+ * 3/ Repair (Resync) (migr_state=1 migr_type=MIGR_REPAIR map0state=normal
+ * map1state=normal)
+ * 4/ Rebuild (migr_state=1 migr_type=MIGR_REBUILD map0state=normal
+ * map1state=degraded)
+ */
+static void migrate(struct imsm_dev *dev, __u8 to_state, int migr_type)
+{
+ struct imsm_map *dest;
+ struct imsm_map *src = get_imsm_map(dev, 0);
+
+ dev->vol.migr_state = 1;
+ set_migr_type(dev, migr_type);
+ dev->vol.curr_migr_unit = 0;
+ dest = get_imsm_map(dev, 1);
+
+ /* duplicate and then set the target end state in map[0] */
+ memcpy(dest, src, sizeof_imsm_map(src));
+ if (migr_type == MIGR_REBUILD) {
+ __u32 ord;
+ int i;
+
+ for (i = 0; i < src->num_members; i++) {
+ ord = __le32_to_cpu(src->disk_ord_tbl[i]);
+ set_imsm_ord_tbl_ent(src, i, ord_to_idx(ord));
+ }
+ }
+
+ src->map_state = to_state;
+}
+
+static void end_migration(struct imsm_dev *dev, __u8 map_state)
+{
+ struct imsm_map *map = get_imsm_map(dev, 0);
+ struct imsm_map *prev = get_imsm_map(dev, dev->vol.migr_state);
+ int i;
+
+ /* merge any IMSM_ORD_REBUILD bits that were not successfully
+ * completed in the last migration.
+ *
+ * FIXME add support for online capacity expansion and
+ * raid-level-migration
+ */
+ for (i = 0; i < prev->num_members; i++)
+ map->disk_ord_tbl[i] |= prev->disk_ord_tbl[i];
+
+ dev->vol.migr_state = 0;
+ dev->vol.curr_migr_unit = 0;
+ map->map_state = map_state;
+}
+#endif
+
+static int parse_raid_devices(struct intel_super *super)
+{
+ int i;
+ struct imsm_dev *dev_new;
+ size_t len, len_migr;
+ size_t space_needed = 0;
+ struct imsm_super *mpb = super->anchor;
+
+ for (i = 0; i < super->anchor->num_raid_devs; i++) {
+ struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i);
+ struct intel_dev *dv;
+
+ len = sizeof_imsm_dev(dev_iter, 0);
+ len_migr = sizeof_imsm_dev(dev_iter, 1);
+ if (len_migr > len)
+ space_needed += len_migr - len;
+
+ dv = malloc(sizeof(*dv));
+ if (!dv)
+ return 1;
+ dev_new = malloc(len_migr);
+ if (!dev_new) {
+ free(dv);
+ return 1;
+ }
+ imsm_copy_dev(dev_new, dev_iter);
+ dv->dev = dev_new;
+ dv->index = i;
+ dv->next = super->devlist;
+ super->devlist = dv;
+ }
+
+ /* ensure that super->buf is large enough when all raid devices
+ * are migrating
+ */
+ if (__le32_to_cpu(mpb->mpb_size) + space_needed > super->len) {
+ void *buf;
+
+ len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + space_needed, 512);
+ if (posix_memalign(&buf, 512, len) != 0)
+ return 1;
+
+ memcpy(buf, super->buf, super->len);
+ memset(buf + super->len, 0, len - super->len);
+ free(super->buf);
+ super->buf = buf;
+ super->len = len;
+ }
+
+ return 0;
+}
+
+/* retrieve a pointer to the bbm log which starts after all raid devices */
+struct bbm_log *__get_imsm_bbm_log(struct imsm_super *mpb)
+{
+ void *ptr = NULL;
+
+ if (__le32_to_cpu(mpb->bbm_log_size)) {
+ ptr = mpb;
+ ptr += mpb->mpb_size - __le32_to_cpu(mpb->bbm_log_size);
+ }
+
+ return ptr;
+}
+
+static void __free_imsm(struct intel_super *super, int free_disks);
+
+/* load_imsm_mpb - read matrix metadata
+ * allocates super->mpb to be freed by free_super
+ */
+static int load_imsm_mpb(int fd, struct intel_super *super, char *devname)
+{
+ unsigned long long dsize;
+ unsigned long long sectors;
+ struct stat;
+ struct imsm_super *anchor;
+ __u32 check_sum;
+ int rc;
+
+ get_dev_size(fd, NULL, &dsize);
+
+ if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) {
+ if (devname)
+ fprintf(stderr,
+ Name ": Cannot seek to anchor block on %s: %s\n",
+ devname, strerror(errno));
+ return 1;
+ }
+
+ if (posix_memalign((void**)&anchor, 512, 512) != 0) {
+ if (devname)
+ fprintf(stderr,
+ Name ": Failed to allocate imsm anchor buffer"
+ " on %s\n", devname);
+ return 1;
+ }
+ if (read(fd, anchor, 512) != 512) {
+ if (devname)
+ fprintf(stderr,
+ Name ": Cannot read anchor block on %s: %s\n",
+ devname, strerror(errno));
+ free(anchor);
+ return 1;
+ }
+
+ if (strncmp((char *) anchor->sig, MPB_SIGNATURE, MPB_SIG_LEN) != 0) {
+ if (devname)
+ fprintf(stderr,
+ Name ": no IMSM anchor on %s\n", devname);
+ free(anchor);
+ return 2;
+ }
+
+ __free_imsm(super, 0);
+ super->len = ROUND_UP(anchor->mpb_size, 512);
+ if (posix_memalign(&super->buf, 512, super->len) != 0) {
+ if (devname)
+ fprintf(stderr,
+ Name ": unable to allocate %zu byte mpb buffer\n",
+ super->len);
+ free(anchor);
+ return 2;
+ }
+ memcpy(super->buf, anchor, 512);
+
+ sectors = mpb_sectors(anchor) - 1;
+ free(anchor);
+ if (!sectors) {
+ check_sum = __gen_imsm_checksum(super->anchor);
+ if (check_sum != __le32_to_cpu(super->anchor->check_sum)) {
+ if (devname)
+ fprintf(stderr,
+ Name ": IMSM checksum %x != %x on %s\n",
+ check_sum,
+ __le32_to_cpu(super->anchor->check_sum),
+ devname);
+ return 2;
+ }
+
+ rc = load_imsm_disk(fd, super, devname, 0);
+ if (rc == 0)
+ rc = parse_raid_devices(super);
+ return rc;
+ }
+
+ /* read the extended mpb */
+ if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) {
+ if (devname)
+ fprintf(stderr,
+ Name ": Cannot seek to extended mpb on %s: %s\n",
+ devname, strerror(errno));
+ return 1;
+ }
+
+ if (read(fd, super->buf + 512, super->len - 512) != super->len - 512) {
+ if (devname)
+ fprintf(stderr,
+ Name ": Cannot read extended mpb on %s: %s\n",
+ devname, strerror(errno));
+ return 2;
+ }
+
+ check_sum = __gen_imsm_checksum(super->anchor);
+ if (check_sum != __le32_to_cpu(super->anchor->check_sum)) {
+ if (devname)
+ fprintf(stderr,
+ Name ": IMSM checksum %x != %x on %s\n",
+ check_sum, __le32_to_cpu(super->anchor->check_sum),
+ devname);
+ return 3;
+ }
+
+ /* FIXME the BBM log is disk specific so we cannot use this global
+ * buffer for all disks. Ok for now since we only look at the global
+ * bbm_log_size parameter to gate assembly
+ */
+ super->bbm_log = __get_imsm_bbm_log(super->anchor);
+
+ rc = load_imsm_disk(fd, super, devname, 0);
+ if (rc == 0)
+ rc = parse_raid_devices(super);
+
+ return rc;
+}
+
+static void __free_imsm_disk(struct dl *d)
+{
+ if (d->fd >= 0)
+ close(d->fd);
+ if (d->devname)
+ free(d->devname);
+ if (d->e)
+ free(d->e);
+ free(d);
+
+}
+static void free_imsm_disks(struct intel_super *super)
+{
+ struct dl *d;
+
+ while (super->disks) {
+ d = super->disks;
+ super->disks = d->next;
+ __free_imsm_disk(d);
+ }
+ while (super->missing) {
+ d = super->missing;
+ super->missing = d->next;
+ __free_imsm_disk(d);
+ }
+
+}
+
+/* free all the pieces hanging off of a super pointer */
+static void __free_imsm(struct intel_super *super, int free_disks)
+{
+ if (super->buf) {
+ free(super->buf);
+ super->buf = NULL;
+ }
+ if (free_disks)
+ free_imsm_disks(super);
+ free_devlist(super);
+ if (super->hba) {
+ free((void *) super->hba);
+ super->hba = NULL;
+ }
+}
+
+static void free_imsm(struct intel_super *super)
+{
+ __free_imsm(super, 1);
+ free(super);
+}
+
+static void free_super_imsm(struct supertype *st)
+{
+ struct intel_super *super = st->sb;
+
+ if (!super)
+ return;
+
+ free_imsm(super);
+ st->sb = NULL;
+}
+
+static struct intel_super *alloc_super(int creating_imsm)
+{
+ struct intel_super *super = malloc(sizeof(*super));
+
+ if (super) {
+ memset(super, 0, sizeof(*super));
+ super->creating_imsm = creating_imsm;
+ super->current_vol = -1;
+ super->create_offset = ~((__u32 ) 0);
+ if (!check_env("IMSM_NO_PLATFORM"))
+ super->orom = find_imsm_orom();
+ if (super->orom && !check_env("IMSM_TEST_OROM")) {
+ struct sys_dev *list, *ent;
+
+ /* find the first intel ahci controller */
+ list = find_driver_devices("pci", "ahci");
+ for (ent = list; ent; ent = ent->next)
+ if (devpath_to_vendor(ent->path) == 0x8086)
+ break;
+ if (ent) {
+ super->hba = ent->path;
+ ent->path = NULL;
+ }
+ free_sys_dev(&list);
+ }
+ }
+
+ return super;
+}
+
+#ifndef MDASSEMBLE
+/* find_missing - helper routine for load_super_imsm_all that identifies
+ * disks that have disappeared from the system. This routine relies on
+ * the mpb being uptodate, which it is at load time.
+ */
+static int find_missing(struct intel_super *super)
+{
+ int i;
+ struct imsm_super *mpb = super->anchor;
+ struct dl *dl;
+ struct imsm_disk *disk;
+
+ for (i = 0; i < mpb->num_disks; i++) {
+ disk = __get_imsm_disk(mpb, i);
+ dl = serial_to_dl(disk->serial, super);
+ if (dl)
+ continue;
+
+ dl = malloc(sizeof(*dl));
+ if (!dl)
+ return 1;
+ dl->major = 0;
+ dl->minor = 0;
+ dl->fd = -1;
+ dl->devname = strdup("missing");
+ dl->index = i;
+ serialcpy(dl->serial, disk->serial);
+ dl->disk = *disk;
+ dl->e = NULL;
+ dl->next = super->missing;
+ super->missing = dl;
+ }
+
+ return 0;
+}
+
+static int load_super_imsm_all(struct supertype *st, int fd, void **sbp,
+ char *devname, int keep_fd)
+{
+ struct mdinfo *sra;
+ struct intel_super *super;
+ struct mdinfo *sd, *best = NULL;
+ __u32 bestgen = 0;
+ __u32 gen;
+ char nm[20];
+ int dfd;
+ int rv;
+ int devnum = fd2devnum(fd);
+ int retry;
+ enum sysfs_read_flags flags;
+
+ flags = GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE;
+ if (mdmon_running(devnum))
+ flags |= SKIP_GONE_DEVS;
+
+ /* check if 'fd' an opened container */
+ sra = sysfs_read(fd, 0, flags);
+ if (!sra)
+ return 1;
+
+ if (sra->array.major_version != -1 ||
+ sra->array.minor_version != -2 ||
+ strcmp(sra->text_version, "imsm") != 0)
+ return 1;
+
+ super = alloc_super(0);
+ if (!super)
+ return 1;
+
+ /* find the most up to date disk in this array, skipping spares */
+ for (sd = sra->devs; sd; sd = sd->next) {
+ sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+ dfd = dev_open(nm, keep_fd ? O_RDWR : O_RDONLY);
+ if (dfd < 0) {
+ free_imsm(super);
+ return 2;
+ }
+ rv = load_imsm_mpb(dfd, super, NULL);
+
+ /* retry the load if we might have raced against mdmon */
+ if (rv == 3 && mdmon_running(devnum))
+ for (retry = 0; retry < 3; retry++) {
+ usleep(3000);
+ rv = load_imsm_mpb(dfd, super, NULL);
+ if (rv != 3)
+ break;
+ }
+ if (!keep_fd)
+ close(dfd);
+ if (rv == 0) {
+ if (super->anchor->num_raid_devs == 0)
+ gen = 0;
+ else
+ gen = __le32_to_cpu(super->anchor->generation_num);
+ if (!best || gen > bestgen) {
+ bestgen = gen;
+ best = sd;
+ }
+ } else {
+ free_imsm(super);
+ return rv;
+ }
+ }
+
+ if (!best) {
+ free_imsm(super);
+ return 1;
+ }
+
+ /* load the most up to date anchor */
+ sprintf(nm, "%d:%d", best->disk.major, best->disk.minor);
+ dfd = dev_open(nm, O_RDONLY);
+ if (dfd < 0) {
+ free_imsm(super);
+ return 1;
+ }
+ rv = load_imsm_mpb(dfd, super, NULL);
+ close(dfd);
+ if (rv != 0) {
+ free_imsm(super);
+ return 2;
+ }
+
+ /* re-parse the disk list with the current anchor */
+ for (sd = sra->devs ; sd ; sd = sd->next) {
+ sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+ dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY);
+ if (dfd < 0) {
+ free_imsm(super);
+ return 2;
+ }
+ load_imsm_disk(dfd, super, NULL, keep_fd);
+ if (!keep_fd)
+ close(dfd);
+ }
+
+
+ if (find_missing(super) != 0) {
+ free_imsm(super);
+ return 2;
+ }
+
+ if (st->subarray[0]) {
+ if (atoi(st->subarray) <= super->anchor->num_raid_devs)
+ super->current_vol = atoi(st->subarray);
+ else
+ return 1;
+ }
+
+ *sbp = super;
+ st->container_dev = devnum;
+ if (st->ss == NULL) {
+ st->ss = &super_imsm;
+ st->minor_version = 0;
+ st->max_devs = IMSM_MAX_DEVICES;
+ }
+ st->loaded_container = 1;
+
+ return 0;
+}
+#endif
+
+static int load_super_imsm(struct supertype *st, int fd, char *devname)
+{
+ struct intel_super *super;
+ int rv;
+
+#ifndef MDASSEMBLE
+ if (load_super_imsm_all(st, fd, &st->sb, devname, 1) == 0)
+ return 0;
+#endif
+ if (st->subarray[0])
+ return 1; /* FIXME */
+
+ super = alloc_super(0);
+ if (!super) {
+ fprintf(stderr,
+ Name ": malloc of %zu failed.\n",
+ sizeof(*super));
+ return 1;
+ }
+
+ rv = load_imsm_mpb(fd, super, devname);
+
+ if (rv) {
+ if (devname)
+ fprintf(stderr,
+ Name ": Failed to load all information "
+ "sections on %s\n", devname);
+ free_imsm(super);
+ return rv;
+ }
+
+ st->sb = super;
+ if (st->ss == NULL) {
+ st->ss = &super_imsm;
+ st->minor_version = 0;
+ st->max_devs = IMSM_MAX_DEVICES;
+ }
+ st->loaded_container = 0;
+
+ return 0;
+}
+
+static __u16 info_to_blocks_per_strip(mdu_array_info_t *info)
+{
+ if (info->level == 1)
+ return 128;
+ return info->chunk_size >> 9;
+}
+
+static __u32 info_to_num_data_stripes(mdu_array_info_t *info, int num_domains)
+{
+ __u32 num_stripes;
+
+ num_stripes = (info->size * 2) / info_to_blocks_per_strip(info);
+ num_stripes /= num_domains;
+
+ return num_stripes;
+}
+
+static __u32 info_to_blocks_per_member(mdu_array_info_t *info)
+{
+ if (info->level == 1)
+ return info->size * 2;
+ else
+ return (info->size * 2) & ~(info_to_blocks_per_strip(info) - 1);
+}
+
+static void imsm_update_version_info(struct intel_super *super)
+{
+ /* update the version and attributes */
+ struct imsm_super *mpb = super->anchor;
+ char *version;
+ struct imsm_dev *dev;
+ struct imsm_map *map;
+ int i;
+
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ dev = get_imsm_dev(super, i);
+ map = get_imsm_map(dev, 0);
+ if (__le32_to_cpu(dev->size_high) > 0)
+ mpb->attributes |= MPB_ATTRIB_2TB;
+
+ /* FIXME detect when an array spans a port multiplier */
+ #if 0
+ mpb->attributes |= MPB_ATTRIB_PM;
+ #endif
+
+ if (mpb->num_raid_devs > 1 ||
+ mpb->attributes != MPB_ATTRIB_CHECKSUM_VERIFY) {
+ version = MPB_VERSION_ATTRIBS;
+ switch (get_imsm_raid_level(map)) {
+ case 0: mpb->attributes |= MPB_ATTRIB_RAID0; break;
+ case 1: mpb->attributes |= MPB_ATTRIB_RAID1; break;
+ case 10: mpb->attributes |= MPB_ATTRIB_RAID10; break;
+ case 5: mpb->attributes |= MPB_ATTRIB_RAID5; break;
+ }
+ } else {
+ if (map->num_members >= 5)
+ version = MPB_VERSION_5OR6_DISK_ARRAY;
+ else if (dev->status == DEV_CLONE_N_GO)
+ version = MPB_VERSION_CNG;
+ else if (get_imsm_raid_level(map) == 5)
+ version = MPB_VERSION_RAID5;
+ else if (map->num_members >= 3)
+ version = MPB_VERSION_3OR4_DISK_ARRAY;
+ else if (get_imsm_raid_level(map) == 1)
+ version = MPB_VERSION_RAID1;
+ else
+ version = MPB_VERSION_RAID0;
+ }
+ strcpy(((char *) mpb->sig) + strlen(MPB_SIGNATURE), version);
+ }
+}
+
+static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
+ unsigned long long size, char *name,
+ char *homehost, int *uuid)
+{
+ /* We are creating a volume inside a pre-existing container.
+ * so st->sb is already set.
+ */
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb = super->anchor;
+ struct intel_dev *dv;
+ struct imsm_dev *dev;
+ struct imsm_vol *vol;
+ struct imsm_map *map;
+ int idx = mpb->num_raid_devs;
+ int i;
+ unsigned long long array_blocks;
+ size_t size_old, size_new;
+ __u32 num_data_stripes;
+
+ if (super->orom && mpb->num_raid_devs >= super->orom->vpa) {
+ fprintf(stderr, Name": This imsm-container already has the "
+ "maximum of %d volumes\n", super->orom->vpa);
+ return 0;
+ }
+
+ /* ensure the mpb is large enough for the new data */
+ size_old = __le32_to_cpu(mpb->mpb_size);
+ size_new = disks_to_mpb_size(info->nr_disks);
+ if (size_new > size_old) {
+ void *mpb_new;
+ size_t size_round = ROUND_UP(size_new, 512);
+
+ if (posix_memalign(&mpb_new, 512, size_round) != 0) {
+ fprintf(stderr, Name": could not allocate new mpb\n");
+ return 0;
+ }
+ memcpy(mpb_new, mpb, size_old);
+ free(mpb);
+ mpb = mpb_new;
+ super->anchor = mpb_new;
+ mpb->mpb_size = __cpu_to_le32(size_new);
+ memset(mpb_new + size_old, 0, size_round - size_old);
+ }
+ super->current_vol = idx;
+ /* when creating the first raid device in this container set num_disks
+ * to zero, i.e. delete this spare and add raid member devices in
+ * add_to_super_imsm_volume()
+ */
+ if (super->current_vol == 0)
+ mpb->num_disks = 0;
+
+ for (i = 0; i < super->current_vol; i++) {
+ dev = get_imsm_dev(super, i);
+ if (strncmp((char *) dev->volume, name,
+ MAX_RAID_SERIAL_LEN) == 0) {
+ fprintf(stderr, Name": '%s' is already defined for this container\n",
+ name);
+ return 0;
+ }
+ }
+
+ sprintf(st->subarray, "%d", idx);
+ dv = malloc(sizeof(*dv));
+ if (!dv) {
+ fprintf(stderr, Name ": failed to allocate device list entry\n");
+ return 0;
+ }
+ dev = malloc(sizeof(*dev) + sizeof(__u32) * (info->raid_disks - 1));
+ if (!dev) {
+ free(dv);
+ fprintf(stderr, Name": could not allocate raid device\n");
+ return 0;
+ }
+ strncpy((char *) dev->volume, name, MAX_RAID_SERIAL_LEN);
+ if (info->level == 1)
+ array_blocks = info_to_blocks_per_member(info);
+ else
+ array_blocks = calc_array_size(info->level, info->raid_disks,
+ info->layout, info->chunk_size,
+ info->size*2);
+ /* round array size down to closest MB */
+ array_blocks = (array_blocks >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT;
+
+ dev->size_low = __cpu_to_le32((__u32) array_blocks);
+ dev->size_high = __cpu_to_le32((__u32) (array_blocks >> 32));
+ dev->status = __cpu_to_le32(0);
+ dev->reserved_blocks = __cpu_to_le32(0);
+ vol = &dev->vol;
+ vol->migr_state = 0;
+ set_migr_type(dev, MIGR_INIT);
+ vol->dirty = 0;
+ vol->curr_migr_unit = 0;
+ map = get_imsm_map(dev, 0);
+ map->pba_of_lba0 = __cpu_to_le32(super->create_offset);
+ map->blocks_per_member = __cpu_to_le32(info_to_blocks_per_member(info));
+ map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info));
+ map->failed_disk_num = ~0;
+ map->map_state = info->level ? IMSM_T_STATE_UNINITIALIZED :
+ IMSM_T_STATE_NORMAL;
+ map->ddf = 1;
+
+ if (info->level == 1 && info->raid_disks > 2) {
+ fprintf(stderr, Name": imsm does not support more than 2 disks"
+ "in a raid1 volume\n");
+ return 0;
+ }
+ if (info->level == 10) {
+ map->raid_level = 1;
+ map->num_domains = info->raid_disks / 2;
+ } else {
+ map->raid_level = info->level;
+ map->num_domains = 1;
+ }
+ num_data_stripes = info_to_num_data_stripes(info, map->num_domains);
+ map->num_data_stripes = __cpu_to_le32(num_data_stripes);
+
+ map->num_members = info->raid_disks;
+ for (i = 0; i < map->num_members; i++) {
+ /* initialized in add_to_super */
+ set_imsm_ord_tbl_ent(map, i, 0);
+ }
+ mpb->num_raid_devs++;
+
+ dv->dev = dev;
+ dv->index = super->current_vol;
+ dv->next = super->devlist;
+ super->devlist = dv;
+
+ imsm_update_version_info(super);
+
+ return 1;
+}
+
+static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
+ unsigned long long size, char *name,
+ char *homehost, int *uuid)
+{
+ /* This is primarily called by Create when creating a new array.
+ * We will then get add_to_super called for each component, and then
+ * write_init_super called to write it out to each device.
+ * For IMSM, Create can create on fresh devices or on a pre-existing
+ * array.
+ * To create on a pre-existing array a different method will be called.
+ * This one is just for fresh drives.
+ */
+ struct intel_super *super;
+ struct imsm_super *mpb;
+ size_t mpb_size;
+ char *version;
+
+ if (!info) {
+ st->sb = NULL;
+ return 0;
+ }
+ if (st->sb)
+ return init_super_imsm_volume(st, info, size, name, homehost,
+ uuid);
+
+ super = alloc_super(1);
+ if (!super)
+ return 0;
+ mpb_size = disks_to_mpb_size(info->nr_disks);
+ if (posix_memalign(&super->buf, 512, mpb_size) != 0) {
+ free(super);
+ return 0;
+ }
+ mpb = super->buf;
+ memset(mpb, 0, mpb_size);
+
+ mpb->attributes = MPB_ATTRIB_CHECKSUM_VERIFY;
+
+ version = (char *) mpb->sig;
+ strcpy(version, MPB_SIGNATURE);
+ version += strlen(MPB_SIGNATURE);
+ strcpy(version, MPB_VERSION_RAID0);
+ mpb->mpb_size = mpb_size;
+
+ st->sb = super;
+ return 1;
+}
+
+#ifndef MDASSEMBLE
+static int add_to_super_imsm_volume(struct supertype *st, mdu_disk_info_t *dk,
+ int fd, char *devname)
+{
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb = super->anchor;
+ struct dl *dl;
+ struct imsm_dev *dev;
+ struct imsm_map *map;
+
+ dev = get_imsm_dev(super, super->current_vol);
+ map = get_imsm_map(dev, 0);
+
+ if (! (dk->state & (1<<MD_DISK_SYNC))) {
+ fprintf(stderr, Name ": %s: Cannot add spare devices to IMSM volume\n",
+ devname);
+ return 1;
+ }
+
+ if (fd == -1) {
+ /* we're doing autolayout so grab the pre-marked (in
+ * validate_geometry) raid_disk
+ */
+ for (dl = super->disks; dl; dl = dl->next)
+ if (dl->raiddisk == dk->raid_disk)
+ break;
+ } else {
+ for (dl = super->disks; dl ; dl = dl->next)
+ if (dl->major == dk->major &&
+ dl->minor == dk->minor)
+ break;
+ }
+
+ if (!dl) {
+ fprintf(stderr, Name ": %s is not a member of the same container\n", devname);
+ return 1;
+ }
+
+ /* add a pristine spare to the metadata */
+ if (dl->index < 0) {
+ dl->index = super->anchor->num_disks;
+ super->anchor->num_disks++;
+ }
+ set_imsm_ord_tbl_ent(map, dk->number, dl->index);
+ dl->disk.status = CONFIGURED_DISK | USABLE_DISK;
+
+ /* if we are creating the first raid device update the family number */
+ if (super->current_vol == 0) {
+ __u32 sum;
+ struct imsm_dev *_dev = __get_imsm_dev(mpb, 0);
+ struct imsm_disk *_disk = __get_imsm_disk(mpb, dl->index);
+
+ *_dev = *dev;
+ *_disk = dl->disk;
+ sum = __gen_imsm_checksum(mpb);
+ mpb->family_num = __cpu_to_le32(sum);
+ }
+
+ return 0;
+}
+
+static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk,
+ int fd, char *devname)
+{
+ struct intel_super *super = st->sb;
+ struct dl *dd;
+ unsigned long long size;
+ __u32 id;
+ int rv;
+ struct stat stb;
+
+ /* if we are on an RAID enabled platform check that the disk is
+ * attached to the raid controller
+ */
+ if (super->hba && !disk_attached_to_hba(fd, super->hba)) {
+ fprintf(stderr,
+ Name ": %s is not attached to the raid controller: %s\n",
+ devname ? : "disk", super->hba);
+ return 1;
+ }
+
+ if (super->current_vol >= 0)
+ return add_to_super_imsm_volume(st, dk, fd, devname);
+
+ fstat(fd, &stb);
+ dd = malloc(sizeof(*dd));
+ if (!dd) {
+ fprintf(stderr,
+ Name ": malloc failed %s:%d.\n", __func__, __LINE__);
+ return 1;
+ }
+ memset(dd, 0, sizeof(*dd));
+ dd->major = major(stb.st_rdev);
+ dd->minor = minor(stb.st_rdev);
+ dd->index = -1;
+ dd->devname = devname ? strdup(devname) : NULL;
+ dd->fd = fd;
+ dd->e = NULL;
+ rv = imsm_read_serial(fd, devname, dd->serial);
+ if (rv) {
+ fprintf(stderr,
+ Name ": failed to retrieve scsi serial, aborting\n");
+ free(dd);
+ abort();
+ }
+
+ get_dev_size(fd, NULL, &size);
+ size /= 512;
+ serialcpy(dd->disk.serial, dd->serial);
+ dd->disk.total_blocks = __cpu_to_le32(size);
+ dd->disk.status = USABLE_DISK | SPARE_DISK;
+ if (sysfs_disk_to_scsi_id(fd, &id) == 0)
+ dd->disk.scsi_id = __cpu_to_le32(id);
+ else
+ dd->disk.scsi_id = __cpu_to_le32(0);
+
+ if (st->update_tail) {
+ dd->next = super->add;
+ super->add = dd;
+ } else {
+ dd->next = super->disks;
+ super->disks = dd;
+ }
+
+ return 0;
+}
+
+static int store_imsm_mpb(int fd, struct intel_super *super);
+
+/* spare records have their own family number and do not have any defined raid
+ * devices
+ */
+static int write_super_imsm_spares(struct intel_super *super, int doclose)
+{
+ struct imsm_super mpb_save;
+ struct imsm_super *mpb = super->anchor;
+ __u32 sum;
+ struct dl *d;
+
+ mpb_save = *mpb;
+ mpb->num_raid_devs = 0;
+ mpb->num_disks = 1;
+ mpb->mpb_size = sizeof(struct imsm_super);
+ mpb->generation_num = __cpu_to_le32(1UL);
+
+ for (d = super->disks; d; d = d->next) {
+ if (d->index != -1)
+ continue;
+
+ mpb->disk[0] = d->disk;
+ sum = __gen_imsm_checksum(mpb);
+ mpb->family_num = __cpu_to_le32(sum);
+ sum = __gen_imsm_checksum(mpb);
+ mpb->check_sum = __cpu_to_le32(sum);
+
+ if (store_imsm_mpb(d->fd, super)) {
+ fprintf(stderr, "%s: failed for device %d:%d %s\n",
+ __func__, d->major, d->minor, strerror(errno));
+ *mpb = mpb_save;
+ return 1;
+ }
+ if (doclose) {
+ close(d->fd);
+ d->fd = -1;
+ }
+ }
+
+ *mpb = mpb_save;
+ return 0;
+}
+
+static int write_super_imsm(struct intel_super *super, int doclose)
+{
+ struct imsm_super *mpb = super->anchor;
+ struct dl *d;
+ __u32 generation;
+ __u32 sum;
+ int spares = 0;
+ int i;
+ __u32 mpb_size = sizeof(struct imsm_super) - sizeof(struct imsm_disk);
+
+ /* 'generation' is incremented everytime the metadata is written */
+ generation = __le32_to_cpu(mpb->generation_num);
+ generation++;
+ mpb->generation_num = __cpu_to_le32(generation);
+
+ mpb_size += sizeof(struct imsm_disk) * mpb->num_disks;
+ for (d = super->disks; d; d = d->next) {
+ if (d->index == -1)
+ spares++;
+ else
+ mpb->disk[d->index] = d->disk;
+ }
+ for (d = super->missing; d; d = d->next)
+ mpb->disk[d->index] = d->disk;
+
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ struct imsm_dev *dev = __get_imsm_dev(mpb, i);
+
+ imsm_copy_dev(dev, get_imsm_dev(super, i));
+ mpb_size += sizeof_imsm_dev(dev, 0);
+ }
+ mpb_size += __le32_to_cpu(mpb->bbm_log_size);
+ mpb->mpb_size = __cpu_to_le32(mpb_size);
+
+ /* recalculate checksum */
+ sum = __gen_imsm_checksum(mpb);
+ mpb->check_sum = __cpu_to_le32(sum);
+
+ /* write the mpb for disks that compose raid devices */
+ for (d = super->disks; d ; d = d->next) {
+ if (d->index < 0)
+ continue;
+ if (store_imsm_mpb(d->fd, super))
+ fprintf(stderr, "%s: failed for device %d:%d %s\n",
+ __func__, d->major, d->minor, strerror(errno));
+ if (doclose) {
+ close(d->fd);
+ d->fd = -1;
+ }
+ }
+
+ if (spares)
+ return write_super_imsm_spares(super, doclose);
+
+ return 0;
+}
+
+
+static int create_array(struct supertype *st)
+{
+ size_t len;
+ struct imsm_update_create_array *u;
+ struct intel_super *super = st->sb;
+ struct imsm_dev *dev = get_imsm_dev(super, super->current_vol);
+ struct imsm_map *map = get_imsm_map(dev, 0);
+ struct disk_info *inf;
+ struct imsm_disk *disk;
+ int i;
+ int idx;
+
+ len = sizeof(*u) - sizeof(*dev) + sizeof_imsm_dev(dev, 0) +
+ sizeof(*inf) * map->num_members;
+ u = malloc(len);
+ if (!u) {
+ fprintf(stderr, "%s: failed to allocate update buffer\n",
+ __func__);
+ return 1;
+ }
+
+ u->type = update_create_array;
+ u->dev_idx = super->current_vol;
+ imsm_copy_dev(&u->dev, dev);
+ inf = get_disk_info(u);
+ for (i = 0; i < map->num_members; i++) {
+ idx = get_imsm_disk_idx(dev, i);
+ disk = get_imsm_disk(super, idx);
+ serialcpy(inf[i].serial, disk->serial);
+ }
+ append_metadata_update(st, u, len);
+
+ return 0;
+}
+
+static int _add_disk(struct supertype *st)
+{
+ struct intel_super *super = st->sb;
+ size_t len;
+ struct imsm_update_add_disk *u;
+
+ if (!super->add)
+ return 0;
+
+ len = sizeof(*u);
+ u = malloc(len);
+ if (!u) {
+ fprintf(stderr, "%s: failed to allocate update buffer\n",
+ __func__);
+ return 1;
+ }
+
+ u->type = update_add_disk;
+ append_metadata_update(st, u, len);
+
+ return 0;
+}
+
+static int write_init_super_imsm(struct supertype *st)
+{
+ if (st->update_tail) {
+ /* queue the recently created array / added disk
+ * as a metadata update */
+ struct intel_super *super = st->sb;
+ struct dl *d;
+ int rv;
+
+ /* determine if we are creating a volume or adding a disk */
+ if (super->current_vol < 0) {
+ /* in the add disk case we are running in mdmon
+ * context, so don't close fd's
+ */
+ return _add_disk(st);
+ } else
+ rv = create_array(st);
+
+ for (d = super->disks; d ; d = d->next) {
+ close(d->fd);
+ d->fd = -1;
+ }
+
+ return rv;
+ } else
+ return write_super_imsm(st->sb, 1);
+}
+#endif
+
+static int store_zero_imsm(struct supertype *st, int fd)
+{
+ unsigned long long dsize;
+ void *buf;
+
+ get_dev_size(fd, NULL, &dsize);
+
+ /* first block is stored on second to last sector of the disk */
+ if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0)
+ return 1;
+
+ if (posix_memalign(&buf, 512, 512) != 0)
+ return 1;
+
+ memset(buf, 0, 512);
+ if (write(fd, buf, 512) != 512)
+ return 1;
+ return 0;
+}
+
+static int imsm_bbm_log_size(struct imsm_super *mpb)
+{
+ return __le32_to_cpu(mpb->bbm_log_size);
+}
+
+#ifndef MDASSEMBLE
+static int validate_geometry_imsm_container(struct supertype *st, int level,
+ int layout, int raiddisks, int chunk,
+ unsigned long long size, char *dev,
+ unsigned long long *freesize,
+ int verbose)
+{
+ int fd;
+ unsigned long long ldsize;
+ const struct imsm_orom *orom;
+
+ if (level != LEVEL_CONTAINER)
+ return 0;
+ if (!dev)
+ return 1;
+
+ if (check_env("IMSM_NO_PLATFORM"))
+ orom = NULL;
+ else
+ orom = find_imsm_orom();
+ if (orom && raiddisks > orom->tds) {
+ if (verbose)
+ fprintf(stderr, Name ": %d exceeds maximum number of"
+ " platform supported disks: %d\n",
+ raiddisks, orom->tds);
+ return 0;
+ }
+
+ fd = open(dev, O_RDONLY|O_EXCL, 0);
+ if (fd < 0) {
+ if (verbose)
+ fprintf(stderr, Name ": imsm: Cannot open %s: %s\n",
+ dev, strerror(errno));
+ return 0;
+ }
+ if (!get_dev_size(fd, dev, &ldsize)) {
+ close(fd);
+ return 0;
+ }
+ close(fd);
+
+ *freesize = avail_size_imsm(st, ldsize >> 9);
+
+ return 1;
+}
+
+static unsigned long long find_size(struct extent *e, int *idx, int num_extents)
+{
+ const unsigned long long base_start = e[*idx].start;
+ unsigned long long end = base_start + e[*idx].size;
+ int i;
+
+ if (base_start == end)
+ return 0;
+
+ *idx = *idx + 1;
+ for (i = *idx; i < num_extents; i++) {
+ /* extend overlapping extents */
+ if (e[i].start >= base_start &&
+ e[i].start <= end) {
+ if (e[i].size == 0)
+ return 0;
+ if (e[i].start + e[i].size > end)
+ end = e[i].start + e[i].size;
+ } else if (e[i].start > end) {
+ *idx = i;
+ break;
+ }
+ }
+
+ return end - base_start;
+}
+
+static unsigned long long merge_extents(struct intel_super *super, int sum_extents)
+{
+ /* build a composite disk with all known extents and generate a new
+ * 'maxsize' given the "all disks in an array must share a common start
+ * offset" constraint
+ */
+ struct extent *e = calloc(sum_extents, sizeof(*e));
+ struct dl *dl;
+ int i, j;
+ int start_extent;
+ unsigned long long pos;
+ unsigned long long start = 0;
+ unsigned long long maxsize;
+ unsigned long reserve;
+
+ if (!e)
+ return ~0ULL; /* error */
+
+ /* coalesce and sort all extents. also, check to see if we need to
+ * reserve space between member arrays
+ */
+ j = 0;
+ for (dl = super->disks; dl; dl = dl->next) {
+ if (!dl->e)
+ continue;
+ for (i = 0; i < dl->extent_cnt; i++)
+ e[j++] = dl->e[i];
+ }
+ qsort(e, sum_extents, sizeof(*e), cmp_extent);
+
+ /* merge extents */
+ i = 0;
+ j = 0;
+ while (i < sum_extents) {
+ e[j].start = e[i].start;
+ e[j].size = find_size(e, &i, sum_extents);
+ j++;
+ if (e[j-1].size == 0)
+ break;
+ }
+
+ pos = 0;
+ maxsize = 0;
+ start_extent = 0;
+ i = 0;
+ do {
+ unsigned long long esize;
+
+ esize = e[i].start - pos;
+ if (esize >= maxsize) {
+ maxsize = esize;
+ start = pos;
+ start_extent = i;
+ }
+ pos = e[i].start + e[i].size;
+ i++;
+ } while (e[i-1].size);
+ free(e);
+
+ if (start_extent > 0)
+ reserve = IMSM_RESERVED_SECTORS; /* gap between raid regions */
+ else
+ reserve = 0;
+
+ if (maxsize < reserve)
+ return ~0ULL;
+
+ super->create_offset = ~((__u32) 0);
+ if (start + reserve > super->create_offset)
+ return ~0ULL; /* start overflows create_offset */
+ super->create_offset = start + reserve;
+
+ return maxsize - reserve;
+}
+
+static int is_raid_level_supported(const struct imsm_orom *orom, int level, int raiddisks)
+{
+ if (level < 0 || level == 6 || level == 4)
+ return 0;
+
+ /* if we have an orom prevent invalid raid levels */
+ if (orom)
+ switch (level) {
+ case 0: return imsm_orom_has_raid0(orom);
+ case 1:
+ if (raiddisks > 2)
+ return imsm_orom_has_raid1e(orom);
+ return imsm_orom_has_raid1(orom) && raiddisks == 2;
+ case 10: return imsm_orom_has_raid10(orom) && raiddisks == 4;
+ case 5: return imsm_orom_has_raid5(orom) && raiddisks > 2;
+ }
+ else
+ return 1; /* not on an Intel RAID platform so anything goes */
+
+ return 0;
+}
+
+#define pr_vrb(fmt, arg...) (void) (verbose && fprintf(stderr, Name fmt, ##arg))
+/* validate_geometry_imsm_volume - lifted from validate_geometry_ddf_bvd
+ * FIX ME add ahci details
+ */
+static int validate_geometry_imsm_volume(struct supertype *st, int level,
+ int layout, int raiddisks, int chunk,
+ unsigned long long size, char *dev,
+ unsigned long long *freesize,
+ int verbose)
+{
+ struct stat stb;
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb = super->anchor;
+ struct dl *dl;
+ unsigned long long pos = 0;
+ unsigned long long maxsize;
+ struct extent *e;
+ int i;
+
+ /* We must have the container info already read in. */
+ if (!super)
+ return 0;
+
+ if (!is_raid_level_supported(super->orom, level, raiddisks)) {
+ pr_vrb(": platform does not support raid%d with %d disk%s\n",
+ level, raiddisks, raiddisks > 1 ? "s" : "");
+ return 0;
+ }
+ if (super->orom && level != 1 &&
+ !imsm_orom_has_chunk(super->orom, chunk)) {
+ pr_vrb(": platform does not support a chunk size of: %d\n", chunk);
+ return 0;
+ }
+ if (layout != imsm_level_to_layout(level)) {
+ if (level == 5)
+ pr_vrb(": imsm raid 5 only supports the left-asymmetric layout\n");
+ else if (level == 10)
+ pr_vrb(": imsm raid 10 only supports the n2 layout\n");
+ else
+ pr_vrb(": imsm unknown layout %#x for this raid level %d\n",
+ layout, level);
+ return 0;
+ }
+
+ if (!dev) {
+ /* General test: make sure there is space for
+ * 'raiddisks' device extents of size 'size' at a given
+ * offset
+ */
+ unsigned long long minsize = size;
+ unsigned long long start_offset = ~0ULL;
+ int dcnt = 0;
+ if (minsize == 0)
+ minsize = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
+ for (dl = super->disks; dl ; dl = dl->next) {
+ int found = 0;
+
+ pos = 0;
+ i = 0;
+ e = get_extents(super, dl);
+ if (!e) continue;
+ do {
+ unsigned long long esize;
+ esize = e[i].start - pos;
+ if (esize >= minsize)
+ found = 1;
+ if (found && start_offset == ~0ULL) {
+ start_offset = pos;
+ break;
+ } else if (found && pos != start_offset) {
+ found = 0;
+ break;
+ }
+ pos = e[i].start + e[i].size;
+ i++;
+ } while (e[i-1].size);
+ if (found)
+ dcnt++;
+ free(e);
+ }
+ if (dcnt < raiddisks) {
+ if (verbose)
+ fprintf(stderr, Name ": imsm: Not enough "
+ "devices with space for this array "
+ "(%d < %d)\n",
+ dcnt, raiddisks);
+ return 0;
+ }
+ return 1;
+ }
+
+ /* This device must be a member of the set */
+ if (stat(dev, &stb) < 0)
+ return 0;
+ if ((S_IFMT & stb.st_mode) != S_IFBLK)
+ return 0;
+ for (dl = super->disks ; dl ; dl = dl->next) {
+ if (dl->major == major(stb.st_rdev) &&
+ dl->minor == minor(stb.st_rdev))
+ break;
+ }
+ if (!dl) {
+ if (verbose)
+ fprintf(stderr, Name ": %s is not in the "
+ "same imsm set\n", dev);
+ return 0;
+ } else if (super->orom && dl->index < 0 && mpb->num_raid_devs) {
+ /* If a volume is present then the current creation attempt
+ * cannot incorporate new spares because the orom may not
+ * understand this configuration (all member disks must be
+ * members of each array in the container).
+ */
+ fprintf(stderr, Name ": %s is a spare and a volume"
+ " is already defined for this container\n", dev);
+ fprintf(stderr, Name ": The option-rom requires all member"
+ " disks to be a member of all volumes\n");
+ return 0;
+ }
+
+ /* retrieve the largest free space block */
+ e = get_extents(super, dl);
+ maxsize = 0;
+ i = 0;
+ if (e) {
+ do {
+ unsigned long long esize;
+
+ esize = e[i].start - pos;
+ if (esize >= maxsize)
+ maxsize = esize;
+ pos = e[i].start + e[i].size;
+ i++;
+ } while (e[i-1].size);
+ dl->e = e;
+ dl->extent_cnt = i;
+ } else {
+ if (verbose)
+ fprintf(stderr, Name ": unable to determine free space for: %s\n",
+ dev);
+ return 0;
+ }
+ if (maxsize < size) {
+ if (verbose)
+ fprintf(stderr, Name ": %s not enough space (%llu < %llu)\n",
+ dev, maxsize, size);
+ return 0;
+ }
+
+ /* count total number of extents for merge */
+ i = 0;
+ for (dl = super->disks; dl; dl = dl->next)
+ if (dl->e)
+ i += dl->extent_cnt;
+
+ maxsize = merge_extents(super, i);
+ if (maxsize < size) {
+ if (verbose)
+ fprintf(stderr, Name ": not enough space after merge (%llu < %llu)\n",
+ maxsize, size);
+ return 0;
+ } else if (maxsize == ~0ULL) {
+ if (verbose)
+ fprintf(stderr, Name ": failed to merge %d extents\n", i);
+ return 0;
+ }
+
+ *freesize = maxsize;
+
+ return 1;
+}
+
+static int reserve_space(struct supertype *st, int raiddisks,
+ unsigned long long size, int chunk,
+ unsigned long long *freesize)
+{
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb = super->anchor;
+ struct dl *dl;
+ int i;
+ int extent_cnt;
+ struct extent *e;
+ unsigned long long maxsize;
+ unsigned long long minsize;
+ int cnt;
+ int used;
+
+ /* find the largest common start free region of the possible disks */
+ used = 0;
+ extent_cnt = 0;
+ cnt = 0;
+ for (dl = super->disks; dl; dl = dl->next) {
+ dl->raiddisk = -1;
+
+ if (dl->index >= 0)
+ used++;
+
+ /* don't activate new spares if we are orom constrained
+ * and there is already a volume active in the container
+ */
+ if (super->orom && dl->index < 0 && mpb->num_raid_devs)
+ continue;
+
+ e = get_extents(super, dl);
+ if (!e)
+ continue;
+ for (i = 1; e[i-1].size; i++)
+ ;
+ dl->e = e;
+ dl->extent_cnt = i;
+ extent_cnt += i;
+ cnt++;
+ }
+
+ maxsize = merge_extents(super, extent_cnt);
+ minsize = size;
+ if (size == 0)
+ minsize = chunk;
+
+ if (cnt < raiddisks ||
+ (super->orom && used && used != raiddisks) ||
+ maxsize < minsize) {
+ fprintf(stderr, Name ": not enough devices with space to create array.\n");
+ return 0; /* No enough free spaces large enough */
+ }
+
+ if (size == 0) {
+ size = maxsize;
+ if (chunk) {
+ size /= chunk;
+ size *= chunk;
+ }
+ }
+
+ cnt = 0;
+ for (dl = super->disks; dl; dl = dl->next)
+ if (dl->e)
+ dl->raiddisk = cnt++;
+
+ *freesize = size;
+
+ return 1;
+}
+
+static int validate_geometry_imsm(struct supertype *st, int level, int layout,
+ int raiddisks, int chunk, unsigned long long size,
+ char *dev, unsigned long long *freesize,
+ int verbose)
+{
+ int fd, cfd;
+ struct mdinfo *sra;
+
+ /* if given unused devices create a container
+ * if given given devices in a container create a member volume
+ */
+ if (level == LEVEL_CONTAINER) {
+ /* Must be a fresh device to add to a container */
+ return validate_geometry_imsm_container(st, level, layout,
+ raiddisks, chunk, size,
+ dev, freesize,
+ verbose);
+ }
+
+ if (!dev) {
+ if (st->sb && freesize) {
+ /* we are being asked to automatically layout a
+ * new volume based on the current contents of
+ * the container. If the the parameters can be
+ * satisfied reserve_space will record the disks,
+ * start offset, and size of the volume to be
+ * created. add_to_super and getinfo_super
+ * detect when autolayout is in progress.
+ */
+ return reserve_space(st, raiddisks, size, chunk, freesize);
+ }
+ return 1;
+ }
+ if (st->sb) {
+ /* creating in a given container */
+ return validate_geometry_imsm_volume(st, level, layout,
+ raiddisks, chunk, size,
+ dev, freesize, verbose);
+ }
+
+ /* limit creation to the following levels */
+ if (!dev)
+ switch (level) {
+ case 0:
+ case 1:
+ case 10:
+ case 5:
+ break;
+ default:
+ return 1;
+ }
+
+ /* This device needs to be a device in an 'imsm' container */
+ fd = open(dev, O_RDONLY|O_EXCL, 0);
+ if (fd >= 0) {
+ if (verbose)
+ fprintf(stderr,
+ Name ": Cannot create this array on device %s\n",
+ dev);
+ close(fd);
+ return 0;
+ }
+ if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) {
+ if (verbose)
+ fprintf(stderr, Name ": Cannot open %s: %s\n",
+ dev, strerror(errno));
+ return 0;
+ }
+ /* Well, it is in use by someone, maybe an 'imsm' container. */
+ cfd = open_container(fd);
+ if (cfd < 0) {
+ close(fd);
+ if (verbose)
+ fprintf(stderr, Name ": Cannot use %s: It is busy\n",
+ dev);
+ return 0;
+ }
+ sra = sysfs_read(cfd, 0, GET_VERSION);
+ close(fd);
+ if (sra && sra->array.major_version == -1 &&
+ strcmp(sra->text_version, "imsm") == 0) {
+ /* This is a member of a imsm container. Load the container
+ * and try to create a volume
+ */
+ struct intel_super *super;
+
+ if (load_super_imsm_all(st, cfd, (void **) &super, NULL, 1) == 0) {
+ st->sb = super;
+ st->container_dev = fd2devnum(cfd);
+ close(cfd);
+ return validate_geometry_imsm_volume(st, level, layout,
+ raiddisks, chunk,
+ size, dev,
+ freesize, verbose);
+ }
+ close(cfd);
+ } else /* may belong to another container */
+ return 0;
+
+ return 1;
+}
+#endif /* MDASSEMBLE */
+
+static struct mdinfo *container_content_imsm(struct supertype *st)
+{
+ /* Given a container loaded by load_super_imsm_all,
+ * extract information about all the arrays into
+ * an mdinfo tree.
+ *
+ * For each imsm_dev create an mdinfo, fill it in,
+ * then look for matching devices in super->disks
+ * and create appropriate device mdinfo.
+ */
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb = super->anchor;
+ struct mdinfo *rest = NULL;
+ int i;
+
+ /* do not assemble arrays that might have bad blocks */
+ if (imsm_bbm_log_size(super->anchor)) {
+ fprintf(stderr, Name ": BBM log found in metadata. "
+ "Cannot activate array(s).\n");
+ return NULL;
+ }
+
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ struct imsm_dev *dev = get_imsm_dev(super, i);
+ struct imsm_map *map = get_imsm_map(dev, 0);
+ struct mdinfo *this;
+ int slot;
+
+ /* do not publish arrays that are in the middle of an
+ * unsupported migration
+ */
+ if (dev->vol.migr_state &&
+ (migr_type(dev) == MIGR_GEN_MIGR ||
+ migr_type(dev) == MIGR_STATE_CHANGE)) {
+ fprintf(stderr, Name ": cannot assemble volume '%.16s':"
+ " unsupported migration in progress\n",
+ dev->volume);
+ continue;
+ }
+
+ this = malloc(sizeof(*this));
+ memset(this, 0, sizeof(*this));
+ this->next = rest;
+
+ super->current_vol = i;
+ getinfo_super_imsm_volume(st, this);
+ for (slot = 0 ; slot < map->num_members; slot++) {
+ struct mdinfo *info_d;
+ struct dl *d;
+ int idx;
+ int skip;
+ __u32 s;
+ __u32 ord;
+
+ skip = 0;
+ idx = get_imsm_disk_idx(dev, slot);
+ ord = get_imsm_ord_tbl_ent(dev, slot);
+ for (d = super->disks; d ; d = d->next)
+ if (d->index == idx)
+ break;
+
+ if (d == NULL)
+ skip = 1;
+
+ s = d ? d->disk.status : 0;
+ if (s & FAILED_DISK)
+ skip = 1;
+ if (!(s & USABLE_DISK))
+ skip = 1;
+ if (ord & IMSM_ORD_REBUILD)
+ skip = 1;
+
+ /*
+ * if we skip some disks the array will be assmebled degraded;
+ * reset resync start to avoid a dirty-degraded situation
+ *
+ * FIXME handle dirty degraded
+ */
+ if (skip && !dev->vol.dirty)
+ this->resync_start = ~0ULL;
+ if (skip)
+ continue;
+
+ info_d = malloc(sizeof(*info_d));
+ if (!info_d) {
+ fprintf(stderr, Name ": failed to allocate disk"
+ " for volume %.16s\n", dev->volume);
+ free(this);
+ this = rest;
+ break;
+ }
+ memset(info_d, 0, sizeof(*info_d));
+ info_d->next = this->devs;
+ this->devs = info_d;
+
+ info_d->disk.number = d->index;
+ info_d->disk.major = d->major;
+ info_d->disk.minor = d->minor;
+ info_d->disk.raid_disk = slot;
+
+ this->array.working_disks++;
+
+ info_d->events = __le32_to_cpu(mpb->generation_num);
+ info_d->data_offset = __le32_to_cpu(map->pba_of_lba0);
+ info_d->component_size = __le32_to_cpu(map->blocks_per_member);
+ if (d->devname)
+ strcpy(info_d->name, d->devname);
+ }
+ rest = this;
+ }
+
+ return rest;
+}
+
+
+#ifndef MDASSEMBLE
+static int imsm_open_new(struct supertype *c, struct active_array *a,
+ char *inst)
+{
+ struct intel_super *super = c->sb;
+ struct imsm_super *mpb = super->anchor;
+
+ if (atoi(inst) >= mpb->num_raid_devs) {
+ fprintf(stderr, "%s: subarry index %d, out of range\n",
+ __func__, atoi(inst));
+ return -ENODEV;
+ }
+
+ dprintf("imsm: open_new %s\n", inst);
+ a->info.container_member = atoi(inst);
+ return 0;
+}
+
+static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, int failed)
+{
+ struct imsm_map *map = get_imsm_map(dev, 0);
+
+ if (!failed)
+ return map->map_state == IMSM_T_STATE_UNINITIALIZED ?
+ IMSM_T_STATE_UNINITIALIZED : IMSM_T_STATE_NORMAL;
+
+ switch (get_imsm_raid_level(map)) {
+ case 0:
+ return IMSM_T_STATE_FAILED;
+ break;
+ case 1:
+ if (failed < map->num_members)
+ return IMSM_T_STATE_DEGRADED;
+ else
+ return IMSM_T_STATE_FAILED;
+ break;
+ case 10:
+ {
+ /**
+ * check to see if any mirrors have failed, otherwise we
+ * are degraded. Even numbered slots are mirrored on
+ * slot+1
+ */
+ int i;
+ /* gcc -Os complains that this is unused */
+ int insync = insync;
+
+ for (i = 0; i < map->num_members; i++) {
+ __u32 ord = get_imsm_ord_tbl_ent(dev, i);
+ int idx = ord_to_idx(ord);
+ struct imsm_disk *disk;
+
+ /* reset the potential in-sync count on even-numbered
+ * slots. num_copies is always 2 for imsm raid10
+ */
+ if ((i & 1) == 0)
+ insync = 2;
+
+ disk = get_imsm_disk(super, idx);
+ if (!disk || disk->status & FAILED_DISK ||
+ ord & IMSM_ORD_REBUILD)
+ insync--;
+
+ /* no in-sync disks left in this mirror the
+ * array has failed
+ */
+ if (insync == 0)
+ return IMSM_T_STATE_FAILED;
+ }
+
+ return IMSM_T_STATE_DEGRADED;
+ }
+ case 5:
+ if (failed < 2)
+ return IMSM_T_STATE_DEGRADED;
+ else
+ return IMSM_T_STATE_FAILED;
+ break;
+ default:
+ break;
+ }
+
+ return map->map_state;
+}
+
+static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev)
+{
+ int i;
+ int failed = 0;
+ struct imsm_disk *disk;
+ struct imsm_map *map = get_imsm_map(dev, 0);
+ struct imsm_map *prev = get_imsm_map(dev, dev->vol.migr_state);
+ __u32 ord;
+ int idx;
+
+ /* at the beginning of migration we set IMSM_ORD_REBUILD on
+ * disks that are being rebuilt. New failures are recorded to
+ * map[0]. So we look through all the disks we started with and
+ * see if any failures are still present, or if any new ones
+ * have arrived
+ *
+ * FIXME add support for online capacity expansion and
+ * raid-level-migration
+ */
+ for (i = 0; i < prev->num_members; i++) {
+ ord = __le32_to_cpu(prev->disk_ord_tbl[i]);
+ ord |= __le32_to_cpu(map->disk_ord_tbl[i]);
+ idx = ord_to_idx(ord);
+
+ disk = get_imsm_disk(super, idx);
+ if (!disk || disk->status & FAILED_DISK ||
+ ord & IMSM_ORD_REBUILD)
+ failed++;
+ }
+
+ return failed;
+}
+
+static int is_resyncing(struct imsm_dev *dev)
+{
+ struct imsm_map *migr_map;
+
+ if (!dev->vol.migr_state)
+ return 0;
+
+ if (migr_type(dev) == MIGR_INIT ||
+ migr_type(dev) == MIGR_REPAIR)
+ return 1;
+
+ migr_map = get_imsm_map(dev, 1);
+
+ if (migr_map->map_state == IMSM_T_STATE_NORMAL)
+ return 1;
+ else
+ return 0;
+}
+
+static int is_rebuilding(struct imsm_dev *dev)
+{
+ struct imsm_map *migr_map;
+
+ if (!dev->vol.migr_state)
+ return 0;
+
+ if (migr_type(dev) != MIGR_REBUILD)
+ return 0;
+
+ migr_map = get_imsm_map(dev, 1);
+
+ if (migr_map->map_state == IMSM_T_STATE_DEGRADED)
+ return 1;
+ else
+ return 0;
+}
+
+/* return true if we recorded new information */
+static int mark_failure(struct imsm_dev *dev, struct imsm_disk *disk, int idx)
+{
+ __u32 ord;
+ int slot;
+ struct imsm_map *map;
+
+ /* new failures are always set in map[0] */
+ map = get_imsm_map(dev, 0);
+
+ slot = get_imsm_disk_slot(map, idx);
+ if (slot < 0)
+ return 0;
+
+ ord = __le32_to_cpu(map->disk_ord_tbl[slot]);
+ if ((disk->status & FAILED_DISK) && (ord & IMSM_ORD_REBUILD))
+ return 0;
+
+ disk->status |= FAILED_DISK;
+ set_imsm_ord_tbl_ent(map, slot, idx | IMSM_ORD_REBUILD);
+ if (map->failed_disk_num == ~0)
+ map->failed_disk_num = slot;
+ return 1;
+}
+
+static void mark_missing(struct imsm_dev *dev, struct imsm_disk *disk, int idx)
+{
+ mark_failure(dev, disk, idx);
+
+ if (disk->scsi_id == __cpu_to_le32(~(__u32)0))
+ return;
+
+ disk->scsi_id = __cpu_to_le32(~(__u32)0);
+ memmove(&disk->serial[0], &disk->serial[1], MAX_RAID_SERIAL_LEN - 1);
+}
+
+/* Handle dirty -> clean transititions and resync. Degraded and rebuild
+ * states are handled in imsm_set_disk() with one exception, when a
+ * resync is stopped due to a new failure this routine will set the
+ * 'degraded' state for the array.
+ */
+static int imsm_set_array_state(struct active_array *a, int consistent)
+{
+ int inst = a->info.container_member;
+ struct intel_super *super = a->container->sb;
+ struct imsm_dev *dev = get_imsm_dev(super, inst);
+ struct imsm_map *map = get_imsm_map(dev, 0);
+ int failed = imsm_count_failed(super, dev);
+ __u8 map_state = imsm_check_degraded(super, dev, failed);
+
+ /* before we activate this array handle any missing disks */
+ if (consistent == 2 && super->missing) {
+ struct dl *dl;
+
+ dprintf("imsm: mark missing\n");
+ end_migration(dev, map_state);
+ for (dl = super->missing; dl; dl = dl->next)
+ mark_missing(dev, &dl->disk, dl->index);
+ super->updates_pending++;
+ }
+
+ if (consistent == 2 &&
+ (!is_resync_complete(a) ||
+ map_state != IMSM_T_STATE_NORMAL ||
+ dev->vol.migr_state))
+ consistent = 0;
+
+ if (is_resync_complete(a)) {
+ /* complete intialization / resync,
+ * recovery and interrupted recovery is completed in
+ * ->set_disk
+ */
+ if (is_resyncing(dev)) {
+ dprintf("imsm: mark resync done\n");
+ end_migration(dev, map_state);
+ super->updates_pending++;
+ }
+ } else if (!is_resyncing(dev) && !failed) {
+ /* mark the start of the init process if nothing is failed */
+ dprintf("imsm: mark resync start (%llu)\n", a->resync_start);
+ if (map->map_state == IMSM_T_STATE_UNINITIALIZED)
+ migrate(dev, IMSM_T_STATE_NORMAL, MIGR_INIT);
+ else
+ migrate(dev, IMSM_T_STATE_NORMAL, MIGR_REPAIR);
+ super->updates_pending++;
+ }
+
+ /* FIXME check if we can update curr_migr_unit from resync_start */
+
+ /* mark dirty / clean */
+ if (dev->vol.dirty != !consistent) {
+ dprintf("imsm: mark '%s' (%llu)\n",
+ consistent ? "clean" : "dirty", a->resync_start);
+ if (consistent)
+ dev->vol.dirty = 0;
+ else
+ dev->vol.dirty = 1;
+ super->updates_pending++;
+ }
+ return consistent;
+}
+
+static void imsm_set_disk(struct active_array *a, int n, int state)
+{
+ int inst = a->info.container_member;
+ struct intel_super *super = a->container->sb;
+ struct imsm_dev *dev = get_imsm_dev(super, inst);
+ struct imsm_map *map = get_imsm_map(dev, 0);
+ struct imsm_disk *disk;
+ int failed;
+ __u32 ord;
+ __u8 map_state;
+
+ if (n > map->num_members)
+ fprintf(stderr, "imsm: set_disk %d out of range 0..%d\n",
+ n, map->num_members - 1);
+
+ if (n < 0)
+ return;
+
+ dprintf("imsm: set_disk %d:%x\n", n, state);
+
+ ord = get_imsm_ord_tbl_ent(dev, n);
+ disk = get_imsm_disk(super, ord_to_idx(ord));
+
+ /* check for new failures */
+ if (state & DS_FAULTY) {
+ if (mark_failure(dev, disk, ord_to_idx(ord)))
+ super->updates_pending++;
+ }
+
+ /* check if in_sync */
+ if (state & DS_INSYNC && ord & IMSM_ORD_REBUILD && is_rebuilding(dev)) {
+ struct imsm_map *migr_map = get_imsm_map(dev, 1);
+
+ set_imsm_ord_tbl_ent(migr_map, n, ord_to_idx(ord));
+ super->updates_pending++;
+ }
+
+ failed = imsm_count_failed(super, dev);
+ map_state = imsm_check_degraded(super, dev, failed);
+
+ /* check if recovery complete, newly degraded, or failed */
+ if (map_state == IMSM_T_STATE_NORMAL && is_rebuilding(dev)) {
+ end_migration(dev, map_state);
+ map = get_imsm_map(dev, 0);
+ map->failed_disk_num = ~0;
+ super->updates_pending++;
+ } else if (map_state == IMSM_T_STATE_DEGRADED &&
+ map->map_state != map_state &&
+ !dev->vol.migr_state) {
+ dprintf("imsm: mark degraded\n");
+ map->map_state = map_state;
+ super->updates_pending++;
+ } else if (map_state == IMSM_T_STATE_FAILED &&
+ map->map_state != map_state) {
+ dprintf("imsm: mark failed\n");
+ end_migration(dev, map_state);
+ super->updates_pending++;
+ }
+}
+
+static int store_imsm_mpb(int fd, struct intel_super *super)
+{
+ struct imsm_super *mpb = super->anchor;
+ __u32 mpb_size = __le32_to_cpu(mpb->mpb_size);
+ unsigned long long dsize;
+ unsigned long long sectors;
+
+ get_dev_size(fd, NULL, &dsize);
+
+ if (mpb_size > 512) {
+ /* -1 to account for anchor */
+ sectors = mpb_sectors(mpb) - 1;
+
+ /* write the extended mpb to the sectors preceeding the anchor */
+ if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0)
+ return 1;
+
+ if (write(fd, super->buf + 512, 512 * sectors) != 512 * sectors)
+ return 1;
+ }
+
+ /* first block is stored on second to last sector of the disk */
+ if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0)
+ return 1;
+
+ if (write(fd, super->buf, 512) != 512)
+ return 1;
+
+ return 0;
+}
+
+static void imsm_sync_metadata(struct supertype *container)
+{
+ struct intel_super *super = container->sb;
+
+ if (!super->updates_pending)
+ return;
+
+ write_super_imsm(super, 0);
+
+ super->updates_pending = 0;
+}
+
+static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_array *a)
+{
+ struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member);
+ int i = get_imsm_disk_idx(dev, idx);
+ struct dl *dl;
+
+ for (dl = super->disks; dl; dl = dl->next)
+ if (dl->index == i)
+ break;
+
+ if (dl && dl->disk.status & FAILED_DISK)
+ dl = NULL;
+
+ if (dl)
+ dprintf("%s: found %x:%x\n", __func__, dl->major, dl->minor);
+
+ return dl;
+}
+
+static struct dl *imsm_add_spare(struct intel_super *super, int slot,
+ struct active_array *a, int activate_new)
+{
+ struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member);
+ int idx = get_imsm_disk_idx(dev, slot);
+ struct imsm_super *mpb = super->anchor;
+ struct imsm_map *map;
+ unsigned long long esize;
+ unsigned long long pos;
+ struct mdinfo *d;
+ struct extent *ex;
+ int i, j;
+ int found;
+ __u32 array_start;
+ __u32 blocks;
+ struct dl *dl;
+
+ for (dl = super->disks; dl; dl = dl->next) {
+ /* If in this array, skip */
+ for (d = a->info.devs ; d ; d = d->next)
+ if (d->state_fd >= 0 &&
+ d->disk.major == dl->major &&
+ d->disk.minor == dl->minor) {
+ dprintf("%x:%x already in array\n", dl->major, dl->minor);
+ break;
+ }
+ if (d)
+ continue;
+
+ /* skip in use or failed drives */
+ if (dl->disk.status & FAILED_DISK || idx == dl->index ||
+ dl->index == -2) {
+ dprintf("%x:%x status (failed: %d index: %d)\n",
+ dl->major, dl->minor,
+ (dl->disk.status & FAILED_DISK) == FAILED_DISK, idx);
+ continue;
+ }
+
+ /* skip pure spares when we are looking for partially
+ * assimilated drives
+ */
+ if (dl->index == -1 && !activate_new)
+ continue;
+
+ /* Does this unused device have the requisite free space?
+ * It needs to be able to cover all member volumes
+ */
+ ex = get_extents(super, dl);
+ if (!ex) {
+ dprintf("cannot get extents\n");
+ continue;
+ }
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ dev = get_imsm_dev(super, i);
+ map = get_imsm_map(dev, 0);
+
+ /* check if this disk is already a member of
+ * this array
+ */
+ if (get_imsm_disk_slot(map, dl->index) >= 0)
+ continue;
+
+ found = 0;
+ j = 0;
+ pos = 0;
+ array_start = __le32_to_cpu(map->pba_of_lba0);
+ blocks = __le32_to_cpu(map->blocks_per_member);
+
+ do {
+ /* check that we can start at pba_of_lba0 with
+ * blocks_per_member of space
+ */
+ esize = ex[j].start - pos;
+ if (array_start >= pos &&
+ array_start + blocks < ex[j].start) {
+ found = 1;
+ break;
+ }
+ pos = ex[j].start + ex[j].size;
+ j++;
+ } while (ex[j-1].size);
+
+ if (!found)
+ break;
+ }
+
+ free(ex);
+ if (i < mpb->num_raid_devs) {
+ dprintf("%x:%x does not have %u at %u\n",
+ dl->major, dl->minor,
+ blocks, array_start);
+ /* No room */
+ continue;
+ }
+ return dl;
+ }
+
+ return dl;
+}
+
+static struct mdinfo *imsm_activate_spare(struct active_array *a,
+ struct metadata_update **updates)
+{
+ /**
+ * Find a device with unused free space and use it to replace a
+ * failed/vacant region in an array. We replace failed regions one a
+ * array at a time. The result is that a new spare disk will be added
+ * to the first failed array and after the monitor has finished
+ * propagating failures the remainder will be consumed.
+ *
+ * FIXME add a capability for mdmon to request spares from another
+ * container.
+ */
+
+ struct intel_super *super = a->container->sb;
+ int inst = a->info.container_member;
+ struct imsm_dev *dev = get_imsm_dev(super, inst);
+ struct imsm_map *map = get_imsm_map(dev, 0);
+ int failed = a->info.array.raid_disks;
+ struct mdinfo *rv = NULL;
+ struct mdinfo *d;
+ struct mdinfo *di;
+ struct metadata_update *mu;
+ struct dl *dl;
+ struct imsm_update_activate_spare *u;
+ int num_spares = 0;
+ int i;
+
+ for (d = a->info.devs ; d ; d = d->next) {
+ if ((d->curr_state & DS_FAULTY) &&
+ d->state_fd >= 0)
+ /* wait for Removal to happen */
+ return NULL;
+ if (d->state_fd >= 0)
+ failed--;
+ }
+
+ dprintf("imsm: activate spare: inst=%d failed=%d (%d) level=%d\n",
+ inst, failed, a->info.array.raid_disks, a->info.array.level);
+ if (imsm_check_degraded(super, dev, failed) != IMSM_T_STATE_DEGRADED)
+ return NULL;
+
+ /* For each slot, if it is not working, find a spare */
+ for (i = 0; i < a->info.array.raid_disks; i++) {
+ for (d = a->info.devs ; d ; d = d->next)
+ if (d->disk.raid_disk == i)
+ break;
+ dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0);
+ if (d && (d->state_fd >= 0))
+ continue;
+
+ /*
+ * OK, this device needs recovery. Try to re-add the
+ * previous occupant of this slot, if this fails see if
+ * we can continue the assimilation of a spare that was
+ * partially assimilated, finally try to activate a new
+ * spare.
+ */
+ dl = imsm_readd(super, i, a);
+ if (!dl)
+ dl = imsm_add_spare(super, i, a, 0);
+ if (!dl)
+ dl = imsm_add_spare(super, i, a, 1);
+ if (!dl)
+ continue;
+
+ /* found a usable disk with enough space */
+ di = malloc(sizeof(*di));
+ if (!di)
+ continue;
+ memset(di, 0, sizeof(*di));
+
+ /* dl->index will be -1 in the case we are activating a
+ * pristine spare. imsm_process_update() will create a
+ * new index in this case. Once a disk is found to be
+ * failed in all member arrays it is kicked from the
+ * metadata
+ */
+ di->disk.number = dl->index;
+
+ /* (ab)use di->devs to store a pointer to the device
+ * we chose
+ */
+ di->devs = (struct mdinfo *) dl;
+
+ di->disk.raid_disk = i;
+ di->disk.major = dl->major;
+ di->disk.minor = dl->minor;
+ di->disk.state = 0;
+ di->data_offset = __le32_to_cpu(map->pba_of_lba0);
+ di->component_size = a->info.component_size;
+ di->container_member = inst;
+ di->next = rv;
+ rv = di;
+ num_spares++;
+ dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor,
+ i, di->data_offset);
+
+ break;
+ }
+
+ if (!rv)
+ /* No spares found */
+ return rv;
+ /* Now 'rv' has a list of devices to return.
+ * Create a metadata_update record to update the
+ * disk_ord_tbl for the array
+ */
+ mu = malloc(sizeof(*mu));
+ if (mu) {
+ mu->buf = malloc(sizeof(struct imsm_update_activate_spare) * num_spares);
+ if (mu->buf == NULL) {
+ free(mu);
+ mu = NULL;
+ }
+ }
+ if (!mu) {
+ while (rv) {
+ struct mdinfo *n = rv->next;
+
+ free(rv);
+ rv = n;
+ }
+ return NULL;
+ }
+
+ mu->space = NULL;
+ mu->len = sizeof(struct imsm_update_activate_spare) * num_spares;
+ mu->next = *updates;
+ u = (struct imsm_update_activate_spare *) mu->buf;
+
+ for (di = rv ; di ; di = di->next) {
+ u->type = update_activate_spare;
+ u->dl = (struct dl *) di->devs;
+ di->devs = NULL;
+ u->slot = di->disk.raid_disk;
+ u->array = inst;
+ u->next = u + 1;
+ u++;
+ }
+ (u-1)->next = NULL;
+ *updates = mu;
+
+ return rv;
+}
+
+static int disks_overlap(struct intel_super *super, int idx, struct imsm_update_create_array *u)
+{
+ struct imsm_dev *dev = get_imsm_dev(super, idx);
+ struct imsm_map *map = get_imsm_map(dev, 0);
+ struct imsm_map *new_map = get_imsm_map(&u->dev, 0);
+ struct disk_info *inf = get_disk_info(u);
+ struct imsm_disk *disk;
+ int i;
+ int j;
+
+ for (i = 0; i < map->num_members; i++) {
+ disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i));
+ for (j = 0; j < new_map->num_members; j++)
+ if (serialcmp(disk->serial, inf[j].serial) == 0)
+ return 1;
+ }
+
+ return 0;
+}
+
+static void imsm_delete(struct intel_super *super, struct dl **dlp, int index);
+
+static void imsm_process_update(struct supertype *st,
+ struct metadata_update *update)
+{
+ /**
+ * crack open the metadata_update envelope to find the update record
+ * update can be one of:
+ * update_activate_spare - a spare device has replaced a failed
+ * device in an array, update the disk_ord_tbl. If this disk is
+ * present in all member arrays then also clear the SPARE_DISK
+ * flag
+ */
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb;
+ enum imsm_update_type type = *(enum imsm_update_type *) update->buf;
+
+ /* update requires a larger buf but the allocation failed */
+ if (super->next_len && !super->next_buf) {
+ super->next_len = 0;
+ return;
+ }
+
+ if (super->next_buf) {
+ memcpy(super->next_buf, super->buf, super->len);
+ free(super->buf);
+ super->len = super->next_len;
+ super->buf = super->next_buf;
+
+ super->next_len = 0;
+ super->next_buf = NULL;
+ }
+
+ mpb = super->anchor;
+
+ switch (type) {
+ case update_activate_spare: {
+ struct imsm_update_activate_spare *u = (void *) update->buf;
+ struct imsm_dev *dev = get_imsm_dev(super, u->array);
+ struct imsm_map *map = get_imsm_map(dev, 0);
+ struct imsm_map *migr_map;
+ struct active_array *a;
+ struct imsm_disk *disk;
+ __u8 to_state;
+ struct dl *dl;
+ unsigned int found;
+ int failed;
+ int victim = get_imsm_disk_idx(dev, u->slot);
+ int i;
+
+ for (dl = super->disks; dl; dl = dl->next)
+ if (dl == u->dl)
+ break;
+
+ if (!dl) {
+ fprintf(stderr, "error: imsm_activate_spare passed "
+ "an unknown disk (index: %d)\n",
+ u->dl->index);
+ return;
+ }
+
+ super->updates_pending++;
+
+ /* count failures (excluding rebuilds and the victim)
+ * to determine map[0] state
+ */
+ failed = 0;
+ for (i = 0; i < map->num_members; i++) {
+ if (i == u->slot)
+ continue;
+ disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i));
+ if (!disk || disk->status & FAILED_DISK)
+ failed++;
+ }
+
+ /* adding a pristine spare, assign a new index */
+ if (dl->index < 0) {
+ dl->index = super->anchor->num_disks;
+ super->anchor->num_disks++;
+ }
+ disk = &dl->disk;
+ disk->status |= CONFIGURED_DISK;
+ disk->status &= ~SPARE_DISK;
+
+ /* mark rebuild */
+ to_state = imsm_check_degraded(super, dev, failed);
+ map->map_state = IMSM_T_STATE_DEGRADED;
+ migrate(dev, to_state, MIGR_REBUILD);
+ migr_map = get_imsm_map(dev, 1);
+ set_imsm_ord_tbl_ent(map, u->slot, dl->index);
+ set_imsm_ord_tbl_ent(migr_map, u->slot, dl->index | IMSM_ORD_REBUILD);
+
+ /* count arrays using the victim in the metadata */
+ found = 0;
+ for (a = st->arrays; a ; a = a->next) {
+ dev = get_imsm_dev(super, a->info.container_member);
+ map = get_imsm_map(dev, 0);
+
+ if (get_imsm_disk_slot(map, victim) >= 0)
+ found++;
+ }
+
+ /* delete the victim if it is no longer being
+ * utilized anywhere
+ */
+ if (!found) {
+ struct dl **dlp;
+
+ /* We know that 'manager' isn't touching anything,
+ * so it is safe to delete
+ */
+ for (dlp = &super->disks; *dlp; dlp = &(*dlp)->next)
+ if ((*dlp)->index == victim)
+ break;
+
+ /* victim may be on the missing list */
+ if (!*dlp)
+ for (dlp = &super->missing; *dlp; dlp = &(*dlp)->next)
+ if ((*dlp)->index == victim)
+ break;
+ imsm_delete(super, dlp, victim);
+ }
+ break;
+ }
+ case update_create_array: {
+ /* someone wants to create a new array, we need to be aware of
+ * a few races/collisions:
+ * 1/ 'Create' called by two separate instances of mdadm
+ * 2/ 'Create' versus 'activate_spare': mdadm has chosen
+ * devices that have since been assimilated via
+ * activate_spare.
+ * In the event this update can not be carried out mdadm will
+ * (FIX ME) notice that its update did not take hold.
+ */
+ struct imsm_update_create_array *u = (void *) update->buf;
+ struct intel_dev *dv;
+ struct imsm_dev *dev;
+ struct imsm_map *map, *new_map;
+ unsigned long long start, end;
+ unsigned long long new_start, new_end;
+ int i;
+ struct disk_info *inf;
+ struct dl *dl;
+
+ /* handle racing creates: first come first serve */
+ if (u->dev_idx < mpb->num_raid_devs) {
+ dprintf("%s: subarray %d already defined\n",
+ __func__, u->dev_idx);
+ goto create_error;
+ }
+
+ /* check update is next in sequence */
+ if (u->dev_idx != mpb->num_raid_devs) {
+ dprintf("%s: can not create array %d expected index %d\n",
+ __func__, u->dev_idx, mpb->num_raid_devs);
+ goto create_error;
+ }
+
+ new_map = get_imsm_map(&u->dev, 0);
+ new_start = __le32_to_cpu(new_map->pba_of_lba0);
+ new_end = new_start + __le32_to_cpu(new_map->blocks_per_member);
+ inf = get_disk_info(u);
+
+ /* handle activate_spare versus create race:
+ * check to make sure that overlapping arrays do not include
+ * overalpping disks
+ */
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ dev = get_imsm_dev(super, i);
+ map = get_imsm_map(dev, 0);
+ start = __le32_to_cpu(map->pba_of_lba0);
+ end = start + __le32_to_cpu(map->blocks_per_member);
+ if ((new_start >= start && new_start <= end) ||
+ (start >= new_start && start <= new_end))
+ /* overlap */;
+ else
+ continue;
+
+ if (disks_overlap(super, i, u)) {
+ dprintf("%s: arrays overlap\n", __func__);
+ goto create_error;
+ }
+ }
+
+ /* check that prepare update was successful */
+ if (!update->space) {
+ dprintf("%s: prepare update failed\n", __func__);
+ goto create_error;
+ }
+
+ /* check that all disks are still active before committing
+ * changes. FIXME: could we instead handle this by creating a
+ * degraded array? That's probably not what the user expects,
+ * so better to drop this update on the floor.
+ */
+ for (i = 0; i < new_map->num_members; i++) {
+ dl = serial_to_dl(inf[i].serial, super);
+ if (!dl) {
+ dprintf("%s: disk disappeared\n", __func__);
+ goto create_error;
+ }
+ }
+
+ super->updates_pending++;
+
+ /* convert spares to members and fixup ord_tbl */
+ for (i = 0; i < new_map->num_members; i++) {
+ dl = serial_to_dl(inf[i].serial, super);
+ if (dl->index == -1) {
+ dl->index = mpb->num_disks;
+ mpb->num_disks++;
+ dl->disk.status |= CONFIGURED_DISK;
+ dl->disk.status &= ~SPARE_DISK;
+ }
+ set_imsm_ord_tbl_ent(new_map, i, dl->index);
+ }
+
+ dv = update->space;
+ dev = dv->dev;
+ update->space = NULL;
+ imsm_copy_dev(dev, &u->dev);
+ dv->index = u->dev_idx;
+ dv->next = super->devlist;
+ super->devlist = dv;
+ mpb->num_raid_devs++;
+
+ imsm_update_version_info(super);
+ break;
+ create_error:
+ /* mdmon knows how to release update->space, but not
+ * ((struct intel_dev *) update->space)->dev
+ */
+ if (update->space) {
+ dv = update->space;
+ free(dv->dev);
+ }
+ break;
+ }
+ case update_add_disk:
+
+ /* we may be able to repair some arrays if disks are
+ * being added */
+ if (super->add) {
+ struct active_array *a;
+
+ super->updates_pending++;
+ for (a = st->arrays; a; a = a->next)
+ a->check_degraded = 1;
+ }
+ /* add some spares to the metadata */
+ while (super->add) {
+ struct dl *al;
+
+ al = super->add;
+ super->add = al->next;
+ al->next = super->disks;
+ super->disks = al;
+ dprintf("%s: added %x:%x\n",
+ __func__, al->major, al->minor);
+ }
+
+ break;
+ }
+}
+
+static void imsm_prepare_update(struct supertype *st,
+ struct metadata_update *update)
+{
+ /**
+ * Allocate space to hold new disk entries, raid-device entries or a new
+ * mpb if necessary. The manager synchronously waits for updates to
+ * complete in the monitor, so new mpb buffers allocated here can be
+ * integrated by the monitor thread without worrying about live pointers
+ * in the manager thread.
+ */
+ enum imsm_update_type type = *(enum imsm_update_type *) update->buf;
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb = super->anchor;
+ size_t buf_len;
+ size_t len = 0;
+
+ switch (type) {
+ case update_create_array: {
+ struct imsm_update_create_array *u = (void *) update->buf;
+ struct intel_dev *dv;
+ struct imsm_dev *dev = &u->dev;
+ struct imsm_map *map = get_imsm_map(dev, 0);
+ struct dl *dl;
+ struct disk_info *inf;
+ int i;
+ int activate = 0;
+
+ inf = get_disk_info(u);
+ len = sizeof_imsm_dev(dev, 1);
+ /* allocate a new super->devlist entry */
+ dv = malloc(sizeof(*dv));
+ if (dv) {
+ dv->dev = malloc(len);
+ if (dv->dev)
+ update->space = dv;
+ else {
+ free(dv);
+ update->space = NULL;
+ }
+ }
+
+ /* count how many spares will be converted to members */
+ for (i = 0; i < map->num_members; i++) {
+ dl = serial_to_dl(inf[i].serial, super);
+ if (!dl) {
+ /* hmm maybe it failed?, nothing we can do about
+ * it here
+ */
+ continue;
+ }
+ if (count_memberships(dl, super) == 0)
+ activate++;
+ }
+ len += activate * sizeof(struct imsm_disk);
+ break;
+ default:
+ break;
+ }
+ }
+
+ /* check if we need a larger metadata buffer */
+ if (super->next_buf)
+ buf_len = super->next_len;
+ else
+ buf_len = super->len;
+
+ if (__le32_to_cpu(mpb->mpb_size) + len > buf_len) {
+ /* ok we need a larger buf than what is currently allocated
+ * if this allocation fails process_update will notice that
+ * ->next_len is set and ->next_buf is NULL
+ */
+ buf_len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + len, 512);
+ if (super->next_buf)
+ free(super->next_buf);
+
+ super->next_len = buf_len;
+ if (posix_memalign(&super->next_buf, 512, buf_len) == 0)
+ memset(super->next_buf, 0, buf_len);
+ else
+ super->next_buf = NULL;
+ }
+}
+
+/* must be called while manager is quiesced */
+static void imsm_delete(struct intel_super *super, struct dl **dlp, int index)
+{
+ struct imsm_super *mpb = super->anchor;
+ struct dl *iter;
+ struct imsm_dev *dev;
+ struct imsm_map *map;
+ int i, j, num_members;
+ __u32 ord;
+
+ dprintf("%s: deleting device[%d] from imsm_super\n",
+ __func__, index);
+
+ /* shift all indexes down one */
+ for (iter = super->disks; iter; iter = iter->next)
+ if (iter->index > index)
+ iter->index--;
+ for (iter = super->missing; iter; iter = iter->next)
+ if (iter->index > index)
+ iter->index--;
+
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ dev = get_imsm_dev(super, i);
+ map = get_imsm_map(dev, 0);
+ num_members = map->num_members;
+ for (j = 0; j < num_members; j++) {
+ /* update ord entries being careful not to propagate
+ * ord-flags to the first map
+ */
+ ord = get_imsm_ord_tbl_ent(dev, j);
+
+ if (ord_to_idx(ord) <= index)
+ continue;
+
+ map = get_imsm_map(dev, 0);
+ set_imsm_ord_tbl_ent(map, j, ord_to_idx(ord - 1));
+ map = get_imsm_map(dev, 1);
+ if (map)
+ set_imsm_ord_tbl_ent(map, j, ord - 1);
+ }
+ }
+
+ mpb->num_disks--;
+ super->updates_pending++;
+ if (*dlp) {
+ struct dl *dl = *dlp;
+
+ *dlp = (*dlp)->next;
+ __free_imsm_disk(dl);
+ }
+}
+#endif /* MDASSEMBLE */
+
+struct superswitch super_imsm = {
+#ifndef MDASSEMBLE
+ .examine_super = examine_super_imsm,
+ .brief_examine_super = brief_examine_super_imsm,
+ .export_examine_super = export_examine_super_imsm,
+ .detail_super = detail_super_imsm,
+ .brief_detail_super = brief_detail_super_imsm,
+ .write_init_super = write_init_super_imsm,
+ .validate_geometry = validate_geometry_imsm,
+ .add_to_super = add_to_super_imsm,
+ .detail_platform = detail_platform_imsm,
+#endif
+ .match_home = match_home_imsm,
+ .uuid_from_super= uuid_from_super_imsm,
+ .getinfo_super = getinfo_super_imsm,
+ .update_super = update_super_imsm,
+
+ .avail_size = avail_size_imsm,
+
+ .compare_super = compare_super_imsm,
+
+ .load_super = load_super_imsm,
+ .init_super = init_super_imsm,
+ .store_super = store_zero_imsm,
+ .free_super = free_super_imsm,
+ .match_metadata_desc = match_metadata_desc_imsm,
+ .container_content = container_content_imsm,
+ .default_layout = imsm_level_to_layout,
+
+ .external = 1,
+ .name = "imsm",
+
+#ifndef MDASSEMBLE
+/* for mdmon */
+ .open_new = imsm_open_new,
+ .load_super = load_super_imsm,
+ .set_array_state= imsm_set_array_state,
+ .set_disk = imsm_set_disk,
+ .sync_metadata = imsm_sync_metadata,
+ .activate_spare = imsm_activate_spare,
+ .process_update = imsm_process_update,
+ .prepare_update = imsm_prepare_update,
+#endif /* MDASSEMBLE */
+};
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
#define HAVE_STDINT_H 1
}
-void super0_swap_endian(struct mdp_superblock_s *sb)
+static void super0_swap_endian(struct mdp_superblock_s *sb)
{
/* as super0 superblocks are host-endian, it is sometimes
* useful to be able to swap the endianness
}
}
-static void brief_examine_super0(struct supertype *st)
+static void brief_examine_super0(struct supertype *st, int verbose)
{
mdp_super_t *sb = st->sb;
char *c=map_num(pers, sb->level);
sprintf(devname, "/dev/md%d", sb->md_minor);
- printf("ARRAY %s level=%s num-devices=%d UUID=",
- devname,
- c?c:"-unknown-", sb->raid_disks);
+ if (verbose) {
+ printf("ARRAY %s level=%s num-devices=%d",
+ devname,
+ c?c:"-unknown-", sb->raid_disks);
+ } else
+ printf("ARRAY %s", devname);
+
if (sb->minor_version >= 90)
- printf("%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1,
+ printf(" UUID=%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1,
sb->set_uuid2, sb->set_uuid3);
else
- printf("%08x", sb->set_uuid0);
+ printf(" UUID=%08x", sb->set_uuid0);
printf("\n");
}
else
printf("%08x", sb->set_uuid0);
}
-
-static void export_detail_super0(struct supertype *st)
-{
- mdp_super_t *sb = st->sb;
- printf("MD_UUID=");
- if (sb->minor_version >= 90)
- printf("%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1,
- sb->set_uuid2, sb->set_uuid3);
- else
- printf("%08x", sb->set_uuid0);
- printf("\n");
-}
#endif
static int match_home0(struct supertype *st, char *homehost)
info->events = md_event(sb);
info->data_offset = 0;
+ sprintf(info->text_version, "0.%d", sb->minor_version);
+ info->safe_mode_delay = 200;
+
uuid_from_super0(st, info->uuid);
if (sb->minor_version > 90 && (sb->reshape_position+1) != 0) {
unsigned long long size, char *ignored_name, char *homehost,
int *uuid)
{
- mdp_super_t *sb = malloc(MD_SB_BYTES + sizeof(bitmap_super_t));
+ mdp_super_t *sb;
int spares;
+
+ if (posix_memalign((void**)&sb, 4096,
+ MD_SB_BYTES + ROUND_UP(sizeof(bitmap_super_t), 4096)) != 0) {
+ fprintf(stderr, Name ": %s could not allocate superblock\n", __func__);
+ return 0;
+ }
memset(sb, 0, MD_SB_BYTES + sizeof(bitmap_super_t));
st->sb = sb;
- if (info->major_version == -1) {
+ if (info == NULL) {
/* zeroing the superblock */
return 0;
}
return 1;
}
+struct devinfo {
+ int fd;
+ char *devname;
+ mdu_disk_info_t disk;
+ struct devinfo *next;
+};
+
+#ifndef MDASSEMBLE
/* Add a device to the superblock being created */
-static void add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo)
+static int add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo,
+ int fd, char *devname)
{
mdp_super_t *sb = st->sb;
mdp_disk_t *dk = &sb->disks[dinfo->number];
+ struct devinfo *di, **dip;
dk->number = dinfo->number;
dk->major = dinfo->major;
dk->minor = dinfo->minor;
dk->raid_disk = dinfo->raid_disk;
dk->state = dinfo->state;
+
+ sb->this_disk = sb->disks[dinfo->number];
+ sb->sb_csum = calc_sb0_csum(sb);
+
+ dip = (struct devinfo **)&st->info;
+ while (*dip)
+ dip = &(*dip)->next;
+ di = malloc(sizeof(struct devinfo));
+ di->fd = fd;
+ di->devname = devname;
+ di->disk = *dinfo;
+ di->next = NULL;
+ *dip = di;
+
+ return 0;
}
+#endif
static int store_super0(struct supertype *st, int fd)
{
if (super->state & (1<<MD_SB_BITMAP_PRESENT)) {
struct bitmap_super_s * bm = (struct bitmap_super_s*)(super+1);
if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC)
- if (write(fd, bm, sizeof(*bm)) != sizeof(*bm))
+ if (write(fd, bm, ROUND_UP(sizeof(*bm),4096)) !=
+ ROUND_UP(sizeof(*bm),4096))
return 5;
}
return 0;
}
-static int write_init_super0(struct supertype *st,
- mdu_disk_info_t *dinfo, char *devname)
+#ifndef MDASSEMBLE
+static int write_init_super0(struct supertype *st)
{
mdp_super_t *sb = st->sb;
- int fd = dev_open(devname, O_RDWR|O_EXCL);
- int rv;
+ int rv = 0;
+ struct devinfo *di;
- if (fd < 0) {
- fprintf(stderr, Name ": Failed to open %s to write superblock\n", devname);
- return -1;
- }
+ for (di = st->info ; di && ! rv ; di = di->next) {
- sb->disks[dinfo->number].state &= ~(1<<MD_DISK_FAULTY);
+ if (di->disk.state == 1)
+ continue;
+ if (di->fd == -1)
+ continue;
+ Kill(di->devname, 0, 1, 1);
+ Kill(di->devname, 0, 1, 1);
- sb->this_disk = sb->disks[dinfo->number];
- sb->sb_csum = calc_sb0_csum(sb);
- rv = store_super0(st, fd);
+ sb->disks[di->disk.number].state &= ~(1<<MD_DISK_FAULTY);
- if (rv == 0 && (sb->state & (1<<MD_SB_BITMAP_PRESENT)))
- rv = st->ss->write_bitmap(st, fd);
+ sb->this_disk = sb->disks[di->disk.number];
+ sb->sb_csum = calc_sb0_csum(sb);
+ rv = store_super0(st, di->fd);
- close(fd);
- if (rv)
- fprintf(stderr, Name ": failed to write superblock to %s\n", devname);
+ if (rv == 0 && (sb->state & (1<<MD_SB_BITMAP_PRESENT)))
+ rv = st->ss->write_bitmap(st, di->fd);
+
+ if (rv)
+ fprintf(stderr,
+ Name ": failed to write superblock to %s\n",
+ di->devname);
+ close(di->fd);
+ di->fd = -1;
+ }
return rv;
}
+#endif
static int compare_super0(struct supertype *st, struct supertype *tst)
{
if (second->md_magic != MD_SB_MAGIC)
return 1;
if (!first) {
- first = malloc(MD_SB_BYTES + sizeof(struct bitmap_super_s));
+ if (posix_memalign((void**)&first, 4096,
+ MD_SB_BYTES +
+ ROUND_UP(sizeof(struct bitmap_super_s), 4096)) != 0) {
+ fprintf(stderr, Name
+ ": %s could not allocate superblock\n", __func__);
+ return 1;
+ }
memcpy(first, second, MD_SB_BYTES + sizeof(struct bitmap_super_s));
st->sb = first;
return 0;
free_super0(st);
+ if (st->subarray[0])
+ return 1;
+
if (!get_dev_size(fd, devname, &dsize))
return 1;
return 1;
}
- super = malloc(MD_SB_BYTES + sizeof(bitmap_super_t));
+ if (posix_memalign((void**)&super, 4096,
+ MD_SB_BYTES +
+ ROUND_UP(sizeof(bitmap_super_t), 4096)) != 0) {
+ fprintf(stderr, Name
+ ": %s could not allocate superblock\n", __func__);
+ return 1;
+ }
if (read(fd, super, sizeof(*super)) != MD_SB_BYTES) {
if (devname)
st->ss = &super0;
st->minor_version = super->minor_version;
st->max_devs = MD_SB_DISKS;
+ st->info = NULL;
}
/* Now check on the bitmap superblock */
* valid. If it doesn't clear the bit. An --assemble --force
* should get that written out.
*/
- if (read(fd, super+1, sizeof(struct bitmap_super_s))
- != sizeof(struct bitmap_super_s))
+ if (read(fd, super+1, ROUND_UP(sizeof(struct bitmap_super_s),4096))
+ != ROUND_UP(sizeof(struct bitmap_super_s),4096))
goto no_bitmap;
uuid_from_super0(st, uuid);
struct supertype *st = malloc(sizeof(*st));
if (!st) return st;
+ memset(st, 0, sizeof(*st));
st->ss = &super0;
+ st->info = NULL;
st->minor_version = 90;
st->max_devs = MD_SB_DISKS;
st->sb = NULL;
- /* Eliminate pointless leading 0 from some versions of mdadm -D */
- if (strncmp(arg, "00.", 3) == 0)
+ /* we sometimes get 00.90 */
+ while (arg[0] == '0' && arg[1] == '0')
arg++;
if (strcmp(arg, "0") == 0 ||
strcmp(arg, "0.90") == 0 ||
}
-void locate_bitmap0(struct supertype *st, int fd)
+static void locate_bitmap0(struct supertype *st, int fd)
{
unsigned long long dsize;
unsigned long long offset;
lseek64(fd, offset, 0);
}
-int write_bitmap0(struct supertype *st, int fd)
+static int write_bitmap0(struct supertype *st, int fd)
{
unsigned long long dsize;
unsigned long long offset;
int rv = 0;
int towrite, n;
- char buf[4096];
+ char abuf[4096+4096];
+ char *buf = (char*)(((long)(abuf+4096))&~4095L);
if (!get_dev_size(fd, NULL, &dsize))
return 1;
if (lseek64(fd, offset + 4096, 0)< 0LL)
return 3;
-
- if (write(fd, ((char*)sb)+MD_SB_BYTES, sizeof(bitmap_super_t)) !=
- sizeof(bitmap_super_t))
- return -2;
- towrite = 64*1024 - MD_SB_BYTES - sizeof(bitmap_super_t);
- memset(buf, 0xff, sizeof(buf));
+ memset(buf, 0xff, 4096);
+ memcpy(buf, ((char*)sb)+MD_SB_BYTES, sizeof(bitmap_super_t));
+ towrite = 64*1024;
while (towrite > 0) {
n = towrite;
- if (n > sizeof(buf))
- n = sizeof(buf);
+ if (n > 4096)
+ n = 4096;
n = write(fd, buf, n);
if (n > 0)
towrite -= n;
else
break;
+ memset(buf, 0xff, 4096);
}
fsync(fd);
if (towrite)
st->sb = NULL;
}
+#ifndef MDASSEMBLE
+static int validate_geometry0(struct supertype *st, int level,
+ int layout, int raiddisks,
+ int chunk, unsigned long long size,
+ char *subdev, unsigned long long *freesize,
+ int verbose)
+{
+ unsigned long long ldsize;
+ int fd;
+
+ if (level == LEVEL_CONTAINER)
+ return 0;
+ if (raiddisks > MD_SB_DISKS)
+ return 0;
+ if (size > (0x7fffffffULL<<9))
+ return 0;
+ if (!subdev)
+ return 1;
+
+ fd = open(subdev, O_RDONLY|O_EXCL, 0);
+ if (fd < 0) {
+ if (verbose)
+ fprintf(stderr, Name ": super0.90 cannot open %s: %s\n",
+ subdev, strerror(errno));
+ return 0;
+ }
+
+ if (!get_dev_size(fd, subdev, &ldsize)) {
+ close(fd);
+ return 0;
+ }
+ close(fd);
+
+ if (ldsize < MD_RESERVED_SECTORS * 512)
+ return 0;
+ if (size > (0x7fffffffULL<<9))
+ return 0;
+ *freesize = MD_NEW_SIZE_SECTORS(ldsize >> 9);
+ return 1;
+}
+#endif /* MDASSEMBLE */
+
struct superswitch super0 = {
#ifndef MDASSEMBLE
.examine_super = examine_super0,
.export_examine_super = export_examine_super0,
.detail_super = detail_super0,
.brief_detail_super = brief_detail_super0,
- .export_detail_super = export_detail_super0,
+ .write_init_super = write_init_super0,
+ .validate_geometry = validate_geometry0,
+ .add_to_super = add_to_super0,
#endif
.match_home = match_home0,
.uuid_from_super = uuid_from_super0,
.getinfo_super = getinfo_super0,
.update_super = update_super0,
.init_super = init_super0,
- .add_to_super = add_to_super0,
.store_super = store_super0,
- .write_init_super = write_init_super0,
.compare_super = compare_super0,
.load_super = load_super0,
.match_metadata_desc = match_metadata_desc0,
.locate_bitmap = locate_bitmap0,
.write_bitmap = write_bitmap0,
.free_super = free_super0,
- .major = 0,
- .swapuuid = 0,
+ .name = "0.90",
};
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
#include "mdadm.h"
return __cpu_to_le32(csum);
}
+static char abuf[4096+4096];
+static int aread(int fd, void *buf, int len)
+{
+ /* aligned read.
+ * On devices with a 4K sector size, we need to read
+ * the full sector and copy relevant bits into
+ * the buffer
+ */
+ int bsize;
+ char *b;
+ int n;
+ if (ioctl(fd, BLKSSZGET, &bsize) != 0 ||
+ bsize <= len)
+ return read(fd, buf, len);
+ if (bsize > 4096)
+ return -1;
+ b = (char*)(((long)(abuf+4096))&~4095UL);
+
+ n = read(fd, b, bsize);
+ if (n <= 0)
+ return n;
+ lseek(fd, len - n, 1);
+ if (n > len)
+ n = len;
+ memcpy(buf, b, n);
+ return n;
+}
+
+static int awrite(int fd, void *buf, int len)
+{
+ /* aligned write.
+ * On devices with a 4K sector size, we need to write
+ * the full sector. We pre-read if the sector is larger
+ * than the write.
+ * The address must be sector-aligned.
+ */
+ int bsize;
+ char *b;
+ int n;
+ if (ioctl(fd, BLKSSZGET, &bsize) != 0 ||
+ bsize <= len)
+ return write(fd, buf, len);
+ if (bsize > 4096)
+ return -1;
+ b = (char*)(((long)(abuf+4096))&~4095UL);
+
+ n = read(fd, b, bsize);
+ if (n <= 0)
+ return n;
+ lseek(fd, -n, 1);
+ memcpy(b, buf, len);
+ n = write(fd, b, bsize);
+ if (n <= 0)
+ return n;
+ lseek(fd, len - n, 1);
+ return len;
+}
+
#ifndef MDASSEMBLE
static void examine_super1(struct supertype *st, char *homehost)
{
struct mdp_superblock_1 *sb = st->sb;
time_t atime;
int d;
- int faulty;
+ int role;
int i;
char *c;
int l = homehost ? strlen(homehost) : 0;
default: break;
}
printf("\n");
+#if 0
+ /* This turns out to just be confusing */
printf(" Array Slot : %d (", __le32_to_cpu(sb->dev_number));
for (i= __le32_to_cpu(sb->max_dev); i> 0 ; i--)
if (__le16_to_cpu(sb->dev_roles[i-1]) != 0xffff)
else printf("%d", role);
}
printf(")\n");
+#endif
+ printf(" Device Role : ");
+ d = __le32_to_cpu(sb->dev_number);
+ if (d < sb->raid_disks)
+ role = __le16_to_cpu(sb->dev_roles[d]);
+ else
+ role = 0xFFFF;
+ if (role >= 0xFFFE)
+ printf("spare\n");
+ else
+ printf("Active device %d\n", role);
+
printf(" Array State : ");
for (d=0; d<__le32_to_cpu(sb->raid_disks); d++) {
int cnt = 0;
}
}
if (cnt > 1) printf("?");
- else if (cnt == 1 && me) printf("U");
- else if (cnt == 1) printf("u");
- else printf ("_");
+ else if (cnt == 1) printf("A");
+ else printf (".");
}
+#if 0
+ /* This is confusing too */
faulty = 0;
for (i=0; i< __le32_to_cpu(sb->max_dev); i++) {
int role = __le16_to_cpu(sb->dev_roles[i]);
faulty++;
}
if (faulty) printf(" %d failed", faulty);
+#endif
+ printf(" ('A' == active, '.' == missing)");
printf("\n");
}
-static void brief_examine_super1(struct supertype *st)
+static void brief_examine_super1(struct supertype *st, int verbose)
{
struct mdp_superblock_1 *sb = st->sb;
int i;
else if (sb->set_name[0])
nm = sb->set_name;
else
- nm = "??";
+ nm = NULL;
- printf("ARRAY /dev/md/%s level=%s ", nm, c?c:"-unknown-");
+ printf("ARRAY%s%s", nm ? " /dev/md/":"", nm);
+ if (verbose && c)
+ printf(" level=%s", c);
sb_offset = __le64_to_cpu(sb->super_offset);
if (sb_offset <= 4)
- printf("metadata=1.1 ");
+ printf(" metadata=1.1 ");
else if (sb_offset <= 8)
- printf("metadata=1.2 ");
+ printf(" metadata=1.2 ");
else
- printf("metadata=1.0 ");
- printf("num-devices=%d UUID=", __le32_to_cpu(sb->raid_disks));
+ printf(" metadata=1.0 ");
+ if (verbose)
+ printf("num-devices=%d ", __le32_to_cpu(sb->raid_disks));
+ printf("UUID=");
for (i=0; i<16; i++) {
if ((i&3)==0 && i != 0) printf(":");
printf("%02x", sb->set_uuid[i]);
}
if (len)
printf("MD_NAME=%.*s\n", len, sb->set_name);
- printf("MD_UUID=");
- for (i=0; i<16; i++) {
- if ((i&3)==0 && i != 0) printf(":");
- printf("%02x", sb->set_uuid[i]);
- }
- printf("\n");
}
#endif
int role;
info->array.major_version = 1;
- info->array.minor_version = __le32_to_cpu(sb->feature_map);
+ info->array.minor_version = st->minor_version;
info->array.patch_version = 0;
info->array.raid_disks = __le32_to_cpu(sb->raid_disks);
info->array.level = __le32_to_cpu(sb->level);
info->disk.raid_disk = role;
}
info->events = __le64_to_cpu(sb->events);
+ sprintf(info->text_version, "1.%d", st->minor_version);
+ info->safe_mode_delay = 200;
memcpy(info->uuid, sb->set_uuid, 16);
__le64_to_cpu(sb->data_offset)) {
/* set data_size to device size less data_offset */
struct misc_dev_info *misc = (struct misc_dev_info*)
- (st->sb + 1024 + sizeof(struct bitmap_super_s));
+ (st->sb + 1024 + 512);
printf("Size was %llu\n", (unsigned long long)
__le64_to_cpu(sb->data_size));
sb->data_size = __cpu_to_le64(
static int init_super1(struct supertype *st, mdu_array_info_t *info,
unsigned long long size, char *name, char *homehost, int *uuid)
{
- struct mdp_superblock_1 *sb = malloc(1024 + sizeof(bitmap_super_t) +
- sizeof(struct misc_dev_info));
+ struct mdp_superblock_1 *sb;
int spares;
int rfd;
char defname[10];
+
+ if (posix_memalign((void**)&sb, 512, (1024 + 512 +
+ sizeof(struct misc_dev_info))) != 0) {
+ fprintf(stderr, Name
+ ": %s could not allocate superblock\n", __func__);
+ return 0;
+ }
memset(sb, 0, 1024);
st->sb = sb;
- if (info->major_version == -1) {
+ if (info == NULL) {
/* zeroing superblock */
return 0;
}
return 1;
}
+struct devinfo {
+ int fd;
+ char *devname;
+ mdu_disk_info_t disk;
+ struct devinfo *next;
+};
+#ifndef MDASSEMBLE
/* Add a device to the superblock being created */
-static void add_to_super1(struct supertype *st, mdu_disk_info_t *dk)
+static int add_to_super1(struct supertype *st, mdu_disk_info_t *dk,
+ int fd, char *devname)
{
struct mdp_superblock_1 *sb = st->sb;
__u16 *rp = sb->dev_roles + dk->number;
+ struct devinfo *di, **dip;
+
if ((dk->state & 6) == 6) /* active, sync */
*rp = __cpu_to_le16(dk->raid_disk);
else if ((dk->state & ~2) == 0) /* active or idle -> spare */
*rp = 0xffff;
else
*rp = 0xfffe;
+
if (dk->number >= __le32_to_cpu(sb->max_dev) &&
__le32_to_cpu(sb->max_dev) < 384)
sb->max_dev = __cpu_to_le32(dk->number+1);
+
+ sb->dev_number = __cpu_to_le32(dk->number);
+ sb->sb_csum = calc_sb_1_csum(sb);
+
+ dip = (struct devinfo **)&st->info;
+ while (*dip)
+ dip = &(*dip)->next;
+ di = malloc(sizeof(struct devinfo));
+ di->fd = fd;
+ di->devname = devname;
+ di->disk = *dk;
+ di->next = NULL;
+ *dip = di;
+
+ return 0;
}
+#endif
static void locate_bitmap1(struct supertype *st, int fd);
return 3;
sbsize = sizeof(*sb) + 2 * __le32_to_cpu(sb->max_dev);
+ sbsize = (sbsize+511)&(~511UL);
- if (write(fd, sb, sbsize) != sbsize)
+ if (awrite(fd, sb, sbsize) != sbsize)
return 4;
if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) {
(((char*)sb)+1024);
if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC) {
locate_bitmap1(st, fd);
- if (write(fd, bm, sizeof(*bm)) != sizeof(*bm))
+ if (awrite(fd, bm, sizeof(*bm)) !=
+ sizeof(*bm))
return 5;
}
}
return 4*2;
}
-static int write_init_super1(struct supertype *st,
- mdu_disk_info_t *dinfo, char *devname)
+#ifndef MDASSEMBLE
+static int write_init_super1(struct supertype *st)
{
struct mdp_superblock_1 *sb = st->sb;
struct supertype refst;
- int fd = dev_open(devname, O_RDWR | O_EXCL);
int rfd;
- int rv;
+ int rv = 0;
int bm_space;
-
+ struct devinfo *di;
unsigned long long dsize, array_size;
long long sb_offset;
+ for (di = st->info; di && ! rv ; di = di->next) {
+ if (di->disk.state == 1)
+ continue;
+ if (di->fd < 0)
+ continue;
- if (fd < 0) {
- fprintf(stderr, Name ": Failed to open %s to write superblock\n",
- devname);
- return -1;
- }
+ Kill(di->devname, 0, 1, 1);
+ Kill(di->devname, 0, 1, 1);
- sb->dev_number = __cpu_to_le32(dinfo->number);
- if (dinfo->state & (1<<MD_DISK_WRITEMOSTLY))
- sb->devflags |= __cpu_to_le32(WriteMostly1);
+ sb->dev_number = __cpu_to_le32(di->disk.number);
+ if (di->disk.state & (1<<MD_DISK_WRITEMOSTLY))
+ sb->devflags |= __cpu_to_le32(WriteMostly1);
- if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 ||
- read(rfd, sb->device_uuid, 16) != 16) {
- __u32 r[4] = {random(), random(), random(), random()};
- memcpy(sb->device_uuid, r, 16);
- }
-
- if (rfd >= 0) close(rfd);
- sb->events = 0;
-
- refst =*st;
- refst.sb = NULL;
- if (load_super1(&refst, fd, NULL)==0) {
- struct mdp_superblock_1 *refsb = refst.sb;
-
- memcpy(sb->device_uuid, refsb->device_uuid, 16);
- if (memcmp(sb->set_uuid, refsb->set_uuid, 16)==0) {
- /* same array, so preserve events and dev_number */
- sb->events = refsb->events;
- /* bugs in 2.6.17 and earlier mean the dev_number
- * chosen in Manage must be preserved
- */
- if (get_linux_version() >= 2006018)
- sb->dev_number = refsb->dev_number;
+ if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 ||
+ read(rfd, sb->device_uuid, 16) != 16) {
+ __u32 r[4] = {random(), random(), random(), random()};
+ memcpy(sb->device_uuid, r, 16);
+ }
+ if (rfd >= 0)
+ close(rfd);
+
+ sb->events = 0;
+
+ refst =*st;
+ refst.sb = NULL;
+ if (load_super1(&refst, di->fd, NULL)==0) {
+ struct mdp_superblock_1 *refsb = refst.sb;
+
+ memcpy(sb->device_uuid, refsb->device_uuid, 16);
+ if (memcmp(sb->set_uuid, refsb->set_uuid, 16)==0) {
+ /* same array, so preserve events and
+ * dev_number */
+ sb->events = refsb->events;
+ /* bugs in 2.6.17 and earlier mean the
+ * dev_number chosen in Manage must be preserved
+ */
+ if (get_linux_version() >= 2006018)
+ sb->dev_number = refsb->dev_number;
+ }
+ free(refsb);
}
- free(refsb);
- }
- if (!get_dev_size(fd, NULL, &dsize))
- return 1;
- dsize >>= 9;
+ if (!get_dev_size(di->fd, NULL, &dsize))
+ return 1;
+ dsize >>= 9;
- if (dsize < 24) {
- close(fd);
- return 2;
- }
+ if (dsize < 24) {
+ close(di->fd);
+ return 2;
+ }
- /*
- * Calculate the position of the superblock.
- * It is always aligned to a 4K boundary and
- * depending on minor_version, it can be:
- * 0: At least 8K, but less than 12K, from end of device
- * 1: At start of device
- * 2: 4K from start of device.
- * Depending on the array size, we might leave extra space
- * for a bitmap.
- */
- array_size = __le64_to_cpu(sb->size);
- /* work out how much space we left for a bitmap */
- bm_space = choose_bm_space(array_size);
-
- switch(st->minor_version) {
- case 0:
- sb_offset = dsize;
- sb_offset -= 8*2;
- sb_offset &= ~(4*2-1);
- sb->super_offset = __cpu_to_le64(sb_offset);
- sb->data_offset = __cpu_to_le64(0);
+ /*
+ * Calculate the position of the superblock.
+ * It is always aligned to a 4K boundary and
+ * depending on minor_version, it can be:
+ * 0: At least 8K, but less than 12K, from end of device
+ * 1: At start of device
+ * 2: 4K from start of device.
+ * Depending on the array size, we might leave extra space
+ * for a bitmap.
+ */
+ array_size = __le64_to_cpu(sb->size);
+ /* work out how much space we left for a bitmap */
+ bm_space = choose_bm_space(array_size);
+
+ switch(st->minor_version) {
+ case 0:
+ sb_offset = dsize;
+ sb_offset -= 8*2;
+ sb_offset &= ~(4*2-1);
+ sb->super_offset = __cpu_to_le64(sb_offset);
+ sb->data_offset = __cpu_to_le64(0);
if (sb_offset - bm_space < array_size)
bm_space = sb_offset - array_size;
- sb->data_size = __cpu_to_le64(sb_offset - bm_space);
- break;
- case 1:
- sb->super_offset = __cpu_to_le64(0);
- if (4*2 + bm_space + __le64_to_cpu(sb->size) > dsize)
- bm_space = dsize - __le64_to_cpu(sb->size) - 4*2;
- sb->data_offset = __cpu_to_le64(bm_space + 4*2);
- sb->data_size = __cpu_to_le64(dsize - bm_space - 4*2);
- break;
- case 2:
- sb_offset = 4*2;
- sb->super_offset = __cpu_to_le64(4*2);
- if (4*2 + 4*2 + bm_space + __le64_to_cpu(sb->size) > dsize)
- bm_space = dsize - __le64_to_cpu(sb->size) - 4*2 - 4*2;
- sb->data_offset = __cpu_to_le64(4*2 + 4*2 + bm_space);
- sb->data_size = __cpu_to_le64(dsize - 4*2 - 4*2 - bm_space );
- break;
- default:
- return -EINVAL;
- }
+ sb->data_size = __cpu_to_le64(sb_offset - bm_space);
+ break;
+ case 1:
+ sb->super_offset = __cpu_to_le64(0);
+ if (4*2 + bm_space + __le64_to_cpu(sb->size) > dsize)
+ bm_space = dsize - __le64_to_cpu(sb->size) -4*2;
+ sb->data_offset = __cpu_to_le64(bm_space + 4*2);
+ sb->data_size = __cpu_to_le64(dsize - bm_space - 4*2);
+ break;
+ case 2:
+ sb_offset = 4*2;
+ sb->super_offset = __cpu_to_le64(4*2);
+ if (4*2 + 4*2 + bm_space + __le64_to_cpu(sb->size)
+ > dsize)
+ bm_space = dsize - __le64_to_cpu(sb->size)
+ - 4*2 - 4*2;
+ sb->data_offset = __cpu_to_le64(4*2 + 4*2 + bm_space);
+ sb->data_size = __cpu_to_le64(dsize - 4*2 - 4*2
+ - bm_space );
+ break;
+ default:
+ return -EINVAL;
+ }
- sb->sb_csum = calc_sb_1_csum(sb);
- rv = store_super1(st, fd);
- if (rv)
- fprintf(stderr, Name ": failed to write superblock to %s\n", devname);
+ sb->sb_csum = calc_sb_1_csum(sb);
+ rv = store_super1(st, di->fd);
+ if (rv)
+ fprintf(stderr,
+ Name ": failed to write superblock to %s\n",
+ di->devname);
- if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1))
- rv = st->ss->write_bitmap(st, fd);
- close(fd);
+ if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1))
+ rv = st->ss->write_bitmap(st, di->fd);
+ close(di->fd);
+ di->fd = -1;
+ }
return rv;
}
+#endif
static int compare_super1(struct supertype *st, struct supertype *tst)
{
return 1;
if (!first) {
- first = malloc(1024+sizeof(bitmap_super_t) +
- sizeof(struct misc_dev_info));
- memcpy(first, second, 1024+sizeof(bitmap_super_t) +
+ if (posix_memalign((void**)&first, 512,
+ 1024 + 512 +
+ sizeof(struct misc_dev_info)) != 0) {
+ fprintf(stderr, Name
+ ": %s could not allocate superblock\n", __func__);
+ return 1;
+ }
+ memcpy(first, second, 1024 + 512 +
sizeof(struct misc_dev_info));
st->sb = first;
return 0;
free_super1(st);
+ if (st->subarray[0])
+ return 1;
+
if (st->ss == NULL || st->minor_version == -1) {
int bestvers = -1;
struct supertype tst;
__u64 bestctime = 0;
/* guess... choose latest ctime */
+ memset(&tst, 0, sizeof(tst));
tst.ss = &super1;
- tst.sb = NULL;
for (tst.minor_version = 0; tst.minor_version <= 2 ; tst.minor_version++) {
switch(load_super1(&tst, fd, devname)) {
case 0: super = tst.sb;
return 1;
}
- super = malloc(1024 + sizeof(bitmap_super_t) +
- sizeof(struct misc_dev_info));
+ if (posix_memalign((void**)&super, 512,
+ 1024 + 512 +
+ sizeof(struct misc_dev_info)) != 0) {
+ fprintf(stderr, Name ": %s could not allocate superblock\n",
+ __func__);
+ return 1;
+ }
- if (read(fd, super, 1024) != 1024) {
+ if (aread(fd, super, 1024) != 1024) {
if (devname)
fprintf(stderr, Name ": Cannot read superblock on %s\n",
devname);
bsb = (struct bitmap_super_s *)(((char*)super)+1024);
- misc = (struct misc_dev_info*) (bsb+1);
+ misc = (struct misc_dev_info*) (((char*)super)+1024+512);
misc->device_size = dsize;
/* Now check on the bitmap superblock */
* should get that written out.
*/
locate_bitmap1(st, fd);
- if (read(fd, ((char*)super)+1024, sizeof(struct bitmap_super_s))
- != sizeof(struct bitmap_super_s))
+ if (aread(fd, ((char*)super)+1024, 512)
+ != 512)
goto no_bitmap;
uuid_from_super1(st, uuid);
struct supertype *st = malloc(sizeof(*st));
if (!st) return st;
+ memset(st, 0, sizeof(*st));
st->ss = &super1;
st->max_devs = 384;
st->sb = NULL;
- /* Eliminate pointless leading 0 from some versions of mdadm -D */
- if (strncmp(arg, "01.", 3) == 0)
+ /* leading zeros can be safely ignored. --detail generates them. */
+ while (*arg == '0')
arg++;
if (strcmp(arg, "1.0") == 0 ||
strcmp(arg, "1.00") == 0) {
return st;
}
if (strcmp(arg, "1") == 0 ||
- strcmp(arg, "default/large") == 0) {
+ strcmp(arg, "default") == 0) {
st->minor_version = -1;
return st;
}
int rv = 0;
int towrite, n;
- char buf[4096];
+ char *buf = (char*)(((long)(abuf+4096))&~4095UL);
locate_bitmap1(st, fd);
- if (write(fd, ((char*)sb)+1024, sizeof(bitmap_super_t)) !=
- sizeof(bitmap_super_t))
- return -2;
+ memset(buf, 0xff, 4096);
+ memcpy(buf, ((char*)sb)+1024, sizeof(bitmap_super_t));
+
towrite = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9);
towrite = (towrite+7) >> 3; /* bits to bytes */
- memset(buf, 0xff, sizeof(buf));
+ towrite += sizeof(bitmap_super_t);
+ towrite = ROUND_UP(towrite, 512);
while (towrite > 0) {
n = towrite;
- if (n > sizeof(buf))
- n = sizeof(buf);
+ if (n > 4096)
+ n = 4096;
n = write(fd, buf, n);
if (n > 0)
towrite -= n;
else
break;
+ memset(buf, 0xff, 4096);
}
fsync(fd);
if (towrite)
st->sb = NULL;
}
+#ifndef MDASSEMBLE
+static int validate_geometry1(struct supertype *st, int level,
+ int layout, int raiddisks,
+ int chunk, unsigned long long size,
+ char *subdev, unsigned long long *freesize,
+ int verbose)
+{
+ unsigned long long ldsize;
+ int fd;
+
+ if (level == LEVEL_CONTAINER)
+ return 0;
+ if (!subdev)
+ return 1;
+
+ fd = open(subdev, O_RDONLY|O_EXCL, 0);
+ if (fd < 0) {
+ if (verbose)
+ fprintf(stderr, Name ": super1.x cannot open %s: %s\n",
+ subdev, strerror(errno));
+ return 0;
+ }
+
+ if (!get_dev_size(fd, subdev, &ldsize)) {
+ close(fd);
+ return 0;
+ }
+ close(fd);
+
+ *freesize = avail_size1(st, ldsize >> 9);
+ return 1;
+}
+#endif /* MDASSEMBLE */
+
struct superswitch super1 = {
#ifndef MDASSEMBLE
.examine_super = examine_super1,
.detail_super = detail_super1,
.brief_detail_super = brief_detail_super1,
.export_detail_super = export_detail_super1,
+ .write_init_super = write_init_super1,
+ .validate_geometry = validate_geometry1,
+ .add_to_super = add_to_super1,
#endif
.match_home = match_home1,
.uuid_from_super = uuid_from_super1,
.getinfo_super = getinfo_super1,
.update_super = update_super1,
.init_super = init_super1,
- .add_to_super = add_to_super1,
.store_super = store_super1,
- .write_init_super = write_init_super1,
.compare_super = compare_super1,
.load_super = load_super1,
.match_metadata_desc = match_metadata_desc1,
.locate_bitmap = locate_bitmap1,
.write_bitmap = write_bitmap1,
.free_super = free_super1,
- .major = 1,
#if __BYTE_ORDER == BIG_ENDIAN
.swapuuid = 0,
#else
.swapuuid = 1,
#endif
+ .name = "1.x",
};
* sysfs - extract md related information from sysfs. Part of:
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
#include "mdadm.h"
#include <dirent.h>
+#include <ctype.h>
int load_sys(char *path, char *buf)
{
return -1;
n = read(fd, buf, 1024);
close(fd);
- if (n <=0 || n >= 1024)
+ if (n <0 || n >= 1024)
return -1;
buf[n] = 0;
- if (buf[n-1] == '\n')
+ if (n && buf[n-1] == '\n')
buf[n-1] = 0;
return 0;
}
}
}
+int sysfs_open(int devnum, char *devname, char *attr)
+{
+ char fname[50];
+ int fd;
+ char *mdname = devnum2devname(devnum);
+
+ if (!mdname)
+ return -1;
+
+ sprintf(fname, "/sys/block/%s/md/", mdname);
+ if (devname) {
+ strcat(fname, devname);
+ strcat(fname, "/");
+ }
+ strcat(fname, attr);
+ fd = open(fname, O_RDWR);
+ if (fd < 0 && errno == EACCES)
+ fd = open(fname, O_RDONLY);
+ free(mdname);
+ return fd;
+}
+
+void sysfs_init(struct mdinfo *mdi, int fd, int devnum)
+{
+ mdi->sys_name[0] = 0;
+ if (fd >= 0) {
+ mdu_version_t vers;
+ if (ioctl(fd, RAID_VERSION, &vers) != 0)
+ return;
+ devnum = fd2devnum(fd);
+ }
+ if (devnum == NoMdDev)
+ return;
+ if (devnum >= 0)
+ sprintf(mdi->sys_name, "md%d", devnum);
+ else
+ sprintf(mdi->sys_name, "md_d%d",
+ -1-devnum);
+}
+
+
struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
{
/* Longest possible name in sysfs, mounted at /sys, is
char *dbase;
struct mdinfo *sra;
struct mdinfo *dev;
- DIR *dir;
+ DIR *dir = NULL;
struct dirent *de;
sra = malloc(sizeof(*sra));
if (sra == NULL)
return sra;
- sra->next = NULL;
-
- if (fd >= 0) {
- struct stat stb;
- mdu_version_t vers;
- if (fstat(fd, &stb)) return NULL;
- if (ioctl(fd, RAID_VERSION, &vers) != 0)
- return NULL;
- if (major(stb.st_rdev) == MD_MAJOR)
- sprintf(sra->sys_name, "md%d", (int)minor(stb.st_rdev));
- else if (major(stb.st_rdev) == get_mdp_major())
- sprintf(sra->sys_name, "md_d%d",
- (int)minor(stb.st_rdev)>>MdpMinorShift);
- else {
- /* must be an extended-minor partition. Look at the
- * /sys/dev/block/%d:%d link which must look like
- * ../../block/mdXXX/mdXXXpYY
- */
- char path[30];
- char link[200];
- char *cp;
- int n;
- sprintf(path, "/sys/dev/block/%d:%d", major(stb.st_rdev),
- minor(stb.st_rdev));
- n = readlink(path, link, sizeof(link)-1);
- if (n <= 0)
- return NULL;
- link[n] = 0;
- cp = strrchr(link, '/');
- if (cp) *cp = 0;
- cp = strchr(link, '/');
- if (cp && strncmp(cp, "/md", 3) == 0)
- strcpy(sra->sys_name, cp+1);
- else
- return NULL;
- }
- } else {
- if (devnum >= 0)
- sprintf(sra->sys_name, "md%d", devnum);
- else
- sprintf(sra->sys_name, "md_d%d",
- -1-devnum);
+ memset(sra, 0, sizeof(*sra));
+ sysfs_init(sra, fd, devnum);
+ if (sra->sys_name[0] == 0) {
+ free(sra);
+ return NULL;
}
+
sprintf(fname, "/sys/block/%s/md/", sra->sys_name);
base = fname + strlen(fname);
sra->array.major_version = -1;
sra->array.minor_version = -2;
strcpy(sra->text_version, buf+9);
- } else
+ } else {
sscanf(buf, "%d.%d",
&sra->array.major_version,
&sra->array.minor_version);
+ strcpy(sra->text_version, buf);
+ }
}
if (options & GET_LEVEL) {
strcpy(base, "level");
goto abort;
sra->array.layout = strtoul(buf, NULL, 0);
}
+ if (options & GET_DISKS) {
+ strcpy(base, "raid_disks");
+ if (load_sys(fname, buf))
+ goto abort;
+ sra->array.raid_disks = strtoul(buf, NULL, 0);
+ }
+ if (options & GET_DEGRADED) {
+ strcpy(base, "degraded");
+ if (load_sys(fname, buf))
+ goto abort;
+ sra->array.failed_disks = strtoul(buf, NULL, 0);
+ }
if (options & GET_COMPONENT) {
strcpy(base, "component_size");
if (load_sys(fname, buf))
goto abort;
sra->mismatch_cnt = strtoul(buf, NULL, 0);
}
+ if (options & GET_SAFEMODE) {
+ int scale = 1;
+ int dot = 0;
+ int i;
+ unsigned long msec;
+ size_t len;
+
+ strcpy(base, "safe_mode_delay");
+ if (load_sys(fname, buf))
+ goto abort;
+
+ /* remove a period, and count digits after it */
+ len = strlen(buf);
+ for (i = 0; i < len; i++) {
+ if (dot) {
+ if (isdigit(buf[i])) {
+ buf[i-1] = buf[i];
+ scale *= 10;
+ }
+ buf[i] = 0;
+ } else if (buf[i] == '.') {
+ dot=1;
+ buf[i] = 0;
+ }
+ }
+ msec = strtoul(buf, NULL, 10);
+ msec = (msec * 1000) / scale;
+ sra->safe_mode_delay = msec;
+ }
if (! (options & GET_DEVS))
return sra;
dev = malloc(sizeof(*dev));
if (!dev)
goto abort;
- dev->next = sra->devs;
- sra->devs = dev;
- strcpy(dev->sys_name, de->d_name);
/* Always get slot, major, minor */
strcpy(dbase, "slot");
- if (load_sys(fname, buf))
- goto abort;
+ if (load_sys(fname, buf)) {
+ /* hmm... unable to read 'slot' maybe the device
+ * is going away?
+ */
+ strcpy(dbase, "block");
+ if (readlink(fname, buf, sizeof(buf)) < 0 &&
+ errno != ENAMETOOLONG) {
+ /* ...yup device is gone */
+ free(dev);
+ continue;
+ } else {
+ /* slot is unreadable but 'block' link
+ * still intact... something bad is happening
+ * so abort
+ */
+ free(dev);
+ goto abort;
+ }
+
+ }
+ strcpy(dev->sys_name, de->d_name);
dev->disk.raid_disk = strtoul(buf, &ep, 10);
if (*ep) dev->disk.raid_disk = -1;
strcpy(dbase, "block/dev");
- if (load_sys(fname, buf))
- goto abort;
+ if (load_sys(fname, buf)) {
+ free(dev);
+ if (options & SKIP_GONE_DEVS)
+ continue;
+ else
+ goto abort;
+ }
sscanf(buf, "%d:%d", &dev->disk.major, &dev->disk.minor);
+ /* special case check for block devices that can go 'offline' */
+ if (options & SKIP_GONE_DEVS) {
+ strcpy(dbase, "block/device/state");
+ if (load_sys(fname, buf) == 0 &&
+ strncmp(buf, "offline", 7) == 0) {
+ free(dev);
+ continue;
+ }
+ }
+
+ /* finally add this disk to the array */
+ dev->next = sra->devs;
+ sra->devs = dev;
+
if (options & GET_OFFSET) {
strcpy(dbase, "offset");
if (load_sys(fname, buf))
strcpy(dbase, "size");
if (load_sys(fname, buf))
goto abort;
- dev->component_size = strtoull(buf, NULL, 0);
+ dev->component_size = strtoull(buf, NULL, 0) * 2;
}
if (options & GET_STATE) {
dev->disk.state = 0;
dev->errors = strtoul(buf, NULL, 0);
}
}
+ closedir(dir);
return sra;
abort:
+ if (dir)
+ closedir(dir);
sysfs_free(sra);
return NULL;
}
+int sysfs_attr_match(const char *attr, const char *str)
+{
+ /* See if attr, read from a sysfs file, matches
+ * str. They must either be the same, or attr can
+ * have a trailing newline or comma
+ */
+ while (*attr && *str && *attr == *str) {
+ attr++;
+ str++;
+ }
+
+ if (*str || (*attr && *attr != ',' && *attr != '\n'))
+ return 0;
+ return 1;
+}
+
+int sysfs_match_word(const char *word, char **list)
+{
+ int n;
+ for (n=0; list[n]; n++)
+ if (sysfs_attr_match(word, list[n]))
+ break;
+ return n;
+}
+
unsigned long long get_component_size(int fd)
{
/* Find out the component size of the array.
char fname[50];
int n;
int fd;
+
sprintf(fname, "/sys/block/%s/md/%s/%s",
sra->sys_name, dev?dev->sys_name:"", name);
fd = open(fname, O_WRONLY);
return -1;
n = write(fd, val, strlen(val));
close(fd);
- if (n != strlen(val))
+ if (n != strlen(val)) {
+ dprintf(Name ": failed to write '%s' to '%s' (%s)\n",
+ val, fname, strerror(errno));
return -1;
+ }
return 0;
}
return sysfs_set_str(sra, dev, name, valstr);
}
+int sysfs_uevent(struct mdinfo *sra, char *event)
+{
+ char fname[50];
+ int n;
+ int fd;
+
+ sprintf(fname, "/sys/block/%s/uevent",
+ sra->sys_name);
+ fd = open(fname, O_WRONLY);
+ if (fd < 0)
+ return -1;
+ n = write(fd, event, strlen(event));
+ close(fd);
+ return 0;
+}
+
int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
char *name, unsigned long long *val)
{
return -1;
return 0;
}
+
+int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, char *val, int size)
+{
+ char fname[50];
+ int n;
+ int fd;
+ sprintf(fname, "/sys/block/%s/md/%s/%s",
+ sra->sys_name, dev?dev->sys_name:"", name);
+ fd = open(fname, O_RDONLY);
+ if (fd < 0)
+ return -1;
+ n = read(fd, val, size);
+ close(fd);
+ if (n <= 0)
+ return -1;
+ val[n] = 0;
+ return n;
+}
+
+int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms)
+{
+ unsigned long sec;
+ unsigned long msec;
+ char delay[30];
+
+ sec = ms / 1000;
+ msec = ms % 1000;
+
+ sprintf(delay, "%ld.%03ld\n", sec, msec);
+ /* this '\n' ^ needed for kernels older than 2.6.28 */
+ return sysfs_set_str(sra, NULL, "safe_mode_delay", delay);
+}
+
+int sysfs_set_array(struct mdinfo *info, int vers)
+{
+ int rv = 0;
+ char ver[100];
+
+ ver[0] = 0;
+ if (info->array.major_version == -1 &&
+ info->array.minor_version == -2) {
+ strcat(strcpy(ver, "external:"), info->text_version);
+
+ if ((vers % 100) < 2 ||
+ sysfs_set_str(info, NULL, "metadata_version",
+ ver) < 0) {
+ fprintf(stderr, Name ": This kernel does not "
+ "support external metadata.\n");
+ return 1;
+ }
+ }
+ if (info->array.level < 0)
+ return 0; /* FIXME */
+ rv |= sysfs_set_str(info, NULL, "level",
+ map_num(pers, info->array.level));
+ rv |= sysfs_set_num(info, NULL, "raid_disks", info->array.raid_disks);
+ rv |= sysfs_set_num(info, NULL, "chunk_size", info->array.chunk_size);
+ rv |= sysfs_set_num(info, NULL, "layout", info->array.layout);
+ rv |= sysfs_set_num(info, NULL, "component_size", info->component_size/2);
+ if (info->custom_array_size) {
+ int rc;
+
+ rc = sysfs_set_num(info, NULL, "array_size",
+ info->custom_array_size/2);
+ if (rc && errno == ENOENT) {
+ fprintf(stderr, Name ": This kernel does not "
+ "have the md/array_size attribute, "
+ "the array may be larger than expected\n");
+ rc = 0;
+ }
+ rv |= rc;
+ }
+
+ if (info->array.level > 0)
+ rv |= sysfs_set_num(info, NULL, "resync_start", info->resync_start);
+ return rv;
+}
+
+int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int in_sync)
+{
+ char dv[100];
+ char nm[100];
+ char *dname;
+ int rv;
+
+ sprintf(dv, "%d:%d", sd->disk.major, sd->disk.minor);
+ rv = sysfs_set_str(sra, NULL, "new_dev", dv);
+ if (rv)
+ return rv;
+
+ memset(nm, 0, sizeof(nm));
+ sprintf(dv, "/sys/dev/block/%d:%d", sd->disk.major, sd->disk.minor);
+ rv = readlink(dv, nm, sizeof(nm));
+ if (rv <= 0)
+ return -1;
+ nm[rv] = '\0';
+ dname = strrchr(nm, '/');
+ if (dname) dname++;
+ strcpy(sd->sys_name, "dev-");
+ strcpy(sd->sys_name+4, dname);
+
+ rv = sysfs_set_num(sra, sd, "offset", sd->data_offset);
+ rv |= sysfs_set_num(sra, sd, "size", (sd->component_size+1) / 2);
+ if (sra->array.level != LEVEL_CONTAINER) {
+ if (in_sync)
+ /* This can correctly fail if array isn't started,
+ * yet, so just ignore status for now.
+ */
+ sysfs_set_str(sra, sd, "state", "in_sync");
+ rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk);
+ }
+ return rv;
+}
+
+#if 0
+int sysfs_disk_to_sg(int fd)
+{
+ /* from an open block device, try find and open its corresponding
+ * scsi_generic interface
+ */
+ struct stat st;
+ char path[256];
+ char sg_path[256];
+ char sg_major_minor[8];
+ char *c;
+ DIR *dir;
+ struct dirent *de;
+ int major, minor, rv;
+
+ if (fstat(fd, &st))
+ return -1;
+
+ snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device",
+ major(st.st_rdev), minor(st.st_rdev));
+
+ dir = opendir(path);
+ if (!dir)
+ return -1;
+
+ de = readdir(dir);
+ while (de) {
+ if (strncmp("scsi_generic:", de->d_name,
+ strlen("scsi_generic:")) == 0)
+ break;
+ de = readdir(dir);
+ }
+ closedir(dir);
+
+ if (!de)
+ return -1;
+
+ snprintf(sg_path, sizeof(sg_path), "%s/%s/dev", path, de->d_name);
+ fd = open(sg_path, O_RDONLY);
+ if (fd < 0)
+ return fd;
+
+ rv = read(fd, sg_major_minor, sizeof(sg_major_minor));
+ close(fd);
+ if (rv < 0)
+ return -1;
+ else
+ sg_major_minor[rv - 1] = '\0';
+
+ c = strchr(sg_major_minor, ':');
+ *c = '\0';
+ c++;
+ major = strtol(sg_major_minor, NULL, 10);
+ minor = strtol(c, NULL, 10);
+ snprintf(path, sizeof(path), "/dev/.tmp.md.%d:%d:%d",
+ (int) getpid(), major, minor);
+ if (mknod(path, S_IFCHR|0600, makedev(major, minor))==0) {
+ fd = open(path, O_RDONLY);
+ unlink(path);
+ return fd;
+ }
+
+ return -1;
+}
+#endif
+
+int sysfs_disk_to_scsi_id(int fd, __u32 *id)
+{
+ /* from an open block device, try to retrieve it scsi_id */
+ struct stat st;
+ char path[256];
+ char *c1, *c2;
+ DIR *dir;
+ struct dirent *de;
+
+ if (fstat(fd, &st))
+ return 1;
+
+ snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device",
+ major(st.st_rdev), minor(st.st_rdev));
+
+ dir = opendir(path);
+ if (!dir)
+ return 1;
+
+ de = readdir(dir);
+ while (de) {
+ if (strncmp("scsi_disk:", de->d_name,
+ strlen("scsi_disk:")) == 0)
+ break;
+ de = readdir(dir);
+ }
+ closedir(dir);
+
+ if (!de)
+ return 1;
+
+ c1 = strchr(de->d_name, ':');
+ c1++;
+ c2 = strchr(c1, ':');
+ *c2 = '\0';
+ *id = strtol(c1, NULL, 10) << 24; /* host */
+ c1 = c2 + 1;
+ c2 = strchr(c1, ':');
+ *c2 = '\0';
+ *id |= strtol(c1, NULL, 10) << 16; /* channel */
+ c1 = c2 + 1;
+ c2 = strchr(c1, ':');
+ *c2 = '\0';
+ *id |= strtol(c1, NULL, 10) << 8; /* lun */
+ c1 = c2 + 1;
+ *id |= strtol(c1, NULL, 10); /* id */
+
+ return 0;
+}
+
+
+int sysfs_unique_holder(int devnum, long rdev)
+{
+ /* Check that devnum is a holder of rdev,
+ * and is the only holder.
+ * we should be locked against races by
+ * an O_EXCL on devnum
+ */
+ DIR *dir;
+ struct dirent *de;
+ char dirname[100];
+ char l;
+ int found = 0;
+ sprintf(dirname, "/sys/dev/block/%d:%d/holders",
+ major(rdev), minor(rdev));
+ dir = opendir(dirname);
+ errno = ENOENT;
+ if (!dir)
+ return 0;
+ l = strlen(dirname);
+ while ((de = readdir(dir)) != NULL) {
+ char buf[10];
+ int n;
+ int mj, mn;
+ char c;
+ int fd;
+
+ if (de->d_ino == 0)
+ continue;
+ if (de->d_name[0] == '.')
+ continue;
+ strcpy(dirname+l, "/");
+ strcat(dirname+l, de->d_name);
+ strcat(dirname+l, "/dev");
+ fd = open(dirname, O_RDONLY);
+ if (fd < 0) {
+ errno = ENOENT;
+ break;
+ }
+ n = read(fd, buf, sizeof(buf)-1);
+ close(fd);
+ buf[n] = 0;
+ if (sscanf(buf, "%d:%d%c", &mj, &mn, &c) != 3 ||
+ c != '\n') {
+ errno = ENOENT;
+ break;
+ }
+ if (mj != MD_MAJOR)
+ mn = -1-(mn>>6);
+
+ if (devnum != mn) {
+ errno = EEXIST;
+ break;
+ }
+ found = 1;
+ }
+ closedir(dir);
+ if (de)
+ return 0;
+ else
+ return found;
+}
+
+#ifndef MDASSEMBLE
+
+static char *clean_states[] = {
+ "clear", "inactive", "readonly", "read-auto", "clean", NULL };
+
+int WaitClean(char *dev, int verbose)
+{
+ int fd;
+ struct mdinfo *mdi;
+ int rv = 1;
+ int devnum;
+
+ fd = open(dev, O_RDONLY);
+ if (fd < 0) {
+ if (verbose)
+ fprintf(stderr, Name ": Couldn't open %s: %s\n", dev, strerror(errno));
+ return 1;
+ }
+
+ devnum = fd2devnum(fd);
+ mdi = sysfs_read(fd, devnum, GET_VERSION|GET_LEVEL|GET_SAFEMODE);
+ if (!mdi) {
+ if (verbose)
+ fprintf(stderr, Name ": Failed to read sysfs attributes for "
+ "%s\n", dev);
+ close(fd);
+ return 0;
+ }
+
+ switch(mdi->array.level) {
+ case LEVEL_LINEAR:
+ case LEVEL_MULTIPATH:
+ case 0:
+ /* safemode delay is irrelevant for these levels */
+ rv = 0;
+
+ }
+
+ /* for internal metadata the kernel handles the final clean
+ * transition, containers can never be dirty
+ */
+ if (!is_subarray(mdi->text_version))
+ rv = 0;
+
+ /* safemode disabled ? */
+ if (mdi->safe_mode_delay == 0)
+ rv = 0;
+
+ if (rv) {
+ int state_fd = sysfs_open(fd2devnum(fd), NULL, "array_state");
+ char buf[20];
+ fd_set fds;
+ struct timeval tm;
+
+ /* minimize the safe_mode_delay and prepare to wait up to 5s
+ * for writes to quiesce
+ */
+ sysfs_set_safemode(mdi, 1);
+ tm.tv_sec = 5;
+ tm.tv_usec = 0;
+
+ /* give mdmon a chance to checkpoint resync */
+ sysfs_set_str(mdi, NULL, "sync_action", "idle");
+
+ FD_ZERO(&fds);
+
+ /* wait for array_state to be clean */
+ while (1) {
+ rv = read(state_fd, buf, sizeof(buf));
+ if (rv < 0)
+ break;
+ if (sysfs_match_word(buf, clean_states) <= 4)
+ break;
+ FD_SET(state_fd, &fds);
+ rv = select(state_fd + 1, NULL, NULL, &fds, &tm);
+ if (rv < 0 && errno != EINTR)
+ break;
+ lseek(state_fd, 0, SEEK_SET);
+ }
+ if (rv < 0)
+ rv = 1;
+ else if (ping_monitor(mdi->text_version) == 0) {
+ /* we need to ping to close the window between array
+ * state transitioning to clean and the metadata being
+ * marked clean
+ */
+ rv = 0;
+ } else
+ rv = 1;
+ if (rv && verbose)
+ fprintf(stderr, Name ": Error waiting for %s to be clean\n",
+ dev);
+
+ /* restore the original safe_mode_delay */
+ sysfs_set_safemode(mdi, mdi->safe_mode_delay);
+ close(state_fd);
+ }
+
+ sysfs_free(mdi);
+ close(fd);
+
+ return rv;
+}
+#endif /* MDASSEMBLE */
mdsize11=19992
mdsize12=19988
+# ddf needs bigger devices as 32Meg is reserved!
+ddfsize=65536
+
cleanup() {
- $mdadm -Ss
- for d in 0 1 2 3 4 5 6 7
+ udevadm settle
+ $mdadm -Ssq
+ for d in 0 1 2 3 4 5 6 7 8 9 10 11 12
do
losetup -d /dev/loop$d ; # rm -f $targetdir/mdtest$d
done
trap cleanup 0 1 2 3 15
devlist=
-for d in 0 1 2 3 4 5 6 7
+for d in 0 1 2 3 4 5 6 7 8 9 10 11 12
do
- [ -f $targetdir/mdtest$d ] || dd if=/dev/zero of=$targetdir/mdtest$d count=$size bs=1K > /dev/null 2>&1
+ sz=$size
+ if [ $d -gt 7 ]; then sz=$ddfsize ; fi
+ [ -f $targetdir/mdtest$d ] || dd if=/dev/zero of=$targetdir/mdtest$d count=$sz bs=1K > /dev/null 2>&1
[ -b /dev/loop$d ] || mknod /dev/loop$d b 7 $d
if [ $d -eq 7 ]
then
eval dev$d=/dev/loop$d
eval file$d=$targetdir/mdtest$d
eval devlist=\"\$devlist \$dev$d\"
+ #" <-- add this quote to un-confuse vim syntax highlighting
done
path0=$dev6
path1=$dev7
+ulimit -c unlimited
+[ -f /proc/mdstat ] || modprobe md_mod
echo 2000 > /proc/sys/dev/raid/speed_limit_max
echo 0 > /sys/module/md_mod/parameters/start_ro
# mdadm always adds --quiet, and we want to see any unexpected messages
mdadm() {
rm -f $targetdir/stderr
+ case $* in
+ *-S* ) udevadm settle;;
+ esac
case $* in
*-C* ) $mdadm 2> $targetdir/stderr --quiet "$@" --auto=yes;;
* ) $mdadm 2> $targetdir/stderr --quiet "$@"
esac
+ rv=$?
cat >&2 $targetdir/stderr
+ return $rv
}
# check various things
dsize=$[dvsize/chunk]
dsize=$[dsize*chunk]
rasize=$[dsize*2*cnt]
+ if [ `/sbin/blockdev --getsize $dev` -eq 0 ]; then sleep 2 ; fi
if [ $rasize -ne `/sbin/blockdev --getsize $dev` ]
then
echo "ERROR: size is wrong for $dev: $cnt * $dvsize (chunk=$chunk) = $rasize, not `/sbin/blockdev --getsize $dev`"
fsck -fn $dev >&2
}
+setup_environment() {
+ if [ -f $1 ]; then
+ . $environment
+ setup_env
+ fi
+}
+reset_environment() {
+ if [ -f $1 ]; then
+ reset_env
+ unset setup_env
+ unset reset_env
+ fi
+}
for script in tests/$prefix tests/$prefix*[^~]
do
if [ -f "$script" ]
then
rm -f $targetdir/stderr
+ # stop all arrays, just incase some script left an array active.
+ mdadm -Ssq
+ mdadm --zero $devlist 2> /dev/null
+ mdadm --zero $devlist 2> /dev/null
+ environment="tests/env-`basename $script`"
+ setup_environment $environment
# source script in a subshell, so it has access to our
# namespace, but cannot change it.
if ( set -ex ; . $script ) 2> $targetdir/log
then echo "$script succeeded"
else cat $targetdir/log ; cat $targetdir/stderr
echo "$script failed"
+ reset_environment $environment
exit 1
fi
+ reset_environment $environment
fi
done
exit 0
--- /dev/null
+
+# Check integrity of raid5 in degraded mode
+# Create a 4 disk raid5, create a filesystem and
+# sh1sum it with each device failed
+
+for layout in ls rs la ra
+do
+ mdadm -CR $md0 -l5 --layout $layout -n4 $dev0 $dev1 $dev2 $dev3
+ check wait
+ tar cf - /etc > $md0
+ sum=`sha1sum $md0`
+
+ for i in $dev0 $dev1 $dev2 $dev3
+ do
+ mdadm $md0 -f $i
+ mdadm $md0 -r $i
+ blockdev --flushbufs $md0
+ sum1=`sha1sum $md0`
+ if [ $sum != $sum1 ]
+ then
+ echo $sum does not matc $sum1 with $i missing
+ exit 1
+ fi
+ mdadm $md0 -a $i
+ check wait
+ done
+ mdadm -S $md0
+done
+
--- /dev/null
+
+# Check integrity of raid6 in degraded modes
+# Create a 5 disk raid6, dump some data to it, then
+# sh1sum it with different pairs of devices failed
+
+layouts='ls rs la ra'
+lv=`uname -r`
+if expr $lv '>=' 2.6.30 > /dev/null
+then
+ layouts="$layouts parity-first dd-zero-restart ddf-N-restart ddf-N-continue \
+ left-asymmetric-6 right-asymmetric-6 left-symmetric-6 right-symmetric-6 parity-first-6"
+fi
+echo $layouts
+for layout in $layouts
+do
+ mdadm -CR $md0 -l6 --layout $layout -n5 $dev0 $dev1 $dev2 $dev3 $dev4
+ check wait
+ tar cf - /etc > $md0
+ sum=`sha1sum $md0`
+
+ totest=
+ for second in $dev0 $dev1 $dev2 $dev3 $dev4
+ do
+ mdadm $md0 -f $second
+ mdadm $md0 -r $second
+ blockdev --flushbufs $md0
+ sum1=`sha1sum $md0`
+ if [ $sum != $sum1 ]
+ then
+ echo $sum does not matc $sum1 with $second missing
+ exit 1
+ fi
+ for first in $totest
+ do
+ mdadm $md0 -f $first
+ mdadm $md0 -r $first
+ blockdev --flushbufs $md0
+ sum1=`sha1sum $md0`
+ if [ $sum != $sum1 ]
+ then
+ echo $sum does not matc $sum1 with $first and $second missing
+ exit 1
+ fi
+ mdadm $md0 -a $first
+ check wait
+ done
+ mdadm $md0 -a $second
+ check wait
+ totest="$totest $second"
+ done
+ mdadm -S $md0
+done
+
mdadm --assemble --scan --config=$conf $md2
$tst
mdadm -S $md2
+
+# Now use incremental assembly.
+mdadm -I --config=$conf $dev0
+mdadm -I --config=$conf $dev1
+mdadm -I --config=$conf $dev2
+$tst
+mdadm -S $md2
mdadm --assemble --scan --config=$conf $md1
check state U_U
eval $tst
+
+# And now assemble with -I
+mdadm -Ss
+mdadm -I -c $conf $dev0
+mdadm -I -c $conf $dev1
+mdadm -I -c $conf $dev2
+eval $tst
# create an array with a name
mdadm -CR $md0 -l0 -n2 --metadata=1 --name="Fred" $dev0 $dev1
-mdadm -E $dev0 | grep 'Name : Fred$' > /dev/null || exit 1
-mdadm -D $md0 | grep 'Name : Fred$' > /dev/null || exit 1
+mdadm -E $dev0 | grep 'Name : [^:]*:Fred ' > /dev/null || exit 1
+mdadm -D $md0 | grep 'Name : [^:]*:Fred ' > /dev/null || exit 1
mdadm -S $md0
mdadm -A $md0 --name="Fred" $devlist
--- /dev/null
+# create raid arrays with varying degress of overlap
+mdadm -CR $container -e imsm -n 6 $dev0 $dev1 $dev2 $dev3 $dev4 $dev5
+imsm_check container 6
+
+size=1910
+level=1
+num_disks=2
+mdadm -CR $member0 $dev0 $dev1 -n $num_disks -l $level -z $size
+mdadm -CR $member1 $dev1 $dev2 -n $num_disks -l $level -z $size
+mdadm -CR $member2 $dev2 $dev3 -n $num_disks -l $level -z $size
+mdadm -CR $member3 $dev3 $dev4 -n $num_disks -l $level -z $size
+mdadm -CR $member4 $dev4 $dev5 -n $num_disks -l $level -z $size
+
+offset=0
+imsm_check member $member0 $num_disks $level $size $offset
+offset=$((offset+size+2048))
+imsm_check member $member1 $num_disks $level $size $offset
+offset=$((offset+size+2048))
+imsm_check member $member2 $num_disks $level $size $offset
+offset=$((offset+size+2048))
+imsm_check member $member3 $num_disks $level $size $offset
+# at this point there should be more freespace at the start of the disk
+# than the end
+offset=0
+imsm_check member $member4 $num_disks $level $size $offset
--- /dev/null
+# sanity check array creation
+
+num_disks=2
+mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1
+imsm_check container $num_disks
+
+# RAID0 + RAID1
+size=10000
+level=0
+chunk=64
+offset=0
+mdadm -CR $member0 $dev0 $dev1 -n $num_disks -l $level -z $size -c $chunk
+imsm_check member $member0 $num_disks $level $size $offset $chunk
+testdev $member0 $num_disks $size $chunk
+
+offset=$(((size & ~(chunk - 1)) + 2048))
+size=5000
+level=1
+chunk=0
+mdadm -CR $member1 $dev0 $dev1 -n $num_disks -l $level -z $size
+imsm_check member $member1 $num_disks $level $size $offset $chunk
+testdev $member1 1 $size 1
+check wait
+
+mdadm -Ss
+
+# RAID10 + RAID5
+num_disks=4
+mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 $dev2 $dev3
+imsm_check container $num_disks
+
+size=10000
+level=10
+chunk=64
+offset=0
+mdadm -CR $member0 $dev0 $dev1 $dev2 $dev3 -n $num_disks -l $level -z $size -c $chunk
+imsm_check member $member0 $num_disks $level $size $offset $chunk
+testdev $member0 $((num_disks-2)) $size $chunk
+
+offset=$(((size & ~(chunk - 1)) + 2048))
+size=5000
+level=5
+mdadm -CR $member1 $dev0 $dev1 $dev2 $dev3 -n $num_disks -l $level -z $size -c $chunk
+imsm_check member $member1 $num_disks $level $size $offset $chunk
+testdev $member1 $((num_disks-1)) $size $chunk
+check wait
+
+# FAIL / REBUILD
+imsm_check_hold $container $dev0
+mdadm --fail $member0 $dev0
+mdadm --wait-clean --scan
+imsm_check_removal $container $dev0
+mdadm --add $container $dev4
+check wait
+imsm_check_hold $container $dev4
+
--- /dev/null
+#
+# Test basic DDF functionality.
+#
+# Create a container with 5 drives
+# create a small raid0 across them all, then a 2disk raid1
+# and a 3disk raid5 using the remaining space
+#
+# add some data, tear down the array, reassemble
+# and make sure it is still there.
+
+mdadm -CR /dev/md/ddf0 -e ddf -n 5 $dev8 $dev9 $dev10 $dev11 $dev12
+mdadm -CR r0 -l0 -n5 /dev/md/ddf0 -z 5000
+mdadm -CR r1 -l1 -n2 /dev/md/ddf0
+mdadm -CR r5 -l5 -n3 /dev/md/ddf0
+testdev /dev/md/r0 5 5000 64
+# r0 will use 4992 due to chunk size, so that leave 27776 for the rest
+testdev /dev/md/r1 1 27776 1
+testdev /dev/md/r5 2 27776 64
+dd if=/dev/sda of=/dev/md/r0 || true
+dd if=/dev/sda of=/dev/md/r1 || true
+dd if=/dev/sda of=/dev/md/r5 || true
+
+s0=`sha1sum /dev/md/r0`
+s1=`sha1sum /dev/md/r1`
+s5=`sha1sum /dev/md/r5`
+
+
+mdadm -Ss
+mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12
+mdadm -I /dev/md/ddf0
+
+s0a=`sha1sum /dev/md/r0`
+s1a=`sha1sum /dev/md/r1`
+s5a=`sha1sum /dev/md/r5`
+
+if [ "$s0" != "$s0a" ]; then
+ echo r0 did not match ; exit 1;
+fi
+if [ "$s1" != "$s1a" ]; then
+ echo r1 did not match ; exit 1;
+fi
+if [ "$s5" != "$s5a" ]; then
+ echo r5 did not match ; exit 1;
+fi
+
+# failure status just means it has completed already, so ignore it.
+mdadm --wait /dev/md/r1 || true
+mdadm --wait /dev/md/r5 || true
+
+mdadm -Dbs > /var/tmp/mdadm.conf
+
+mdadm -Ss
+
+# Now try to assemble using mdadm.conf
+mdadm -Asc /var/tmp/mdadm.conf
+check nosync # This failed once. The raid5 was resyncing.
+
+mdadm -Dbs > /tmp/mdadm.conf
+diff /tmp/mdadm.conf /var/tmp/mdadm.conf
+mdadm -Ss
+
+# and now assemble fully incrementally.
+for i in $dev8 $dev9 $dev10 $dev11 $dev12
+do
+ #./mdadm -I $i -vv 2>&1 | wc -l > /tmp/cnt
+ ./mdadm -I $i 2> /tmp/thing
+ wc -l < /tmp/thing > /tmp/cnt
+ # should find container and 2 devices, so 3 lines.
+ [ `cat /tmp/cnt` -eq 3 ]
+done
+check nosync
+
+mdadm -Dbs > /tmp/mdadm.conf
+diff /tmp/mdadm.conf /var/tmp/mdadm.conf
+mdadm -Ss
+rm /tmp/mdadm.conf /var/tmp/mdadm.conf
--- /dev/null
+imsm_check() {
+ case $1 in
+ container )
+ grep -s "$(((418 * $2)/2)) blocks super external:imsm" /proc/mdstat > /dev/null || {
+ echo >&2 "ERROR correctly formed container not found"; cat /proc/mdstat; exit 1;}
+ ;;
+ member )
+ member=$2
+ num_disks=$3
+ level=$4
+ size=$5
+ offset=$6
+ err=0
+
+ eval `stat -L -c "let major=0x%t; let minor=0x%T;" $member`
+ sysfs=/sys/dev/block/${major}:${minor}
+ if [ ! -f ${sysfs}/md/array_state ]; then
+ echo "member array $member not found" >&2
+ cat /proc/mdstat >&2
+ exit 1
+ fi
+ for i in `seq 0 $((num_disks-1))`
+ do
+ _offset=`cat ${sysfs}/md/rd${i}/offset`
+ if [ $offset -ne $((_offset/2)) ]; then
+ echo "offset mismatch expected $offset got $_offset" >&2
+ err=$((err+1))
+ fi
+ _size=`cat ${sysfs}/md/rd${i}/size`
+ if [ $size -ne $_size ]; then
+ echo "offset mismatch expected $size got $_size" >&2
+ err=$((err+1))
+ fi
+ done
+
+ if [ $err -gt 0 ]; then
+ echo "$member failed check" >&2
+ cat /proc/mdstat >&2
+ mdadm -E /dev/loop0 >&2
+ exit 1
+ fi
+ ;;
+ * ) echo >&2 ERROR unknown check $1 ; exit 1;
+ esac
+}
+
+setup_env() {
+ export IMSM_DEVNAME_AS_SERIAL=1
+ export IMSM_NO_PLATFORM=1
+ container=/dev/md/container
+ member0=/dev/md/vol0
+ member1=/dev/md/vol1
+ member2=/dev/md/vol2
+ member3=/dev/md/vol3
+ member4=/dev/md/vol4
+}
+
+reset_env() {
+ unset IMSM_DEVNAME_AS_SERIAL
+ unset IMSM_NO_PLATFORM
+ unset imsm_check
+ unset container
+ unset member0
+ unset member1
+ unset member2
+ unset member3
+ unset member4
+}
--- /dev/null
+imsm_check_hold() {
+ if mdadm --remove $1 $2; then
+ echo "$2 removal from $1 should have been blocked" >&2
+ cat /proc/mdstat >&2
+ mdadm -E $2
+ exit 1
+ fi
+}
+
+imsm_check_removal() {
+ if ! mdadm --remove $1 $2 ; then
+ echo "$2 removal from $1 should have succeeded" >&2
+ cat /proc/mdstat >&2
+ mdadm -E $2
+ exit 1
+ fi
+}
+
+imsm_check() {
+ udevadm settle
+ case $1 in
+ container )
+ grep -s "$(((418 * $2)/2)) blocks super external:imsm" /proc/mdstat > /dev/null || {
+ echo >&2 "ERROR correctly formed container not found"; cat /proc/mdstat; exit 1;}
+ ;;
+ member )
+ member=$2
+ num_disks=$3
+ level=$4
+ size=$5
+ offset=$6
+ chunk=$7
+ err=0
+
+ if [ $level -ne 1 ]; then
+ size=$((size & ~(chunk - 1)))
+ else
+ chunk=64
+ fi
+ eval `stat -L -c "let major=0x%t; let minor=0x%T;" $member`
+ sysfs=/sys/dev/block/${major}:${minor}
+ if [ ! -f ${sysfs}/md/array_state ]; then
+ echo "member array $member not found" >&2
+ cat /proc/mdstat >&2
+ exit 1
+ fi
+ _chunk=`cat ${sysfs}/md/chunk_size`
+ if [ $chunk -ne $((_chunk/1024)) ]; then
+ echo "chunk mismatch expected $chunk got $_chunk" >&2
+ err=$((err+1))
+ fi
+ for i in `seq 0 $((num_disks-1))`
+ do
+ _offset=`cat ${sysfs}/md/rd${i}/offset`
+ if [ $offset -ne $((_offset/2)) ]; then
+ echo "offset mismatch expected $offset got $_offset" >&2
+ err=$((err+1))
+ fi
+ _size=`cat ${sysfs}/md/rd${i}/size`
+ if [ $size -ne $_size ]; then
+ echo "size mismatch expected $size got $_size" >&2
+ err=$((err+1))
+ fi
+ done
+
+ if [ $err -gt 0 ]; then
+ echo "$member failed check" >&2
+ cat /proc/mdstat >&2
+ mdadm -E /dev/loop0 >&2
+ exit 1
+ fi
+ ;;
+ * ) echo >&2 ERROR unknown check $1 ; exit 1;
+ esac
+}
+
+setup_env() {
+ export IMSM_DEVNAME_AS_SERIAL=1
+ export IMSM_TEST_OROM=1
+ container=/dev/md/container
+ member0=/dev/md/vol0
+ member1=/dev/md/vol1
+ member2=/dev/md/vol2
+ member3=/dev/md/vol3
+ member4=/dev/md/vol4
+}
+
+reset_env() {
+ unset IMSM_DEVNAME_AS_SERIAL
+ unset IMSM_TEST_OROM
+ unset imsm_check
+ unset container
+ unset member0
+ unset member1
+ unset member2
+ unset member3
+ unset member4
+}
--- /dev/null
+# do not edit this file, it will be overwritten on update
+
+SUBSYSTEM!="block", GOTO="md_end"
+ACTION!="add|change", GOTO="md_end"
+ACTION=="change", GOTO="md_no_incr"
+
+# import data from a raid member and activate it
+#ENV{ID_FS_TYPE}=="linux_raid_member", IMPORT{program}="/sbin/mdadm --examine --export $tempnode", RUN+="/sbin/mdadm --incremental $env{DEVNAME}"
+# import data from a raid set
+LABEL="md_no_incr"
+KERNEL!="md*", GOTO="md_end"
+
+# partitions have no md/{array_state,metadata_version}, but should not
+# for that reason be ignored.
+ENV{DEVTYPE}=="partition", GOTO="md_ignore_state"
+
+# container devices have a metadata version of e.g. 'external:ddf' and
+# never leave state 'inactive'
+ATTR{md/metadata_version}=="external:[A-Za-z]*", ATTR{md/array_state}=="inactive", GOTO="md_ignore_state"
+TEST!="md/array_state", GOTO="md_end"
+ATTR{md/array_state}=="|clear|inactive", GOTO="md_end"
+LABEL="md_ignore_state"
+
+IMPORT{program}="/sbin/mdadm --detail --export $tempnode"
+ENV{DEVTYPE}=="disk", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}", OPTIONS+="string_escape=replace"
+ENV{DEVTYPE}=="disk", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}"
+ENV{DEVTYPE}=="disk", ENV{MD_DEVNAME}=="?*", SYMLINK+="md/$env{MD_DEVNAME}"
+ENV{DEVTYPE}=="partition", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}-part%n", OPTIONS+="string_escape=replace"
+ENV{DEVTYPE}=="partition", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}-part%n"
+ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[^0-9]", SYMLINK+="md/$env{MD_DEVNAME}%n"
+ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[0-9]", SYMLINK+="md/$env{MD_DEVNAME}p%n"
+
+IMPORT{program}="vol_id --export $tempnode"
+OPTIONS+="link_priority=100"
+ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}"
+ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}"
+
+LABEL="md_end"
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
#include "mdadm.h"
#include "md_p.h"
+#include <sys/socket.h>
#include <sys/utsname.h>
+#include <sys/wait.h>
+#include <sys/un.h>
#include <ctype.h>
+#include <dirent.h>
+#include <signal.h>
/*
* following taken from linux/blkpg.h because they aren't
}
}
+const int uuid_match_any[4] = { ~0, ~0, ~0, ~0 };
int same_uuid(int a[4], int b[4], int swapuuid)
{
+ if (memcmp(a, uuid_match_any, sizeof(int[4])) == 0 ||
+ memcmp(b, uuid_match_any, sizeof(int[4])) == 0)
+ return 1;
+
if (swapuuid) {
/* parse uuids are hostendian.
* uuid's from some superblocks are big-ending
memcpy(a, b, 16);
}
+char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char sep)
+{
+ int i, j;
+ int id;
+ char uuid[16];
+ char *c = buf;
+ strcpy(c, "UUID-");
+ c += strlen(c);
+ copy_uuid(uuid, info->uuid, st->ss->swapuuid);
+ for (i = 0; i < 4; i++) {
+ id = uuid[i];
+ if (i)
+ *c++ = sep;
+ for (j = 3; j >= 0; j--) {
+ sprintf(c,"%02x", (unsigned char) uuid[j+4*i]);
+ c+= 2;
+ }
+ }
+ return buf;
+}
+
#ifndef MDASSEMBLE
int check_ext2(int fd, char *name)
{
/* tests if dev is a "standard" md dev name.
* i.e if the last component is "/dNN" or "/mdNN",
* where NN is a string of digits
+ * Returns 1 if a partitionable standard,
+ * -1 if non-partitonable,
+ * 0 if not a standard name.
*/
char *d = strrchr(dev, '/');
int type=0;
/*
* Find a block device with the right major/minor number.
* If we find multiple names, choose the shortest.
- * If we find a non-standard name, it is probably there
- * deliberately so prefer it over a standard name.
+ * If we find a name in /dev/md/, we prefer that.
* This applies only to names for MD devices.
*/
char *map_dev(int major, int minor, int create)
{
struct devmap *p;
- char *std = NULL, *nonstd=NULL;
+ char *regular = NULL, *preferred=NULL;
int did_check = 0;
if (major == 0 && minor == 0)
for (p=devlist; p; p=p->next)
if (p->major == major &&
p->minor == minor) {
- if (is_standard(p->name, NULL)) {
- if (std == NULL ||
- strlen(p->name) < strlen(std))
- std = p->name;
+ if (strncmp(p->name, "/dev/md/",8) == 0) {
+ if (preferred == NULL ||
+ strlen(p->name) < strlen(preferred))
+ preferred = p->name;
} else {
- if (nonstd == NULL ||
- strlen(p->name) < strlen(nonstd))
- nonstd = p->name;
+ if (regular == NULL ||
+ strlen(p->name) < strlen(regular))
+ regular = p->name;
}
}
- if (!std && !nonstd && !did_check) {
+ if (!regular && !preferred && !did_check) {
devlist_ready = 0;
goto retry;
}
- if (create && !std && !nonstd) {
+ if (create && !regular && !preferred) {
static char buf[30];
snprintf(buf, sizeof(buf), "%d:%d", major, minor);
- nonstd = buf;
+ regular = buf;
}
- return nonstd ? nonstd : std;
+ return preferred ? preferred : regular;
}
unsigned long calc_csum(void *super, int bytes)
}
#endif
+unsigned long long calc_array_size(int level, int raid_disks, int layout,
+ int chunksize, unsigned long long devsize)
+{
+ int data_disks = 0;
+ switch (level) {
+ case 0: data_disks = raid_disks; break;
+ case 1: data_disks = 1; break;
+ case 4:
+ case 5: data_disks = raid_disks - 1; break;
+ case 6: data_disks = raid_disks - 2; break;
+ case 10: data_disks = raid_disks / (layout & 255) / ((layout>>8)&255);
+ break;
+ }
+ devsize &= ~(unsigned long long)((chunksize>>9)-1);
+ return data_disks * devsize;
+}
+
int get_mdp_major(void)
{
static int mdp_major = -1;
return mdp_major;
}
-
#if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO)
-
char *get_md_name(int dev)
{
/* find /dev/md%d or /dev/md/%d or make a device /dev/.tmp.md%d */
unlink(name);
}
-static int dev2major(int d)
-{
- if (d >= 0)
- return MD_MAJOR;
- else
- return get_mdp_major();
-}
-
-static int dev2minor(int d)
-{
- if (d >= 0)
- return d;
- return (-1-d) << MdpMinorShift;
-}
-
int find_free_devnum(int use_partitions)
{
int devnum;
for (devnum = 127; devnum != 128;
- devnum = devnum ? devnum-1 : (1<<22)-1) {
+ devnum = devnum ? devnum-1 : (1<<20)-1) {
char *dn;
int _devnum;
if (e > dev && *e == ':' && e[1] &&
(minor = strtoul(e+1, &e, 0)) >= 0 &&
*e == 0) {
- snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d", major, minor);
+ snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d:%d",
+ (int)getpid(), major, minor);
if (mknod(devname, S_IFBLK|0600, makedev(major, minor))==0) {
- fd = open(devname, flags);
+ fd = open(devname, flags|O_DIRECT);
unlink(devname);
}
} else
- fd = open(dev, flags);
+ fd = open(dev, flags|O_DIRECT);
return fd;
}
-struct superswitch *superlist[] = { &super0, &super1, NULL };
+int open_dev(int devnum)
+{
+ char buf[20];
+
+ sprintf(buf, "%d:%d", dev2major(devnum), dev2minor(devnum));
+ return dev_open(buf, O_RDWR);
+}
+
+int open_dev_excl(int devnum)
+{
+ char buf[20];
+ int i;
+
+ sprintf(buf, "%d:%d", dev2major(devnum), dev2minor(devnum));
+ for (i=0 ; i<25 ; i++) {
+ int fd = dev_open(buf, O_RDWR|O_EXCL);
+ if (fd >= 0)
+ return fd;
+ if (errno != EBUSY)
+ return fd;
+ usleep(200000);
+ }
+ return -1;
+}
+
+int same_dev(char *one, char *two)
+{
+ struct stat st1, st2;
+ if (stat(one, &st1) != 0)
+ return 0;
+ if (stat(two, &st2) != 0)
+ return 0;
+ if ((st1.st_mode & S_IFMT) != S_IFBLK)
+ return 0;
+ if ((st2.st_mode & S_IFMT) != S_IFBLK)
+ return 0;
+ return st1.st_rdev == st2.st_rdev;
+}
+
+void wait_for(char *dev, int fd)
+{
+ int i;
+ struct stat stb_want;
+
+ if (fstat(fd, &stb_want) != 0 ||
+ (stb_want.st_mode & S_IFMT) != S_IFBLK)
+ return;
+
+ for (i=0 ; i<25 ; i++) {
+ struct stat stb;
+ if (stat(dev, &stb) == 0 &&
+ (stb.st_mode & S_IFMT) == S_IFBLK &&
+ (stb.st_rdev == stb_want.st_rdev))
+ return;
+ usleep(200000);
+ }
+}
+
+struct superswitch *superlist[] = { &super0, &super1, &super_ddf, &super_imsm, NULL };
#if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO)
+
struct supertype *super_by_fd(int fd)
{
mdu_array_info_t array;
char *verstr;
char version[20];
int i;
+ char *subarray = NULL;
sra = sysfs_read(fd, 0, GET_VERSION);
sprintf(version, "%d.%d", vers, minor);
verstr = version;
}
+ if (minor == -2 && is_subarray(verstr)) {
+ char *dev = verstr+1;
+ subarray = strchr(dev, '/');
+ int devnum;
+ if (subarray)
+ *subarray++ = '\0';
+ devnum = devname2devnum(dev);
+ subarray = strdup(subarray);
+ if (sra)
+ sysfs_free(sra);
+ sra = sysfs_read(-1, devnum, GET_VERSION);
+ if (sra && sra->text_version[0])
+ verstr = sra->text_version;
+ else
+ verstr = "-no-metadata-";
+ }
+
for (i = 0; st == NULL && superlist[i] ; i++)
st = superlist[i]->match_metadata_desc(verstr);
if (sra)
sysfs_free(sra);
- if (st)
+ if (st) {
st->sb = NULL;
+ if (subarray) {
+ strncpy(st->subarray, subarray, 32);
+ st->subarray[31] = 0;
+ free(subarray);
+ } else
+ st->subarray[0] = 0;
+ }
return st;
}
#endif /* !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) */
-struct supertype *dup_super(struct supertype *st)
+struct supertype *dup_super(struct supertype *orig)
{
- struct supertype *stnew = NULL;
- char *verstr = NULL;
- char version[20];
- int i;
+ struct supertype *st;
+ if (!orig)
+ return orig;
+ st = malloc(sizeof(*st));
if (!st)
return st;
-
- if (st->minor_version == -1)
- sprintf(version, "%d", st->ss->major);
- else
- sprintf(version, "%d.%d", st->ss->major, st->minor_version);
- verstr = version;
-
- for (i = 0; stnew == NULL && superlist[i] ; i++)
- stnew = superlist[i]->match_metadata_desc(verstr);
-
- if (stnew)
- stnew->sb = NULL;
- return stnew;
+ memset(st, 0, sizeof(*st));
+ st->ss = orig->ss;
+ st->max_devs = orig->max_devs;
+ st->minor_version = orig->minor_version;
+ strcpy(st->subarray, orig->subarray);
+ st->sb = NULL;
+ st->info = NULL;
+ return st;
}
struct supertype *guess_super(int fd)
int i;
st = malloc(sizeof(*st));
- memset(st, 0, sizeof(*st));
for (i=0 ; superlist[i]; i++) {
int rv;
ss = superlist[i];
- st->ss = NULL;
+ memset(st, 0, sizeof(*st));
rv = ss->load_super(st, fd, NULL);
if (rv == 0) {
struct mdinfo info;
}
if (bestsuper != -1) {
int rv;
- st->ss = NULL;
+ memset(st, 0, sizeof(*st));
rv = superlist[bestsuper]->load_super(st, fd, NULL);
if (rv == 0) {
superlist[bestsuper]->free_super(st);
return;
}
+int open_container(int fd)
+{
+ /* 'fd' is a block device. Find out if it is in use
+ * by a container, and return an open fd on that container.
+ */
+ char path[256];
+ char *e;
+ DIR *dir;
+ struct dirent *de;
+ int dfd, n;
+ char buf[200];
+ int major, minor;
+ struct stat st;
+
+ if (fstat(fd, &st) != 0)
+ return -1;
+ sprintf(path, "/sys/dev/block/%d:%d/holders",
+ (int)major(st.st_rdev), (int)minor(st.st_rdev));
+ e = path + strlen(path);
+
+ dir = opendir(path);
+ if (!dir)
+ return -1;
+ while ((de = readdir(dir))) {
+ if (de->d_ino == 0)
+ continue;
+ if (de->d_name[0] == '.')
+ continue;
+ sprintf(e, "/%s/dev", de->d_name);
+ dfd = open(path, O_RDONLY);
+ if (dfd < 0)
+ continue;
+ n = read(dfd, buf, sizeof(buf));
+ close(dfd);
+ if (n <= 0 || n >= sizeof(buf))
+ continue;
+ buf[n] = 0;
+ if (sscanf(buf, "%d:%d", &major, &minor) != 2)
+ continue;
+ sprintf(buf, "%d:%d", major, minor);
+ dfd = dev_open(buf, O_RDONLY);
+ if (dfd >= 0) {
+ closedir(dir);
+ return dfd;
+ }
+ }
+ closedir(dir);
+ return -1;
+}
+
+int add_disk(int mdfd, struct supertype *st,
+ struct mdinfo *sra, struct mdinfo *info)
+{
+ /* Add a device to an array, in one of 2 ways. */
+ int rv;
+#ifndef MDASSEMBLE
+ if (st->ss->external) {
+ rv = sysfs_add_disk(sra, info,
+ info->disk.state & (1<<MD_DISK_SYNC));
+ if (! rv) {
+ struct mdinfo *sd2;
+ for (sd2 = sra->devs; sd2; sd2=sd2->next)
+ if (sd2 == info)
+ break;
+ if (sd2 == NULL) {
+ sd2 = malloc(sizeof(*sd2));
+ *sd2 = *info;
+ sd2->next = sra->devs;
+ sra->devs = sd2;
+ }
+ }
+ } else
+#endif
+ rv = ioctl(mdfd, ADD_NEW_DISK, &info->disk);
+ return rv;
+}
+
+int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info)
+{
+ /* Initialise kernel's knowledge of array.
+ * This varies between externally managed arrays
+ * and older kernels
+ */
+ int vers = md_get_version(mdfd);
+ int rv;
+
+#ifndef MDASSEMBLE
+ if (st->ss->external)
+ rv = sysfs_set_array(info, vers);
+ else
+#endif
+ if ((vers % 100) >= 1) { /* can use different versions */
+ mdu_array_info_t inf;
+ memset(&inf, 0, sizeof(inf));
+ inf.major_version = info->array.major_version;
+ inf.minor_version = info->array.minor_version;
+ rv = ioctl(mdfd, SET_ARRAY_INFO, &inf);
+ } else
+ rv = ioctl(mdfd, SET_ARRAY_INFO, NULL);
+ return rv;
+}
+
+char *devnum2devname(int num)
+{
+ char name[100];
+ if (num > 0)
+ sprintf(name, "md%d", num);
+ else
+ sprintf(name, "md_d%d", -1-num);
+ return strdup(name);
+}
+
+int devname2devnum(char *name)
+{
+ char *ep;
+ int num;
+ if (strncmp(name, "md_d", 4)==0)
+ num = -1-strtoul(name+4, &ep, 10);
+ else
+ num = strtoul(name+2, &ep, 10);
+ return num;
+}
+
+int stat2devnum(struct stat *st)
+{
+ char path[30];
+ char link[200];
+ char *cp;
+ int n;
+
+ if ((S_IFMT & st->st_mode) == S_IFBLK) {
+ if (major(st->st_rdev) == MD_MAJOR)
+ return minor(st->st_rdev);
+ else if (major(st->st_rdev) == get_mdp_major())
+ return -1- (minor(st->st_rdev)>>MdpMinorShift);
+
+ /* must be an extended-minor partition. Look at the
+ * /sys/dev/block/%d:%d link which must look like
+ * ../../block/mdXXX/mdXXXpYY
+ */
+ sprintf(path, "/sys/dev/block/%d:%d", major(st->st_rdev),
+ minor(st->st_rdev));
+ n = readlink(path, link, sizeof(link)-1);
+ if (n <= 0)
+ return NoMdDev;
+ link[n] = 0;
+ cp = strrchr(link, '/');
+ if (cp) *cp = 0;
+ cp = strchr(link, '/');
+ if (cp && strncmp(cp, "/md", 3) == 0)
+ return devname2devnum(cp+1);
+ }
+ return NoMdDev;
+
+}
+
+int fd2devnum(int fd)
+{
+ struct stat stb;
+ if (fstat(fd, &stb) == 0)
+ return stat2devnum(&stb);
+ return NoMdDev;
+}
+
+int mdmon_running(int devnum)
+{
+ char path[100];
+ char pid[10];
+ int fd;
+ int n;
+ sprintf(path, "/var/run/mdadm/%s.pid", devnum2devname(devnum));
+ fd = open(path, O_RDONLY, 0);
+
+ if (fd < 0)
+ return 0;
+ n = read(fd, pid, 9);
+ close(fd);
+ if (n <= 0)
+ return 0;
+ if (kill(atoi(pid), 0) == 0)
+ return 1;
+ return 0;
+}
+
+int signal_mdmon(int devnum)
+{
+ char path[100];
+ char pid[10];
+ int fd;
+ int n;
+ sprintf(path, "/var/run/mdadm/%s.pid", devnum2devname(devnum));
+ fd = open(path, O_RDONLY, 0);
+
+ if (fd < 0)
+ return 0;
+ n = read(fd, pid, 9);
+ close(fd);
+ if (n <= 0)
+ return 0;
+ if (kill(atoi(pid), SIGUSR1) == 0)
+ return 1;
+ return 0;
+}
+
+int start_mdmon(int devnum)
+{
+ int i;
+ int len;
+ pid_t pid;
+ int status;
+ char pathbuf[1024];
+ char *paths[4] = {
+ pathbuf,
+ "/sbin/mdmon",
+ "mdmon",
+ NULL
+ };
+
+ if (check_env("MDADM_NO_MDMON"))
+ return 0;
+
+ len = readlink("/proc/self/exe", pathbuf, sizeof(pathbuf));
+ if (len > 0) {
+ char *sl;
+ pathbuf[len] = 0;
+ sl = strrchr(pathbuf, '/');
+ if (sl)
+ sl++;
+ else
+ sl = pathbuf;
+ strcpy(sl, "mdmon");
+ } else
+ pathbuf[0] = '\0';
+
+ switch(fork()) {
+ case 0:
+ /* FIXME yuk. CLOSE_EXEC?? */
+ for (i=3; i < 100; i++)
+ close(i);
+ for (i=0; paths[i]; i++)
+ if (paths[i][0])
+ execl(paths[i], "mdmon",
+ devnum2devname(devnum),
+ NULL);
+ exit(1);
+ case -1: fprintf(stderr, Name ": cannot run mdmon. "
+ "Array remains readonly\n");
+ return -1;
+ default: /* parent - good */
+ pid = wait(&status);
+ if (pid < 0 || status != 0)
+ return -1;
+ }
+ return 0;
+}
+
+int check_env(char *name)
+{
+ char *val = getenv(name);
+
+ if (val && atoi(val) == 1)
+ return 1;
+
+ return 0;
+}
+
+#ifndef MDASSEMBLE
+int flush_metadata_updates(struct supertype *st)
+{
+ int sfd;
+ if (!st->updates) {
+ st->update_tail = NULL;
+ return -1;
+ }
+
+ sfd = connect_monitor(devnum2devname(st->container_dev));
+ if (sfd < 0)
+ return -1;
+
+ while (st->updates) {
+ struct metadata_update *mu = st->updates;
+ st->updates = mu->next;
+
+ send_message(sfd, mu, 0);
+ wait_reply(sfd, 0);
+ free(mu->buf);
+ free(mu);
+ }
+ ack(sfd, 0);
+ wait_reply(sfd, 0);
+ close(sfd);
+ st->update_tail = NULL;
+ return 0;
+}
+
+void append_metadata_update(struct supertype *st, void *buf, int len)
+{
+
+ struct metadata_update *mu = malloc(sizeof(*mu));
+
+ mu->buf = buf;
+ mu->len = len;
+ mu->space = NULL;
+ mu->next = NULL;
+ *st->update_tail = mu;
+ st->update_tail = &mu->next;
+}
+#endif /* MDASSEMBLE */
+
#ifdef __TINYC__
/* tinyc doesn't optimize this check in ioctl.h out ... */
unsigned int __invalid_size_argument_for_IOC = 0;