]> git.ipfire.org Git - thirdparty/mdadm.git/blobdiff - Manage.c
Add a new clustered disk
[thirdparty/mdadm.git] / Manage.c
index 3d713a32a3520eb6e3259482b49836bc956e0576..e3bdfb3d837fa52533a16f10ed17ac42a385fb2e 100644 (file)
--- a/Manage.c
+++ b/Manage.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2012 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
@@ -27,9 +27,9 @@
 #include "md_p.h"
 #include <ctype.h>
 
-#define REGISTER_DEV           _IO (MD_MAJOR, 1)
-#define START_MD               _IO (MD_MAJOR, 2)
-#define STOP_MD                _IO (MD_MAJOR, 3)
+#define REGISTER_DEV           _IO (MD_MAJOR, 1)
+#define START_MD               _IO (MD_MAJOR, 2)
+#define STOP_MD                        _IO (MD_MAJOR, 3)
 
 int Manage_ro(char *devname, int fd, int readonly)
 {
@@ -54,7 +54,7 @@ int Manage_ro(char *devname, int fd, int readonly)
        /* If this is an externally-managed array, we need to modify the
         * metadata_version so that mdmon doesn't undo our change.
         */
-       mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION);
+       mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION);
        if (mdi &&
            mdi->array.major_version == -1 &&
            is_subarray(mdi->text_version)) {
@@ -127,12 +127,12 @@ out:
 
 #ifndef MDASSEMBLE
 
-static void remove_devices(int devnum, char *path)
+static void remove_devices(char *devnm, char *path)
 {
        /*
         * Remove names at 'path' - possibly with
         * partition suffixes - which link to the 'standard'
-        * name for devnum.  These were probably created
+        * name for devnm.  These were probably created
         * by mdadm when the array was assembled.
         */
        char base[40];
@@ -146,10 +146,7 @@ static void remove_devices(int devnum, char *path)
        if (!path)
                return;
 
-       if (devnum >= 0)
-               sprintf(base, "/dev/md%d", devnum);
-       else
-               sprintf(base, "/dev/md_d%d", -1-devnum);
+       sprintf(base, "/dev/%s", devnm);
        be = base + strlen(base);
 
        path2 = xmalloc(strlen(path)+20);
@@ -173,192 +170,333 @@ static void remove_devices(int devnum, char *path)
        free(path2);
 }
 
-int Manage_runstop(char *devname, int fd, int runstop,
-                  int verbose, int will_retry)
+int Manage_run(char *devname, int fd, struct context *c)
+{
+       /* Run the array.  Array must already be configured
+        *  Requires >= 0.90.0
+        */
+       char nm[32], *nmp;
+
+       if (md_get_version(fd) < 9000) {
+               pr_err("need md driver version 0.90.0 or later\n");
+               return 1;
+       }
+       nmp = fd2devnm(fd);
+       if (!nmp) {
+               pr_err("Cannot find %s in sysfs!!\n", devname);
+               return 1;
+       }
+       strcpy(nm, nmp);
+       return IncrementalScan(c, nm);
+}
+
+int Manage_stop(char *devname, int fd, int verbose, int will_retry)
 {
-       /* Run or stop the array.  Array must already be configured
-        * 'Run' requires >= 0.90.0
-        * 'will_retry' is only relevant for 'stop', and means
-        * that error messages are not wanted.
+       /* Stop the array.  Array must already be configured
+        * 'will_retry' means that error messages are not wanted.
         */
-       mdu_param_t param; /* unused */
        int rv = 0;
+       struct map_ent *map = NULL;
+       struct mdinfo *mdi;
+       char devnm[32];
+       char container[32];
+       int err;
+       int count;
+       char buf[32];
+       unsigned long long rd1, rd2;
 
        if (will_retry && verbose == 0)
                verbose = -1;
 
-       if (runstop == -1 && md_get_version(fd) < 9000) {
+       if (md_get_version(fd) < 9000) {
                if (ioctl(fd, STOP_MD, 0) == 0)
                        return 0;
-               pr_err("stopping device %s "
-                      "failed: %s\n",
+               pr_err("stopping device %s failed: %s\n",
                       devname, strerror(errno));
                return 1;
        }
 
-       if (md_get_version(fd) < 9000) {
-               pr_err("need md driver version 0.90.0 or later\n");
+       /* If this is an mdmon managed array, just write 'inactive'
+        * to the array state and let mdmon clear up.
+        */
+       strcpy(devnm, fd2devnm(fd));
+       /* Get EXCL access first.  If this fails, then attempting
+        * to stop is probably a bad idea.
+        */
+       mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_COMPONENT|GET_VERSION);
+       if (mdi && is_subarray(mdi->text_version)) {
+               char *sl;
+               strncpy(container, mdi->text_version+1, sizeof(container));
+               container[sizeof(container)-1] = 0;
+               sl = strchr(container, '/');
+               if (sl)
+                       *sl = 0;
+       } else
+               container[0] = 0;
+       close(fd);
+       count = 5;
+       while (((fd = ((devnm[0] == '/')
+                      ?open(devname, O_RDONLY|O_EXCL)
+                      :open_dev_flags(devnm, O_RDONLY|O_EXCL))) < 0
+               || strcmp(fd2devnm(fd), devnm) != 0)
+              && container[0]
+              && mdmon_running(container)
+              && count) {
+               if (fd >= 0)
+                       close(fd);
+               flush_mdmon(container);
+               count--;
+       }
+       if (fd < 0 || strcmp(fd2devnm(fd), devnm) != 0) {
+               if (fd >= 0)
+                       close(fd);
+               if (verbose >= 0)
+                       pr_err("Cannot get exclusive access to %s:Perhaps a running process, mounted filesystem or active volume group?\n",
+                              devname);
                return 1;
        }
+       if (mdi &&
+           mdi->array.level > 0 &&
+           is_subarray(mdi->text_version)) {
+               int err;
+               /* This is mdmon managed. */
+               close(fd);
 
-       if (runstop > 0) {
-               if (ioctl(fd, RUN_ARRAY, &param)) {
+               /* As we have an O_EXCL open, any use of the device
+                * which blocks STOP_ARRAY is probably a transient use,
+                * so it is reasonable to retry for a while - 5 seconds.
+                */
+               count = 25;
+               while (count &&
+                      (err = sysfs_set_str(mdi, NULL,
+                                           "array_state",
+                                           "inactive")) < 0
+                      && errno == EBUSY) {
+                       usleep(200000);
+                       count--;
+               }
+               if (err) {
                        if (verbose >= 0)
-                               pr_err("failed to run array %s: %s\n",
+                               pr_err("failed to stop array %s: %s\n",
                                       devname, strerror(errno));
-                       return 1;
+                       rv = 1;
+                       goto out;
                }
-               if (verbose >= 0)
-                       pr_err("started %s\n", devname);
-       } else if (runstop < 0){
-               struct map_ent *map = NULL;
-               struct stat stb;
-               struct mdinfo *mdi;
-               int devnum;
-               int err;
-               int count;
-               /* If this is an mdmon managed array, just write 'inactive'
-                * to the array state and let mdmon clear up.
-                */
-               devnum = fd2devnum(fd);
-               /* Get EXCL access first.  If this fails, then attempting
-                * to stop is probably a bad idea.
-                */
-               close(fd);
-               fd = open(devname, O_RDONLY|O_EXCL);
-               if (fd < 0 || fd2devnum(fd) != devnum) {
-                       if (fd >= 0)
-                               close(fd);
+
+               /* Give monitor a chance to act */
+               ping_monitor(mdi->text_version);
+
+               fd = open_dev_excl(devnm);
+               if (fd < 0) {
                        if (verbose >= 0)
-                               pr_err("Cannot get exclusive access to %s:"
-                                      "Perhaps a running "
-                                      "process, mounted filesystem "
-                                      "or active volume group?\n",
+                               pr_err("failed to completely stop %s: Device is busy\n",
                                       devname);
-                       return 1;
+                       rv = 1;
+                       goto out;
                }
-               mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION);
-               if (mdi &&
-                   mdi->array.level > 0 &&
-                   is_subarray(mdi->text_version)) {
-                       int err;
-                       /* This is mdmon managed. */
-                       close(fd);
-
-                       /* As we have an O_EXCL open, any use of the device
-                        * which blocks STOP_ARRAY is probably a transient use,
-                        * so it is reasonable to retry for a while - 5 seconds.
-                        */
-                       count = 25;
-                       while (count &&
-                              (err = sysfs_set_str(mdi, NULL,
-                                                   "array_state",
-                                                   "inactive")) < 0
-                              && errno == EBUSY) {
-                               usleep(200000);
-                               count--;
-                       }
-                       if (err) {
-                               if (verbose >= 0)
-                                       pr_err("failed to stop array %s: %s\n",
-                                              devname, strerror(errno));
-                               rv = 1;
-                               goto out;
-                       }
-
-                       /* Give monitor a chance to act */
-                       ping_monitor(mdi->text_version);
+       } else if (mdi &&
+                  mdi->array.major_version == -1 &&
+                  mdi->array.minor_version == -2 &&
+                  !is_subarray(mdi->text_version)) {
+               struct mdstat_ent *mds, *m;
+               /* container, possibly mdmon-managed.
+                * Make sure mdmon isn't opening it, which
+                * would interfere with the 'stop'
+                */
+               ping_monitor(mdi->sys_name);
 
-                       fd = open_dev_excl(devnum);
-                       if (fd < 0) {
+               /* now check that there are no existing arrays
+                * which are members of this array
+                */
+               mds = mdstat_read(0, 0);
+               for (m = mds; m; m = m->next)
+                       if (m->metadata_version &&
+                           strncmp(m->metadata_version, "external:", 9)==0 &&
+                           metadata_container_matches(m->metadata_version+9,
+                                                      devnm)) {
                                if (verbose >= 0)
-                                       pr_err("failed to completely stop %s"
-                                              ": Device is busy\n",
-                                              devname);
+                                       pr_err("Cannot stop container %s: member %s still active\n",
+                                              devname, m->dev);
+                               free_mdstat(mds);
                                rv = 1;
                                goto out;
                        }
-               } else if (mdi &&
-                          mdi->array.major_version == -1 &&
-                          mdi->array.minor_version == -2 &&
-                          !is_subarray(mdi->text_version)) {
-                       struct mdstat_ent *mds, *m;
-                       /* container, possibly mdmon-managed.
-                        * Make sure mdmon isn't opening it, which
-                        * would interfere with the 'stop'
-                        */
-                       ping_monitor(mdi->sys_name);
+       }
 
-                       /* now check that there are no existing arrays
-                        * which are members of this array
+       /* If the array is undergoing a reshape which changes the number
+        * of devices, then it would be nice to stop it at a point where
+        * it has completed a full number of stripes in both old and
+        * new layouts as this will allow the reshape to be reverted.
+        * So if 'sync_action' is "reshape" and 'raid_disks' shows two
+        * different numbers, then
+        *  - freeze reshape
+        *  - set sync_max to next multiple of both data_disks and
+        *    chunk sizes (or next but one)
+        *  - unfreeze reshape
+        *  - wait on 'sync_completed' for that point to be reached.
+        */
+       if (mdi && (mdi->array.level >= 4 && mdi->array.level <= 6) &&
+           sysfs_attribute_available(mdi, NULL, "sync_action") &&
+           sysfs_attribute_available(mdi, NULL, "reshape_direction") &&
+           sysfs_get_str(mdi, NULL, "sync_action", buf, 20) > 0 &&
+           strcmp(buf, "reshape\n") == 0 &&
+           sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2 &&
+           sysfs_set_str(mdi, NULL, "sync_action", "frozen") == 0) {
+               /* Array is frozen */
+               unsigned long long position, curr;
+               unsigned long long chunk1, chunk2;
+               unsigned long long rddiv, chunkdiv;
+               unsigned long long sectors;
+               unsigned long long sync_max, old_sync_max;
+               unsigned long long completed;
+               int backwards = 0;
+               int delay;
+               int scfd;
+
+               rd1 -= mdi->array.level == 6 ? 2 : 1;
+               rd2 -= mdi->array.level == 6 ? 2 : 1;
+               sysfs_get_str(mdi, NULL, "reshape_direction", buf, sizeof(buf));
+               if (strncmp(buf, "back", 4) == 0)
+                       backwards = 1;
+               sysfs_get_ll(mdi, NULL, "reshape_position", &position);
+               sysfs_get_two(mdi, NULL, "chunk_size", &chunk1, &chunk2);
+               chunk1 /= 512;
+               chunk2 /= 512;
+               rddiv = GCD(rd1, rd2);
+               chunkdiv = GCD(chunk1, chunk2);
+               sectors = (chunk1/chunkdiv) * chunk2 * (rd1/rddiv) * rd2;
+
+               if (backwards) {
+                       /* Need to subtract 'reshape_position' from
+                        * array size to get equivalent of sync_max.
+                        * Size calculation based on raid5_size in kernel.
                         */
-                       mds = mdstat_read(0, 0);
-                       for (m = mds; m; m = m->next)
-                               if (m->metadata_version &&
-                                   strncmp(m->metadata_version, "external:", 9)==0 &&
-                                   is_subarray(m->metadata_version+9) &&
-                                   devname2devnum(m->metadata_version+10) == devnum) {
-                                       if (verbose >= 0)
-                                               pr_err("Cannot stop container %s: "
-                                                      "member %s still active\n",
-                                                      devname, m->dev);
-                                       free_mdstat(mds);
-                                       rv = 1;
-                                       goto out;
-                               }
+                       unsigned long long size = mdi->component_size;
+                       size &= ~(chunk1-1);
+                       size &= ~(chunk2-1);
+                       /* rd1 must be smaller */
+                       position = (position / sectors - 1) * sectors;
+                       sync_max = size - position/rd1;
+               } else {
+                       position = (position / sectors + 2) * sectors;
+                       sync_max = position/rd1;
                }
-
-               /* As we have an O_EXCL open, any use of the device
-                * which blocks STOP_ARRAY is probably a transient use,
-                * so it is reasonable to retry for a while - 5 seconds.
+               if (sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) < 0)
+                       old_sync_max = mdi->component_size;
+               /* Must not advance sync_max as that could confuse
+                * the reshape monitor */
+               if (sync_max < old_sync_max)
+                       sysfs_set_num(mdi, NULL, "sync_max", sync_max);
+               sysfs_set_str(mdi, NULL, "sync_action", "idle");
+
+               /* That should have set things going again.  Now we
+                * wait a little while (3 second max) for sync_completed
+                * to reach the target.
+                * The reshape process can block for 500msec if
+                * the sync speed limit is hit, so we need to wait
+                * a lot longer than that. 1 second is usually
+                * enough.  3 is safe.
                 */
-               count = 25; err = 0;
-               while (count && fd >= 0
-                      && (err = ioctl(fd, STOP_ARRAY, NULL)) < 0
-                      && errno == EBUSY) {
-                       usleep(200000);
-                       count --;
-               }
-               if (fd >= 0 && err) {
-                       if (verbose >= 0) {
-                               pr_err("failed to stop array %s: %s\n",
-                                      devname, strerror(errno));
-                               if (errno == EBUSY)
-                                       fprintf(stderr, "Perhaps a running "
-                                               "process, mounted filesystem "
-                                               "or active volume group?\n");
+               delay = 3000;
+               scfd = sysfs_open(mdi->sys_name, NULL, "sync_completed");
+               while (scfd >= 0 && delay > 0 && old_sync_max > 0) {
+                       sysfs_get_ll(mdi, NULL, "reshape_position", &curr);
+                       sysfs_fd_get_str(scfd, buf, sizeof(buf));
+                       if (strncmp(buf, "none", 4) == 0) {
+                               /* Either reshape has aborted, or hasn't
+                                * quite started yet.  Wait a bit and
+                                * check  'sync_action' to see.
+                                */
+                               usleep(10000);
+                               sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf));
+                               if (strncmp(buf, "reshape", 7) != 0)
+                                       break;
                        }
-                       rv = 1;
-                       goto out;
+
+                       if (sysfs_fd_get_ll(scfd, &completed) == 0 &&
+                           (completed > sync_max ||
+                            (completed == sync_max && curr != position))) {
+                               while (completed > sync_max) {
+                                       sync_max += sectors / rd1;
+                                       if (backwards)
+                                               position -= sectors;
+                                       else
+                                               position += sectors;
+                               }
+                               if (sync_max < old_sync_max)
+                                       sysfs_set_num(mdi, NULL, "sync_max", sync_max);
+                       }
+
+                       if (!backwards && curr >= position)
+                               break;
+                       if (backwards && curr <= position)
+                               break;
+                       sysfs_wait(scfd, &delay);
                }
-               /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array
-                * was stopped, so We'll do it here just to be sure.  Drop any
-                * partitions as well...
-                */
-               if (fd >= 0)
-                       ioctl(fd, BLKRRPART, 0);
-               if (mdi)
-                       sysfs_uevent(mdi, "change");
-
-               if (devnum != NoMdDev &&
-                   (stat("/dev/.udev", &stb) != 0 ||
-                    check_env("MDADM_NO_UDEV"))) {
-                       struct map_ent *mp = map_by_devnum(&map, devnum);
-                       remove_devices(devnum, mp ? mp->path : NULL);
+               if (scfd >= 0)
+                       close(scfd);
+
+       }
+
+       /* As we have an O_EXCL open, any use of the device
+        * which blocks STOP_ARRAY is probably a transient use,
+        * so it is reasonable to retry for a while - 5 seconds.
+        */
+       count = 25; err = 0;
+       while (count && fd >= 0
+              && (err = ioctl(fd, STOP_ARRAY, NULL)) < 0
+              && errno == EBUSY) {
+               usleep(200000);
+               count --;
+       }
+       if (fd >= 0 && err) {
+               if (verbose >= 0) {
+                       pr_err("failed to stop array %s: %s\n",
+                              devname, strerror(errno));
+                       if (errno == EBUSY)
+                               cont_err("Perhaps a running process, mounted filesystem or active volume group?\n");
                }
+               rv = 1;
+               goto out;
+       }
+       /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array
+        * was stopped, so We'll do it here just to be sure.  Drop any
+        * partitions as well...
+        */
+       if (fd >= 0)
+               ioctl(fd, BLKRRPART, 0);
+       if (mdi)
+               sysfs_uevent(mdi, "change");
 
-               if (verbose >= 0)
-                       pr_err("stopped %s\n", devname);
-               map_lock(&map);
-               map_remove(&map, devnum);
-               map_unlock(&map);
-       out:
-               if (mdi)
-                       sysfs_free(mdi);
+       if (devnm[0] && use_udev()) {
+               struct map_ent *mp = map_by_devnm(&map, devnm);
+               remove_devices(devnm, mp ? mp->path : NULL);
        }
+
+       if (verbose >= 0)
+               pr_err("stopped %s\n", devname);
+       map_lock(&map);
+       map_remove(&map, devnm);
+       map_unlock(&map);
+out:
+       if (mdi)
+               sysfs_free(mdi);
+
        return rv;
 }
 
+static struct mddev_dev *add_one(struct mddev_dev *dv, char *name, char disp)
+{
+       struct mddev_dev *new;
+       new = xmalloc(sizeof(*new));
+       memset(new, 0, sizeof(*new));
+       new->devname = xstrdup(name);
+       new->disposition = disp;
+       new->next = dv->next;
+       dv->next = new;
+       return new;
+}
+
 static void add_faulty(struct mddev_dev *dv, int fd, char disp)
 {
        mdu_array_info_t array;
@@ -371,7 +509,6 @@ static void add_faulty(struct mddev_dev *dv, int fd, char disp)
 
        remaining_disks = array.nr_disks;
        for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
-               struct mddev_dev *new;
                char buf[40];
                disk.number = i;
                if (ioctl(fd, GET_DISK_INFO, &disk) != 0)
@@ -382,12 +519,7 @@ static void add_faulty(struct mddev_dev *dv, int fd, char disp)
                if ((disk.state & 1) == 0) /* not faulty */
                        continue;
                sprintf(buf, "%d:%d", disk.major, disk.minor);
-               new = xmalloc(sizeof(*new));
-               new->devname = xstrdup(buf);
-               new->disposition = disp;
-               new->next = dv->next;
-               dv->next = new;
-               dv = new;
+               dv = add_one(dv, buf, disp);
        }
 }
 
@@ -403,7 +535,6 @@ static void add_detached(struct mddev_dev *dv, int fd, char disp)
 
        remaining_disks = array.nr_disks;
        for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
-               struct mddev_dev *new;
                char buf[40];
                int sfd;
                disk.number = i;
@@ -424,12 +555,41 @@ static void add_detached(struct mddev_dev *dv, int fd, char disp)
                if (errno != ENXIO)
                        /* Probably not detached */
                        continue;
-               new = xmalloc(sizeof(*new));
-               new->devname = xstrdup(buf);
-               new->disposition = disp;
-               new->next = dv->next;
-               dv->next = new;
-               dv = new;
+               dv = add_one(dv, buf, disp);
+       }
+}
+
+static void add_set(struct mddev_dev *dv, int fd, char set_char)
+{
+       mdu_array_info_t array;
+       mdu_disk_info_t disk;
+       int remaining_disks;
+       int copies, set;
+       int i;
+
+       if (ioctl(fd, GET_ARRAY_INFO, &array) != 0)
+               return;
+       if (array.level != 10)
+               return;
+       copies = ((array.layout & 0xff) *
+                 ((array.layout >> 8) & 0xff));
+       if (array.raid_disks % copies)
+               return;
+
+       remaining_disks = array.nr_disks;
+       for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
+               char buf[40];
+               disk.number = i;
+               if (ioctl(fd, GET_DISK_INFO, &disk) != 0)
+                       continue;
+               if (disk.major == 0 && disk.minor == 0)
+                       continue;
+               remaining_disks--;
+               set = disk.raid_disk % copies;
+               if (set_char != set + 'A')
+                       continue;
+               sprintf(buf, "%d:%d", disk.major, disk.minor);
+               dv = add_one(dv, buf, dv->disposition);
        }
 }
 
@@ -484,8 +644,7 @@ int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
                        int rv = -1;
                        tfd = dev_open(dv->devname, O_RDWR);
                        if (tfd < 0) {
-                               pr_err("failed to open %s for"
-                                      " superblock update during re-add\n", dv->devname);
+                               pr_err("failed to open %s for superblock update during re-add\n", dv->devname);
                                return -1;
                        }
 
@@ -505,8 +664,7 @@ int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
                                rv = dev_st->ss->store_super(dev_st, tfd);
                        close(tfd);
                        if (rv != 0) {
-                               pr_err("failed to update"
-                                      " superblock during re-add\n");
+                               pr_err("failed to update superblock during re-add\n");
                                return -1;
                        }
                }
@@ -532,7 +690,8 @@ skip_re_add:
 int Manage_add(int fd, int tfd, struct mddev_dev *dv,
               struct supertype *tst, mdu_array_info_t *array,
               int force, int verbose, char *devname,
-              char *update, unsigned long rdev, unsigned long long array_size)
+              char *update, unsigned long rdev, unsigned long long array_size,
+              int raid_slot)
 {
        unsigned long long ldsize;
        struct supertype *dev_st = NULL;
@@ -546,22 +705,16 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
                        return -1;
        }
 
-       if (tst->ss->validate_geometry(
-                   tst, array->level, array->layout,
-                   array->raid_disks, NULL,
-                   ldsize >> 9, INVALID_SECTORS, NULL, NULL, 0) == 0) {
+       if (tst->ss == &super0 && ldsize > 4ULL*1024*1024*1024*1024) {
+               /* More than 4TB is wasted on v0.90 */
                if (!force) {
-                       pr_err("%s is larger than %s can "
-                              "effectively use.\n"
-                              "       Add --force is you "
-                              "really want to add this device.\n",
+                       pr_err("%s is larger than %s can effectively use.\n"
+                              "       Add --force is you really want to add this device.\n",
                               dv->devname, devname);
                        return -1;
                }
-               pr_err("%s is larger than %s can "
-                      "effectively use.\n"
-                      "       Adding anyway as --force "
-                      "was given.\n",
+               pr_err("%s is larger than %s can effectively use.\n"
+                      "       Adding anyway as --force was given.\n",
                       dv->devname, devname);
        }
        if (!tst->ss->external &&
@@ -617,7 +770,8 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
                                break;
                        }
                /* FIXME this is a bad test to be using */
-               if (!tst->sb && dv->disposition != 'a') {
+               if (!tst->sb && (dv->disposition != 'a'
+                                && dv->disposition != 'S')) {
                        /* we are re-adding a device to a
                         * completely dead array - have to depend
                         * on kernel to check
@@ -628,7 +782,8 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
                }
 
                /* Make sure device is large enough */
-               if (tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) <
+               if (tst->sb &&
+                   tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) <
                    array_size) {
                        if (dv->disposition == 'M')
                                return 0;
@@ -647,7 +802,7 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
                        dev_st = dup_super(tst);
                        dev_st->ss->load_super(dev_st, tfd, NULL);
                }
-               if (dev_st && dev_st->sb) {
+               if (dev_st && dev_st->sb && dv->disposition != 'S') {
                        int rv = attempt_re_add(fd, tfd, dv,
                                                dev_st, tst,
                                                rdev,
@@ -674,19 +829,20 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
                        int d;
                        int found = 0;
 
-                       for (d = 0; d < MAX_DISKS && found < array->active_disks; d++) {
+                       for (d = 0; d < MAX_DISKS && found < array->nr_disks; d++) {
                                disc.number = d;
                                if (ioctl(fd, GET_DISK_INFO, &disc))
                                        continue;
                                if (disc.major == 0 && disc.minor == 0)
                                        continue;
+                               found++;
                                if (!(disc.state & (1<<MD_DISK_SYNC)))
                                        continue;
                                avail[disc.raid_disk] = 1;
-                               found++;
                        }
                        array_failed = !enough(array->level, array->raid_disks,
                                               array->layout, 1, avail);
+                       free(avail);
                } else
                        array_failed = 0;
                if (array_failed) {
@@ -725,7 +881,10 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
        }
        disc.major = major(rdev);
        disc.minor = minor(rdev);
-       disc.number =j;
+       if (raid_slot < 0)
+               disc.number = j;
+       else
+               disc.number = raid_slot;
        disc.state = 0;
        if (array->not_persistent==0) {
                int dfd;
@@ -766,6 +925,14 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
                        }
                free(used);
        }
+
+       if (array->state & (1 << MD_SB_CLUSTERED)) {
+               if (dv->disposition == 'c')
+                       disc.state |= (1 << MD_DISK_CANDIDATE);
+               else
+                       disc.state |= (1 << MD_DISK_CLUSTER_ADD);
+       }
+
        if (dv->writemostly == 1)
                disc.state |= (1 << MD_DISK_WRITEMOSTLY);
        if (tst->ss->external) {
@@ -774,13 +941,14 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
                struct mdinfo new_mdi;
                struct mdinfo *sra;
                int container_fd;
-               int devnum = fd2devnum(fd);
+               char devnm[32];
                int dfd;
 
-               container_fd = open_dev_excl(devnum);
+               strcpy(devnm, fd2devnm(fd));
+
+               container_fd = open_dev_excl(devnm);
                if (container_fd < 0) {
-                       pr_err("add failed for %s:"
-                              " could not get exclusive access to container\n",
+                       pr_err("add failed for %s: could not get exclusive access to container\n",
                               dv->devname);
                        tst->ss->free_super(tst);
                        return -1;
@@ -788,7 +956,7 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
 
                Kill(dv->devname, NULL, 0, -1, 0);
                dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
-               if (mdmon_running(tst->container_dev))
+               if (mdmon_running(tst->container_devnm))
                        tst->update_tail = &tst->updates;
                if (tst->ss->add_to_super(tst, &disc, dfd,
                                          dv->devname, INVALID_SECTORS)) {
@@ -801,7 +969,7 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
                else
                        tst->ss->sync_metadata(tst);
 
-               sra = sysfs_read(container_fd, -1, 0);
+               sra = sysfs_read(container_fd, NULL, 0);
                if (!sra) {
                        pr_err("add failed for %s: sysfs_read failed\n",
                               dv->devname);
@@ -819,13 +987,12 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
                 * would block add_disk */
                tst->ss->free_super(tst);
                if (sysfs_add_disk(sra, &new_mdi, 0) != 0) {
-                       pr_err("add new device to external metadata"
-                              " failed for %s\n", dv->devname);
+                       pr_err("add new device to external metadata failed for %s\n", dv->devname);
                        close(container_fd);
                        sysfs_free(sra);
                        return -1;
                }
-               ping_monitor_by_id(devnum);
+               ping_monitor(devnm);
                sysfs_free(sra);
                close(container_fd);
        } else {
@@ -858,11 +1025,11 @@ int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv,
                 * get an O_EXCL open on the container
                 */
                int ret;
-               int dnum = fd2devnum(fd);
-               lfd = open_dev_excl(dnum);
+               char devnm[32];
+               strcpy(devnm, fd2devnm(fd));
+               lfd = open_dev_excl(devnm);
                if (lfd < 0) {
-                       pr_err("Cannot get exclusive access "
-                              " to container - odd\n");
+                       pr_err("Cannot get exclusive access  to container - odd\n");
                        return -1;
                }
                /* We may not be able to check on holders in
@@ -875,7 +1042,7 @@ int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv,
                if (rdev == 0)
                        ret = -1;
                else
-                       ret = sysfs_unique_holder(dnum, rdev);
+                       ret = sysfs_unique_holder(devnm, rdev);
                if (ret == 0) {
                        pr_err("%s is not a member, cannot remove.\n",
                               dv->devname);
@@ -904,7 +1071,7 @@ int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv,
                if (err && errno == ENODEV) {
                        /* Old kernels rejected this if no personality
                         * is registered */
-                       struct mdinfo *sra = sysfs_read(fd, 0, GET_DEVS);
+                       struct mdinfo *sra = sysfs_read(fd, NULL, GET_DEVS);
                        struct mdinfo *dv = NULL;
                        if (sra)
                                dv = sra->devs;
@@ -922,8 +1089,7 @@ int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv,
                }
        }
        if (err) {
-               pr_err("hot remove failed "
-                      "for %s: %s\n",  dv->devname,
+               pr_err("hot remove failed for %s: %s\n",        dv->devname,
                       strerror(errno));
                if (lfd >= 0)
                        close(lfd);
@@ -936,15 +1102,14 @@ int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv,
                 * 'add' event before reconciling this 'remove'
                 * event.
                 */
-               char *name = devnum2devname(fd2devnum(fd));
+               char *devnm = fd2devnm(fd);
 
-               if (!name) {
+               if (!devnm) {
                        pr_err("unable to get container name\n");
                        return -1;
                }
 
-               ping_manager(name);
-               free(name);
+               ping_manager(devnm);
        }
        if (lfd >= 0)
                close(lfd);
@@ -965,7 +1130,7 @@ int Manage_replace(struct supertype *tst, int fd, struct mddev_dev *dv,
        /* Need to find the device in sysfs and add 'want_replacement' to the
         * status.
         */
-       mdi = sysfs_read(fd, -1, GET_DEVS);
+       mdi = sysfs_read(fd, NULL, GET_DEVS);
        if (!mdi || !mdi->devs) {
                pr_err("Cannot find status of %s to enable replacement - strange\n",
                       devname);
@@ -1016,7 +1181,7 @@ int Manage_with(struct supertype *tst, int fd, struct mddev_dev *dv,
 {
        struct mdinfo *mdi, *di;
        /* try to set 'slot' for 'rdev' in 'fd' to 'dv->used' */
-       mdi = sysfs_read(fd, -1, GET_DEVS|GET_STATE);
+       mdi = sysfs_read(fd, NULL, GET_DEVS|GET_STATE);
        if (!mdi || !mdi->devs) {
                pr_err("Cannot find status of %s to enable replacement - strange\n",
                       devname);
@@ -1044,7 +1209,7 @@ int Manage_with(struct supertype *tst, int fd, struct mddev_dev *dv,
                                   "slot", dv->used);
                if (rv) {
                        sysfs_free(mdi);
-                       pr_err("Failed to %s as preferred replacement.\n",
+                       pr_err("Failed to set %s as preferred replacement.\n",
                               dv->devname);
                        return -1;
                }
@@ -1068,6 +1233,7 @@ int Manage_subdevs(char *devname, int fd,
         *  'a' - add the device
         *         try HOT_ADD_DISK
         *         If that fails EINVAL, try ADD_NEW_DISK
+        *  'S' - add the device as a spare - don't try re-add
         *  'A' - re-add the device
         *  'r' - remove the device: HOT_REMOVE_DISK
         *        device can be 'faulty' or 'detached' in which case all
@@ -1084,6 +1250,9 @@ int Manage_subdevs(char *devname, int fd,
         *        it must be unpaired, and is an error.
         *  'M' - this is created by a 'missing' target.  It is a slight
         *        variant on 'A'
+        *  'F' - Another variant of 'A', where the device was faulty
+        *        so must be removed from the array first.
+        *  'c' - confirm the device as found (for clustered environments)
         *
         * For 'f' and 'r', the device can also be a kernel-internal
         * name such as 'sdb'.
@@ -1091,7 +1260,6 @@ int Manage_subdevs(char *devname, int fd,
        mdu_array_info_t array;
        unsigned long long array_size;
        struct mddev_dev *dv;
-       struct stat stb;
        int tfd = -1;
        struct supertype *tst;
        char *subarray = NULL;
@@ -1099,13 +1267,15 @@ int Manage_subdevs(char *devname, int fd,
        int count = 0; /* number of actions taken */
        struct mdinfo info;
        int frozen = 0;
+       int busy = 0;
+       int raid_slot = -1;
 
        if (ioctl(fd, GET_ARRAY_INFO, &array)) {
                pr_err("Cannot get array info for %s\n",
                        devname);
                goto abort;
        }
-       sysfs_init(&info, fd, 0);
+       sysfs_init(&info, fd, NULL);
 
        /* array.size is only 32 bits and may be truncated.
         * So read from sysfs if possible, and record number of sectors
@@ -1122,25 +1292,37 @@ int Manage_subdevs(char *devname, int fd,
                goto abort;
        }
 
-       stb.st_rdev = 0;
        for (dv = devlist; dv; dv = dv->next) {
+               unsigned long rdev = 0; /* device to add/remove etc */
                int rv;
+               int mj,mn;
+
+               raid_slot = -1;
+               if (dv->disposition == 'c') {
+                       rv = parse_cluster_confirm_arg(dv->devname,
+                                                      &dv->devname,
+                                                      &raid_slot);
+                       if (!rv) {
+                               pr_err("Could not get the devname of cluster\n");
+                               goto abort;
+                       }
+               }
 
                if (strcmp(dv->devname, "failed") == 0 ||
                    strcmp(dv->devname, "faulty") == 0) {
-                       if (dv->disposition != 'r') {
-                               pr_err("%s only meaningful "
-                                       "with -r, not -%c\n",
+                       if (dv->disposition != 'A'
+                           && dv->disposition != 'r') {
+                               pr_err("%s only meaningful with -r or --re-add, not -%c\n",
                                        dv->devname, dv->disposition);
                                goto abort;
                        }
-                       add_faulty(dv, fd, 'r');
+                       add_faulty(dv, fd, (dv->disposition == 'A'
+                                           ? 'F' : 'r'));
                        continue;
                }
                if (strcmp(dv->devname, "detached") == 0) {
                        if (dv->disposition != 'r' && dv->disposition != 'f') {
-                               pr_err("%s only meaningful "
-                                       "with -r of -f, not -%c\n",
+                               pr_err("%s only meaningful with -r of -f, not -%c\n",
                                        dv->devname, dv->disposition);
                                goto abort;
                        }
@@ -1151,9 +1333,13 @@ int Manage_subdevs(char *devname, int fd,
                if (strcmp(dv->devname, "missing") == 0) {
                        struct mddev_dev *add_devlist = NULL;
                        struct mddev_dev **dp;
+                       if (dv->disposition == 'c') {
+                               rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL);
+                               break;
+                       }
+
                        if (dv->disposition != 'A') {
-                               pr_err("'missing' only meaningful "
-                                      "with --re-add\n");
+                               pr_err("'missing' only meaningful with --re-add\n");
                                goto abort;
                        }
                        add_devlist = conf_get_devs();
@@ -1169,6 +1355,35 @@ int Manage_subdevs(char *devname, int fd,
                        continue;
                }
 
+               if (strncmp(dv->devname, "set-", 4) == 0 &&
+                   strlen(dv->devname) == 5) {
+                       int copies;
+
+                       if (dv->disposition != 'r' &&
+                           dv->disposition != 'f') {
+                               pr_err("'%s' only meaningful with -r or -f\n",
+                                      dv->devname);
+                               goto abort;
+                       }
+                       if (array.level != 10) {
+                               pr_err("'%s' only meaningful with RAID10 arrays\n",
+                                      dv->devname);
+                               goto abort;
+                       }
+                       copies = ((array.layout & 0xff) *
+                                 ((array.layout >> 8) & 0xff));
+                       if (array.raid_disks % copies != 0 ||
+                           dv->devname[4] < 'A' ||
+                           dv->devname[4] >= 'A' + copies ||
+                           copies > 26) {
+                               pr_err("'%s' not meaningful with this array\n",
+                                      dv->devname);
+                               goto abort;
+                       }
+                       add_set(dv, fd, dv->devname[4]);
+                       continue;
+               }
+
                if (strchr(dv->devname, '/') == NULL &&
                    strchr(dv->devname, ':') == NULL &&
                    strlen(dv->devname) < 50) {
@@ -1176,35 +1391,39 @@ int Manage_subdevs(char *devname, int fd,
                        int found = 0;
                        char dname[55];
                        if (dv->disposition != 'r' && dv->disposition != 'f') {
-                               pr_err("%s only meaningful "
-                                       "with -r or -f, not -%c\n",
+                               pr_err("%s only meaningful with -r or -f, not -%c\n",
                                        dv->devname, dv->disposition);
                                goto abort;
                        }
 
                        sprintf(dname, "dev-%s", dv->devname);
-                       sysfd = sysfs_open(fd2devnum(fd), dname, "block/dev");
+                       sysfd = sysfs_open(fd2devnm(fd), dname, "block/dev");
                        if (sysfd >= 0) {
                                char dn[20];
-                               int mj,mn;
                                if (sysfs_fd_get_str(sysfd, dn, 20) > 0 &&
                                    sscanf(dn, "%d:%d", &mj,&mn) == 2) {
-                                       stb.st_rdev = makedev(mj,mn);
+                                       rdev = makedev(mj,mn);
                                        found = 1;
                                }
                                close(sysfd);
                                sysfd = -1;
                        }
                        if (!found) {
-                               sysfd = sysfs_open(fd2devnum(fd), dname, "state");
+                               sysfd = sysfs_open(fd2devnm(fd), dname, "state");
                                if (sysfd < 0) {
-                                       pr_err("%s does not appear "
-                                               "to be a component of %s\n",
+                                       pr_err("%s does not appear to be a component of %s\n",
                                                dv->devname, devname);
                                        goto abort;
                                }
                        }
+               } else if ((dv->disposition == 'r' || dv->disposition == 'f')
+                          && get_maj_min(dv->devname, &mj, &mn)) {
+                       /* for 'fail' and 'remove', the device might
+                        * not exist.
+                        */
+                       rdev = makedev(mj, mn);
                } else {
+                       struct stat stb;
                        tfd = dev_open(dv->devname, O_RDONLY);
                        if (tfd >= 0)
                                fstat(tfd, &stb);
@@ -1237,6 +1456,7 @@ int Manage_subdevs(char *devname, int fd,
                                        goto abort;
                                }
                        }
+                       rdev = stb.st_rdev;
                }
                switch(dv->disposition){
                default:
@@ -1244,15 +1464,19 @@ int Manage_subdevs(char *devname, int fd,
                                dv->devname, dv->disposition);
                        goto abort;
                case 'a':
+               case 'S': /* --add-spare */
                case 'A':
-               case 'M':
+               case 'M': /* --re-add missing */
+               case 'F': /* --re-add faulty  */
+               case 'c': /* --cluster-confirm */
                        /* add the device */
                        if (subarray) {
-                               pr_err("Cannot add disks to a"
-                                       " \'member\' array, perform this"
-                                       " operation on the parent container\n");
+                               pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n");
                                goto abort;
                        }
+                       if (dv->disposition == 'F')
+                               /* Need to remove first */
+                               ioctl(fd, HOT_REMOVE_DISK, rdev);
                        /* Make sure it isn't in use (in 2.6 or later) */
                        tfd = dev_open(dv->devname, O_RDONLY|O_EXCL);
                        if (tfd >= 0) {
@@ -1262,7 +1486,7 @@ int Manage_subdevs(char *devname, int fd,
                                 */
                                close(tfd);
                                tfd = dev_open(dv->devname, O_RDONLY);
-                       }                               
+                       }
                        if (tfd < 0) {
                                if (dv->disposition == 'M')
                                        continue;
@@ -1278,7 +1502,7 @@ int Manage_subdevs(char *devname, int fd,
                        }
                        rv = Manage_add(fd, tfd, dv, tst, &array,
                                        force, verbose, devname, update,
-                                       stb.st_rdev, array_size);
+                                       rdev, array_size, raid_slot);
                        close(tfd);
                        tfd = -1;
                        if (rv < 0)
@@ -1290,13 +1514,11 @@ int Manage_subdevs(char *devname, int fd,
                case 'r':
                        /* hot remove */
                        if (subarray) {
-                               pr_err("Cannot remove disks from a"
-                                       " \'member\' array, perform this"
-                                       " operation on the parent container\n");
+                               pr_err("Cannot remove disks from a \'member\' array, perform this operation on the parent container\n");
                                rv = -1;
                        } else
                                rv = Manage_remove(tst, fd, dv, sysfd,
-                                                  stb.st_rdev, verbose,
+                                                  rdev, verbose,
                                                   devname);
                        if (sysfd >= 0)
                                close(sysfd);
@@ -1311,7 +1533,9 @@ int Manage_subdevs(char *devname, int fd,
                        /* FIXME check current member */
                        if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) ||
                            (sysfd < 0 && ioctl(fd, SET_DISK_FAULTY,
-                                               (unsigned long) stb.st_rdev))) {
+                                               rdev))) {
+                               if (errno == EBUSY)
+                                       busy = 1;
                                pr_err("set device faulty failed for %s:  %s\n",
                                        dv->devname, strerror(errno));
                                if (sysfd >= 0)
@@ -1328,9 +1552,7 @@ int Manage_subdevs(char *devname, int fd,
                        break;
                case 'R': /* Mark as replaceable */
                        if (subarray) {
-                               pr_err("Cannot replace disks in a"
-                                       " \'member\' array, perform this"
-                                       " operation on the parent container\n");
+                               pr_err("Cannot replace disks in a \'member\' array, perform this operation on the parent container\n");
                                rv = -1;
                        } else {
                                if (!frozen) {
@@ -1340,7 +1562,7 @@ int Manage_subdevs(char *devname, int fd,
                                                frozen = -1;
                                }
                                rv = Manage_replace(tst, fd, dv,
-                                                   stb.st_rdev, verbose,
+                                                   rdev, verbose,
                                                    devname);
                        }
                        if (rv < 0)
@@ -1354,7 +1576,7 @@ int Manage_subdevs(char *devname, int fd,
                        goto abort;
                case 'w': /* --with device which was matched */
                        rv = Manage_with(tst, fd, dv,
-                                        stb.st_rdev, verbose, devname);
+                                        rdev, verbose, devname);
                        if (rv < 0)
                                goto abort;
                        break;
@@ -1369,7 +1591,7 @@ int Manage_subdevs(char *devname, int fd,
 abort:
        if (frozen > 0)
                sysfs_set_str(&info, NULL, "sync_action","idle");
-       return 1;
+       return !test && busy ? 2 : 1;
 }
 
 int autodetect(void)
@@ -1403,7 +1625,7 @@ int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident
                goto free_super;
        }
 
-       if (mdmon_running(st->devnum))
+       if (mdmon_running(st->devnm))
                st->update_tail = &st->updates;
 
        rv = st->ss->update_subarray(st, subarray, update, ident);