/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2012 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
#include "md_p.h"
#include <ctype.h>
-#define REGISTER_DEV _IO (MD_MAJOR, 1)
-#define START_MD _IO (MD_MAJOR, 2)
-#define STOP_MD _IO (MD_MAJOR, 3)
+#define REGISTER_DEV _IO (MD_MAJOR, 1)
+#define START_MD _IO (MD_MAJOR, 2)
+#define STOP_MD _IO (MD_MAJOR, 3)
int Manage_ro(char *devname, int fd, int readonly)
{
/* If this is an externally-managed array, we need to modify the
* metadata_version so that mdmon doesn't undo our change.
*/
- mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION);
+ mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION);
if (mdi &&
mdi->array.major_version == -1 &&
is_subarray(mdi->text_version)) {
}
out:
#ifndef MDASSEMBLE
- if (mdi)
- sysfs_free(mdi);
+ sysfs_free(mdi);
#endif
return rv;
}
#ifndef MDASSEMBLE
-static void remove_devices(int devnum, char *path)
+static void remove_devices(char *devnm, char *path)
{
/*
* Remove names at 'path' - possibly with
* partition suffixes - which link to the 'standard'
- * name for devnum. These were probably created
+ * name for devnm. These were probably created
* by mdadm when the array was assembled.
*/
char base[40];
if (!path)
return;
- if (devnum >= 0)
- sprintf(base, "/dev/md%d", devnum);
- else
- sprintf(base, "/dev/md_d%d", -1-devnum);
+ sprintf(base, "/dev/%s", devnm);
be = base + strlen(base);
path2 = xmalloc(strlen(path)+20);
free(path2);
}
-int Manage_runstop(char *devname, int fd, int runstop,
- int verbose, int will_retry)
+int Manage_run(char *devname, int fd, struct context *c)
+{
+ /* Run the array. Array must already be configured
+ * Requires >= 0.90.0
+ */
+ char nm[32], *nmp;
+
+ if (md_get_version(fd) < 9000) {
+ pr_err("need md driver version 0.90.0 or later\n");
+ return 1;
+ }
+ nmp = fd2devnm(fd);
+ if (!nmp) {
+ pr_err("Cannot find %s in sysfs!!\n", devname);
+ return 1;
+ }
+ strcpy(nm, nmp);
+ return IncrementalScan(c, nm);
+}
+
+int Manage_stop(char *devname, int fd, int verbose, int will_retry)
{
- /* Run or stop the array. Array must already be configured
- * 'Run' requires >= 0.90.0
- * 'will_retry' is only relevant for 'stop', and means
- * that error messages are not wanted.
+ /* Stop the array. Array must already be configured
+ * 'will_retry' means that error messages are not wanted.
*/
- mdu_param_t param; /* unused */
int rv = 0;
+ struct map_ent *map = NULL;
+ struct mdinfo *mdi;
+ char devnm[32];
+ char container[32];
+ int err;
+ int count;
+ char buf[32];
+ unsigned long long rd1, rd2;
if (will_retry && verbose == 0)
verbose = -1;
- if (runstop == -1 && md_get_version(fd) < 9000) {
+ if (md_get_version(fd) < 9000) {
if (ioctl(fd, STOP_MD, 0) == 0)
return 0;
- pr_err("stopping device %s "
- "failed: %s\n",
+ pr_err("stopping device %s failed: %s\n",
devname, strerror(errno));
return 1;
}
- if (md_get_version(fd) < 9000) {
- pr_err("need md driver version 0.90.0 or later\n");
+ strcpy(devnm, fd2devnm(fd));
+ /* Get EXCL access first. If this fails, then attempting
+ * to stop is probably a bad idea.
+ */
+ mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_COMPONENT|GET_VERSION);
+ if (mdi && is_subarray(mdi->text_version)) {
+ char *sl;
+ strncpy(container, mdi->text_version+1, sizeof(container));
+ container[sizeof(container)-1] = 0;
+ sl = strchr(container, '/');
+ if (sl)
+ *sl = 0;
+ } else
+ container[0] = 0;
+ close(fd);
+ count = 5;
+ while (((fd = ((devname[0] == '/')
+ ?open(devname, O_RDONLY|O_EXCL)
+ :open_dev_flags(devnm, O_RDONLY|O_EXCL))) < 0
+ || strcmp(fd2devnm(fd), devnm) != 0)
+ && container[0]
+ && mdmon_running(container)
+ && count) {
+ /* Can't open, so something might be wrong. However it
+ * is a container, so we might be racing with mdmon, so
+ * retry for a bit.
+ */
+ if (fd >= 0)
+ close(fd);
+ flush_mdmon(container);
+ count--;
+ }
+ if (fd < 0 || strcmp(fd2devnm(fd), devnm) != 0) {
+ if (fd >= 0)
+ close(fd);
+ if (verbose >= 0)
+ pr_err("Cannot get exclusive access to %s:Perhaps a running process, mounted filesystem or active volume group?\n",
+ devname);
return 1;
}
+ /* If this is an mdmon managed array, just write 'inactive'
+ * to the array state and let mdmon clear up.
+ */
+ if (mdi &&
+ mdi->array.level > 0 &&
+ is_subarray(mdi->text_version)) {
+ int err;
+ /* This is mdmon managed. */
+ close(fd);
- if (runstop > 0) {
- if (ioctl(fd, RUN_ARRAY, ¶m)) {
+ /* As we had an O_EXCL open, any use of the device
+ * which blocks STOP_ARRAY is probably a transient use,
+ * so it is reasonable to retry for a while - 5 seconds.
+ */
+ count = 25;
+ while (count &&
+ (err = sysfs_set_str(mdi, NULL,
+ "array_state",
+ "inactive")) < 0
+ && errno == EBUSY) {
+ usleep(200000);
+ count--;
+ }
+ if (err) {
if (verbose >= 0)
- pr_err("failed to run array %s: %s\n",
+ pr_err("failed to stop array %s: %s\n",
devname, strerror(errno));
- return 1;
+ rv = 1;
+ goto out;
}
- if (verbose >= 0)
- pr_err("started %s\n", devname);
- } else if (runstop < 0){
- struct map_ent *map = NULL;
- struct stat stb;
- struct mdinfo *mdi;
- int devnum;
- int err;
- int count;
- /* If this is an mdmon managed array, just write 'inactive'
- * to the array state and let mdmon clear up.
- */
- devnum = fd2devnum(fd);
- /* Get EXCL access first. If this fails, then attempting
- * to stop is probably a bad idea.
- */
- close(fd);
- fd = open(devname, O_RDONLY|O_EXCL);
- if (fd < 0 || fd2devnum(fd) != devnum) {
- if (fd >= 0)
- close(fd);
+
+ /* Give monitor a chance to act */
+ ping_monitor(mdi->text_version);
+
+ fd = open_dev_excl(devnm);
+ if (fd < 0) {
if (verbose >= 0)
- pr_err("Cannot get exclusive access to %s:"
- "Perhaps a running "
- "process, mounted filesystem "
- "or active volume group?\n",
+ pr_err("failed to completely stop %s: Device is busy\n",
devname);
- return 1;
+ rv = 1;
+ goto out;
}
- mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION);
- if (mdi &&
- mdi->array.level > 0 &&
- is_subarray(mdi->text_version)) {
- int err;
- /* This is mdmon managed. */
- close(fd);
+ } else if (mdi &&
+ mdi->array.major_version == -1 &&
+ mdi->array.minor_version == -2 &&
+ !is_subarray(mdi->text_version)) {
+ struct mdstat_ent *mds, *m;
+ /* container, possibly mdmon-managed.
+ * Make sure mdmon isn't opening it, which
+ * would interfere with the 'stop'
+ */
+ ping_monitor(mdi->sys_name);
- /* As we have an O_EXCL open, any use of the device
- * which blocks STOP_ARRAY is probably a transient use,
- * so it is reasonable to retry for a while - 5 seconds.
- */
- count = 25;
- while (count &&
- (err = sysfs_set_str(mdi, NULL,
- "array_state",
- "inactive")) < 0
- && errno == EBUSY) {
- usleep(200000);
- count--;
- }
- if (err) {
+ /* now check that there are no existing arrays
+ * which are members of this array
+ */
+ mds = mdstat_read(0, 0);
+ for (m = mds; m; m = m->next)
+ if (m->metadata_version &&
+ strncmp(m->metadata_version, "external:", 9)==0 &&
+ metadata_container_matches(m->metadata_version+9,
+ devnm)) {
if (verbose >= 0)
- pr_err("failed to stop array %s: %s\n",
- devname, strerror(errno));
+ pr_err("Cannot stop container %s: member %s still active\n",
+ devname, m->devnm);
+ free_mdstat(mds);
rv = 1;
goto out;
}
+ }
- /* Give monitor a chance to act */
- ping_monitor(mdi->text_version);
+ /* If the array is undergoing a reshape which changes the number
+ * of devices, then it would be nice to stop it at a point where
+ * it has completed a full number of stripes in both old and
+ * new layouts as this will allow the reshape to be reverted.
+ * So if 'sync_action' is "reshape" and 'raid_disks' shows two
+ * different numbers, then
+ * - freeze reshape
+ * - set sync_max to next multiple of both data_disks and
+ * chunk sizes (or next but one)
+ * - unfreeze reshape
+ * - wait on 'sync_completed' for that point to be reached.
+ */
+ if (mdi && (mdi->array.level >= 4 && mdi->array.level <= 6) &&
+ sysfs_attribute_available(mdi, NULL, "sync_action") &&
+ sysfs_attribute_available(mdi, NULL, "reshape_direction") &&
+ sysfs_get_str(mdi, NULL, "sync_action", buf, 20) > 0 &&
+ strcmp(buf, "reshape\n") == 0 &&
+ sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2) {
+ unsigned long long position, curr;
+ unsigned long long chunk1, chunk2;
+ unsigned long long rddiv, chunkdiv;
+ unsigned long long sectors;
+ unsigned long long sync_max, old_sync_max;
+ unsigned long long completed;
+ int backwards = 0;
+ int delay;
+ int scfd;
+
+ delay = 40;
+ while (rd1 > rd2 && delay > 0 &&
+ sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) == 0) {
+ /* must be in the critical section - wait a bit */
+ delay -= 1;
+ usleep(100000);
+ }
- fd = open_dev_excl(devnum);
- if (fd < 0) {
- if (verbose >= 0)
- pr_err("failed to completely stop %s"
- ": Device is busy\n",
- devname);
- rv = 1;
- goto out;
- }
- } else if (mdi &&
- mdi->array.major_version == -1 &&
- mdi->array.minor_version == -2 &&
- !is_subarray(mdi->text_version)) {
- struct mdstat_ent *mds, *m;
- /* container, possibly mdmon-managed.
- * Make sure mdmon isn't opening it, which
- * would interfere with the 'stop'
+ if (sysfs_set_str(mdi, NULL, "sync_action", "frozen") != 0)
+ goto done;
+ /* Array is frozen */
+
+ rd1 -= mdi->array.level == 6 ? 2 : 1;
+ rd2 -= mdi->array.level == 6 ? 2 : 1;
+ sysfs_get_str(mdi, NULL, "reshape_direction", buf, sizeof(buf));
+ if (strncmp(buf, "back", 4) == 0)
+ backwards = 1;
+ if (sysfs_get_ll(mdi, NULL, "reshape_position", &position) != 0) {
+ /* reshape must have finished now */
+ sysfs_set_str(mdi, NULL, "sync_action", "idle");
+ goto done;
+ }
+ sysfs_get_two(mdi, NULL, "chunk_size", &chunk1, &chunk2);
+ chunk1 /= 512;
+ chunk2 /= 512;
+ rddiv = GCD(rd1, rd2);
+ chunkdiv = GCD(chunk1, chunk2);
+ sectors = (chunk1/chunkdiv) * chunk2 * (rd1/rddiv) * rd2;
+
+ if (backwards) {
+ /* Need to subtract 'reshape_position' from
+ * array size to get equivalent of sync_max.
+ * Size calculation based on raid5_size in kernel.
*/
- ping_monitor(mdi->sys_name);
-
- /* now check that there are no existing arrays
- * which are members of this array
+ unsigned long long size = mdi->component_size;
+ size &= ~(chunk1-1);
+ size &= ~(chunk2-1);
+ /* rd1 must be smaller */
+ /* Reshape may have progressed further backwards than
+ * recorded, so target even further back (hence "-1")
*/
- mds = mdstat_read(0, 0);
- for (m = mds; m; m = m->next)
- if (m->metadata_version &&
- strncmp(m->metadata_version, "external:", 9)==0 &&
- is_subarray(m->metadata_version+9) &&
- devname2devnum(m->metadata_version+10) == devnum) {
- if (verbose >= 0)
- pr_err("Cannot stop container %s: "
- "member %s still active\n",
- devname, m->dev);
- free_mdstat(mds);
- rv = 1;
- goto out;
- }
+ position = (position / sectors - 1) * sectors;
+ /* rd1 is always the conversion factor between 'sync'
+ * position and 'reshape' position.
+ * We read 1 "new" stripe worth of data from where-ever,
+ * and when write out that full stripe.
+ */
+ sync_max = size - position/rd1;
+ } else {
+ /* Reshape will very likely be beyond position, and it may
+ * be too late to stop at '+1', so aim for '+2'
+ */
+ position = (position / sectors + 2) * sectors;
+ sync_max = position/rd1;
}
-
- /* As we have an O_EXCL open, any use of the device
- * which blocks STOP_ARRAY is probably a transient use,
- * so it is reasonable to retry for a while - 5 seconds.
+ if (sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) < 0)
+ old_sync_max = mdi->component_size;
+ /* Must not advance sync_max as that could confuse
+ * the reshape monitor */
+ if (sync_max < old_sync_max)
+ sysfs_set_num(mdi, NULL, "sync_max", sync_max);
+ sysfs_set_str(mdi, NULL, "sync_action", "idle");
+
+ /* That should have set things going again. Now we
+ * wait a little while (3 second max) for sync_completed
+ * to reach the target.
+ * The reshape process can block for 500msec if
+ * the sync speed limit is hit, so we need to wait
+ * a lot longer than that. 1 second is usually
+ * enough. 3 is safe.
*/
- count = 25; err = 0;
- while (count && fd >= 0
- && (err = ioctl(fd, STOP_ARRAY, NULL)) < 0
- && errno == EBUSY) {
- usleep(200000);
- count --;
- }
- if (fd >= 0 && err) {
- if (verbose >= 0) {
- pr_err("failed to stop array %s: %s\n",
- devname, strerror(errno));
- if (errno == EBUSY)
- fprintf(stderr, "Perhaps a running "
- "process, mounted filesystem "
- "or active volume group?\n");
+ delay = 3000;
+ scfd = sysfs_open(mdi->sys_name, NULL, "sync_completed");
+ while (scfd >= 0 && delay > 0 && old_sync_max > 0) {
+ unsigned long long max_completed;
+ sysfs_get_ll(mdi, NULL, "reshape_position", &curr);
+ sysfs_fd_get_str(scfd, buf, sizeof(buf));
+ if (strncmp(buf, "none", 4) == 0) {
+ /* Either reshape has aborted, or hasn't
+ * quite started yet. Wait a bit and
+ * check 'sync_action' to see.
+ */
+ usleep(10000);
+ sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf));
+ if (strncmp(buf, "reshape", 7) != 0)
+ break;
}
- rv = 1;
- goto out;
+
+ if (sysfs_fd_get_two(scfd, &completed,
+ &max_completed) == 2 &&
+ /* 'completed' sometimes reads as max-uulong */
+ completed < max_completed &&
+ (completed > sync_max ||
+ (completed == sync_max && curr != position))) {
+ while (completed > sync_max) {
+ sync_max += sectors / rd1;
+ if (backwards)
+ position -= sectors;
+ else
+ position += sectors;
+ }
+ if (sync_max < old_sync_max)
+ sysfs_set_num(mdi, NULL, "sync_max", sync_max);
+ }
+
+ if (!backwards && curr >= position)
+ break;
+ if (backwards && curr <= position)
+ break;
+ sysfs_wait(scfd, &delay);
}
+ if (scfd >= 0)
+ close(scfd);
+
+ }
+done:
+
+ /* As we have an O_EXCL open, any use of the device
+ * which blocks STOP_ARRAY is probably a transient use,
+ * so it is reasonable to retry for a while - 5 seconds.
+ */
+ count = 25; err = 0;
+ while (count && fd >= 0
+ && (err = ioctl(fd, STOP_ARRAY, NULL)) < 0
+ && errno == EBUSY) {
+ usleep(200000);
+ count --;
+ }
+ if (fd >= 0 && err) {
+ if (verbose >= 0) {
+ pr_err("failed to stop array %s: %s\n",
+ devname, strerror(errno));
+ if (errno == EBUSY)
+ cont_err("Perhaps a running process, mounted filesystem or active volume group?\n");
+ }
+ rv = 1;
+ goto out;
+ }
+
+ if (get_linux_version() < 2006028) {
/* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array
* was stopped, so We'll do it here just to be sure. Drop any
* partitions as well...
ioctl(fd, BLKRRPART, 0);
if (mdi)
sysfs_uevent(mdi, "change");
+ }
- if (devnum != NoMdDev &&
- (stat("/dev/.udev", &stb) != 0 ||
- check_env("MDADM_NO_UDEV"))) {
- struct map_ent *mp = map_by_devnum(&map, devnum);
- remove_devices(devnum, mp ? mp->path : NULL);
- }
-
- if (verbose >= 0)
- pr_err("stopped %s\n", devname);
- map_lock(&map);
- map_remove(&map, devnum);
- map_unlock(&map);
- out:
- if (mdi)
- sysfs_free(mdi);
+ if (devnm[0] && use_udev()) {
+ struct map_ent *mp = map_by_devnm(&map, devnm);
+ remove_devices(devnm, mp ? mp->path : NULL);
}
+
+ if (verbose >= 0)
+ pr_err("stopped %s\n", devname);
+ map_lock(&map);
+ map_remove(&map, devnm);
+ map_unlock(&map);
+out:
+ sysfs_free(mdi);
+
return rv;
}
+static struct mddev_dev *add_one(struct mddev_dev *dv, char *name, char disp)
+{
+ struct mddev_dev *new;
+ new = xmalloc(sizeof(*new));
+ memset(new, 0, sizeof(*new));
+ new->devname = xstrdup(name);
+ new->disposition = disp;
+ new->next = dv->next;
+ dv->next = new;
+ return new;
+}
+
static void add_faulty(struct mddev_dev *dv, int fd, char disp)
{
mdu_array_info_t array;
remaining_disks = array.nr_disks;
for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
- struct mddev_dev *new;
char buf[40];
disk.number = i;
if (ioctl(fd, GET_DISK_INFO, &disk) != 0)
if ((disk.state & 1) == 0) /* not faulty */
continue;
sprintf(buf, "%d:%d", disk.major, disk.minor);
- new = xmalloc(sizeof(*new));
- new->devname = xstrdup(buf);
- new->disposition = disp;
- new->next = dv->next;
- dv->next = new;
- dv = new;
+ dv = add_one(dv, buf, disp);
}
}
remaining_disks = array.nr_disks;
for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
- struct mddev_dev *new;
char buf[40];
int sfd;
disk.number = i;
if (errno != ENXIO)
/* Probably not detached */
continue;
- new = xmalloc(sizeof(*new));
- new->devname = xstrdup(buf);
- new->disposition = disp;
- new->next = dv->next;
- dv->next = new;
- dv = new;
+ dv = add_one(dv, buf, disp);
+ }
+}
+
+static void add_set(struct mddev_dev *dv, int fd, char set_char)
+{
+ mdu_array_info_t array;
+ mdu_disk_info_t disk;
+ int remaining_disks;
+ int copies, set;
+ int i;
+
+ if (ioctl(fd, GET_ARRAY_INFO, &array) != 0)
+ return;
+ if (array.level != 10)
+ return;
+ copies = ((array.layout & 0xff) *
+ ((array.layout >> 8) & 0xff));
+ if (array.raid_disks % copies)
+ return;
+
+ remaining_disks = array.nr_disks;
+ for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
+ char buf[40];
+ disk.number = i;
+ if (ioctl(fd, GET_DISK_INFO, &disk) != 0)
+ continue;
+ if (disk.major == 0 && disk.minor == 0)
+ continue;
+ remaining_disks--;
+ set = disk.raid_disk % copies;
+ if (set_char != set + 'A')
+ continue;
+ sprintf(buf, "%d:%d", disk.major, disk.minor);
+ dv = add_one(dv, buf, dv->disposition);
}
}
disc.number = mdi.disk.number;
disc.raid_disk = mdi.disk.raid_disk;
disc.state = mdi.disk.state;
- if (dv->writemostly == 1)
+ if (array->state & (1 << MD_SB_CLUSTERED)) {
+ /* extra flags are needed when adding to a cluster as
+ * there are two cases to distinguish
+ */
+ if (dv->disposition == 'c')
+ disc.state |= (1 << MD_DISK_CANDIDATE);
+ else
+ disc.state |= (1 << MD_DISK_CLUSTER_ADD);
+ }
+ if (dv->writemostly == FlagSet)
disc.state |= 1 << MD_DISK_WRITEMOSTLY;
- if (dv->writemostly == 2)
+ if (dv->writemostly == FlagClear)
disc.state &= ~(1 << MD_DISK_WRITEMOSTLY);
+ if (dv->failfast == FlagSet)
+ disc.state |= 1 << MD_DISK_FAILFAST;
+ if (dv->failfast == FlagClear)
+ disc.state &= ~(1 << MD_DISK_FAILFAST);
remove_partitions(tfd);
- if (update || dv->writemostly > 0) {
+ if (update || dv->writemostly != FlagDefault
+ || dv->failfast != FlagDefault) {
int rv = -1;
tfd = dev_open(dv->devname, O_RDWR);
if (tfd < 0) {
- pr_err("failed to open %s for"
- " superblock update during re-add\n", dv->devname);
+ pr_err("failed to open %s for superblock update during re-add\n", dv->devname);
return -1;
}
- if (dv->writemostly == 1)
+ if (dv->writemostly == FlagSet)
rv = dev_st->ss->update_super(
dev_st, NULL, "writemostly",
devname, verbose, 0, NULL);
- if (dv->writemostly == 2)
+ if (dv->writemostly == FlagClear)
rv = dev_st->ss->update_super(
dev_st, NULL, "readwrite",
devname, verbose, 0, NULL);
+ if (dv->failfast == FlagSet)
+ rv = dev_st->ss->update_super(
+ dev_st, NULL, "failfast",
+ devname, verbose, 0, NULL);
+ if (dv->failfast == FlagClear)
+ rv = dev_st->ss->update_super(
+ dev_st, NULL, "nofailfast",
+ devname, verbose, 0, NULL);
if (update)
rv = dev_st->ss->update_super(
dev_st, NULL, update,
rv = dev_st->ss->store_super(dev_st, tfd);
close(tfd);
if (rv != 0) {
- pr_err("failed to update"
- " superblock during re-add\n");
+ pr_err("failed to update superblock during re-add\n");
return -1;
}
}
int Manage_add(int fd, int tfd, struct mddev_dev *dv,
struct supertype *tst, mdu_array_info_t *array,
int force, int verbose, char *devname,
- char *update, unsigned long rdev, unsigned long long array_size)
+ char *update, unsigned long rdev, unsigned long long array_size,
+ int raid_slot)
{
unsigned long long ldsize;
- struct supertype *dev_st = NULL;
+ struct supertype *dev_st;
int j;
mdu_disk_info_t disc;
return -1;
}
- if (tst->ss->validate_geometry(
- tst, array->level, array->layout,
- array->raid_disks, NULL,
- ldsize >> 9, INVALID_SECTORS, NULL, NULL, 0) == 0) {
+ if (tst->ss == &super0 && ldsize > 4ULL*1024*1024*1024*1024) {
+ /* More than 4TB is wasted on v0.90 */
if (!force) {
- pr_err("%s is larger than %s can "
- "effectively use.\n"
- " Add --force is you "
- "really want to add this device.\n",
+ pr_err("%s is larger than %s can effectively use.\n"
+ " Add --force is you really want to add this device.\n",
dv->devname, devname);
return -1;
}
- pr_err("%s is larger than %s can "
- "effectively use.\n"
- " Adding anyway as --force "
- "was given.\n",
+ pr_err("%s is larger than %s can effectively use.\n"
+ " Adding anyway as --force was given.\n",
dv->devname, devname);
}
if (!tst->ss->external &&
break;
}
/* FIXME this is a bad test to be using */
- if (!tst->sb && dv->disposition != 'a') {
+ if (!tst->sb && (dv->disposition != 'a'
+ && dv->disposition != 'S')) {
/* we are re-adding a device to a
* completely dead array - have to depend
* on kernel to check
}
/* Make sure device is large enough */
- if (tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) <
+ if (dv->disposition != 'j' && /* skip size check for Journal */
+ tst->sb &&
+ tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) <
array_size) {
if (dv->disposition == 'M')
return 0;
* simply re-add it.
*/
- if (array->not_persistent==0) {
+ if (array->not_persistent == 0) {
dev_st = dup_super(tst);
dev_st->ss->load_super(dev_st, tfd, NULL);
- }
- if (dev_st && dev_st->sb) {
- int rv = attempt_re_add(fd, tfd, dv,
- dev_st, tst,
- rdev,
- update, devname,
- verbose,
- array);
- dev_st->ss->free_super(dev_st);
- if (rv)
- return rv;
+ if (dev_st->sb && dv->disposition != 'S') {
+ int rv;
+
+ rv = attempt_re_add(fd, tfd, dv, dev_st, tst,
+ rdev, update, devname,
+ verbose, array);
+ dev_st->ss->free_super(dev_st);
+ if (rv)
+ return rv;
+ }
}
if (dv->disposition == 'M') {
if (verbose > 0)
int d;
int found = 0;
- for (d = 0; d < MAX_DISKS && found < array->active_disks; d++) {
+ for (d = 0; d < MAX_DISKS && found < array->nr_disks; d++) {
disc.number = d;
if (ioctl(fd, GET_DISK_INFO, &disc))
continue;
}
array_failed = !enough(array->level, array->raid_disks,
array->layout, 1, avail);
+ free(avail);
} else
array_failed = 0;
if (array_failed) {
}
disc.major = major(rdev);
disc.minor = minor(rdev);
- disc.number =j;
+ if (raid_slot < 0)
+ disc.number = j;
+ else
+ disc.number = raid_slot;
disc.state = 0;
+
+ /* only add journal to array that supports journaling */
+ if (dv->disposition == 'j') {
+ struct mdinfo mdi;
+ struct mdinfo *mdp;
+
+ mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE);
+ if (!mdp) {
+ pr_err("%s unable to read array state.\n", devname);
+ return -1;
+ }
+
+ if (strncmp(mdp->sysfs_array_state, "readonly", 8) != 0) {
+ sysfs_free(mdp);
+ pr_err("%s is not readonly, cannot add journal.\n", devname);
+ return -1;
+ }
+
+ sysfs_free(mdp);
+
+ tst->ss->getinfo_super(tst, &mdi, NULL);
+ if (mdi.journal_device_required == 0) {
+ pr_err("%s does not support journal device.\n", devname);
+ return -1;
+ }
+ disc.raid_disk = 0;
+ }
+
if (array->not_persistent==0) {
int dfd;
- if (dv->writemostly == 1)
+ if (dv->disposition == 'j')
+ disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC);
+ if (dv->writemostly == FlagSet)
disc.state |= 1 << MD_DISK_WRITEMOSTLY;
+ if (dv->failfast == FlagSet)
+ disc.state |= 1 << MD_DISK_FAILFAST;
dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
if (tst->ss->add_to_super(tst, &disc, dfd,
dv->devname, INVALID_SECTORS))
}
free(used);
}
- if (dv->writemostly == 1)
+
+ if (array->state & (1 << MD_SB_CLUSTERED)) {
+ if (dv->disposition == 'c')
+ disc.state |= (1 << MD_DISK_CANDIDATE);
+ else
+ disc.state |= (1 << MD_DISK_CLUSTER_ADD);
+ }
+
+ if (dv->writemostly == FlagSet)
disc.state |= (1 << MD_DISK_WRITEMOSTLY);
+ if (dv->failfast == FlagSet)
+ disc.state |= (1 << MD_DISK_FAILFAST);
if (tst->ss->external) {
/* add a disk
* to an external metadata container */
struct mdinfo new_mdi;
struct mdinfo *sra;
int container_fd;
- int devnum = fd2devnum(fd);
+ char devnm[32];
int dfd;
- container_fd = open_dev_excl(devnum);
+ strcpy(devnm, fd2devnm(fd));
+
+ container_fd = open_dev_excl(devnm);
if (container_fd < 0) {
- pr_err("add failed for %s:"
- " could not get exclusive access to container\n",
+ pr_err("add failed for %s: could not get exclusive access to container\n",
dv->devname);
tst->ss->free_super(tst);
return -1;
Kill(dv->devname, NULL, 0, -1, 0);
dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
- if (mdmon_running(tst->container_dev))
+ if (mdmon_running(tst->container_devnm))
tst->update_tail = &tst->updates;
if (tst->ss->add_to_super(tst, &disc, dfd,
dv->devname, INVALID_SECTORS)) {
else
tst->ss->sync_metadata(tst);
- sra = sysfs_read(container_fd, -1, 0);
+ sra = sysfs_read(container_fd, NULL, 0);
if (!sra) {
pr_err("add failed for %s: sysfs_read failed\n",
dv->devname);
* would block add_disk */
tst->ss->free_super(tst);
if (sysfs_add_disk(sra, &new_mdi, 0) != 0) {
- pr_err("add new device to external metadata"
- " failed for %s\n", dv->devname);
+ pr_err("add new device to external metadata failed for %s\n", dv->devname);
close(container_fd);
sysfs_free(sra);
return -1;
}
- ping_monitor_by_id(devnum);
+ ping_monitor(devnm);
sysfs_free(sra);
close(container_fd);
} else {
tst->ss->free_super(tst);
if (ioctl(fd, ADD_NEW_DISK, &disc)) {
- pr_err("add new device failed for %s as %d: %s\n",
- dv->devname, j, strerror(errno));
+ if (dv->disposition == 'j')
+ pr_err("Failed to hot add %s as journal, "
+ "please try restart %s.\n", dv->devname, devname);
+ else
+ pr_err("add new device failed for %s as %d: %s\n",
+ dv->devname, j, strerror(errno));
return -1;
}
+ if (dv->disposition == 'j') {
+ pr_err("Journal added successfully, making %s read-write\n", devname);
+ if (Manage_ro(devname, fd, -1))
+ pr_err("Failed to make %s read-write\n", devname);
+ }
+
}
if (verbose >= 0)
pr_err("added %s\n", dv->devname);
* get an O_EXCL open on the container
*/
int ret;
- int dnum = fd2devnum(fd);
- lfd = open_dev_excl(dnum);
+ char devnm[32];
+ strcpy(devnm, fd2devnm(fd));
+ lfd = open_dev_excl(devnm);
if (lfd < 0) {
- pr_err("Cannot get exclusive access "
- " to container - odd\n");
+ pr_err("Cannot get exclusive access to container - odd\n");
return -1;
}
/* We may not be able to check on holders in
*/
if (rdev == 0)
ret = -1;
- else
- ret = sysfs_unique_holder(dnum, rdev);
- if (ret == 0) {
- pr_err("%s is not a member, cannot remove.\n",
- dv->devname);
- close(lfd);
- return -1;
- }
- if (ret >= 2) {
- pr_err("%s is still in use, cannot remove.\n",
- dv->devname);
- close(lfd);
- return -1;
+ else {
+ /*
+ * The drive has already been set to 'faulty', however
+ * monitor might not have had time to process it and the
+ * drive might still have an entry in the 'holders'
+ * directory. Try a few times to avoid a false error
+ */
+ int count = 20;
+
+ do {
+ ret = sysfs_unique_holder(devnm, rdev);
+ if (ret < 2)
+ break;
+ usleep(100 * 1000); /* 100ms */
+ } while (--count > 0);
+
+ if (ret == 0) {
+ pr_err("%s is not a member, cannot remove.\n",
+ dv->devname);
+ close(lfd);
+ return -1;
+ }
+ if (ret >= 2) {
+ pr_err("%s is still in use, cannot remove.\n",
+ dv->devname);
+ close(lfd);
+ return -1;
+ }
}
}
/* FIXME check that it is a current member */
if (err && errno == ENODEV) {
/* Old kernels rejected this if no personality
* is registered */
- struct mdinfo *sra = sysfs_read(fd, 0, GET_DEVS);
+ struct mdinfo *sra = sysfs_read(fd, NULL, GET_DEVS);
struct mdinfo *dv = NULL;
if (sra)
dv = sra->devs;
"state", "remove");
else
err = -1;
- if (sra)
- sysfs_free(sra);
+ sysfs_free(sra);
}
}
if (err) {
- pr_err("hot remove failed "
- "for %s: %s\n", dv->devname,
+ pr_err("hot remove failed for %s: %s\n", dv->devname,
strerror(errno));
if (lfd >= 0)
close(lfd);
* 'add' event before reconciling this 'remove'
* event.
*/
- char *name = devnum2devname(fd2devnum(fd));
+ char *devnm = fd2devnm(fd);
- if (!name) {
+ if (!devnm) {
pr_err("unable to get container name\n");
return -1;
}
- ping_manager(name);
- free(name);
+ ping_manager(devnm);
}
if (lfd >= 0)
close(lfd);
/* Need to find the device in sysfs and add 'want_replacement' to the
* status.
*/
- mdi = sysfs_read(fd, -1, GET_DEVS);
+ mdi = sysfs_read(fd, NULL, GET_DEVS);
if (!mdi || !mdi->devs) {
pr_err("Cannot find status of %s to enable replacement - strange\n",
devname);
{
struct mdinfo *mdi, *di;
/* try to set 'slot' for 'rdev' in 'fd' to 'dv->used' */
- mdi = sysfs_read(fd, -1, GET_DEVS|GET_STATE);
+ mdi = sysfs_read(fd, NULL, GET_DEVS|GET_STATE);
if (!mdi || !mdi->devs) {
pr_err("Cannot find status of %s to enable replacement - strange\n",
devname);
"slot", dv->used);
if (rv) {
sysfs_free(mdi);
- pr_err("Failed to %s as preferred replacement.\n",
+ pr_err("Failed to set %s as preferred replacement.\n",
dv->devname);
return -1;
}
* 'a' - add the device
* try HOT_ADD_DISK
* If that fails EINVAL, try ADD_NEW_DISK
+ * 'S' - add the device as a spare - don't try re-add
+ * 'j' - add the device as a journal device
* 'A' - re-add the device
* 'r' - remove the device: HOT_REMOVE_DISK
* device can be 'faulty' or 'detached' in which case all
* variant on 'A'
* 'F' - Another variant of 'A', where the device was faulty
* so must be removed from the array first.
+ * 'c' - confirm the device as found (for clustered environments)
*
* For 'f' and 'r', the device can also be a kernel-internal
* name such as 'sdb'.
mdu_array_info_t array;
unsigned long long array_size;
struct mddev_dev *dv;
- struct stat stb;
int tfd = -1;
struct supertype *tst;
char *subarray = NULL;
int sysfd = -1;
int count = 0; /* number of actions taken */
struct mdinfo info;
+ struct mdinfo devinfo;
int frozen = 0;
+ int busy = 0;
+ int raid_slot = -1;
if (ioctl(fd, GET_ARRAY_INFO, &array)) {
pr_err("Cannot get array info for %s\n",
devname);
goto abort;
}
- sysfs_init(&info, fd, 0);
+ sysfs_init(&info, fd, NULL);
/* array.size is only 32 bits and may be truncated.
* So read from sysfs if possible, and record number of sectors
goto abort;
}
- stb.st_rdev = 0;
for (dv = devlist; dv; dv = dv->next) {
+ unsigned long rdev = 0; /* device to add/remove etc */
int rv;
+ int mj,mn;
+
+ raid_slot = -1;
+ if (dv->disposition == 'c') {
+ rv = parse_cluster_confirm_arg(dv->devname,
+ &dv->devname,
+ &raid_slot);
+ if (rv) {
+ pr_err("Could not get the devname of cluster\n");
+ goto abort;
+ }
+ }
if (strcmp(dv->devname, "failed") == 0 ||
strcmp(dv->devname, "faulty") == 0) {
if (dv->disposition != 'A'
&& dv->disposition != 'r') {
- pr_err("%s only meaningful "
- "with -r or --re-add, not -%c\n",
+ pr_err("%s only meaningful with -r or --re-add, not -%c\n",
dv->devname, dv->disposition);
goto abort;
}
}
if (strcmp(dv->devname, "detached") == 0) {
if (dv->disposition != 'r' && dv->disposition != 'f') {
- pr_err("%s only meaningful "
- "with -r of -f, not -%c\n",
+ pr_err("%s only meaningful with -r of -f, not -%c\n",
dv->devname, dv->disposition);
goto abort;
}
}
if (strcmp(dv->devname, "missing") == 0) {
- struct mddev_dev *add_devlist = NULL;
+ struct mddev_dev *add_devlist;
struct mddev_dev **dp;
+ if (dv->disposition == 'c') {
+ rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL);
+ break;
+ }
+
if (dv->disposition != 'A') {
- pr_err("'missing' only meaningful "
- "with --re-add\n");
+ pr_err("'missing' only meaningful with --re-add\n");
goto abort;
}
add_devlist = conf_get_devs();
continue;
}
+ if (strncmp(dv->devname, "set-", 4) == 0 &&
+ strlen(dv->devname) == 5) {
+ int copies;
+
+ if (dv->disposition != 'r' &&
+ dv->disposition != 'f') {
+ pr_err("'%s' only meaningful with -r or -f\n",
+ dv->devname);
+ goto abort;
+ }
+ if (array.level != 10) {
+ pr_err("'%s' only meaningful with RAID10 arrays\n",
+ dv->devname);
+ goto abort;
+ }
+ copies = ((array.layout & 0xff) *
+ ((array.layout >> 8) & 0xff));
+ if (array.raid_disks % copies != 0 ||
+ dv->devname[4] < 'A' ||
+ dv->devname[4] >= 'A' + copies ||
+ copies > 26) {
+ pr_err("'%s' not meaningful with this array\n",
+ dv->devname);
+ goto abort;
+ }
+ add_set(dv, fd, dv->devname[4]);
+ continue;
+ }
+
if (strchr(dv->devname, '/') == NULL &&
strchr(dv->devname, ':') == NULL &&
strlen(dv->devname) < 50) {
int found = 0;
char dname[55];
if (dv->disposition != 'r' && dv->disposition != 'f') {
- pr_err("%s only meaningful "
- "with -r or -f, not -%c\n",
+ pr_err("%s only meaningful with -r or -f, not -%c\n",
dv->devname, dv->disposition);
goto abort;
}
sprintf(dname, "dev-%s", dv->devname);
- sysfd = sysfs_open(fd2devnum(fd), dname, "block/dev");
+ sysfd = sysfs_open(fd2devnm(fd), dname, "block/dev");
if (sysfd >= 0) {
char dn[20];
- int mj,mn;
if (sysfs_fd_get_str(sysfd, dn, 20) > 0 &&
sscanf(dn, "%d:%d", &mj,&mn) == 2) {
- stb.st_rdev = makedev(mj,mn);
+ rdev = makedev(mj,mn);
found = 1;
}
close(sysfd);
sysfd = -1;
}
if (!found) {
- sysfd = sysfs_open(fd2devnum(fd), dname, "state");
+ sysfd = sysfs_open(fd2devnm(fd), dname, "state");
if (sysfd < 0) {
- pr_err("%s does not appear "
- "to be a component of %s\n",
+ pr_err("%s does not appear to be a component of %s\n",
dv->devname, devname);
goto abort;
}
}
+ } else if ((dv->disposition == 'r' || dv->disposition == 'f')
+ && get_maj_min(dv->devname, &mj, &mn)) {
+ /* for 'fail' and 'remove', the device might
+ * not exist.
+ */
+ rdev = makedev(mj, mn);
} else {
+ struct stat stb;
tfd = dev_open(dv->devname, O_RDONLY);
- if (tfd >= 0)
+ if (tfd >= 0) {
fstat(tfd, &stb);
- else {
+ close(tfd);
+ } else {
int open_err = errno;
if (stat(dv->devname, &stb) != 0) {
pr_err("Cannot find %s: %s\n",
goto abort;
}
}
+ rdev = stb.st_rdev;
}
switch(dv->disposition){
default:
dv->devname, dv->disposition);
goto abort;
case 'a':
+ case 'S': /* --add-spare */
+ case 'j': /* --add-journal */
case 'A':
case 'M': /* --re-add missing */
case 'F': /* --re-add faulty */
+ case 'c': /* --cluster-confirm */
/* add the device */
if (subarray) {
- pr_err("Cannot add disks to a"
- " \'member\' array, perform this"
- " operation on the parent container\n");
+ pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n");
goto abort;
}
+
+ /* Let's first try to write re-add to sysfs */
+ if (rdev != 0 &&
+ (dv->disposition == 'A' || dv->disposition == 'F')) {
+ sysfs_init_dev(&devinfo, rdev);
+ if (sysfs_set_str(&info, &devinfo, "state", "re-add") == 0) {
+ pr_err("re-add %s to %s succeed\n",
+ dv->devname, info.sys_name);
+ break;
+ }
+ }
+
if (dv->disposition == 'F')
/* Need to remove first */
- ioctl(fd, HOT_REMOVE_DISK,
- (unsigned long)stb.st_rdev);
+ ioctl(fd, HOT_REMOVE_DISK, rdev);
/* Make sure it isn't in use (in 2.6 or later) */
tfd = dev_open(dv->devname, O_RDONLY|O_EXCL);
if (tfd >= 0) {
*/
close(tfd);
tfd = dev_open(dv->devname, O_RDONLY);
- }
+ }
if (tfd < 0) {
if (dv->disposition == 'M')
continue;
}
rv = Manage_add(fd, tfd, dv, tst, &array,
force, verbose, devname, update,
- stb.st_rdev, array_size);
+ rdev, array_size, raid_slot);
close(tfd);
tfd = -1;
if (rv < 0)
case 'r':
/* hot remove */
if (subarray) {
- pr_err("Cannot remove disks from a"
- " \'member\' array, perform this"
- " operation on the parent container\n");
+ pr_err("Cannot remove disks from a \'member\' array, perform this operation on the parent container\n");
rv = -1;
} else
rv = Manage_remove(tst, fd, dv, sysfd,
- stb.st_rdev, verbose,
+ rdev, verbose,
devname);
if (sysfd >= 0)
close(sysfd);
/* FIXME check current member */
if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) ||
(sysfd < 0 && ioctl(fd, SET_DISK_FAULTY,
- (unsigned long) stb.st_rdev))) {
+ rdev))) {
+ if (errno == EBUSY)
+ busy = 1;
pr_err("set device faulty failed for %s: %s\n",
dv->devname, strerror(errno));
if (sysfd >= 0)
break;
case 'R': /* Mark as replaceable */
if (subarray) {
- pr_err("Cannot replace disks in a"
- " \'member\' array, perform this"
- " operation on the parent container\n");
+ pr_err("Cannot replace disks in a \'member\' array, perform this operation on the parent container\n");
rv = -1;
} else {
if (!frozen) {
frozen = -1;
}
rv = Manage_replace(tst, fd, dv,
- stb.st_rdev, verbose,
+ rdev, verbose,
devname);
}
if (rv < 0)
goto abort;
case 'w': /* --with device which was matched */
rv = Manage_with(tst, fd, dv,
- stb.st_rdev, verbose, devname);
+ rdev, verbose, devname);
if (rv < 0)
goto abort;
break;
abort:
if (frozen > 0)
sysfs_set_str(&info, NULL, "sync_action","idle");
- return 1;
+ return !test && busy ? 2 : 1;
}
int autodetect(void)
goto free_super;
}
- if (mdmon_running(st->devnum))
+ if (mdmon_running(st->devnm))
st->update_tail = &st->updates;
rv = st->ss->update_subarray(st, subarray, update, ident);
devlist.next = NULL;
devlist.used = 0;
- devlist.writemostly = 0;
+ devlist.writemostly = FlagDefault;
+ devlist.failfast = FlagDefault;
devlist.devname = devname;
sprintf(devname, "%d:%d", major(devid), minor(devid));