/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2012 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
#include "md_p.h"
#include <ctype.h>
-#define REGISTER_DEV _IO (MD_MAJOR, 1)
-#define START_MD _IO (MD_MAJOR, 2)
-#define STOP_MD _IO (MD_MAJOR, 3)
+#define REGISTER_DEV _IO (MD_MAJOR, 1)
+#define START_MD _IO (MD_MAJOR, 2)
+#define STOP_MD _IO (MD_MAJOR, 3)
int Manage_ro(char *devname, int fd, int readonly)
{
free(path2);
}
-int Manage_runstop(char *devname, int fd, int runstop,
- int verbose, int will_retry)
+int Manage_run(char *devname, int fd, struct context *c)
{
- /* Run or stop the array. Array must already be configured
- * 'Run' requires >= 0.90.0
- * 'will_retry' is only relevant for 'stop', and means
- * that error messages are not wanted.
+ /* Run the array. Array must already be configured
+ * Requires >= 0.90.0
+ */
+ char nm[32], *nmp;
+
+ if (md_get_version(fd) < 9000) {
+ pr_err("need md driver version 0.90.0 or later\n");
+ return 1;
+ }
+ nmp = fd2devnm(fd);
+ if (!nmp) {
+ pr_err("Cannot find %s in sysfs!!\n", devname);
+ return 1;
+ }
+ strcpy(nm, nmp);
+ return IncrementalScan(c, nm);
+}
+
+int Manage_stop(char *devname, int fd, int verbose, int will_retry)
+{
+ /* Stop the array. Array must already be configured
+ * 'will_retry' means that error messages are not wanted.
*/
- mdu_param_t param; /* unused */
int rv = 0;
+ struct map_ent *map = NULL;
+ struct mdinfo *mdi;
+ char devnm[32];
+ char container[32];
+ int err;
+ int count;
+ char buf[32];
+ unsigned long long rd1, rd2;
if (will_retry && verbose == 0)
verbose = -1;
- if (runstop == -1 && md_get_version(fd) < 9000) {
+ if (md_get_version(fd) < 9000) {
if (ioctl(fd, STOP_MD, 0) == 0)
return 0;
pr_err("stopping device %s "
return 1;
}
- if (md_get_version(fd) < 9000) {
- pr_err("need md driver version 0.90.0 or later\n");
+ /* If this is an mdmon managed array, just write 'inactive'
+ * to the array state and let mdmon clear up.
+ */
+ strcpy(devnm, fd2devnm(fd));
+ /* Get EXCL access first. If this fails, then attempting
+ * to stop is probably a bad idea.
+ */
+ mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_COMPONENT|GET_VERSION);
+ if (mdi && is_subarray(mdi->text_version)) {
+ char *sl;
+ strncpy(container, mdi->text_version+1, sizeof(container));
+ container[sizeof(container)-1] = 0;
+ sl = strchr(container, '/');
+ if (sl)
+ *sl = 0;
+ } else
+ container[0] = 0;
+ close(fd);
+ count = 5;
+ while (((fd = ((devnm[0] == '/')
+ ?open(devname, O_RDONLY|O_EXCL)
+ :open_dev_flags(devnm, O_RDONLY|O_EXCL))) < 0
+ || strcmp(fd2devnm(fd), devnm) != 0)
+ && container[0]
+ && mdmon_running(container)
+ && count) {
+ if (fd >= 0)
+ close(fd);
+ flush_mdmon(container);
+ count--;
+ }
+ if (fd < 0 || strcmp(fd2devnm(fd), devnm) != 0) {
+ if (fd >= 0)
+ close(fd);
+ if (verbose >= 0)
+ pr_err("Cannot get exclusive access to %s:"
+ "Perhaps a running "
+ "process, mounted filesystem "
+ "or active volume group?\n",
+ devname);
return 1;
}
+ if (mdi &&
+ mdi->array.level > 0 &&
+ is_subarray(mdi->text_version)) {
+ int err;
+ /* This is mdmon managed. */
+ close(fd);
- if (runstop > 0) {
- if (ioctl(fd, RUN_ARRAY, ¶m)) {
+ /* As we have an O_EXCL open, any use of the device
+ * which blocks STOP_ARRAY is probably a transient use,
+ * so it is reasonable to retry for a while - 5 seconds.
+ */
+ count = 25;
+ while (count &&
+ (err = sysfs_set_str(mdi, NULL,
+ "array_state",
+ "inactive")) < 0
+ && errno == EBUSY) {
+ usleep(200000);
+ count--;
+ }
+ if (err) {
if (verbose >= 0)
- pr_err("failed to run array %s: %s\n",
+ pr_err("failed to stop array %s: %s\n",
devname, strerror(errno));
- return 1;
+ rv = 1;
+ goto out;
}
- if (verbose >= 0)
- pr_err("started %s\n", devname);
- } else if (runstop < 0){
- struct map_ent *map = NULL;
- struct stat stb;
- struct mdinfo *mdi;
- char devnm[32];
- int err;
- int count;
- /* If this is an mdmon managed array, just write 'inactive'
- * to the array state and let mdmon clear up.
- */
- strcpy(devnm, fd2devnm(fd));
- /* Get EXCL access first. If this fails, then attempting
- * to stop is probably a bad idea.
- */
- close(fd);
- if (devnm[0] == '/')
- fd = open(devname, O_RDONLY|O_EXCL);
- else
- fd = open_dev_flags(devnm, O_RDONLY|O_EXCL);
- if (fd < 0 || strcmp(fd2devnm(fd), devnm) != 0) {
- if (fd >= 0)
- close(fd);
+
+ /* Give monitor a chance to act */
+ ping_monitor(mdi->text_version);
+
+ fd = open_dev_excl(devnm);
+ if (fd < 0) {
if (verbose >= 0)
- pr_err("Cannot get exclusive access to %s:"
- "Perhaps a running "
- "process, mounted filesystem "
- "or active volume group?\n",
+ pr_err("failed to completely stop %s"
+ ": Device is busy\n",
devname);
- return 1;
+ rv = 1;
+ goto out;
}
- mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION);
- if (mdi &&
- mdi->array.level > 0 &&
- is_subarray(mdi->text_version)) {
- int err;
- /* This is mdmon managed. */
- close(fd);
-
- /* As we have an O_EXCL open, any use of the device
- * which blocks STOP_ARRAY is probably a transient use,
- * so it is reasonable to retry for a while - 5 seconds.
- */
- count = 25;
- while (count &&
- (err = sysfs_set_str(mdi, NULL,
- "array_state",
- "inactive")) < 0
- && errno == EBUSY) {
- usleep(200000);
- count--;
- }
- if (err) {
- if (verbose >= 0)
- pr_err("failed to stop array %s: %s\n",
- devname, strerror(errno));
- rv = 1;
- goto out;
- }
-
- /* Give monitor a chance to act */
- ping_monitor(mdi->text_version);
+ } else if (mdi &&
+ mdi->array.major_version == -1 &&
+ mdi->array.minor_version == -2 &&
+ !is_subarray(mdi->text_version)) {
+ struct mdstat_ent *mds, *m;
+ /* container, possibly mdmon-managed.
+ * Make sure mdmon isn't opening it, which
+ * would interfere with the 'stop'
+ */
+ ping_monitor(mdi->sys_name);
- fd = open_dev_excl(devnm);
- if (fd < 0) {
+ /* now check that there are no existing arrays
+ * which are members of this array
+ */
+ mds = mdstat_read(0, 0);
+ for (m = mds; m; m = m->next)
+ if (m->metadata_version &&
+ strncmp(m->metadata_version, "external:", 9)==0 &&
+ metadata_container_matches(m->metadata_version+9,
+ devnm)) {
if (verbose >= 0)
- pr_err("failed to completely stop %s"
- ": Device is busy\n",
- devname);
+ pr_err("Cannot stop container %s: "
+ "member %s still active\n",
+ devname, m->dev);
+ free_mdstat(mds);
rv = 1;
goto out;
}
- } else if (mdi &&
- mdi->array.major_version == -1 &&
- mdi->array.minor_version == -2 &&
- !is_subarray(mdi->text_version)) {
- struct mdstat_ent *mds, *m;
- /* container, possibly mdmon-managed.
- * Make sure mdmon isn't opening it, which
- * would interfere with the 'stop'
- */
- ping_monitor(mdi->sys_name);
+ }
- /* now check that there are no existing arrays
- * which are members of this array
+ /* If the array is undergoing a reshape which changes the number
+ * of devices, then it would be nice to stop it at a point where
+ * it has completed a full number of stripes in both old and
+ * new layouts as this will allow the reshape to be reverted.
+ * So if 'sync_action' is "reshape" and 'raid_disks' shows two
+ * different numbers, then
+ * - freeze reshape
+ * - set sync_max to next multiple of both data_disks and
+ * chunk sizes (or next but one)
+ * - unfreeze reshape
+ * - wait on 'sync_completed' for that point to be reached.
+ */
+ if (mdi && (mdi->array.level >= 4 && mdi->array.level <= 6) &&
+ sysfs_attribute_available(mdi, NULL, "sync_action") &&
+ sysfs_attribute_available(mdi, NULL, "reshape_direction") &&
+ sysfs_get_str(mdi, NULL, "sync_action", buf, 20) > 0 &&
+ strcmp(buf, "reshape\n") == 0 &&
+ sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2 &&
+ sysfs_set_str(mdi, NULL, "sync_action", "frozen") == 0) {
+ /* Array is frozen */
+ unsigned long long position, curr;
+ unsigned long long chunk1, chunk2;
+ unsigned long long rddiv, chunkdiv;
+ unsigned long long sectors;
+ unsigned long long sync_max, old_sync_max;
+ unsigned long long completed;
+ int backwards = 0;
+ int delay;
+ int scfd;
+
+ rd1 -= mdi->array.level == 6 ? 2 : 1;
+ rd2 -= mdi->array.level == 6 ? 2 : 1;
+ sysfs_get_str(mdi, NULL, "reshape_direction", buf, sizeof(buf));
+ if (strncmp(buf, "back", 4) == 0)
+ backwards = 1;
+ sysfs_get_ll(mdi, NULL, "reshape_position", &position);
+ sysfs_get_two(mdi, NULL, "chunk_size", &chunk1, &chunk2);
+ chunk1 /= 512;
+ chunk2 /= 512;
+ rddiv = GCD(rd1, rd2);
+ chunkdiv = GCD(chunk1, chunk2);
+ sectors = (chunk1/chunkdiv) * chunk2 * (rd1/rddiv) * rd2;
+
+ if (backwards) {
+ /* Need to subtract 'reshape_position' from
+ * array size to get equivalent of sync_max.
+ * Size calculation based on raid5_size in kernel.
*/
- mds = mdstat_read(0, 0);
- for (m = mds; m; m = m->next)
- if (m->metadata_version &&
- strncmp(m->metadata_version, "external:", 9)==0 &&
- metadata_container_matches(m->metadata_version+9,
- devnm)) {
- if (verbose >= 0)
- pr_err("Cannot stop container %s: "
- "member %s still active\n",
- devname, m->dev);
- free_mdstat(mds);
- rv = 1;
- goto out;
- }
+ unsigned long long size = mdi->component_size;
+ size &= ~(chunk1-1);
+ size &= ~(chunk2-1);
+ /* rd1 must be smaller */
+ position = (position / sectors - 1) * sectors;
+ sync_max = size - position/rd1;
+ } else {
+ position = (position / sectors + 2) * sectors;
+ sync_max = position/rd1;
}
-
- /* As we have an O_EXCL open, any use of the device
- * which blocks STOP_ARRAY is probably a transient use,
- * so it is reasonable to retry for a while - 5 seconds.
+ if (sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) < 0)
+ old_sync_max = mdi->component_size;
+ /* Must not advance sync_max as that could confuse
+ * the reshape monitor */
+ if (sync_max < old_sync_max)
+ sysfs_set_num(mdi, NULL, "sync_max", sync_max);
+ sysfs_set_str(mdi, NULL, "sync_action", "idle");
+
+ /* That should have set things going again. Now we
+ * wait a little while (3 second max) for sync_completed
+ * to reach the target.
+ * The reshape process can block for 500msec if
+ * the sync speed limit is hit, so we need to wait
+ * a lot longer than that. 1 second is usually
+ * enough. 3 is safe.
*/
- count = 25; err = 0;
- while (count && fd >= 0
- && (err = ioctl(fd, STOP_ARRAY, NULL)) < 0
- && errno == EBUSY) {
- usleep(200000);
- count --;
- }
- if (fd >= 0 && err) {
- if (verbose >= 0) {
- pr_err("failed to stop array %s: %s\n",
- devname, strerror(errno));
- if (errno == EBUSY)
- fprintf(stderr, "Perhaps a running "
- "process, mounted filesystem "
- "or active volume group?\n");
+ delay = 3000;
+ scfd = sysfs_open(mdi->sys_name, NULL, "sync_completed");
+ while (scfd >= 0 && delay > 0 && old_sync_max > 0) {
+ sysfs_get_ll(mdi, NULL, "reshape_position", &curr);
+ sysfs_fd_get_str(scfd, buf, sizeof(buf));
+ if (strncmp(buf, "none", 4) == 0) {
+ /* Either reshape has aborted, or hasn't
+ * quite started yet. Wait a bit and
+ * check 'sync_action' to see.
+ */
+ usleep(10000);
+ sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf));
+ if (strncmp(buf, "reshape", 7) != 0)
+ break;
}
- rv = 1;
- goto out;
+
+ if (sysfs_fd_get_ll(scfd, &completed) == 0 &&
+ (completed > sync_max ||
+ (completed == sync_max && curr != position))) {
+ while (completed > sync_max) {
+ sync_max += sectors / rd1;
+ if (backwards)
+ position -= sectors;
+ else
+ position += sectors;
+ }
+ if (sync_max < old_sync_max)
+ sysfs_set_num(mdi, NULL, "sync_max", sync_max);
+ }
+
+ if (!backwards && curr >= position)
+ break;
+ if (backwards && curr <= position)
+ break;
+ sysfs_wait(scfd, &delay);
}
- /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array
- * was stopped, so We'll do it here just to be sure. Drop any
- * partitions as well...
- */
- if (fd >= 0)
- ioctl(fd, BLKRRPART, 0);
- if (mdi)
- sysfs_uevent(mdi, "change");
-
- if (devnm[0] &&
- (stat("/dev/.udev", &stb) != 0 ||
- check_env("MDADM_NO_UDEV"))) {
- struct map_ent *mp = map_by_devnm(&map, devnm);
- remove_devices(devnm, mp ? mp->path : NULL);
+ if (scfd >= 0)
+ close(scfd);
+
+ }
+
+ /* As we have an O_EXCL open, any use of the device
+ * which blocks STOP_ARRAY is probably a transient use,
+ * so it is reasonable to retry for a while - 5 seconds.
+ */
+ count = 25; err = 0;
+ while (count && fd >= 0
+ && (err = ioctl(fd, STOP_ARRAY, NULL)) < 0
+ && errno == EBUSY) {
+ usleep(200000);
+ count --;
+ }
+ if (fd >= 0 && err) {
+ if (verbose >= 0) {
+ pr_err("failed to stop array %s: %s\n",
+ devname, strerror(errno));
+ if (errno == EBUSY)
+ cont_err("Perhaps a running "
+ "process, mounted filesystem "
+ "or active volume group?\n");
}
+ rv = 1;
+ goto out;
+ }
+ /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array
+ * was stopped, so We'll do it here just to be sure. Drop any
+ * partitions as well...
+ */
+ if (fd >= 0)
+ ioctl(fd, BLKRRPART, 0);
+ if (mdi)
+ sysfs_uevent(mdi, "change");
- if (verbose >= 0)
- pr_err("stopped %s\n", devname);
- map_lock(&map);
- map_remove(&map, devnm);
- map_unlock(&map);
- out:
- if (mdi)
- sysfs_free(mdi);
+ if (devnm[0] && use_udev()) {
+ struct map_ent *mp = map_by_devnm(&map, devnm);
+ remove_devices(devnm, mp ? mp->path : NULL);
}
+
+ if (verbose >= 0)
+ pr_err("stopped %s\n", devname);
+ map_lock(&map);
+ map_remove(&map, devnm);
+ map_unlock(&map);
+out:
+ if (mdi)
+ sysfs_free(mdi);
+
return rv;
}
}
}
+static void add_set(struct mddev_dev *dv, int fd, char set_char)
+{
+ mdu_array_info_t array;
+ mdu_disk_info_t disk;
+ int remaining_disks;
+ int copies, set;
+ int i;
+
+ if (ioctl(fd, GET_ARRAY_INFO, &array) != 0)
+ return;
+ if (array.level != 10)
+ return;
+ copies = ((array.layout & 0xff) *
+ ((array.layout >> 8) & 0xff));
+ if (array.raid_disks % copies)
+ return;
+
+ remaining_disks = array.nr_disks;
+ for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
+ char buf[40];
+ disk.number = i;
+ if (ioctl(fd, GET_DISK_INFO, &disk) != 0)
+ continue;
+ if (disk.major == 0 && disk.minor == 0)
+ continue;
+ remaining_disks--;
+ set = disk.raid_disk % copies;
+ if (set_char != set + 'A')
+ continue;
+ sprintf(buf, "%d:%d", disk.major, disk.minor);
+ dv = add_one(dv, buf, dv->disposition);
+ }
+}
+
int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
struct supertype *dev_st, struct supertype *tst,
unsigned long rdev,
return -1;
}
- if (tst->ss->validate_geometry(
- tst, array->level, array->layout,
- array->raid_disks, NULL,
- ldsize >> 9, INVALID_SECTORS, NULL, NULL, 0) == 0) {
+ if (tst->ss == &super0 && ldsize > 4ULL*1024*1024*1024*1024) {
+ /* More than 4TB is wasted on v0.90 */
if (!force) {
pr_err("%s is larger than %s can "
"effectively use.\n"
break;
}
/* FIXME this is a bad test to be using */
- if (!tst->sb && dv->disposition != 'a') {
+ if (!tst->sb && (dv->disposition != 'a'
+ && dv->disposition != 'S')) {
/* we are re-adding a device to a
* completely dead array - have to depend
* on kernel to check
dev_st = dup_super(tst);
dev_st->ss->load_super(dev_st, tfd, NULL);
}
- if (dev_st && dev_st->sb) {
+ if (dev_st && dev_st->sb && dv->disposition != 'S') {
int rv = attempt_re_add(fd, tfd, dv,
dev_st, tst,
rdev,
continue;
if (disc.major == 0 && disc.minor == 0)
continue;
+ found++;
if (!(disc.state & (1<<MD_DISK_SYNC)))
continue;
avail[disc.raid_disk] = 1;
- found++;
}
array_failed = !enough(array->level, array->raid_disks,
array->layout, 1, avail);
+ free(avail);
} else
array_failed = 0;
if (array_failed) {
"slot", dv->used);
if (rv) {
sysfs_free(mdi);
- pr_err("Failed to %s as preferred replacement.\n",
+ pr_err("Failed to set %s as preferred replacement.\n",
dv->devname);
return -1;
}
* 'a' - add the device
* try HOT_ADD_DISK
* If that fails EINVAL, try ADD_NEW_DISK
+ * 'S' - add the device as a spare - don't try re-add
* 'A' - re-add the device
* 'r' - remove the device: HOT_REMOVE_DISK
* device can be 'faulty' or 'detached' in which case all
mdu_array_info_t array;
unsigned long long array_size;
struct mddev_dev *dv;
- struct stat stb;
int tfd = -1;
struct supertype *tst;
char *subarray = NULL;
goto abort;
}
- stb.st_rdev = 0;
for (dv = devlist; dv; dv = dv->next) {
+ unsigned long rdev = 0; /* device to add/remove etc */
int rv;
+ int mj,mn;
if (strcmp(dv->devname, "failed") == 0 ||
strcmp(dv->devname, "faulty") == 0) {
continue;
}
+ if (strncmp(dv->devname, "set-", 4) == 0 &&
+ strlen(dv->devname) == 5) {
+ int copies;
+
+ if (dv->disposition != 'r' &&
+ dv->disposition != 'f') {
+ pr_err("'%s' only meaningful with -r or -f\n",
+ dv->devname);
+ goto abort;
+ }
+ if (array.level != 10) {
+ pr_err("'%s' only meaningful with RAID10 arrays\n",
+ dv->devname);
+ goto abort;
+ }
+ copies = ((array.layout & 0xff) *
+ ((array.layout >> 8) & 0xff));
+ if (array.raid_disks % copies != 0 ||
+ dv->devname[4] < 'A' ||
+ dv->devname[4] >= 'A' + copies ||
+ copies > 26) {
+ pr_err("'%s' not meaningful with this array\n",
+ dv->devname);
+ goto abort;
+ }
+ add_set(dv, fd, dv->devname[4]);
+ continue;
+ }
+
if (strchr(dv->devname, '/') == NULL &&
strchr(dv->devname, ':') == NULL &&
strlen(dv->devname) < 50) {
sysfd = sysfs_open(fd2devnm(fd), dname, "block/dev");
if (sysfd >= 0) {
char dn[20];
- int mj,mn;
if (sysfs_fd_get_str(sysfd, dn, 20) > 0 &&
sscanf(dn, "%d:%d", &mj,&mn) == 2) {
- stb.st_rdev = makedev(mj,mn);
+ rdev = makedev(mj,mn);
found = 1;
}
close(sysfd);
goto abort;
}
}
+ } else if ((dv->disposition == 'r' || dv->disposition == 'f')
+ && get_maj_min(dv->devname, &mj, &mn)) {
+ /* for 'fail' and 'remove', the device might
+ * not exist.
+ */
+ rdev = makedev(mj, mn);
} else {
+ struct stat stb;
tfd = dev_open(dv->devname, O_RDONLY);
if (tfd >= 0)
fstat(tfd, &stb);
goto abort;
}
}
+ rdev = stb.st_rdev;
}
switch(dv->disposition){
default:
dv->devname, dv->disposition);
goto abort;
case 'a':
+ case 'S': /* --add-spare */
case 'A':
case 'M': /* --re-add missing */
case 'F': /* --re-add faulty */
}
if (dv->disposition == 'F')
/* Need to remove first */
- ioctl(fd, HOT_REMOVE_DISK,
- (unsigned long)stb.st_rdev);
+ ioctl(fd, HOT_REMOVE_DISK, rdev);
/* Make sure it isn't in use (in 2.6 or later) */
tfd = dev_open(dv->devname, O_RDONLY|O_EXCL);
if (tfd >= 0) {
*/
close(tfd);
tfd = dev_open(dv->devname, O_RDONLY);
- }
+ }
if (tfd < 0) {
if (dv->disposition == 'M')
continue;
}
rv = Manage_add(fd, tfd, dv, tst, &array,
force, verbose, devname, update,
- stb.st_rdev, array_size);
+ rdev, array_size);
close(tfd);
tfd = -1;
if (rv < 0)
rv = -1;
} else
rv = Manage_remove(tst, fd, dv, sysfd,
- stb.st_rdev, verbose,
+ rdev, verbose,
devname);
if (sysfd >= 0)
close(sysfd);
/* FIXME check current member */
if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) ||
(sysfd < 0 && ioctl(fd, SET_DISK_FAULTY,
- (unsigned long) stb.st_rdev))) {
+ rdev))) {
if (errno == EBUSY)
busy = 1;
pr_err("set device faulty failed for %s: %s\n",
frozen = -1;
}
rv = Manage_replace(tst, fd, dv,
- stb.st_rdev, verbose,
+ rdev, verbose,
devname);
}
if (rv < 0)
goto abort;
case 'w': /* --with device which was matched */
rv = Manage_with(tst, fd, dv,
- stb.st_rdev, verbose, devname);
+ rdev, verbose, devname);
if (rv < 0)
goto abort;
break;