* use RESTART_ARRAY_RW or STOP_ARRAY_RO
*
*/
- mdu_array_info_t array;
-#ifndef MDASSEMBLE
struct mdinfo *mdi;
-#endif
int rv = 0;
- if (md_get_version(fd) < 9000) {
- pr_err("need md driver version 0.90.0 or later\n");
- return 1;
- }
-#ifndef MDASSEMBLE
/* If this is an externally-managed array, we need to modify the
* metadata_version so that mdmon doesn't undo our change.
*/
}
goto out;
}
-#endif
- if (ioctl(fd, GET_ARRAY_INFO, &array)) {
- pr_err("%s does not appear to be active.\n",
- devname);
+
+ if (!md_array_active(fd)) {
+ pr_err("%s does not appear to be active.\n", devname);
rv = 1;
goto out;
}
}
}
out:
-#ifndef MDASSEMBLE
- if (mdi)
- sysfs_free(mdi);
-#endif
+ sysfs_free(mdi);
return rv;
}
-#ifndef MDASSEMBLE
-
static void remove_devices(char *devnm, char *path)
{
/*
*/
char nm[32], *nmp;
- if (md_get_version(fd) < 9000) {
- pr_err("need md driver version 0.90.0 or later\n");
- return 1;
- }
nmp = fd2devnm(fd);
if (!nmp) {
pr_err("Cannot find %s in sysfs!!\n", devname);
if (will_retry && verbose == 0)
verbose = -1;
- if (md_get_version(fd) < 9000) {
- if (ioctl(fd, STOP_MD, 0) == 0)
- return 0;
- pr_err("stopping device %s failed: %s\n",
- devname, strerror(errno));
- return 1;
- }
-
- /* If this is an mdmon managed array, just write 'inactive'
- * to the array state and let mdmon clear up.
- */
strcpy(devnm, fd2devnm(fd));
/* Get EXCL access first. If this fails, then attempting
* to stop is probably a bad idea.
&& container[0]
&& mdmon_running(container)
&& count) {
+ /* Can't open, so something might be wrong. However it
+ * is a container, so we might be racing with mdmon, so
+ * retry for a bit.
+ */
if (fd >= 0)
close(fd);
flush_mdmon(container);
devname);
return 1;
}
+ /* If this is an mdmon managed array, just write 'inactive'
+ * to the array state and let mdmon clear up.
+ */
if (mdi &&
mdi->array.level > 0 &&
is_subarray(mdi->text_version)) {
/* This is mdmon managed. */
close(fd);
- /* As we have an O_EXCL open, any use of the device
+ /* As we had an O_EXCL open, any use of the device
* which blocks STOP_ARRAY is probably a transient use,
* so it is reasonable to retry for a while - 5 seconds.
*/
sysfs_attribute_available(mdi, NULL, "reshape_direction") &&
sysfs_get_str(mdi, NULL, "sync_action", buf, 20) > 0 &&
strcmp(buf, "reshape\n") == 0 &&
- sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2 &&
- sysfs_set_str(mdi, NULL, "sync_action", "frozen") == 0) {
- /* Array is frozen */
+ sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2) {
unsigned long long position, curr;
unsigned long long chunk1, chunk2;
unsigned long long rddiv, chunkdiv;
int delay;
int scfd;
+ delay = 40;
+ while (rd1 > rd2 && delay > 0 &&
+ sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) == 0) {
+ /* must be in the critical section - wait a bit */
+ delay -= 1;
+ usleep(100000);
+ }
+
+ if (sysfs_set_str(mdi, NULL, "sync_action", "frozen") != 0)
+ goto done;
+ /* Array is frozen */
+
rd1 -= mdi->array.level == 6 ? 2 : 1;
rd2 -= mdi->array.level == 6 ? 2 : 1;
sysfs_get_str(mdi, NULL, "reshape_direction", buf, sizeof(buf));
if (strncmp(buf, "back", 4) == 0)
backwards = 1;
- sysfs_get_ll(mdi, NULL, "reshape_position", &position);
+ if (sysfs_get_ll(mdi, NULL, "reshape_position", &position) != 0) {
+ /* reshape must have finished now */
+ sysfs_set_str(mdi, NULL, "sync_action", "idle");
+ goto done;
+ }
sysfs_get_two(mdi, NULL, "chunk_size", &chunk1, &chunk2);
chunk1 /= 512;
chunk2 /= 512;
size &= ~(chunk1-1);
size &= ~(chunk2-1);
/* rd1 must be smaller */
+ /* Reshape may have progressed further backwards than
+ * recorded, so target even further back (hence "-1")
+ */
position = (position / sectors - 1) * sectors;
+ /* rd1 is always the conversion factor between 'sync'
+ * position and 'reshape' position.
+ * We read 1 "new" stripe worth of data from where-ever,
+ * and when write out that full stripe.
+ */
sync_max = size - position/rd1;
} else {
+ /* Reshape will very likely be beyond position, and it may
+ * be too late to stop at '+1', so aim for '+2'
+ */
position = (position / sectors + 2) * sectors;
sync_max = position/rd1;
}
close(scfd);
}
+done:
/* As we have an O_EXCL open, any use of the device
* which blocks STOP_ARRAY is probably a transient use,
rv = 1;
goto out;
}
- /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array
- * was stopped, so We'll do it here just to be sure. Drop any
- * partitions as well...
- */
- if (fd >= 0)
- ioctl(fd, BLKRRPART, 0);
- if (mdi)
- sysfs_uevent(mdi, "change");
+
+ if (get_linux_version() < 2006028) {
+ /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array
+ * was stopped, so We'll do it here just to be sure. Drop any
+ * partitions as well...
+ */
+ if (fd >= 0)
+ ioctl(fd, BLKRRPART, 0);
+ if (mdi)
+ sysfs_uevent(mdi, "change");
+ }
if (devnm[0] && use_udev()) {
struct map_ent *mp = map_by_devnm(&map, devnm);
map_remove(&map, devnm);
map_unlock(&map);
out:
- if (mdi)
- sysfs_free(mdi);
+ sysfs_free(mdi);
return rv;
}
int remaining_disks;
int i;
- if (ioctl(fd, GET_ARRAY_INFO, &array) != 0)
+ if (md_get_array_info(fd, &array) != 0)
return;
remaining_disks = array.nr_disks;
for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
char buf[40];
disk.number = i;
- if (ioctl(fd, GET_DISK_INFO, &disk) != 0)
+ if (md_get_disk_info(fd, &disk) != 0)
continue;
if (disk.major == 0 && disk.minor == 0)
continue;
int remaining_disks;
int i;
- if (ioctl(fd, GET_ARRAY_INFO, &array) != 0)
+ if (md_get_array_info(fd, &array) != 0)
return;
remaining_disks = array.nr_disks;
char buf[40];
int sfd;
disk.number = i;
- if (ioctl(fd, GET_DISK_INFO, &disk) != 0)
+ if (md_get_disk_info(fd, &disk) != 0)
continue;
if (disk.major == 0 && disk.minor == 0)
continue;
int copies, set;
int i;
- if (ioctl(fd, GET_ARRAY_INFO, &array) != 0)
+ if (md_get_array_info(fd, &array) != 0)
return;
if (array.level != 10)
return;
for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
char buf[40];
disk.number = i;
- if (ioctl(fd, GET_DISK_INFO, &disk) != 0)
+ if (md_get_disk_info(fd, &disk) != 0)
continue;
if (disk.major == 0 && disk.minor == 0)
continue;
get_linux_version() <= 2006018)
goto skip_re_add;
disc.number = mdi.disk.number;
- if (ioctl(fd, GET_DISK_INFO, &disc) != 0
- || disc.major != 0 || disc.minor != 0
- )
+ if (md_get_disk_info(fd, &disc) != 0 ||
+ disc.major != 0 || disc.minor != 0)
goto skip_re_add;
disc.major = major(rdev);
disc.minor = minor(rdev);
disc.number = mdi.disk.number;
disc.raid_disk = mdi.disk.raid_disk;
disc.state = mdi.disk.state;
- if (dv->writemostly == 1)
+ if (array->state & (1 << MD_SB_CLUSTERED)) {
+ /* extra flags are needed when adding to a cluster as
+ * there are two cases to distinguish
+ */
+ if (dv->disposition == 'c')
+ disc.state |= (1 << MD_DISK_CANDIDATE);
+ else
+ disc.state |= (1 << MD_DISK_CLUSTER_ADD);
+ }
+ if (dv->writemostly == FlagSet)
disc.state |= 1 << MD_DISK_WRITEMOSTLY;
- if (dv->writemostly == 2)
+ if (dv->writemostly == FlagClear)
disc.state &= ~(1 << MD_DISK_WRITEMOSTLY);
+ if (dv->failfast == FlagSet)
+ disc.state |= 1 << MD_DISK_FAILFAST;
+ if (dv->failfast == FlagClear)
+ disc.state &= ~(1 << MD_DISK_FAILFAST);
remove_partitions(tfd);
- if (update || dv->writemostly > 0) {
+ if (update || dv->writemostly != FlagDefault
+ || dv->failfast != FlagDefault) {
int rv = -1;
tfd = dev_open(dv->devname, O_RDWR);
if (tfd < 0) {
return -1;
}
- if (dv->writemostly == 1)
+ if (dv->writemostly == FlagSet)
rv = dev_st->ss->update_super(
dev_st, NULL, "writemostly",
devname, verbose, 0, NULL);
- if (dv->writemostly == 2)
+ if (dv->writemostly == FlagClear)
rv = dev_st->ss->update_super(
dev_st, NULL, "readwrite",
devname, verbose, 0, NULL);
+ if (dv->failfast == FlagSet)
+ rv = dev_st->ss->update_super(
+ dev_st, NULL, "failfast",
+ devname, verbose, 0, NULL);
+ if (dv->failfast == FlagClear)
+ rv = dev_st->ss->update_super(
+ dev_st, NULL, "nofailfast",
+ devname, verbose, 0, NULL);
if (update)
rv = dev_st->ss->update_super(
dev_st, NULL, update,
int Manage_add(int fd, int tfd, struct mddev_dev *dv,
struct supertype *tst, mdu_array_info_t *array,
int force, int verbose, char *devname,
- char *update, unsigned long rdev, unsigned long long array_size)
+ char *update, unsigned long rdev, unsigned long long array_size,
+ int raid_slot)
{
unsigned long long ldsize;
- struct supertype *dev_st = NULL;
+ struct supertype *dev_st;
int j;
mdu_disk_info_t disc;
" Adding anyway as --force was given.\n",
dv->devname, devname);
}
- if (!tst->ss->external &&
- array->major_version == 0 &&
- md_get_version(fd)%100 < 2) {
+ if (!tst->ss->external && array->major_version == 0) {
if (ioctl(fd, HOT_ADD_DISK, rdev)==0) {
if (verbose >= 0)
pr_err("hot added %s\n",
char *dev;
int dfd;
disc.number = j;
- if (ioctl(fd, GET_DISK_INFO, &disc))
+ if (md_get_disk_info(fd, &disc))
continue;
if (disc.major==0 && disc.minor==0)
continue;
}
/* Make sure device is large enough */
- if (tst->sb &&
+ if (dv->disposition != 'j' && /* skip size check for Journal */
+ tst->sb &&
tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) <
array_size) {
if (dv->disposition == 'M')
* simply re-add it.
*/
- if (array->not_persistent==0) {
+ if (array->not_persistent == 0) {
dev_st = dup_super(tst);
dev_st->ss->load_super(dev_st, tfd, NULL);
- }
- if (dev_st && dev_st->sb && dv->disposition != 'S') {
- int rv = attempt_re_add(fd, tfd, dv,
- dev_st, tst,
- rdev,
- update, devname,
- verbose,
- array);
- dev_st->ss->free_super(dev_st);
- if (rv)
- return rv;
+ if (dev_st->sb && dv->disposition != 'S') {
+ int rv;
+
+ rv = attempt_re_add(fd, tfd, dv, dev_st, tst,
+ rdev, update, devname,
+ verbose, array);
+ dev_st->ss->free_super(dev_st);
+ if (rv)
+ return rv;
+ }
}
if (dv->disposition == 'M') {
if (verbose > 0)
for (d = 0; d < MAX_DISKS && found < array->nr_disks; d++) {
disc.number = d;
- if (ioctl(fd, GET_DISK_INFO, &disc))
+ if (md_get_disk_info(fd, &disc))
continue;
if (disc.major == 0 && disc.minor == 0)
continue;
- found++;
if (!(disc.state & (1<<MD_DISK_SYNC)))
continue;
avail[disc.raid_disk] = 1;
+ found++;
}
array_failed = !enough(array->level, array->raid_disks,
array->layout, 1, avail);
*/
for (j = array->raid_disks; j < tst->max_devs; j++) {
disc.number = j;
- if (ioctl(fd, GET_DISK_INFO, &disc))
+ if (md_get_disk_info(fd, &disc))
break;
if (disc.major==0 && disc.minor==0)
break;
}
disc.major = major(rdev);
disc.minor = minor(rdev);
- disc.number =j;
+ if (raid_slot < 0)
+ disc.number = j;
+ else
+ disc.number = raid_slot;
disc.state = 0;
+
+ /* only add journal to array that supports journaling */
+ if (dv->disposition == 'j') {
+ struct mdinfo mdi;
+ struct mdinfo *mdp;
+
+ mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE);
+ if (!mdp) {
+ pr_err("%s unable to read array state.\n", devname);
+ return -1;
+ }
+
+ if (mdp->array_state != ARRAY_READONLY) {
+ sysfs_free(mdp);
+ pr_err("%s is not readonly, cannot add journal.\n", devname);
+ return -1;
+ }
+
+ sysfs_free(mdp);
+
+ tst->ss->getinfo_super(tst, &mdi, NULL);
+ if (mdi.journal_device_required == 0) {
+ pr_err("%s does not support journal device.\n", devname);
+ return -1;
+ }
+ disc.raid_disk = 0;
+ }
+
if (array->not_persistent==0) {
int dfd;
- if (dv->writemostly == 1)
+ if (dv->disposition == 'j')
+ disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC);
+ if (dv->writemostly == FlagSet)
disc.state |= 1 << MD_DISK_WRITEMOSTLY;
+ if (dv->failfast == FlagSet)
+ disc.state |= 1 << MD_DISK_FAILFAST;
dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
if (tst->ss->add_to_super(tst, &disc, dfd,
dv->devname, INVALID_SECTORS))
for (j = 0; j < tst->max_devs; j++) {
mdu_disk_info_t disc2;
disc2.number = j;
- if (ioctl(fd, GET_DISK_INFO, &disc2))
+ if (md_get_disk_info(fd, &disc2))
continue;
if (disc2.major==0 && disc2.minor==0)
continue;
}
free(used);
}
- if (dv->writemostly == 1)
+
+ if (array->state & (1 << MD_SB_CLUSTERED)) {
+ if (dv->disposition == 'c')
+ disc.state |= (1 << MD_DISK_CANDIDATE);
+ else
+ disc.state |= (1 << MD_DISK_CLUSTER_ADD);
+ }
+
+ if (dv->writemostly == FlagSet)
disc.state |= (1 << MD_DISK_WRITEMOSTLY);
+ if (dv->failfast == FlagSet)
+ disc.state |= (1 << MD_DISK_FAILFAST);
if (tst->ss->external) {
/* add a disk
* to an external metadata container */
} else {
tst->ss->free_super(tst);
if (ioctl(fd, ADD_NEW_DISK, &disc)) {
- pr_err("add new device failed for %s as %d: %s\n",
- dv->devname, j, strerror(errno));
+ if (dv->disposition == 'j')
+ pr_err("Failed to hot add %s as journal, "
+ "please try restart %s.\n", dv->devname, devname);
+ else
+ pr_err("add new device failed for %s as %d: %s\n",
+ dv->devname, j, strerror(errno));
return -1;
}
+ if (dv->disposition == 'j') {
+ pr_err("Journal added successfully, making %s read-write\n", devname);
+ if (Manage_ro(devname, fd, -1))
+ pr_err("Failed to make %s read-write\n", devname);
+ }
+
}
if (verbose >= 0)
pr_err("added %s\n", dv->devname);
}
int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv,
- int sysfd, unsigned long rdev, int verbose, char *devname)
+ int sysfd, unsigned long rdev, int force, int verbose, char *devname)
{
int lfd = -1;
int err;
*/
if (rdev == 0)
ret = -1;
- else
- ret = sysfs_unique_holder(devnm, rdev);
- if (ret == 0) {
- pr_err("%s is not a member, cannot remove.\n",
- dv->devname);
- close(lfd);
- return -1;
- }
- if (ret >= 2) {
- pr_err("%s is still in use, cannot remove.\n",
- dv->devname);
- close(lfd);
- return -1;
+ else {
+ /*
+ * The drive has already been set to 'faulty', however
+ * monitor might not have had time to process it and the
+ * drive might still have an entry in the 'holders'
+ * directory. Try a few times to avoid a false error
+ */
+ int count = 20;
+
+ do {
+ ret = sysfs_unique_holder(devnm, rdev);
+ if (ret < 2)
+ break;
+ usleep(100 * 1000); /* 100ms */
+ } while (--count > 0);
+
+ if (ret == 0) {
+ pr_err("%s is not a member, cannot remove.\n",
+ dv->devname);
+ close(lfd);
+ return -1;
+ }
+ if (ret >= 2) {
+ pr_err("%s is still in use, cannot remove.\n",
+ dv->devname);
+ close(lfd);
+ return -1;
+ }
}
}
/* FIXME check that it is a current member */
/* device has been removed and we don't know
* the major:minor number
*/
- int n = write(sysfd, "remove", 6);
- if (n != 6)
- err = -1;
- else
- err = 0;
+ err = sys_hot_remove_disk(sysfd, force);
} else {
- err = ioctl(fd, HOT_REMOVE_DISK, rdev);
+ err = hot_remove_disk(fd, rdev, force);
if (err && errno == ENODEV) {
/* Old kernels rejected this if no personality
* is registered */
"state", "remove");
else
err = -1;
- if (sra)
- sysfs_free(sra);
+ sysfs_free(sra);
}
}
if (err) {
* try HOT_ADD_DISK
* If that fails EINVAL, try ADD_NEW_DISK
* 'S' - add the device as a spare - don't try re-add
+ * 'j' - add the device as a journal device
* 'A' - re-add the device
* 'r' - remove the device: HOT_REMOVE_DISK
* device can be 'faulty' or 'detached' in which case all
* variant on 'A'
* 'F' - Another variant of 'A', where the device was faulty
* so must be removed from the array first.
+ * 'c' - confirm the device as found (for clustered environments)
*
* For 'f' and 'r', the device can also be a kernel-internal
* name such as 'sdb'.
int sysfd = -1;
int count = 0; /* number of actions taken */
struct mdinfo info;
+ struct mdinfo devinfo;
int frozen = 0;
int busy = 0;
+ int raid_slot = -1;
- if (ioctl(fd, GET_ARRAY_INFO, &array)) {
- pr_err("Cannot get array info for %s\n",
- devname);
+ if (sysfs_init(&info, fd, NULL)) {
+ pr_err("sysfs not availabile for %s\n", devname);
goto abort;
}
- sysfs_init(&info, fd, NULL);
+ if (md_get_array_info(fd, &array)) {
+ pr_err("Cannot get array info for %s\n", devname);
+ goto abort;
+ }
/* array.size is only 32 bits and may be truncated.
* So read from sysfs if possible, and record number of sectors
*/
int rv;
int mj,mn;
+ raid_slot = -1;
+ if (dv->disposition == 'c') {
+ rv = parse_cluster_confirm_arg(dv->devname,
+ &dv->devname,
+ &raid_slot);
+ if (rv) {
+ pr_err("Could not get the devname of cluster\n");
+ goto abort;
+ }
+ }
+
if (strcmp(dv->devname, "failed") == 0 ||
strcmp(dv->devname, "faulty") == 0) {
if (dv->disposition != 'A'
}
if (strcmp(dv->devname, "missing") == 0) {
- struct mddev_dev *add_devlist = NULL;
+ struct mddev_dev *add_devlist;
struct mddev_dev **dp;
+ if (dv->disposition == 'c') {
+ rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL);
+ break;
+ }
+
if (dv->disposition != 'A') {
pr_err("'missing' only meaningful with --re-add\n");
goto abort;
} else {
struct stat stb;
tfd = dev_open(dv->devname, O_RDONLY);
- if (tfd >= 0)
+ if (tfd >= 0) {
fstat(tfd, &stb);
- else {
+ close(tfd);
+ } else {
int open_err = errno;
if (stat(dv->devname, &stb) != 0) {
pr_err("Cannot find %s: %s\n",
goto abort;
case 'a':
case 'S': /* --add-spare */
+ case 'j': /* --add-journal */
case 'A':
case 'M': /* --re-add missing */
case 'F': /* --re-add faulty */
+ case 'c': /* --cluster-confirm */
/* add the device */
if (subarray) {
pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n");
goto abort;
}
+
+ /* Let's first try to write re-add to sysfs */
+ if (rdev != 0 &&
+ (dv->disposition == 'A' || dv->disposition == 'F')) {
+ sysfs_init_dev(&devinfo, rdev);
+ if (sysfs_set_str(&info, &devinfo, "state", "re-add") == 0) {
+ pr_err("re-add %s to %s succeed\n",
+ dv->devname, info.sys_name);
+ break;
+ }
+ }
+
if (dv->disposition == 'F')
/* Need to remove first */
- ioctl(fd, HOT_REMOVE_DISK, rdev);
+ hot_remove_disk(fd, rdev, force);
/* Make sure it isn't in use (in 2.6 or later) */
tfd = dev_open(dv->devname, O_RDONLY|O_EXCL);
if (tfd >= 0) {
}
rv = Manage_add(fd, tfd, dv, tst, &array,
force, verbose, devname, update,
- rdev, array_size);
+ rdev, array_size, raid_slot);
close(tfd);
tfd = -1;
if (rv < 0)
rv = -1;
} else
rv = Manage_remove(tst, fd, dv, sysfd,
- rdev, verbose,
+ rdev, verbose, force,
devname);
if (sysfd >= 0)
close(sysfd);
devlist.next = NULL;
devlist.used = 0;
- devlist.writemostly = 0;
+ devlist.writemostly = FlagDefault;
+ devlist.failfast = FlagDefault;
devlist.devname = devname;
sprintf(devname, "%d:%d", major(devid), minor(devid));
close(fd2);
return 0;
}
-#endif