X-Git-Url: http://git.ipfire.org/?a=blobdiff_plain;f=Grow.c;h=d596c2de7a7df6bd9ebe0eafdb3e55c365f66f9b;hb=d0bc5190d77a60ac4971601469bbdad6803b8b56;hp=d36cb3c53404db82ef990e931be40c319dac397b;hpb=b6b951557d4e53257e1e4b3e3c0aa34445339980;p=thirdparty%2Fmdadm.git diff --git a/Grow.c b/Grow.c index d36cb3c5..d596c2de 100644 --- a/Grow.c +++ b/Grow.c @@ -530,12 +530,8 @@ static int freeze(struct supertype *st) } } -static void unfreeze(struct supertype *st, int frozen) +static void unfreeze(struct supertype *st) { - /* If 'frozen' is 1, unfreeze the array */ - if (frozen <= 0) - return; - if (st->ss->external) return unfreeze_container(st); else { @@ -569,7 +565,8 @@ static void wait_reshape(struct mdinfo *sra) static int reshape_super(struct supertype *st, long long size, int level, int layout, int chunksize, int raid_disks, - char *backup_file, char *dev, int verbose) + int delta_disks, char *backup_file, char *dev, + int verbose) { /* nothing extra to check in the native case */ if (!st->ss->external) @@ -582,7 +579,8 @@ static int reshape_super(struct supertype *st, long long size, int level, } return st->ss->reshape_super(st, size, level, layout, chunksize, - raid_disks, backup_file, dev, verbose); + raid_disks, delta_disks, backup_file, dev, + verbose); } static void sync_metadata(struct supertype *st) @@ -631,15 +629,17 @@ static int subarray_set_num(char *container, struct mdinfo *sra, char *name, int return rc; } -int start_reshape(struct mdinfo *sra) +int start_reshape(struct mdinfo *sra, int already_running) { int err; sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); err = sysfs_set_num(sra, NULL, "suspend_hi", 0); err = err ?: sysfs_set_num(sra, NULL, "suspend_lo", 0); - err = err ?: sysfs_set_num(sra, NULL, "sync_min", 0); + if (!already_running) + sysfs_set_num(sra, NULL, "sync_min", 0); err = err ?: sysfs_set_num(sra, NULL, "sync_max", 0); - err = err ?: sysfs_set_str(sra, NULL, "sync_action", "reshape"); + if (!already_running) + err = err ?: sysfs_set_str(sra, NULL, "sync_action", "reshape"); return err; } @@ -654,15 +654,20 @@ void abort_reshape(struct mdinfo *sra) sysfs_set_str(sra, NULL, "sync_max", "max"); } -int remove_disks_on_raid10_to_raid0_takeover(struct supertype *st, - struct mdinfo *sra, - int layout) +int remove_disks_for_takeover(struct supertype *st, + struct mdinfo *sra, + int layout) { int nr_of_copies; struct mdinfo *remaining; int slot; - nr_of_copies = layout & 0xff; + if (sra->array.level == 10) + nr_of_copies = layout & 0xff; + else if (sra->array.level == 1) + nr_of_copies = sra->array.raid_disks; + else + return 1; remaining = sra->devs; sra->devs = NULL; @@ -715,7 +720,9 @@ int remove_disks_on_raid10_to_raid0_takeover(struct supertype *st, sysfs_set_str(sra, sd, "state", "faulty"); sysfs_set_str(sra, sd, "slot", "none"); - sysfs_set_str(sra, sd, "state", "remove"); + /* for external metadata disks should be removed in mdmon */ + if (!st->ss->external) + sysfs_set_str(sra, sd, "state", "remove"); sd->disk.state |= (1<disk.state &= ~(1<next = sra->devs; @@ -793,7 +800,8 @@ int reshape_open_backup_file(char *backup_file, char *devname, long blocks, int *fdlist, - unsigned long long *offsets) + unsigned long long *offsets, + int restart) { /* Return 1 on success, 0 on any form of failure */ /* need to check backup file is large enough */ @@ -802,7 +810,7 @@ int reshape_open_backup_file(char *backup_file, unsigned int dev; int i; - *fdlist = open(backup_file, O_RDWR|O_CREAT|O_EXCL, + *fdlist = open(backup_file, O_RDWR|O_CREAT|(restart ? O_TRUNC : O_EXCL), S_IRUSR | S_IWUSR); *offsets = 8 * 512; if (*fdlist < 0) { @@ -826,7 +834,7 @@ int reshape_open_backup_file(char *backup_file, } memset(buf, 0, 512); - for (i=0; i < blocks + 1 ; i++) { + for (i=0; i < blocks + 8 ; i++) { if (write(*fdlist, buf, 512) != 512) { fprintf(stderr, Name ": %s: cannot create" " backup file %s: %s\n", @@ -867,30 +875,6 @@ unsigned long compute_backup_blocks(int nchunk, int ochunk, return blocks; } -/* 'struct reshape' records the intermediate states - * a general reshape. - * The starting geometry is converted to the 'before' geometry - * by at most an atomic level change. They could be the same. - * Similarly the 'after' geometry is converted to the final - * geometry by at most a level change. - * Note that 'before' and 'after' must have the same level. - * 'blocks' is the minimum number of sectors for a reshape unit. - * This will be a multiple of the stripe size in each of the - * 'before' and 'after' geometries. - * If 'blocks' is 0, no restriping is necessary. - */ -struct reshape { - int level; - int parity; /* number of parity blocks/devices */ - struct { - int layout; - int data_disks; - } before, after; - unsigned long long blocks; - unsigned long long stripes; /* number of old stripes that comprise 'blocks'*/ - unsigned long long new_size; /* New size of array in sectors */ -}; - char *analyse_change(struct mdinfo *info, struct reshape *re) { /* Based on the current array state in info->array and @@ -909,6 +893,10 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) * when assembling an array that is undergoing reshape. */ int new_disks; + /* delta_parity records change in number of devices + * caused by level change + */ + int delta_parity = 0; /* If a new level not explicitly given, we assume no-change */ if (info->new_level == UnSet) @@ -937,29 +925,47 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) switch (info->array.level) { case 1: /* RAID1 can convert to RAID1 with different disks, or - * raid5 with 2 disks + * raid5 with 2 disks, or + * raid0 with 1 disk */ + if (info->new_level == 0) { + if (info->delta_disks != UnSet && + info->delta_disks != 0) + return "Cannot change number of disks " + "with RAID1->RAID0 conversion"; + re->level = 0; + re->before.data_disks = 1; + re->after.data_disks = 1; + re->before.layout = 0; + re->backup_blocks = 0; + re->parity = 0; + return NULL; + } if (info->new_level == 1) { if (info->delta_disks == UnSet) /* Don't know what to do */ return "no change requested for Growing RAID1"; re->level = 1; - re->before.data_disks = (info->array.raid_disks + - info->delta_disks); - re->before.layout = 0; - re->blocks = 0; + re->backup_blocks = 0; re->parity = 0; return NULL; } if (info->array.raid_disks == 2 && - info->array.raid_disks == 5) { - /* simple in-place conversion */ + info->new_level == 5) { + re->level = 5; - re->parity = 1; re->before.data_disks = 1; + if (info->delta_disks != UnSet && + info->delta_disks != 0) + re->after.data_disks = 1 + info->delta_disks; + else + re->after.data_disks = 1; + if (re->after.data_disks < 1) + return "Number of disks too small for RAID5"; + re->before.layout = ALGORITHM_LEFT_SYMMETRIC; - re->blocks = 0; - return NULL; + info->array.chunk_size = 65536; + break; } /* Could do some multi-stage conversions, but leave that to * later. @@ -980,10 +986,10 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) return "RAID10 can only be changed to RAID0"; new_disks = (info->array.raid_disks / (info->array.layout & 0xff)); - if (info->delta_disks != UnSet) { + if (info->delta_disks == UnSet) info->delta_disks = (new_disks - info->array.raid_disks); - } + if (info->delta_disks != new_disks - info->array.raid_disks) return "New number of raid-devices impossible for RAID10"; if (info->new_chunk && @@ -994,8 +1000,9 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) re->level = 0; re->parity = 0; re->before.data_disks = new_disks; + re->after.data_disks = re->before.data_disks; re->before.layout = 0; - re->blocks = 0; + re->backup_blocks = 0; return NULL; case 0: @@ -1031,8 +1038,9 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) re->parity = 0; re->before.data_disks = (info->array.raid_disks + info->delta_disks); + re->after.data_disks = re->before.data_disks; re->before.layout = info->new_layout; - re->blocks = 0; + re->backup_blocks = 0; return NULL; } @@ -1040,16 +1048,19 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) * a raid4 style layout of the final level. */ switch (info->new_level) { - case 0: case 4: + delta_parity = 1; + case 0: re->level = 4; re->before.layout = 0; break; case 5: + delta_parity = 1; re->level = 5; re->before.layout = ALGORITHM_PARITY_N; break; case 6: + delta_parity = 2; re->level = 6; re->before.layout = ALGORITHM_PARITY_N; break; @@ -1064,6 +1075,8 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) info->array.layout = ALGORITHM_PARITY_N; case 5: switch (info->new_level) { + case 0: + delta_parity = -1; case 4: re->level = info->array.level; re->before.data_disks = info->array.raid_disks - 1; @@ -1075,6 +1088,7 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) re->before.layout = info->array.layout; break; case 6: + delta_parity = 1; re->level = 6; re->before.data_disks = info->array.raid_disks - 1; switch (info->array.layout) { @@ -1103,9 +1117,11 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) case 1: if (info->array.raid_disks != 2) return "Can only convert a 2-device array to RAID1"; + if (info->delta_disks != UnSet && + info->delta_disks != 0) + return "Cannot set raid_disk when " + "converting RAID5->RAID1"; re->level = 1; - re->before.data_disks = 2; - re->before.layout = 0; break; default: return "Impossible level change requested"; @@ -1115,6 +1131,7 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) switch (info->new_level) { case 4: case 5: + delta_parity = -1; case 6: re->level = 6; re->before.data_disks = info->array.raid_disks - 2; @@ -1140,11 +1157,7 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) */ if (re->level != 4 && re->level != 5) return "Cannot covert to RAID0 from this level"; - if (info->delta_disks == UnSet) - re->after.data_disks = re->before.data_disks; - else - re->after.data_disks = - info->array.raid_disks + info->delta_disks; + switch (re->level) { case 4: re->after.layout = 0 ; break; @@ -1157,11 +1170,7 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) /* We can only get to RAID4 from RAID5 */ if (re->level != 4 && re->level != 5) return "Cannot convert to RAID4 from this level"; - if (info->delta_disks == UnSet) - re->after.data_disks = re->before.data_disks; - else - re->after.data_disks = - re->before.data_disks + info->delta_disks; + switch (re->level) { case 4: re->after.layout = 0 ; break; @@ -1174,14 +1183,7 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) /* We get to RAID5 for RAID5 or RAID6 */ if (re->level != 5 && re->level != 6) return "Cannot convert to RAID5 from this level"; - if (info->delta_disks == UnSet) - re->after.data_disks = re->before.data_disks; - else if (re->level == 5) - re->after.data_disks = - re->before.data_disks + info->delta_disks; - else - re->after.data_disks = - info->array.raid_disks + info->delta_disks - 1; + switch (re->level) { case 5: if (info->new_layout == UnSet) @@ -1190,10 +1192,9 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) re->after.layout = info->new_layout; break; case 6: - if (info->new_layout == UnSet) { - re->after.layout = re->before.layout; - break; - } + if (info->new_layout == UnSet) + info->new_layout = re->before.layout; + /* after.layout needs to be raid6 version of new_layout */ if (info->new_layout == ALGORITHM_PARITY_N) re->after.layout = ALGORITHM_PARITY_N; @@ -1215,19 +1216,20 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) /* We must already be at level 6 */ if (re->level != 6) return "Impossible level change"; - if (info->delta_disks == UnSet) - re->after.data_disks = re->before.data_disks; - else - re->after.data_disks = (info->array.raid_disks + - info->delta_disks) - 2; if (info->new_layout == UnSet) - re->after.layout = re->before.layout; + re->after.layout = info->array.layout; else re->after.layout = info->new_layout; break; default: return "Impossible level change requested"; } + if (info->delta_disks == UnSet) + info->delta_disks = delta_parity; + + re->after.data_disks = (re->before.data_disks + + info->delta_disks + - delta_parity); switch (re->level) { case 6: re->parity = 2; break; case 4: @@ -1243,12 +1245,12 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) re->after.layout == re->before.layout && info->new_chunk == info->array.chunk_size) { /* Nothing to change */ - re->blocks = 0; + re->backup_blocks = 0; return NULL; } if (re->after.data_disks == 1 && re->before.data_disks == 1) { - /* chunks can layout changes make no difference */ - re->blocks = 0; + /* chunk and layout changes make no difference */ + re->backup_blocks = 0; return NULL; } @@ -1260,7 +1262,7 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) get_linux_version() < 2006030) return "reshape to fewer devices is not supported before 2.6.32 - sorry."; - re->blocks = compute_backup_blocks( + re->backup_blocks = compute_backup_blocks( info->new_chunk, info->array.chunk_size, re->after.data_disks, re->before.data_disks); @@ -1271,18 +1273,14 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) static int reshape_array(char *container, int fd, char *devname, struct supertype *st, struct mdinfo *info, - int force, char *backup_file, int quiet, int forked); -static int reshape_container(char *container, int cfd, char *devname, + int force, char *backup_file, int quiet, int forked, + int restart); +static int reshape_container(char *container, char *devname, struct supertype *st, struct mdinfo *info, int force, char *backup_file, - int quiet); -static int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, - unsigned long stripes, - int *fds, unsigned long long *offsets, - int dests, int *destfd, unsigned long long *destoffsets); - + int quiet, int restart); int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, long long size, @@ -1381,10 +1379,8 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, fmt_devname(container_buf, container_dev); container = container_buf; - if (subarray) - rv = st->ss->load_container(st, cfd, NULL); - else - rv = st->ss->load_super(st, cfd, NULL); + rv = st->ss->load_container(st, cfd, NULL); + if (rv) { fprintf(stderr, Name ": Cannot read superblock for %s\n", devname); @@ -1409,8 +1405,9 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, return 1; } - sra = sysfs_read(fd, 0, GET_LEVEL | GET_DISKS | GET_DEVS | GET_STATE); - if (sra) { + sra = sysfs_read(fd, 0, GET_LEVEL | GET_DISKS | GET_DEVS + | GET_STATE | GET_VERSION); + if (sra) { if (st->ss->external && subarray == NULL) { array.level = LEVEL_CONTAINER; sra->array.level = LEVEL_CONTAINER; @@ -1434,7 +1431,8 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, if (size >= 0 && (size == 0 || size != array.size)) { long long orig_size = array.size; - if (reshape_super(st, size, UnSet, UnSet, 0, 0, NULL, devname, !quiet)) { + if (reshape_super(st, size, UnSet, UnSet, 0, 0, UnSet, NULL, + devname, !quiet)) { rv = 1; goto release; } @@ -1456,7 +1454,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, /* restore metadata */ if (reshape_super(st, orig_size, UnSet, UnSet, 0, 0, - NULL, devname, !quiet) == 0) + UnSet, NULL, devname, !quiet) == 0) sync_metadata(st); fprintf(stderr, Name ": Cannot set device size for %s: %s\n", devname, strerror(err)); @@ -1480,15 +1478,17 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, size = array.size; } - /* ========= check for Raid10 -> Raid0 conversion =============== + /* ========= check for Raid10/Raid1 -> Raid0 conversion =============== * current implementation assumes that following conditions must be met: - * - far_copies == 1 - * - near_copies == 2 + * - RAID10: + * - far_copies == 1 + * - near_copies == 2 */ - if (level == 0 && array.level == 10 && sra && - array.layout == ((1 << 8) + 2) && !(array.raid_disks & 1)) { + if ((level == 0 && array.level == 10 && sra && + array.layout == ((1 << 8) + 2) && !(array.raid_disks & 1)) || + (level == 0 && array.level == 1 && sra)) { int err; - err = remove_disks_on_raid10_to_raid0_takeover(st, sra, array.layout); + err = remove_disks_for_takeover(st, sra, array.layout); if (err) { dprintf(Name": Array cannot be reshaped\n"); if (cfd > -1) @@ -1496,14 +1496,20 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, rv = 1; goto release; } + /* FIXME this is added with no justification - why is it here */ + ping_monitor(container); } info.array = array; sysfs_init(&info, fd, NoMdDev); + strcpy(info.text_version, sra->text_version); info.component_size = size*2; info.new_level = level; info.new_chunk = chunksize * 1024; - if (raid_disks) + if (info.array.level == LEVEL_CONTAINER) { + info.delta_disks = UnSet; + info.array.raid_disks = raid_disks; + } else if (raid_disks) info.delta_disks = raid_disks - info.array.raid_disks; else info.delta_disks = UnSet; @@ -1579,32 +1585,49 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, * number of devices (On-Line Capacity Expansion) must be * performed at the level of the container */ - rv = reshape_container(container, fd, devname, st, &info, - force, backup_file, quiet); + rv = reshape_container(container, devname, st, &info, + force, backup_file, quiet, 0); + frozen = 0; } else { + /* get spare devices from external metadata + */ + if (st->ss->external) { + struct mdinfo *info2; + + info2 = st->ss->container_content(st, subarray); + if (info2) { + info.array.spare_disks = + info2->array.spare_disks; + sysfs_free(info2); + } + } + /* Impose these changes on a single array. First * check that the metadata is OK with the change. */ if (reshape_super(st, info.component_size, info.new_level, info.new_layout, info.new_chunk, - info.array.raid_disks + info.delta_disks, + info.array.raid_disks, info.delta_disks, backup_file, devname, quiet)) { rv = 1; goto release; } sync_metadata(st); rv = reshape_array(container, fd, devname, st, &info, force, - backup_file, quiet, 0); + backup_file, quiet, 0, 0); + frozen = 0; } release: - unfreeze(st, frozen); + if (frozen > 0) + unfreeze(st); return rv; } static int reshape_array(char *container, int fd, char *devname, struct supertype *st, struct mdinfo *info, int force, - char *backup_file, int quiet, int forked) + char *backup_file, int quiet, int forked, + int restart) { struct reshape reshape; int spares_needed; @@ -1614,34 +1637,65 @@ static int reshape_array(char *container, int fd, char *devname, struct mdu_array_info_s array; char *c; - int rv = 0; int *fdlist; unsigned long long *offsets; int d; int nrdisks; int err; - int frozen; - unsigned long blocks, stripes; + unsigned long blocks; unsigned long cache; unsigned long long array_size; int done; - struct mdinfo *sra, *sd; - - msg = analyse_change(info, &reshape); + struct mdinfo *sra = NULL; + + if (info->reshape_active) { + int new_level = info->new_level; + info->new_level = UnSet; + msg = analyse_change(info, &reshape); + info->new_level = new_level; + if (!restart) + /* Make sure the array isn't read-only */ + ioctl(fd, RESTART_ARRAY_RW, 0); + } else + msg = analyse_change(info, &reshape); if (msg) { fprintf(stderr, Name ": %s\n", msg); - return 1; + goto release; + } + if (restart && + (reshape.level != info->array.level || + reshape.before.layout != info->array.layout || + reshape.before.data_disks + reshape.parity != info->array.raid_disks)) { + fprintf(stderr, Name ": reshape info is not in native format -" + " cannot continue.\n"); + goto release; } if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) { - dprintf("Canot get array information.\n"); - return 1; + dprintf("Cannot get array information.\n"); + goto release; } + + if (restart) { + /* reshape already started. just skip to monitoring the reshape */ + if (reshape.backup_blocks == 0) + return 0; + goto started; + } + /* The container is frozen but the array may not be. + * So freeze the array so spares don't get put to the wrong use + * FIXME there should probably be a cleaner separation between + * freeze_array and freeze_container. + */ + sysfs_freeze_array(info); + /* Check we have enough spares to not be degraded */ spares_needed = max(reshape.before.data_disks, reshape.after.data_disks) + reshape.parity - array.raid_disks; - if (!force && spares_needed < info->array.spare_disks) { + if (!force && + info->new_level > 1 && + spares_needed > info->array.spare_disks) { fprintf(stderr, Name ": Need %d spare%s to avoid degraded array," " and only have %d.\n" @@ -1649,14 +1703,28 @@ static int reshape_array(char *container, int fd, char *devname, spares_needed, spares_needed == 1 ? "" : "s", info->array.spare_disks); - return 1; + goto release; + } + /* Check we have enough spares to not fail */ + spares_needed = max(reshape.before.data_disks, + reshape.after.data_disks) + - array.raid_disks; + if ((info->new_level > 1 || info->new_level == 0) && + spares_needed > info->array.spare_disks) { + fprintf(stderr, + Name ": Need %d spare%s to create working array," + " and only have %d.\n", + spares_needed, + spares_needed == 1 ? "" : "s", + info->array.spare_disks); + goto release; } - if (reshape.level != info->array.level) { + if (reshape.level != array.level) { char *c = map_num(pers, reshape.level); int err; if (c == NULL) - return 1; /* This should not be possible */ + goto release; err = sysfs_set_str(info, NULL, "level", c); if (err) { @@ -1667,20 +1735,21 @@ static int reshape_array(char *container, int fd, char *devname, (info->array.state & (1<array.level; - } + orig_level = array.level; + sysfs_freeze_array(info); - if (reshape.level > 0 && st->ss->external && - !mdmon_running(st->container_dev)) { - start_mdmon(st->container_dev); - ping_monitor(container); + if (reshape.level > 0 && st->ss->external) { + /* make sure mdmon is aware of the new level */ + if (!mdmon_running(st->container_dev)) + start_mdmon(st->container_dev); + ping_monitor(container); + } } - /* ->reshape_super might have chosen some spares from the * container that it wants to be part of the new array. * We can collect them with ->container_content and give @@ -1692,7 +1761,8 @@ static int reshape_array(char *container, int fd, char *devname, st->ss->container_content(st, subarray); struct mdinfo *d; - if (info2) + if (info2) { + sysfs_init(info2, fd, st->devnum); for (d = info2->devs; d; d = d->next) { if (d->disk.state == 0 && d->disk.raid_disk >= 0) { @@ -1702,45 +1772,55 @@ static int reshape_array(char *container, int fd, char *devname, add_disk(fd, st, info2, d); } } - sysfs_free(info2); + sysfs_free(info2); + } } - if (reshape.blocks == 0) { + if (reshape.backup_blocks == 0) { /* No restriping needed, but we might need to impose * some more changes: layout, raid_disks, chunk_size */ + /* read current array info */ + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) { + dprintf("Cannot get array information.\n"); + goto release; + } + /* compare current array info with new values and if + * it is different update them to new */ if (info->new_layout != UnSet && - info->new_layout != info->array.layout) { - info->array.layout = info->new_layout; - if (ioctl(fd, SET_ARRAY_INFO, &info->array) != 0) { + info->new_layout != array.layout) { + array.layout = info->new_layout; + if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { fprintf(stderr, Name ": failed to set new layout\n"); - rv = 1; + goto release; } else if (!quiet) printf("layout for %s set to %d\n", - devname, info->array.layout); + devname, array.layout); } if (info->delta_disks != UnSet && - info->delta_disks != 0) { - info->array.raid_disks += info->delta_disks; - if (ioctl(fd, SET_ARRAY_INFO, &info->array) != 0) { + info->delta_disks != 0 && + array.raid_disks != (info->array.raid_disks + info->delta_disks)) { + array.raid_disks += info->delta_disks; + if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { fprintf(stderr, Name ": failed to set raid disks\n"); - rv = 1; - } else if (!quiet) + goto release; + } else if (!quiet) { printf("raid_disks for %s set to %d\n", - devname, info->array.raid_disks); + devname, array.raid_disks); + } } if (info->new_chunk != 0 && - info->new_chunk != info->array.chunk_size) { + info->new_chunk != array.chunk_size) { if (sysfs_set_num(info, NULL, "chunk_size", info->new_chunk) != 0) { fprintf(stderr, Name ": failed to set chunk size\n"); - rv = 1; + goto release; } else if (!quiet) printf("chunk size for %s set to %d\n", - devname, info->array.chunk_size); + devname, array.chunk_size); } - - return rv; + unfreeze(st); + return 0; } /* @@ -1776,7 +1856,7 @@ static int reshape_array(char *container, int fd, char *devname, * - request the shape change. * - fork to handle backup etc. */ - +started: /* Check that we can hold all the data */ get_dev_size(fd, NULL, &array_size); if (reshape.new_size < (array_size/512)) { @@ -1785,25 +1865,22 @@ static int reshape_array(char *container, int fd, char *devname, " use --grow --array-size first to truncate array.\n" " e.g. mdadm --grow %s --array-size %llu\n", devname, reshape.new_size/2); - rv = 1; goto release; } sra = sysfs_read(fd, 0, GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK| GET_CACHE); - if (!sra) { fprintf(stderr, Name ": %s: Cannot get array details from sysfs\n", devname); - rv = 1; goto release; } /* Decide how many blocks (sectors) for a reshape * unit. The number we have so far is just a minimum */ - blocks = reshape.blocks; + blocks = reshape.backup_blocks; if (reshape.before.data_disks == reshape.after.data_disks) { /* Make 'blocks' bigger for better throughput, but @@ -1821,47 +1898,44 @@ static int reshape_array(char *container, int fd, char *devname, fprintf(stderr, Name ": %s: Something wrong" " - reshape aborted\n", devname); - rv = 1; goto release; } /* Now we need to open all these devices so we can read/write. */ - nrdisks = array.raid_disks + sra->array.spare_disks; + nrdisks = max(reshape.before.data_disks, + reshape.after.data_disks) + reshape.parity + + sra->array.spare_disks; fdlist = malloc((1+nrdisks) * sizeof(int)); offsets = malloc((1+nrdisks) * sizeof(offsets[0])); if (!fdlist || !offsets) { fprintf(stderr, Name ": malloc failed: grow aborted\n"); - rv = 1; goto release; } - d = reshape_prepare_fdlist(devname, sra, array.raid_disks, + odisks = reshape.before.data_disks + reshape.parity; + d = reshape_prepare_fdlist(devname, sra, odisks, nrdisks, blocks, backup_file, fdlist, offsets); if (d < 0) { - rv = 1; goto release; } if (backup_file == NULL) { - if (reshape.after.data_disks <= reshape.before.data_disks) { + if (reshape.after.data_disks <= reshape.before.data_disks) { fprintf(stderr, Name ": %s: Cannot grow - need backup-file\n", devname); - rv = 1; goto release; } else if (sra->array.spare_disks == 0) { fprintf(stderr, Name ": %s: Cannot grow - need a spare or " "backup-file to backup critical section\n", devname); - rv = 1; goto release; } } else { if (!reshape_open_backup_file(backup_file, fd, devname, (signed)blocks, - fdlist+d, offsets+d)) { - rv = 1; + fdlist+d, offsets+d, restart)) { goto release; } d++; @@ -1891,13 +1965,28 @@ static int reshape_array(char *container, int fd, char *devname, */ sync_metadata(st); + sra->new_chunk = info->new_chunk; + + if (restart) + sra->reshape_progress = info->reshape_progress; + else { + sra->reshape_progress = 0; + if (reshape.after.data_disks < reshape.before.data_disks) + /* start from the end of the new array */ + sra->reshape_progress = (sra->component_size + * reshape.after.data_disks); + } + if (info->array.chunk_size == info->new_chunk && reshape.before.layout == reshape.after.layout && st->ss->external == 0) { + /* use SET_ARRAY_INFO but only if reshape hasn't started */ + ioctl(fd, GET_ARRAY_INFO, &array); array.raid_disks = reshape.after.data_disks + reshape.parity; - if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { + if (!restart && + ioctl(fd, SET_ARRAY_INFO, &array) != 0) { int err = errno; - rv = 1; + fprintf(stderr, Name ": Cannot set device shape for %s: %s\n", devname, strerror(errno)); @@ -1910,24 +1999,21 @@ static int reshape_array(char *container, int fd, char *devname, goto release; } - } else { + } else if (!restart) { /* set them all just in case some old 'new_*' value - * persists from some earlier problem + * persists from some earlier problem. */ - int err = err; /* only used if rv==1, and always set if - * rv==1, so initialisation not needed, - * despite gcc warning - */ + int err = 0; if (sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0) - rv = 1, err = errno; - if (!rv && sysfs_set_num(sra, NULL, "layout", + err = errno; + if (!err && sysfs_set_num(sra, NULL, "layout", reshape.after.layout) < 0) - rv = 1, err = errno; - if (!rv && subarray_set_num(container, sra, "raid_disks", + err = errno; + if (!err && subarray_set_num(container, sra, "raid_disks", reshape.after.data_disks + reshape.parity) < 0) - rv = 1, err = errno; - if (rv) { + err = errno; + if (err) { fprintf(stderr, Name ": Cannot set device shape for %s\n", devname); @@ -1940,193 +2026,167 @@ static int reshape_array(char *container, int fd, char *devname, } } - start_reshape(sra); - if (st->ss->external) { - /* metadata handler takes it from here */ - ping_manager(container); - st->ss->manage_reshape(st, backup_file); - frozen = 0; - goto release; - } - - /* set up the backup-super-block. This requires the - * uuid from the array. - */ - /* Find a superblock */ - for (sd = sra->devs; sd; sd = sd->next) { - char *dn; - int devfd; - int ok; - if (sd->disk.state & (1<disk.major, sd->disk.minor, 1); - devfd = dev_open(dn, O_RDONLY); - if (devfd < 0) - continue; - ok = st->ss->load_super(st, devfd, NULL); - close(devfd); - if (ok >= 0) - break; - } - if (!sd) { - fprintf(stderr, Name ": %s: Cannot find a superblock\n", + err = start_reshape(sra, restart); + if (err) { + fprintf(stderr, + Name ": Cannot %s reshape for %s\n", + restart ? "continue" : "start", devname); - rv = 1; - abort_reshape(sra); goto release; } - - memset(&bsb, 0, 512); - memcpy(bsb.magic, "md_backup_data-1", 16); - st->ss->uuid_from_super(st, (int*)&bsb.set_uuid); - bsb.mtime = __cpu_to_le64(time(0)); - bsb.devstart2 = blocks; - - stripes = reshape.blocks / (info->array.chunk_size/512) / - reshape.before.data_disks; + if (restart) + sysfs_set_str(sra, NULL, "array_state", "active"); /* Now we just need to kick off the reshape and watch, while * handling backups of the data... * This is all done by a forked background process. */ switch(forked ? 0 : fork()) { + case -1: + fprintf(stderr, Name ": Cannot run child to monitor reshape: %s\n", + strerror(errno)); + abort_reshape(sra); + goto release; + default: + return 0; case 0: - close(fd); - if (check_env("MDADM_GROW_VERIFY")) - fd = open(devname, O_RDONLY | O_DIRECT); - else - fd = -1; - mlockall(MCL_FUTURE); + break; + } - odisks = reshape.before.data_disks + reshape.parity; + close(fd); + if (check_env("MDADM_GROW_VERIFY")) + fd = open(devname, O_RDONLY | O_DIRECT); + else + fd = -1; + mlockall(MCL_FUTURE); - done = child_monitor(fd, sra, &reshape, stripes, - fdlist, offsets, - d - odisks, fdlist+odisks, offsets+odisks); + if (st->ss->external) { + /* metadata handler takes it from here */ + done = st->ss->manage_reshape( + fd, sra, &reshape, st, blocks, + fdlist, offsets, + d - odisks, fdlist+odisks, + offsets+odisks); + } else + done = child_monitor( + fd, sra, &reshape, st, blocks, + fdlist, offsets, + d - odisks, fdlist+odisks, + offsets+odisks); + + if (backup_file && done) + unlink(backup_file); + if (!done) { + abort_reshape(sra); + goto out; + } - if (backup_file && done) - unlink(backup_file); - if (!done) { - abort_reshape(sra); - goto out; - } - /* set new array size if required customer_array_size is used - * by this metadata. + if (!st->ss->external && + !(reshape.before.data_disks != reshape.after.data_disks + && info->custom_array_size) && + info->new_level == reshape.level && + !forked) { + /* no need to wait for the reshape to finish as + * there is nothing more to do. */ - if (reshape.before.data_disks != - reshape.after.data_disks && - info->custom_array_size) { - struct mdinfo *info2; - char *subarray = strchr(info->text_version+1, '/')+1; + exit(0); + } + wait_reshape(sra); - wait_reshape(sra); + if (st->ss->external) { + /* Re-load the metadata as much could have changed */ + int cfd = open_dev(st->container_dev); + if (cfd >= 0) { ping_monitor(container); - - info2 = st->ss->container_content(st, subarray); - if (info2) { - unsigned long long current_size = 0; - unsigned long long new_size = - info2->custom_array_size/2; - - if (sysfs_get_ll(sra, - NULL, - "array_size", - ¤t_size) == 0 && - new_size > current_size) { - if (sysfs_set_num(sra, NULL, - "array_size", new_size) - < 0) - dprintf("Error: Cannot" - " set array size"); - else - dprintf("Array size " - "changed"); - dprintf(" from %llu to %llu.\n", - current_size, new_size); - } - sysfs_free(info2); - } + st->ss->free_super(st); + st->ss->load_container(st, cfd, container); + close(cfd); } + } - if (info->new_level != reshape.level) { - /* We need to wait for the reshape to finish - * (which will have happened unless - * odata < ndata) and then set the level - */ - - if (reshape.before.data_disks < - reshape.after.data_disks) - wait_reshape(sra); + /* set new array size if required customer_array_size is used + * by this metadata. + */ + if (reshape.before.data_disks != + reshape.after.data_disks && + info->custom_array_size) { + struct mdinfo *info2; + char *subarray = strchr(info->text_version+1, '/')+1; - c = map_num(pers, info->new_level); - if (c == NULL) { - if (forked) - return 1; - exit(0);/* not possible */ + info2 = st->ss->container_content(st, subarray); + if (info2) { + unsigned long long current_size = 0; + unsigned long long new_size = + info2->custom_array_size/2; + + if (sysfs_get_ll(sra, + NULL, + "array_size", + ¤t_size) == 0 && + new_size > current_size) { + if (sysfs_set_num(sra, NULL, + "array_size", new_size) + < 0) + dprintf("Error: Cannot" + " set array size"); + else + dprintf("Array size " + "changed"); + dprintf(" from %llu to %llu.\n", + current_size, new_size); } + sysfs_free(info2); + } + } + + if (info->new_level != reshape.level) { + c = map_num(pers, info->new_level); + if (c) { err = sysfs_set_str(sra, NULL, "level", c); if (err) fprintf(stderr, Name\ ": %s: could not set level " "to %s\n", devname, c); } - out: - if (forked) - return 0; - exit(0); - case -1: - fprintf(stderr, Name ": Cannot run child to monitor reshape: %s\n", - strerror(errno)); - rv = 1; - abort_reshape(sra); - break; - default: - /* The child will take care of unfreezing the array */ - frozen = 0; - break; } +out: + if (forked) + return 0; + unfreeze(st); + exit(0); - - release: - if (rv) { - unfreeze(st, frozen); - return rv; - } - if (container) - ping_monitor(container); - if (st->ss->external) { - /* Re-load the metadata as much could have changed */ - int cfd = open_dev(st->container_dev); - if (cfd >= 0) { - st->ss->free_super(st); - st->ss->load_container(st, cfd, container); - close(cfd); - } - } - if (rv && orig_level != UnSet && sra) { +release: + if (orig_level != UnSet && sra) { c = map_num(pers, orig_level); if (c && sysfs_set_str(sra, NULL, "level", c) == 0) fprintf(stderr, Name ": aborting level change\n"); } - unfreeze(st, frozen); - return rv; + if (!forked) + unfreeze(st); + return 1; } -int reshape_container(char *container, int cfd, char *devname, +int reshape_container(char *container, char *devname, struct supertype *st, struct mdinfo *info, int force, char *backup_file, - int quiet) + int quiet, int restart) { struct mdinfo *cc = NULL; - if (reshape_super(st, info->component_size, info->new_level, + /* component_size is not meaningful for a container, + * so pass '-1' meaning 'no change' + */ + if (!restart && + reshape_super(st, -1, info->new_level, info->new_layout, info->new_chunk, - info->array.raid_disks + info->delta_disks, - backup_file, devname, quiet)) + info->array.raid_disks, info->delta_disks, + backup_file, devname, quiet)) { + unfreeze(st); return 1; + } sync_metadata(st); @@ -2137,6 +2197,7 @@ int reshape_container(char *container, int cfd, char *devname, switch (fork()) { case -1: /* error */ perror("Cannot fork to complete reshape\n"); + unfreeze(st); return 1; default: /* parent */ printf(Name ": multi-array reshape continues in background\n"); @@ -2152,6 +2213,10 @@ int reshape_container(char *container, int cfd, char *devname, * reshape it. reshape_array() will re-read the metadata * so the next time through a different array should be * ready for reshape. + * It is possible that the 'different' array will not + * be assembled yet. In that case we simple exit. + * When it is assembled, the mdadm which assembles it + * will take over the reshape. */ struct mdinfo *content; int rv; @@ -2178,7 +2243,7 @@ int reshape_container(char *container, int cfd, char *devname, if (!content) break; - fd = open_dev_excl(mdstat->devnum); + fd = open_dev(mdstat->devnum); if (fd < 0) break; adev = map_dev(dev2major(mdstat->devnum), @@ -2191,11 +2256,13 @@ int reshape_container(char *container, int cfd, char *devname, rv = reshape_array(container, fd, adev, st, content, force, - backup_file, quiet, 1); + backup_file, quiet, 1, restart); close(fd); + restart = 0; if (rv) break; } + unfreeze(st); sysfs_free(cc); exit(0); } @@ -2273,18 +2340,23 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape, * 1 if more data from backup_point - but only as far as suspend_point, * should be backed up * 0 if things are progressing smoothly - * -1 if the reshape is finished, either because it is all done, - * or due to an error. + * -1 if the reshape is finished because it is all done, + * -2 if the reshape is finished due to an error. */ int advancing = (reshape->after.data_disks >= reshape->before.data_disks); - int need_backup = (reshape->after.data_disks - == reshape->before.data_disks); + unsigned long long need_backup; /* All data between start of array and + * here will at some point need to + * be backed up. + */ unsigned long long read_offset, write_offset; - unsigned long long read_range, write_range; + unsigned long long write_range; unsigned long long max_progress, target, completed; + unsigned long long array_size = (info->component_size + * reshape->before.data_disks); int fd; + char buf[20]; /* First, we unsuspend any region that is now known to be safe. * If suspend_point is on the 'wrong' side of reshape_progress, then @@ -2292,14 +2364,14 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape, * native metadata when we don't need to back-up. */ if (advancing) { - if (info->reshape_progress < *suspend_point) + if (info->reshape_progress <= *suspend_point) sysfs_set_num(info, NULL, "suspend_lo", info->reshape_progress); } else { /* Note: this won't work in 2.6.37 and before. * Something somewhere should make sure we don't need it! */ - if (info->reshape_progress > *suspend_point) + if (info->reshape_progress >= *suspend_point) sysfs_set_num(info, NULL, "suspend_hi", info->reshape_progress); } @@ -2315,37 +2387,40 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape, * If we need to suspend more, we limit it to 128M per device, which is * rather arbitrary and should be some time-based calculation. */ - write_offset = info->reshape_progress / reshape->before.data_disks; - read_offset = info->reshape_progress / reshape->after.data_disks; - write_range = reshape->blocks / reshape->before.data_disks; - read_range = reshape->blocks / reshape->after.data_disks; + read_offset = info->reshape_progress / reshape->before.data_disks; + write_offset = info->reshape_progress / reshape->after.data_disks; + write_range = info->new_chunk/512; + if (reshape->before.data_disks == reshape->after.data_disks) + need_backup = array_size; + else + need_backup = reshape->backup_blocks; if (advancing) { - if (read_offset < write_offset + write_range) { + if (read_offset < write_offset + write_range) max_progress = backup_point; - if (max_progress <= info->reshape_progress) - need_backup = 1; - } else { + else max_progress = - (read_offset - write_range) * - reshape->before.data_disks; - } + read_offset * + reshape->after.data_disks; } else { - if (read_offset > write_offset - write_range) { + if (read_offset > write_offset - write_range) + /* Can only progress as far as has been backed up, + * which must be suspended */ max_progress = backup_point; - if (max_progress >= info->reshape_progress) - need_backup = 1; - } else { - max_progress = - (read_offset + write_range) * - reshape->before.data_disks; - /* If we are using internal metadata, then we can - * progress all the way to the suspend_point without - * worrying about backing-up/suspending along the - * way. - */ - if (max_progress < *suspend_point && - info->array.major_version >= 0) - max_progress = *suspend_point; + else if (info->reshape_progress <= need_backup) + max_progress = backup_point; + else { + if (info->array.major_version >= 0) + /* Can progress until backup is needed */ + max_progress = need_backup; + else { + /* Can progress until metadata update is required */ + max_progress = + read_offset * + reshape->after.data_disks; + /* but data must be suspended */ + if (max_progress < *suspend_point) + max_progress = *suspend_point; + } } } @@ -2354,13 +2429,15 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape, * Consider extending suspend_point 128M per device if it * is less than 64M per device beyond reshape_progress. * But always do a multiple of 'blocks' + * FIXME this is too big - it takes to long to complete + * this much. */ target = 64*1024*2 * min(reshape->before.data_disks, reshape->after.data_disks); - target /= reshape->blocks; + target /= reshape->backup_blocks; if (target < 2) target = 2; - target *= reshape->blocks; + target *= reshape->backup_blocks; /* For externally managed metadata we always need to suspend IO to * the area being reshaped so we regularly push suspend_point forward. @@ -2368,24 +2445,44 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape, * a backup. */ if (advancing) { - if ((need_backup || info->array.major_version < 0) && + if ((need_backup > info->reshape_progress + || info->array.major_version < 0) && *suspend_point < info->reshape_progress + target) { - if (max_progress < *suspend_point + 2 * target) - *suspend_point = max_progress; - else + if (need_backup < *suspend_point + 2 * target) + *suspend_point = need_backup; + else if (*suspend_point + 2 * target < array_size) *suspend_point += 2 * target; + else + *suspend_point = array_size; sysfs_set_num(info, NULL, "suspend_hi", *suspend_point); - max_progress = *suspend_point; + if (max_progress > *suspend_point) + max_progress = *suspend_point; } } else { - if ((need_backup || info->array.major_version < 0) && - *suspend_point > info->reshape_progress - target) { - if (max_progress > *suspend_point - 2 * target) - *suspend_point = max_progress; - else - *suspend_point -= 2 * target; - sysfs_set_num(info, NULL, "suspend_lo", *suspend_point); - max_progress = *suspend_point; + if (info->array.major_version >= 0) { + /* Only need to suspend when about to backup */ + if (info->reshape_progress < need_backup * 2 && + *suspend_point > 0) { + *suspend_point = 0; + sysfs_set_num(info, NULL, "suspend_lo", 0); + sysfs_set_num(info, NULL, "suspend_hi", need_backup); + } + } else { + /* Need to suspend continually */ + if (info->reshape_progress < *suspend_point) + *suspend_point = info->reshape_progress; + if (*suspend_point + target < info->reshape_progress) + /* No need to move suspend region yet */; + else { + if (*suspend_point >= 2 * target) + *suspend_point -= 2 * target; + else + *suspend_point = 0; + sysfs_set_num(info, NULL, "suspend_lo", + *suspend_point); + } + if (max_progress < *suspend_point) + max_progress = *suspend_point; } } @@ -2397,14 +2494,23 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape, * At the same time we convert wait_point to a similar number * for comparing against sync_completed. */ - if (!advancing) { - max_progress = info->component_size * reshape->after.data_disks - - max_progress; - wait_point = info->component_size * reshape->after.data_disks - - wait_point; - } + /* scale down max_progress to per_disk */ max_progress /= reshape->after.data_disks; + /* Round to chunk size as some kernels give an erroneously high number */ + max_progress /= info->new_chunk/512; + max_progress *= info->new_chunk/512; + /* And round to old chunk size as the kernel wants that */ + max_progress /= info->array.chunk_size/512; + max_progress *= info->array.chunk_size/512; + /* Limit progress to the whole device */ + if (max_progress > info->component_size) + max_progress = info->component_size; wait_point /= reshape->after.data_disks; + if (!advancing) { + /* switch from 'device offset' to 'processed block count' */ + max_progress = info->component_size - max_progress; + wait_point = info->component_size - wait_point; + } sysfs_set_num(info, NULL, "sync_max", max_progress); @@ -2416,11 +2522,11 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape, */ fd = sysfs_get_fd(info, NULL, "sync_completed"); if (fd < 0) - return -1; + goto check_progress; if (sysfs_fd_get_ll(fd, &completed) < 0) { close(fd); - return -1; + goto check_progress; } while (completed < max_progress && completed < wait_point) { /* Check that sync_action is still 'reshape' to avoid @@ -2432,14 +2538,34 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape, action, 20) <= 0 || strncmp(action, "reshape", 7) != 0) break; + /* Some kernels reset 'sync_completed' to zero + * before setting 'sync_action' to 'idle'. + * So we need these extra tests. + */ + if (completed == 0 && advancing + && info->reshape_progress > 0) + break; + if (completed == 0 && !advancing + && info->reshape_progress < (info->component_size + * reshape->after.data_disks)) + break; FD_ZERO(&rfds); FD_SET(fd, &rfds); select(fd+1, NULL, NULL, &rfds, NULL); if (sysfs_fd_get_ll(fd, &completed) < 0) { close(fd); - return -1; + goto check_progress; } } + /* Some kernels reset 'sync_completed' to zero, + * we need to have real point we are in md + */ + if (completed == 0) + completed = max_progress; + + /* some kernels can give an incorrectly high 'completed' number */ + completed /= (info->new_chunk/512); + completed *= (info->new_chunk/512); /* Convert 'completed' back in to a 'progress' number */ completed *= reshape->after.data_disks; if (!advancing) { @@ -2451,10 +2577,30 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape, close(fd); /* We return the need_backup flag. Caller will decide - * how much (a multiple of ->blocks) and will adjust - * suspend_{lo,hi} and suspend_point. + * how much - a multiple of ->backup_blocks up to *suspend_point + */ + if (advancing) + return need_backup > info->reshape_progress; + else + return need_backup >= info->reshape_progress; + +check_progress: + /* if we couldn't read a number from sync_completed, then + * either the reshape did complete, or it aborted. + * We can tell which by checking for 'none' in reshape_position. */ - return need_backup; + strcpy(buf, "hi"); + if (sysfs_get_str(info, NULL, "reshape_position", buf, sizeof(buf)) < 0 + || strncmp(buf, "none", 4) != 0) + return -2; /* abort */ + else { + /* Maybe racing with array shutdown - check state */ + if (sysfs_get_str(info, NULL, "array_state", buf, sizeof(buf)) < 0 + || strncmp(buf, "inactive", 8) == 0 + || strncmp(buf, "clear",5) == 0) + return -2; /* abort */ + return -1; /* complete */ + } } @@ -2486,7 +2632,8 @@ static int grow_backup(struct mdinfo *sra, odata--; /* Check that array hasn't become degraded, else we might backup the wrong data */ - sysfs_get_ll(sra, NULL, "degraded", &ll); + if (sysfs_get_ll(sra, NULL, "degraded", &ll) < 0) + return -1; /* FIXME this error is ignored */ new_degraded = (int)ll; if (new_degraded != *degraded) { /* check each device to ensure it is still working */ @@ -2703,10 +2850,10 @@ static void validate(int afd, int bfd, unsigned long long offset) } } -static int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, - unsigned long stripes, - int *fds, unsigned long long *offsets, - int dests, int *destfd, unsigned long long *destoffsets) +int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, + struct supertype *st, unsigned long blocks, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets) { /* Monitor a reshape where backup is being performed using * 'native' mechanism - either to a backup file, or @@ -2727,6 +2874,41 @@ static int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, int data = reshape->before.data_disks; int disks = reshape->before.data_disks + reshape->parity; int chunk = sra->array.chunk_size; + struct mdinfo *sd; + unsigned long stripes; + + /* set up the backup-super-block. This requires the + * uuid from the array. + */ + /* Find a superblock */ + for (sd = sra->devs; sd; sd = sd->next) { + char *dn; + int devfd; + int ok; + if (sd->disk.state & (1<disk.major, sd->disk.minor, 1); + devfd = dev_open(dn, O_RDONLY); + if (devfd < 0) + continue; + ok = st->ss->load_super(st, devfd, NULL); + close(devfd); + if (ok >= 0) + break; + } + if (!sd) { + fprintf(stderr, Name ": Cannot find a superblock\n"); + return 0; + } + + memset(&bsb, 0, 512); + memcpy(bsb.magic, "md_backup_data-1", 16); + st->ss->uuid_from_super(st, (int*)&bsb.set_uuid); + bsb.mtime = __cpu_to_le64(time(0)); + bsb.devstart2 = blocks; + + stripes = blocks / (sra->array.chunk_size/512) / + reshape->before.data_disks; if (posix_memalign((void**)&buf, 4096, disks * chunk)) /* Don't start the 'reshape' */ @@ -2736,12 +2918,13 @@ static int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, sysfs_set_num(sra, NULL, "sync_speed_min", 200000); } - array_size = sra->component_size * data; if (increasing) { + array_size = sra->component_size * reshape->after.data_disks; backup_point = sra->reshape_progress; suspend_point = 0; } else { - backup_point = array_size; + array_size = sra->component_size * reshape->before.data_disks; + backup_point = reshape->backup_blocks; suspend_point = array_size; } @@ -2772,11 +2955,6 @@ static int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, rv = progress_reshape(sra, reshape, backup_point, wait_point, &suspend_point, &reshape_completed); - if (rv < 0) { - done = 1; - break; - } - /* external metadata would need to ping_monitor here */ sra->reshape_progress = reshape_completed; @@ -2799,19 +2977,46 @@ static int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, destoffsets, 1); } - if (rv) { + if (rv < 0) { + if (rv == -1) + done = 1; + break; + } + + while (rv) { unsigned long long offset; - /* need to backup some space... */ + unsigned long actual_stripes; + /* Need to backup some data. + * If 'part' is not used and the desired + * backup size is suspended, do a backup, + * then consider the next part. + */ /* Check that 'part' is unused */ if (part == 0 && __le64_to_cpu(bsb.length) != 0) - abort(); /* BUG here */ + break; if (part == 1 && __le64_to_cpu(bsb.length2) != 0) - abort(); + break; offset = backup_point / data; - if (!increasing) - offset -= stripes * (chunk/512); - grow_backup(sra, offset, stripes, + actual_stripes = stripes; + if (increasing) { + if (offset + actual_stripes * (chunk/512) > + sra->component_size) + actual_stripes = ((sra->component_size - offset) + / (chunk/512)); + if (offset + actual_stripes * (chunk/512) > + suspend_point/data) + break; + } else { + if (offset < actual_stripes * (chunk/512)) + actual_stripes = offset / (chunk/512); + offset -= actual_stripes * (chunk/512); + if (offset < suspend_point/data) + break; + } + if (actual_stripes == 0) + break; + grow_backup(sra, offset, actual_stripes, fds, offsets, disks, chunk, level, layout, dests, destfd, destoffsets, @@ -2820,16 +3025,18 @@ static int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, /* record where 'part' is up to */ part = !part; if (increasing) - backup_point += stripes * (chunk/512) * data; + backup_point += actual_stripes * (chunk/512) * data; else - backup_point -= stripes * (chunk/512) * data; + backup_point -= actual_stripes * (chunk/512) * data; } } + /* FIXME maybe call progress_reshape one more time instead */ + abort_reshape(sra); /* remove any remaining suspension */ if (reshape->before.data_disks == reshape->after.data_disks) sysfs_set_num(sra, NULL, "sync_speed_min", speed); free(buf); - return 1; /* FIXME what does this mean? */ + return done; } /* @@ -2866,6 +3073,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt int fd; int bsbsize; char *devname, namebuf[20]; + unsigned long long lo, hi; /* This was a spare and may have some saved data on it. * Load the superblock, find and load the @@ -2949,42 +3157,52 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt } if (bsb.magic[15] == '1') { - if (info->delta_disks >= 0) { - /* reshape_progress is increasing */ - if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) < - info->reshape_progress) { - nonew: - if (verbose) - fprintf(stderr, Name ": backup-metadata found on %s but is not needed\n", devname); - continue; /* No new data here */ + if (bsb.length == 0) + continue; + if (info->delta_disks >= 0) { + /* reshape_progress is increasing */ + if (__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length) + < info->reshape_progress) { + nonew: + if (verbose) + fprintf(stderr, Name + ": backup-metadata found on %s but is not needed\n", devname); + continue; /* No new data here */ + } + } else { + /* reshape_progress is decreasing */ + if (__le64_to_cpu(bsb.arraystart) >= + info->reshape_progress) + goto nonew; /* No new data here */ } } else { - /* reshape_progress is decreasing */ - if (__le64_to_cpu(bsb.arraystart) >= - info->reshape_progress) - goto nonew; /* No new data here */ - } - } else { - if (info->delta_disks >= 0) { - /* reshape_progress is increasing */ - if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) < - info->reshape_progress && - __le64_to_cpu(bsb.arraystart2) + __le64_to_cpu(bsb.length2) < - info->reshape_progress) - goto nonew; /* No new data here */ - } else { - /* reshape_progress is decreasing */ - if (__le64_to_cpu(bsb.arraystart) >= - info->reshape_progress && - __le64_to_cpu(bsb.arraystart2) >= - info->reshape_progress) - goto nonew; /* No new data here */ - } + if (bsb.length == 0 && bsb.length2 == 0) + continue; + if (info->delta_disks >= 0) { + /* reshape_progress is increasing */ + if ((__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length) + < info->reshape_progress) + && + (__le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2) + < info->reshape_progress)) + goto nonew; /* No new data here */ + } else { + /* reshape_progress is decreasing */ + if (__le64_to_cpu(bsb.arraystart) >= + info->reshape_progress && + __le64_to_cpu(bsb.arraystart2) >= + info->reshape_progress) + goto nonew; /* No new data here */ + } } if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) { second_fail: if (verbose) - fprintf(stderr, Name ": Failed to verify secondary backup-metadata block on %s\n", + fprintf(stderr, Name + ": Failed to verify secondary backup-metadata block on %s\n", devname); continue; /* Cannot seek */ } @@ -3048,7 +3266,28 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt /* Ok, so the data is restored. Let's update those superblocks. */ - if (info->delta_disks >= 0) { + lo = hi = 0; + if (bsb.length) { + lo = __le64_to_cpu(bsb.arraystart); + hi = lo + __le64_to_cpu(bsb.length); + } + if (bsb.magic[15] == '2' && bsb.length2) { + unsigned long long lo1, hi1; + lo1 = __le64_to_cpu(bsb.arraystart2); + hi1 = lo1 + __le64_to_cpu(bsb.length2); + if (lo == hi) { + lo = lo1; + hi = hi1; + } else if (lo < lo1) + hi = hi1; + else + lo = lo1; + } + if (lo < hi && + (info->reshape_progress < lo || + info->reshape_progress > hi)) + /* backup does not affect reshape_progress*/ ; + else if (info->delta_disks >= 0) { info->reshape_progress = __le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length); if (bsb.magic[15] == '2') { @@ -3123,10 +3362,34 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info, char *backup_file) { - int err = sysfs_set_str(info, NULL, "array_state", "readonly"); + char buf[40]; + char *container = NULL; + int err; + + err = sysfs_set_str(info, NULL, "array_state", "readonly"); if (err) return err; - return reshape_array(NULL, mdfd, "array", st, info, 1, backup_file, 0, 0); -} + if (st->ss->external) { + fmt_devname(buf, st->container_dev); + container = buf; + freeze(st); + if (!mdmon_running(st->container_dev)) + start_mdmon(st->container_dev); + ping_monitor(devnum2devname(st->container_dev)); + + if (info->reshape_active == 2) { + int cfd = open_dev(st->container_dev); + if (cfd < 0) + return 1; + st->ss->load_container(st, cfd, container); + close(cfd); + return reshape_container(container, NULL, + st, info, 0, backup_file, + 0, 1); + } + } + return reshape_array(container, mdfd, "array", st, info, 1, + backup_file, 0, 0, 1); +}