int start_reshape(struct mdinfo *sra)
{
int err;
+ sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL);
err = sysfs_set_num(sra, NULL, "suspend_hi", 0);
err = err ?: sysfs_set_num(sra, NULL, "suspend_lo", 0);
err = err ?: sysfs_set_num(sra, NULL, "sync_min", 0);
return blocks;
}
-/* 'struct reshape' records the intermediate states
- * a general reshape.
- * The starting geometry is converted to the 'before' geometry
- * by at most an atomic level change. They could be the same.
- * Similarly the 'after' geometry is converted to the final
- * geometry by at most a level change.
- * Note that 'before' and 'after' must have the same level.
- * 'blocks' is the minimum number of sectors for a reshape unit.
- * This will be a multiple of the stripe size in each of the
- * 'before' and 'after' geometries.
- * If 'blocks' is 0, no restriping is necessary.
- */
-struct reshape {
- int level;
- int parity; /* number of parity blocks/devices */
- struct {
- int layout;
- int data_disks;
- } before, after;
- unsigned long long blocks;
- unsigned long long stripes; /* number of old stripes that comprise 'blocks'*/
- unsigned long long new_size; /* New size of array in sectors */
-};
-
char *analyse_change(struct mdinfo *info, struct reshape *re)
{
/* Based on the current array state in info->array and
re->before.data_disks = (info->array.raid_disks +
info->delta_disks);
re->before.layout = 0;
- re->blocks = 0;
+ re->backup_blocks = 0;
re->parity = 0;
return NULL;
}
re->parity = 1;
re->before.data_disks = 1;
re->before.layout = ALGORITHM_LEFT_SYMMETRIC;
- re->blocks = 0;
+ re->backup_blocks = 0;
return NULL;
}
/* Could do some multi-stage conversions, but leave that to
re->parity = 0;
re->before.data_disks = new_disks;
re->before.layout = 0;
- re->blocks = 0;
+ re->backup_blocks = 0;
return NULL;
case 0:
re->before.data_disks = (info->array.raid_disks +
info->delta_disks);
re->before.layout = info->new_layout;
- re->blocks = 0;
+ re->backup_blocks = 0;
return NULL;
}
re->after.layout = info->new_layout;
break;
case 6:
- if (info->new_layout == UnSet) {
- re->after.layout = re->before.layout;
- break;
- }
+ if (info->new_layout == UnSet)
+ info->new_layout = re->before.layout;
+
/* after.layout needs to be raid6 version of new_layout */
if (info->new_layout == ALGORITHM_PARITY_N)
re->after.layout = ALGORITHM_PARITY_N;
re->after.data_disks = (info->array.raid_disks +
info->delta_disks) - 2;
if (info->new_layout == UnSet)
- re->after.layout = re->before.layout;
+ re->after.layout = info->array.layout;
else
re->after.layout = info->new_layout;
break;
re->after.layout == re->before.layout &&
info->new_chunk == info->array.chunk_size) {
/* Nothing to change */
- re->blocks = 0;
+ re->backup_blocks = 0;
return NULL;
}
if (re->after.data_disks == 1 && re->before.data_disks == 1) {
/* chunks can layout changes make no difference */
- re->blocks = 0;
+ re->backup_blocks = 0;
return NULL;
}
get_linux_version() < 2006030)
return "reshape to fewer devices is not supported before 2.6.32 - sorry.";
- re->blocks = compute_backup_blocks(
+ re->backup_blocks = compute_backup_blocks(
info->new_chunk, info->array.chunk_size,
re->after.data_disks,
re->before.data_disks);
int force,
char *backup_file,
int quiet);
-static int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
- unsigned long stripes,
- int *fds, unsigned long long *offsets,
- int dests, int *destfd, unsigned long long *destoffsets);
-
int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
long long size,
rv = reshape_array(container, fd, devname, st, &info, force,
backup_file, quiet, 0);
}
+ /* reshape_* released the array */
+ return rv;
release:
unfreeze(st, frozen);
return rv;
int nrdisks;
int err;
int frozen;
- unsigned long blocks, stripes;
+ unsigned long blocks;
unsigned long cache;
unsigned long long array_size;
int done;
- struct mdinfo *sra, *sd;
+ struct mdinfo *sra;
msg = analyse_change(info, &reshape);
if (msg) {
reshape.after.data_disks)
+ reshape.parity - array.raid_disks;
- if (!force && spares_needed < info->array.spare_disks) {
+ if (!force &&
+ info->new_level > 0 &&
+ spares_needed > info->array.spare_disks) {
fprintf(stderr,
Name ": Need %d spare%s to avoid degraded array,"
" and only have %d.\n"
return 1;
}
if (!quiet)
- fprintf(stderr, Name " level of %s changed to %s\n",
+ fprintf(stderr, Name ": level of %s changed to %s\n",
devname, c);
orig_level = info->array.level;
}
sysfs_free(info2);
}
- if (reshape.blocks == 0) {
+ if (reshape.backup_blocks == 0) {
/* No restriping needed, but we might need to impose
* some more changes: layout, raid_disks, chunk_size
*/
}
sra = sysfs_read(fd, 0,
- GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE||GET_CHUNK|
+ GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK|
GET_CACHE);
if (!sra) {
/* Decide how many blocks (sectors) for a reshape
* unit. The number we have so far is just a minimum
*/
- blocks = reshape.blocks;
+ blocks = reshape.backup_blocks;
if (reshape.before.data_disks ==
reshape.after.data_disks) {
/* Make 'blocks' bigger for better throughput, but
goto release;
}
if (backup_file == NULL) {
- if (reshape.after.data_disks <= reshape.before.data_disks) {
+ if (reshape.after.data_disks <= reshape.before.data_disks) {
fprintf(stderr,
Name ": %s: Cannot grow - need backup-file\n",
devname);
*/
sync_metadata(st);
+ sra->new_chunk = info->new_chunk;
+
if (info->array.chunk_size == info->new_chunk &&
reshape.before.layout == reshape.after.layout &&
st->ss->external == 0) {
}
start_reshape(sra);
- if (st->ss->external) {
- /* metadata handler takes it from here */
- ping_manager(container);
- st->ss->manage_reshape(st, backup_file);
- frozen = 0;
- goto release;
- }
-
- /* set up the backup-super-block. This requires the
- * uuid from the array.
- */
- /* Find a superblock */
- for (sd = sra->devs; sd; sd = sd->next) {
- char *dn;
- int devfd;
- int ok;
- if (sd->disk.state & (1<<MD_DISK_FAULTY))
- continue;
- dn = map_dev(sd->disk.major, sd->disk.minor, 1);
- devfd = dev_open(dn, O_RDONLY);
- if (devfd < 0)
- continue;
- ok = st->ss->load_super(st, devfd, NULL);
- close(devfd);
- if (ok >= 0)
- break;
- }
- if (!sd) {
- fprintf(stderr, Name ": %s: Cannot find a superblock\n",
- devname);
- rv = 1;
- abort_reshape(sra);
- goto release;
- }
-
- memset(&bsb, 0, 512);
- memcpy(bsb.magic, "md_backup_data-1", 16);
- st->ss->uuid_from_super(st, (int*)&bsb.set_uuid);
- bsb.mtime = __cpu_to_le64(time(0));
- bsb.devstart2 = blocks;
-
- stripes = reshape.blocks / (info->array.chunk_size/512) /
- reshape.before.data_disks;
/* Now we just need to kick off the reshape and watch, while
* handling backups of the data...
odisks = reshape.before.data_disks + reshape.parity;
- done = child_monitor(fd, sra, &reshape, stripes,
- fdlist, offsets,
- d - odisks, fdlist+odisks, offsets+odisks);
+ if (st->ss->external) {
+ /* metadata handler takes it from here */
+ done = st->ss->manage_reshape(
+ fd, sra, &reshape, st, blocks,
+ fdlist, offsets,
+ d - odisks, fdlist+odisks,
+ offsets+odisks);
+ } else
+ done = child_monitor(
+ fd, sra, &reshape, st, blocks,
+ fdlist, offsets,
+ d - odisks, fdlist+odisks,
+ offsets+odisks);
if (backup_file && done)
unlink(backup_file);
}
}
- if (info->new_level != info->array.level) {
+ if (info->new_level != reshape.level) {
/* We need to wait for the reshape to finish
* (which will have happened unless
* odata < ndata) and then set the level
wait_reshape(sra);
c = map_num(pers, info->new_level);
- if (c == NULL) {
- if (forked)
- return 1;
- exit(0);/* not possible */
- }
+ if (c == NULL)
+ goto out;/* not possible */
err = sysfs_set_str(sra, NULL, "level", c);
if (err)
fprintf(stderr, Name\
- ": %s: could not set level"
+ ": %s: could not set level "
"to %s\n", devname, c);
}
out:
release:
- if (rv) {
- unfreeze(st, frozen);
- return rv;
- }
- if (container)
- ping_monitor(container);
- if (st->ss->external) {
- /* Re-load the metadata as much could have changed */
- int cfd = open_dev(st->container_dev);
- if (cfd >= 0) {
- st->ss->free_super(st);
- st->ss->load_container(st, cfd, container);
- close(cfd);
+ if (!rv) {
+ if (container)
+ ping_monitor(container);
+ if (st->ss->external) {
+ /* Re-load the metadata as much could have changed */
+ int cfd = open_dev(st->container_dev);
+ if (cfd >= 0) {
+ st->ss->free_super(st);
+ st->ss->load_container(st, cfd, container);
+ close(cfd);
+ }
}
}
if (rv && orig_level != UnSet && sra) {
* - suspend_point is maintained by progress_reshape and the caller
* should not touch it except to initialise to zero.
* It is an array address and it only increases in 2.6.37 and earlier.
- * This makes it difficulty to handle reducing reshapes with
+ * This makes it difficult to handle reducing reshapes with
* external metadata.
* However: it is similar to backup_point in that it records the
* other end of a suspended region from reshape_progress.
int advancing = (reshape->after.data_disks
>= reshape->before.data_disks);
- int need_backup = (reshape->after.data_disks
- == reshape->before.data_disks);
+ unsigned long long need_backup; /* need to eventually backup all the way
+ * to here
+ */
unsigned long long read_offset, write_offset;
- unsigned long long read_range, write_range;
+ unsigned long long write_range;
unsigned long long max_progress, target, completed;
+ unsigned long long array_size = (info->component_size
+ * reshape->before.data_disks);
int fd;
/* First, we unsuspend any region that is now known to be safe.
* native metadata when we don't need to back-up.
*/
if (advancing) {
- if (info->reshape_progress < *suspend_point)
+ if (info->reshape_progress <= *suspend_point)
sysfs_set_num(info, NULL, "suspend_lo",
info->reshape_progress);
} else {
/* Note: this won't work in 2.6.37 and before.
* Something somewhere should make sure we don't need it!
*/
- if (info->reshape_progress > *suspend_point)
+ if (info->reshape_progress >= *suspend_point)
sysfs_set_num(info, NULL, "suspend_hi",
info->reshape_progress);
}
* If we need to suspend more, we limit it to 128M per device, which is
* rather arbitrary and should be some time-based calculation.
*/
- write_offset = info->reshape_progress / reshape->before.data_disks;
- read_offset = info->reshape_progress / reshape->after.data_disks;
- write_range = reshape->blocks / reshape->before.data_disks;
- read_range = reshape->blocks / reshape->after.data_disks;
+ read_offset = info->reshape_progress / reshape->before.data_disks;
+ write_offset = info->reshape_progress / reshape->after.data_disks;
+ write_range = info->new_chunk/512;
if (advancing) {
+ need_backup = 0;
if (read_offset < write_offset + write_range) {
max_progress = backup_point;
- if (max_progress <= info->reshape_progress)
- need_backup = 1;
+ if (reshape->before.data_disks == reshape->after.data_disks)
+ need_backup = array_size;
+ else
+ need_backup = reshape->backup_blocks;
} else {
max_progress =
- (read_offset - write_range) *
- reshape->before.data_disks;
+ read_offset *
+ reshape->after.data_disks;
}
} else {
+ need_backup = array_size;
if (read_offset > write_offset - write_range) {
max_progress = backup_point;
if (max_progress >= info->reshape_progress)
- need_backup = 1;
+ need_backup = 0;
} else {
max_progress =
- (read_offset + write_range) *
- reshape->before.data_disks;
+ read_offset *
+ reshape->after.data_disks;
/* If we are using internal metadata, then we can
* progress all the way to the suspend_point without
* worrying about backing-up/suspending along the
* Consider extending suspend_point 128M per device if it
* is less than 64M per device beyond reshape_progress.
* But always do a multiple of 'blocks'
+ * FIXME this is too big - it takes to long to complete
+ * this much.
*/
target = 64*1024*2 * min(reshape->before.data_disks,
reshape->after.data_disks);
- target /= reshape->blocks;
+ target /= reshape->backup_blocks;
if (target < 2)
target = 2;
- target *= reshape->blocks;
+ target *= reshape->backup_blocks;
/* For externally managed metadata we always need to suspend IO to
* the area being reshaped so we regularly push suspend_point forward.
* a backup.
*/
if (advancing) {
- if ((need_backup || info->array.major_version < 0) &&
+ if ((need_backup > info->reshape_progress
+ || info->array.major_version < 0) &&
*suspend_point < info->reshape_progress + target) {
- if (max_progress < *suspend_point + 2 * target)
- *suspend_point = max_progress;
- else
+ if (need_backup < *suspend_point + 2 * target)
+ *suspend_point = need_backup;
+ else if (*suspend_point + 2 * target < array_size)
*suspend_point += 2 * target;
+ else
+ *suspend_point = array_size;
sysfs_set_num(info, NULL, "suspend_hi", *suspend_point);
- max_progress = *suspend_point;
+ if (max_progress > *suspend_point)
+ max_progress = *suspend_point;
}
} else {
- if ((need_backup || info->array.major_version < 0) &&
+ if ((need_backup < info->reshape_progress
+ || info->array.major_version < 0) &&
*suspend_point > info->reshape_progress - target) {
- if (max_progress > *suspend_point - 2 * target)
- *suspend_point = max_progress;
- else
+ if (need_backup > *suspend_point - 2 * target)
+ *suspend_point = need_backup;
+ else if (*suspend_point >= 2 * target)
*suspend_point -= 2 * target;
+ else
+ *suspend_point = 0;
sysfs_set_num(info, NULL, "suspend_lo", *suspend_point);
- max_progress = *suspend_point;
+ if (max_progress < *suspend_point)
+ max_progress = *suspend_point;
}
}
* At the same time we convert wait_point to a similar number
* for comparing against sync_completed.
*/
- if (!advancing) {
- max_progress = info->component_size * reshape->after.data_disks
- - max_progress;
- wait_point = info->component_size * reshape->after.data_disks
- - wait_point;
- }
+ /* scale down max_progress to per_disk */
max_progress /= reshape->after.data_disks;
+ /* Round to chunk size as some kernels give an erroneously high number */
+ max_progress /= info->new_chunk/512;
+ max_progress *= info->new_chunk/512;
+ /* Limit progress to the whole device */
+ if (max_progress > info->component_size)
+ max_progress = info->component_size;
wait_point /= reshape->after.data_disks;
+ if (!advancing) {
+ /* switch from 'device offset' to 'processed block count' */
+ max_progress = info->component_size - max_progress;
+ wait_point = info->component_size - wait_point;
+ }
sysfs_set_num(info, NULL, "sync_max", max_progress);
return -1;
}
}
+ /* some kernels can give an incorrectly high 'completed' number */
+ completed /= (info->new_chunk/512);
+ completed *= (info->new_chunk/512);
/* Convert 'completed' back in to a 'progress' number */
completed *= reshape->after.data_disks;
if (!advancing) {
close(fd);
/* We return the need_backup flag. Caller will decide
- * how much (a multiple of ->blocks) and will adjust
- * suspend_{lo,hi} and suspend_point.
+ * how much - a multiple of ->backup_blocks up to *suspend_point
*/
- return need_backup;
+ return advancing
+ ? (need_backup > info->reshape_progress)
+ : (need_backup < info->reshape_progress);
}
}
}
-static int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
- unsigned long stripes,
- int *fds, unsigned long long *offsets,
- int dests, int *destfd, unsigned long long *destoffsets)
+int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
+ struct supertype *st, unsigned long blocks,
+ int *fds, unsigned long long *offsets,
+ int dests, int *destfd, unsigned long long *destoffsets)
{
/* Monitor a reshape where backup is being performed using
* 'native' mechanism - either to a backup file, or
int data = reshape->before.data_disks;
int disks = reshape->before.data_disks + reshape->parity;
int chunk = sra->array.chunk_size;
+ struct mdinfo *sd;
+ unsigned long stripes;
+
+ /* set up the backup-super-block. This requires the
+ * uuid from the array.
+ */
+ /* Find a superblock */
+ for (sd = sra->devs; sd; sd = sd->next) {
+ char *dn;
+ int devfd;
+ int ok;
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ dn = map_dev(sd->disk.major, sd->disk.minor, 1);
+ devfd = dev_open(dn, O_RDONLY);
+ if (devfd < 0)
+ continue;
+ ok = st->ss->load_super(st, devfd, NULL);
+ close(devfd);
+ if (ok >= 0)
+ break;
+ }
+ if (!sd) {
+ fprintf(stderr, Name ": Cannot find a superblock\n");
+ return 0;
+ }
+
+ memset(&bsb, 0, 512);
+ memcpy(bsb.magic, "md_backup_data-1", 16);
+ st->ss->uuid_from_super(st, (int*)&bsb.set_uuid);
+ bsb.mtime = __cpu_to_le64(time(0));
+ bsb.devstart2 = blocks;
+
+ stripes = blocks / (sra->array.chunk_size/512) /
+ reshape->before.data_disks;
if (posix_memalign((void**)&buf, 4096, disks * chunk))
/* Don't start the 'reshape' */
sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
}
- array_size = sra->component_size * data;
if (increasing) {
+ array_size = sra->component_size * reshape->after.data_disks;
backup_point = sra->reshape_progress;
suspend_point = 0;
} else {
+ array_size = sra->component_size * reshape->before.data_disks;
backup_point = array_size;
suspend_point = array_size;
}
/* Want to return as soon the oldest backup slot can
* be released as that allows us to start backing up
* some more, providing suspend_point has been
- * advanced, which it should have
+ * advanced, which it should have.
*/
if (increasing) {
wait_point = array_size;
rv = progress_reshape(sra, reshape,
backup_point, wait_point,
&suspend_point, &reshape_completed);
- if (rv < 0) {
- done = 1;
- break;
- }
-
/* external metadata would need to ping_monitor here */
sra->reshape_progress = reshape_completed;
destoffsets, 1);
}
- if (rv) {
+ if (rv < 0) {
+ done = 1;
+ break;
+ }
+
+ while (rv) {
unsigned long long offset;
- /* need to backup some space... */
+ unsigned long actual_stripes;
+ /* Need to backup some data.
+ * If 'part' is not used and the desired
+ * backup size is suspended, do a backup,
+ * then consider the next part.
+ */
/* Check that 'part' is unused */
if (part == 0 && __le64_to_cpu(bsb.length) != 0)
- abort(); /* BUG here */
+ break;
if (part == 1 && __le64_to_cpu(bsb.length2) != 0)
- abort();
+ break;
offset = backup_point / data;
- if (!increasing)
- offset -= stripes * (chunk/512);
- grow_backup(sra, offset, stripes,
+ actual_stripes = stripes;
+ if (increasing) {
+ if (offset + actual_stripes * (chunk/512) >
+ sra->component_size)
+ actual_stripes = ((sra->component_size - offset)
+ / (chunk/512));
+ if (offset + actual_stripes * (chunk/512) >
+ suspend_point/data)
+ break;
+ } else {
+ if (offset < actual_stripes * (chunk/512))
+ actual_stripes = offset / (chunk/512);
+ offset -= actual_stripes * (chunk/512);
+ if (offset < suspend_point/data)
+ break;
+ }
+ grow_backup(sra, offset, actual_stripes,
fds, offsets,
disks, chunk, level, layout,
dests, destfd, destoffsets,
/* record where 'part' is up to */
part = !part;
if (increasing)
- backup_point += stripes * (chunk/512) * data;
+ backup_point += actual_stripes * (chunk/512) * data;
else
- backup_point -= stripes * (chunk/512) * data;
+ backup_point -= actual_stripes * (chunk/512) * data;
}
}
+ /* FIXME maybe call progress_reshape one more time instead */
+ abort_reshape(sra); /* remove any remaining suspension */
if (reshape->before.data_disks == reshape->after.data_disks)
sysfs_set_num(sra, NULL, "sync_speed_min", speed);
free(buf);