From e9e43ec36756c50a5dabf6db52d9bebbccaaa72f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Aug 2009 11:12:54 +1000 Subject: [PATCH] Grow: support restart of new migrations. --- Assemble.c | 15 ++- Grow.c | 279 +++++++++++++++++++++++++++++++++++++++++++++++------ mdadm.h | 3 +- restripe.c | 3 +- super0.c | 2 +- 5 files changed, 270 insertions(+), 32 deletions(-) diff --git a/Assemble.c b/Assemble.c index e75c7e5e..3bde9cea 100644 --- a/Assemble.c +++ b/Assemble.c @@ -985,6 +985,8 @@ int Assemble(struct supertype *st, char *mddev, } if (err) { fprintf(stderr, Name ": Failed to restore critical section for reshape, sorry.\n"); + if (backup_file == NULL) + fprintf(stderr," Possibly you needed to specify the --backup-file\n"); close(mdfd); return err; } @@ -1093,7 +1095,18 @@ int Assemble(struct supertype *st, char *mddev, content->array.layout, clean, avail, okcnt) && (okcnt >= req_cnt || start_partial_ok) ))) { - if (ioctl(mdfd, RUN_ARRAY, NULL)==0) { + /* This array is good-to-go. + * If a reshape is in progress then we might need to + * continue monitoring it. In that case we start + * it read-only and let the grow code make it writable. + */ + int rv; + if (content->reshape_active && + content->delta_disks <= 0) + rv = Grow_continue(mdfd, st, content, backup_file); + else + rv = ioctl(mdfd, RUN_ARRAY, NULL); + if (rv == 0) { if (verbose >= 0) { fprintf(stderr, Name ": %s has been started with %d drive%s", mddev, okcnt, okcnt==1?"":"s"); diff --git a/Grow.c b/Grow.c index 6bc00b88..803f5eb8 100644 --- a/Grow.c +++ b/Grow.c @@ -36,6 +36,10 @@ #include "md_u.h" #include "md_p.h" +#ifndef offsetof +#define offsetof(t,f) ((size_t)&(((t*)0)->f)) +#endif + int Grow_Add_device(char *devname, int fd, char *newdev) { /* Add a device to an active array. @@ -424,6 +428,7 @@ static int child_shrink(int afd, struct mdinfo *sra, unsigned long blocks, int dests, int *destfd, unsigned long long *destoffsets); static int child_same_size(int afd, struct mdinfo *sra, unsigned long blocks, int *fds, unsigned long long *offsets, + unsigned long long start, int disks, int chunk, int level, int layout, int data, int dests, int *destfd, unsigned long long *destoffsets); @@ -1115,6 +1120,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, else done = child_same_size(fd, sra, stripes, fdlist, offsets, + 0, odisks, ochunk, array.level, olayout, odata, d - odisks, fdlist+odisks, offsets+odisks); if (backup_file && done) @@ -1466,10 +1472,11 @@ static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes, static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes, int *fds, unsigned long long *offsets, + unsigned long long start, int disks, int chunk, int level, int layout, int data, int dests, int *destfd, unsigned long long *destoffsets) { - unsigned long long start, size; + unsigned long long size; unsigned long tailstripes = stripes; int part; char *buf; @@ -1484,19 +1491,19 @@ static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes, sysfs_get_ll(sra, NULL, "sync_speed_min", &speed); sysfs_set_num(sra, NULL, "sync_speed_min", 200000); - grow_backup(sra, 0, stripes, + grow_backup(sra, start, stripes, fds, offsets, disks, chunk, level, layout, dests, destfd, destoffsets, 0, buf); - grow_backup(sra, stripes * chunk/512, stripes, + grow_backup(sra, (start + stripes) * chunk/512, stripes, fds, offsets, disks, chunk, level, layout, dests, destfd, destoffsets, 1, buf); validate(afd, destfd[0], destoffsets[0]); part = 0; - start = stripes * 2; /* where to read next */ + start += stripes * 2; /* where to read next */ size = sra->component_size / (chunk/512); while (start < size) { if (wait_backup(sra, (start-stripes*2)*chunk/512, @@ -1545,19 +1552,26 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt unsigned long long nstripe, ostripe, last_block; int ndata, odata; - if (info->delta_disks < 0) - return 1; /* cannot handle a shrink */ - if (info->new_level != info->array.level || - info->new_layout != info->array.layout || - info->new_chunk != info->array.chunk_size) - return 1; /* Can only handle change in disks */ + if (info->new_level != info->array.level) + return 1; /* Cannot handle level changes (they are instantaneous) */ + + odata = info->array.raid_disks - info->delta_disks - 1; + if (info->array.level == 6) odata--; /* number of data disks */ + ndata = info->array.raid_disks - 1; + if (info->new_level == 6) ndata--; old_disks = info->array.raid_disks - info->delta_disks; + if (info->delta_disks <= 0) + /* Didn't grow, so the backup file must have + * been used + */ + old_disks = cnt; for (i=old_disks-(backup_file?1:0); iuuid, 16) != 0) continue; /* Wrong uuid */ @@ -1598,18 +1618,46 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt info->array.utime < __le64_to_cpu(bsb.mtime)) continue; /* time stamp is too bad */ - if (__le64_to_cpu(bsb.arraystart) != 0) - continue; /* Can only handle backup from start of array */ - if (__le64_to_cpu(bsb.length) < - info->reshape_progress) - continue; /* No new data here */ - + if (bsb.magic[15] == '1') { + if (info->delta_disks >= 0) { + /* reshape_progress is increasing */ + if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) < + info->reshape_progress) + continue; /* No new data here */ + } else { + /* reshape_progress is decreasing */ + if (__le64_to_cpu(bsb.arraystart) >= + info->reshape_progress) + continue; /* No new data here */ + } + } else { + if (info->delta_disks >= 0) { + /* reshape_progress is increasing */ + if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) < + info->reshape_progress && + __le64_to_cpu(bsb.arraystart2) + __le64_to_cpu(bsb.length2) < + info->reshape_progress) + continue; /* No new data here */ + } else { + /* reshape_progress is decreasing */ + if (__le64_to_cpu(bsb.arraystart) >= + info->reshape_progress && + __le64_to_cpu(bsb.arraystart2) >= + info->reshape_progress) + continue; /* No new data here */ + } + } if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) continue; /* Cannot seek */ /* There should be a duplicate backup superblock 4k before here */ if (lseek64(fd, -4096, 1) < 0 || - read(fd, buf, 4096) != 4096 || - memcmp(buf, &bsb, sizeof(bsb)) != 0) + read(fd, buf, 4096) != 4096) + continue; /* Cannot find leading superblock */ + if (bsb.magic[15] == '1') + bsbsize = offsetof(struct mdp_backup_super, pad1); + else + bsbsize = offsetof(struct mdp_backup_super, pad); + if (memcmp(buf, &bsb, bsbsize) != 0) continue; /* Cannot find leading superblock */ /* Now need the data offsets for all devices. */ @@ -1632,37 +1680,67 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt info->new_level, info->new_layout, fd, __le64_to_cpu(bsb.devstart)*512, - 0, __le64_to_cpu(bsb.length)*512)) { + __le64_to_cpu(bsb.arraystart), + __le64_to_cpu(bsb.length)*512)) { + /* didn't succeed, so giveup */ + return 1; + } + + if (bsb.magic[15] == '2' && + restore_stripes(fdlist, offsets, + info->array.raid_disks, + info->new_chunk, + info->new_level, + info->new_layout, + fd, __le64_to_cpu(bsb.devstart)*512 + + __le64_to_cpu(bsb.devstart2)*512, + __le64_to_cpu(bsb.arraystart2), + __le64_to_cpu(bsb.length2)*512)) { /* didn't succeed, so giveup */ return 1; } + /* Ok, so the data is restored. Let's update those superblocks. */ + if (info->delta_disks >= 0) { + info->reshape_progress = __le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length); + if (bsb.magic[15] == '2') { + unsigned long long p2 = __le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2); + if (p2 > info->reshape_progress) + info->reshape_progress = p2; + } + } else { + info->reshape_progress = __le64_to_cpu(bsb.arraystart); + if (bsb.magic[15] == '2') { + unsigned long long p2 = __le64_to_cpu(bsb.arraystart2); + if (p2 < info->reshape_progress) + info->reshape_progress = p2; + } + } for (j=0; jarray.raid_disks; j++) { if (fdlist[j] < 0) continue; if (st->ss->load_super(st, fdlist[j], NULL)) continue; st->ss->getinfo_super(st, &dinfo); - dinfo.reshape_progress = __le64_to_cpu(bsb.length); + dinfo.reshape_progress = info->reshape_progress; st->ss->update_super(st, &dinfo, "_reshape_progress", NULL,0, 0, NULL); st->ss->store_super(st, fdlist[j]); st->ss->free_super(st); } - - /* And we are done! */ return 0; } /* Didn't find any backup data, try to see if any * was needed. */ + if (info->delta_disks == 0) + /* Alway need backup data when size doesn't change */ + return 1; nstripe = ostripe = 0; - odata = info->array.raid_disks - info->delta_disks - 1; - if (info->array.level == 6) odata--; /* number of data disks */ - ndata = info->array.raid_disks - 1; - if (info->new_level == 6) ndata--; last_block = 0; while (nstripe >= ostripe) { nstripe += info->new_chunk / 512; @@ -1676,3 +1754,148 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt /* needed to recover critical section! */ return 1; } + +int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info, + char *backup_file) +{ + /* Array is assembled and ready to be started, but + * monitoring is probably required. + * So: + * - start read-only + * - set upper bound for resync + * - initialise the 'suspend' boundaries + * - switch to read-write + * - fork and continue monitoring + */ + int err; + int backup_list[1]; + unsigned long long backup_offsets[1]; + int odisks, ndisks, ochunk, nchunk,odata,ndata; + unsigned long a,b,blocks,stripes; + int backup_fd; + int *fds; + unsigned long long *offsets; + int d; + struct mdinfo *sra, *sd; + int rv; + int done = 0; + + err = sysfs_set_str(info, NULL, "array_state", "readonly"); + if (err) + return err; + + /* make sure reshape doesn't progress until we are ready */ + sysfs_set_str(info, NULL, "sync_max", "0"); + sysfs_set_str(info, NULL, "array_state", "active"); /* FIXME or clean */ + + /* ndisks is not growing, so raid_disks is old and +delta is new */ + odisks = info->array.raid_disks; + ndisks = odisks + info->delta_disks; + odata = odisks - 1; + ndata = ndisks - 1; + if (info->array.level == 6) { + odata--; + ndata--; + } + ochunk = info->array.chunk_size; + nchunk = info->new_chunk; + + + a = ochunk/512 * odata; + b = nchunk/512 * ndata; + /* Find GCD */ + while (a != b) { + if (a < b) + b -= a; + if (b < a) + a -= b; + } + /* LCM == product / GCD */ + blocks = ochunk/512 * nchunk/512 * odata * ndata / a; + + if (ndata == odata) + blocks *= 16; + stripes = blocks / (info->array.chunk_size/512) / odata; + + + memset(&bsb, 0, 512); + memcpy(bsb.magic, "md_backup_data-1", 16); + memcpy(&bsb.set_uuid, info->uuid, 16); + bsb.mtime = __cpu_to_le64(time(0)); + bsb.devstart2 = blocks; + + backup_fd = open(backup_file, O_RDWR|O_CREAT, S_IRUSR | S_IWUSR); + backup_list[0] = backup_fd; + backup_offsets[0] = 8 * 512; + fds = malloc(odisks * sizeof(fds[0])); + offsets = malloc(odisks * sizeof(offsets[0])); + for (d=0; dsys_name), + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE| + GET_CACHE); + + for (sd = sra->devs; sd; sd = sd->next) { + if (sd->disk.state & (1<disk.state & (1<disk.major, + sd->disk.minor, 1); + fds[sd->disk.raid_disk] + = dev_open(dn, O_RDONLY); + offsets[sd->disk.raid_disk] = sd->data_offset*512; + if (fds[sd->disk.raid_disk] < 0) { + fprintf(stderr, Name ": %s: cannot open component %s\n", + info->sys_name, dn?dn:"-unknown-"); + rv = 1; + goto release; + } + free(dn); + } + } + + switch(fork()) { + case 0: + close(mdfd); + mlockall(MCL_FUTURE); + if (info->delta_disks < 0) + done = child_shrink(-1, info, stripes, + fds, offsets, + info->array.raid_disks, + info->array.chunk_size, + info->array.level, info->array.layout, + odata, + 1, backup_list, backup_offsets); + else if (info->delta_disks == 0) { + /* The 'start' is a per-device stripe number. + * reshape_progress is a per-array sector number. + * So divide by ndata * chunk_size + */ + unsigned long long start = info->reshape_progress / ndata; + start /= (info->array.chunk_size/512); + done = child_same_size(-1, info, stripes, + fds, offsets, + start, + info->array.raid_disks, + info->array.chunk_size, + info->array.level, info->array.layout, + odata, + 1, backup_list, backup_offsets); + } + if (backup_file && done) + unlink(backup_file); + /* FIXME should I intuit a level change */ + exit(0); + case -1: + fprintf(stderr, Name ": Cannot run child to continue monitoring reshape: %s\n", + strerror(errno)); + return 1; + default: + break; + } +release: + return 0; +} + + diff --git a/mdadm.h b/mdadm.h index 181de07c..b0ff7dc6 100644 --- a/mdadm.h +++ b/mdadm.h @@ -722,7 +722,8 @@ extern int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, int level, char *layout_str, int chunksize, int raid_disks); extern int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt, char *backup_file); - +extern int Grow_continue(int mdfd, struct supertype *st, + struct mdinfo *info, char *backup_file); extern int Assemble(struct supertype *st, char *mddev, mddev_ident_t ident, diff --git a/restripe.c b/restripe.c index 38b44122..9a70cd09 100644 --- a/restripe.c +++ b/restripe.c @@ -519,13 +519,14 @@ int restore_stripes(int *dest, unsigned long long *offsets, int source, unsigned long long read_offset, unsigned long long start, unsigned long long length) { - char *stripe_buf = malloc(raid_disks * chunk_size); + char *stripe_buf; char **stripes = malloc(raid_disks * sizeof(char*)); char **blocks = malloc(raid_disks * sizeof(char*)); int i; int data_disks = raid_disks - (level == 0 ? 0 : level <= 5 ? 1 : 2); + posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size); if (zero == NULL) { zero = malloc(chunk_size); if (zero) diff --git a/super0.c b/super0.c index 60c51454..71b03240 100644 --- a/super0.c +++ b/super0.c @@ -140,7 +140,7 @@ static void examine_super0(struct supertype *st, char *homehost) printf(" Reshape pos'n : %llu%s\n", (unsigned long long)sb->reshape_position/2, human_size((long long)sb->reshape_position<<9)); if (sb->delta_disks) { printf(" Delta Devices : %d", sb->delta_disks); - if (sb->delta_disks) + if (sb->delta_disks > 0) printf(" (%d->%d)\n", sb->raid_disks-sb->delta_disks, sb->raid_disks); else printf(" (%d->%d)\n", sb->raid_disks, sb->raid_disks+sb->delta_disks); -- 2.39.2