]> git.ipfire.org Git - thirdparty/mdadm.git/commitdiff
Grow: support restart of new migrations.
authorNeilBrown <neilb@suse.de>
Thu, 13 Aug 2009 01:12:54 +0000 (11:12 +1000)
committerNeilBrown <neilb@suse.de>
Thu, 13 Aug 2009 01:12:54 +0000 (11:12 +1000)
Assemble.c
Grow.c
mdadm.h
restripe.c
super0.c

index e75c7e5eaf9c7615f062f6d61bcf20dfa9538772..3bde9cea9279def7acb6da31baf0ed26336e0c63 100644 (file)
@@ -985,6 +985,8 @@ int Assemble(struct supertype *st, char *mddev,
                }
                if (err) {
                        fprintf(stderr, Name ": Failed to restore critical section for reshape, sorry.\n");
+                       if (backup_file == NULL)
+                               fprintf(stderr,"      Possibly you needed to specify the --backup-file\n");
                        close(mdfd);
                        return err;
                }
@@ -1093,7 +1095,18 @@ int Assemble(struct supertype *st, char *mddev,
                              content->array.layout, clean, avail, okcnt) &&
                       (okcnt >= req_cnt || start_partial_ok)
                             ))) {
-                       if (ioctl(mdfd, RUN_ARRAY, NULL)==0) {
+                       /* This array is good-to-go.
+                        * If a reshape is in progress then we might need to
+                        * continue monitoring it.  In that case we start
+                        * it read-only and let the grow code make it writable.
+                        */
+                       int rv;
+                       if (content->reshape_active &&
+                           content->delta_disks <= 0)
+                               rv = Grow_continue(mdfd, st, content, backup_file);
+                       else
+                               rv = ioctl(mdfd, RUN_ARRAY, NULL);
+                       if (rv == 0) {
                                if (verbose >= 0) {
                                        fprintf(stderr, Name ": %s has been started with %d drive%s",
                                                mddev, okcnt, okcnt==1?"":"s");
diff --git a/Grow.c b/Grow.c
index 6bc00b8837439d7b2c141ce9dfca91b67ab7954a..803f5eb8d267e58dd2281b287fc9e6dd900b2b28 100644 (file)
--- a/Grow.c
+++ b/Grow.c
 #include       "md_u.h"
 #include       "md_p.h"
 
+#ifndef offsetof
+#define offsetof(t,f) ((size_t)&(((t*)0)->f))
+#endif
+
 int Grow_Add_device(char *devname, int fd, char *newdev)
 {
        /* Add a device to an active array.
@@ -424,6 +428,7 @@ static int child_shrink(int afd, struct mdinfo *sra, unsigned long blocks,
                        int dests, int *destfd, unsigned long long *destoffsets);
 static int child_same_size(int afd, struct mdinfo *sra, unsigned long blocks,
                           int *fds, unsigned long long *offsets,
+                          unsigned long long start,
                           int disks, int chunk, int level, int layout, int data,
                           int dests, int *destfd, unsigned long long *destoffsets);
 
@@ -1115,6 +1120,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        else
                                done = child_same_size(fd, sra, stripes,
                                                       fdlist, offsets,
+                                                      0,
                                                       odisks, ochunk, array.level, olayout, odata,
                                                       d - odisks, fdlist+odisks, offsets+odisks);
                        if (backup_file && done)
@@ -1466,10 +1472,11 @@ static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes,
 
 static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
                           int *fds, unsigned long long *offsets,
+                          unsigned long long start,
                           int disks, int chunk, int level, int layout, int data,
                           int dests, int *destfd, unsigned long long *destoffsets)
 {
-       unsigned long long start, size;
+       unsigned long long size;
        unsigned long tailstripes = stripes;
        int part;
        char *buf;
@@ -1484,19 +1491,19 @@ static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
        sysfs_get_ll(sra, NULL, "sync_speed_min", &speed);
        sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
 
-       grow_backup(sra, 0, stripes,
+       grow_backup(sra, start, stripes,
                    fds, offsets,
                    disks, chunk, level, layout,
                    dests, destfd, destoffsets,
                    0, buf);
-       grow_backup(sra, stripes * chunk/512, stripes,
+       grow_backup(sra, (start + stripes) * chunk/512, stripes,
                    fds, offsets,
                    disks, chunk, level, layout,
                    dests, destfd, destoffsets,
                    1, buf);
        validate(afd, destfd[0], destoffsets[0]);
        part = 0;
-       start = stripes * 2; /* where to read next */
+       start += stripes * 2; /* where to read next */
        size = sra->component_size / (chunk/512);
        while (start < size) {
                if (wait_backup(sra, (start-stripes*2)*chunk/512,
@@ -1545,19 +1552,26 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
        unsigned long long  nstripe, ostripe, last_block;
        int ndata, odata;
 
-       if (info->delta_disks < 0)
-               return 1; /* cannot handle a shrink */
-       if (info->new_level != info->array.level ||
-           info->new_layout != info->array.layout ||
-           info->new_chunk != info->array.chunk_size)
-               return 1; /* Can only handle change in disks */
+       if (info->new_level != info->array.level)
+               return 1; /* Cannot handle level changes (they are instantaneous) */
+
+       odata = info->array.raid_disks - info->delta_disks - 1;
+       if (info->array.level == 6) odata--; /* number of data disks */
+       ndata = info->array.raid_disks - 1;
+       if (info->new_level == 6) ndata--;
 
        old_disks = info->array.raid_disks - info->delta_disks;
 
+       if (info->delta_disks <= 0)
+               /* Didn't grow, so the backup file must have
+                * been used
+                */
+               old_disks = cnt;
        for (i=old_disks-(backup_file?1:0); i<cnt; i++) {
                struct mdinfo dinfo;
                char buf[4096];
                int fd;
+               int bsbsize;
 
                /* This was a spare and may have some saved data on it.
                 * Load the superblock, find and load the
@@ -1568,8 +1582,11 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                 */
                if (i == old_disks-1) {
                        fd = open(backup_file, O_RDONLY);
-                       if (fd<0)
+                       if (fd<0) {
+                               fprintf(stderr, Name ": backup file %s inaccessible: %s\n",
+                                       backup_file, strerror(errno));
                                continue;
+                       }
                } else {
                        fd = fdlist[i];
                        if (fd < 0)
@@ -1587,10 +1604,13 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                }
                if (read(fd, &bsb, sizeof(bsb)) != sizeof(bsb))
                        continue; /* Cannot read */
-               if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0)
+               if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0 &&
+                   memcmp(bsb.magic, "md_backup_data-2", 16) != 0)
                        continue;
                if (bsb.sb_csum != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb)))
                        continue; /* bad checksum */
+               if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0 &&
+                   bsb.sb_csum2 != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb)))
                if (memcmp(bsb.set_uuid,info->uuid, 16) != 0)
                        continue; /* Wrong uuid */
 
@@ -1598,18 +1618,46 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                    info->array.utime < __le64_to_cpu(bsb.mtime))
                        continue; /* time stamp is too bad */
 
-               if (__le64_to_cpu(bsb.arraystart) != 0)
-                       continue; /* Can only handle backup from start of array */
-               if (__le64_to_cpu(bsb.length) <
-                   info->reshape_progress)
-                       continue; /* No new data here */
-
+               if (bsb.magic[15] == '1') {
+               if (info->delta_disks >= 0) {
+                       /* reshape_progress is increasing */
+                       if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) <
+                           info->reshape_progress)
+                               continue; /* No new data here */
+               } else {
+                       /* reshape_progress is decreasing */
+                       if (__le64_to_cpu(bsb.arraystart) >=
+                           info->reshape_progress)
+                               continue; /* No new data here */
+               }
+               } else {
+               if (info->delta_disks >= 0) {
+                       /* reshape_progress is increasing */
+                       if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) <
+                           info->reshape_progress &&
+                           __le64_to_cpu(bsb.arraystart2) + __le64_to_cpu(bsb.length2) <
+                           info->reshape_progress)
+                               continue; /* No new data here */
+               } else {
+                       /* reshape_progress is decreasing */
+                       if (__le64_to_cpu(bsb.arraystart) >=
+                           info->reshape_progress &&
+                           __le64_to_cpu(bsb.arraystart2) >=
+                           info->reshape_progress)
+                               continue; /* No new data here */
+               }
+               }
                if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0)
                        continue; /* Cannot seek */
                /* There should be a duplicate backup superblock 4k before here */
                if (lseek64(fd, -4096, 1) < 0 ||
-                   read(fd, buf, 4096) != 4096 ||
-                   memcmp(buf, &bsb, sizeof(bsb)) != 0)
+                   read(fd, buf, 4096) != 4096)
+                       continue; /* Cannot find leading superblock */
+               if (bsb.magic[15] == '1')
+                       bsbsize = offsetof(struct mdp_backup_super, pad1);
+               else
+                       bsbsize = offsetof(struct mdp_backup_super, pad);
+               if (memcmp(buf, &bsb, bsbsize) != 0)
                        continue; /* Cannot find leading superblock */
 
                /* Now need the data offsets for all devices. */
@@ -1632,37 +1680,67 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                                    info->new_level,
                                    info->new_layout,
                                    fd, __le64_to_cpu(bsb.devstart)*512,
-                                   0, __le64_to_cpu(bsb.length)*512)) {
+                                   __le64_to_cpu(bsb.arraystart),
+                                   __le64_to_cpu(bsb.length)*512)) {
+                       /* didn't succeed, so giveup */
+                       return 1;
+               }
+               
+               if (bsb.magic[15] == '2' &&
+                   restore_stripes(fdlist, offsets,
+                                   info->array.raid_disks,
+                                   info->new_chunk,
+                                   info->new_level,
+                                   info->new_layout,
+                                   fd, __le64_to_cpu(bsb.devstart)*512 +
+                                   __le64_to_cpu(bsb.devstart2)*512,
+                                   __le64_to_cpu(bsb.arraystart2),
+                                   __le64_to_cpu(bsb.length2)*512)) {
                        /* didn't succeed, so giveup */
                        return 1;
                }
 
+
                /* Ok, so the data is restored. Let's update those superblocks. */
 
+               if (info->delta_disks >= 0) {
+                       info->reshape_progress = __le64_to_cpu(bsb.arraystart) +
+                               __le64_to_cpu(bsb.length);
+                       if (bsb.magic[15] == '2') {
+                               unsigned long long p2 = __le64_to_cpu(bsb.arraystart2) +
+                                       __le64_to_cpu(bsb.length2);
+                               if (p2 > info->reshape_progress)
+                                       info->reshape_progress = p2;
+                       }
+               } else {
+                       info->reshape_progress = __le64_to_cpu(bsb.arraystart);
+                       if (bsb.magic[15] == '2') {
+                               unsigned long long p2 = __le64_to_cpu(bsb.arraystart2);
+                               if (p2 < info->reshape_progress)
+                                       info->reshape_progress = p2;
+                       }
+               }
                for (j=0; j<info->array.raid_disks; j++) {
                        if (fdlist[j] < 0) continue;
                        if (st->ss->load_super(st, fdlist[j], NULL))
                                continue;
                        st->ss->getinfo_super(st, &dinfo);
-                       dinfo.reshape_progress = __le64_to_cpu(bsb.length);
+                       dinfo.reshape_progress = info->reshape_progress;
                        st->ss->update_super(st, &dinfo,
                                             "_reshape_progress",
                                             NULL,0, 0, NULL);
                        st->ss->store_super(st, fdlist[j]);
                        st->ss->free_super(st);
                }
-
-               /* And we are done! */
                return 0;
        }
        /* Didn't find any backup data, try to see if any
         * was needed.
         */
+       if (info->delta_disks == 0)
+               /* Alway need backup data when size doesn't change */
+               return 1;
        nstripe = ostripe = 0;
-       odata = info->array.raid_disks - info->delta_disks - 1;
-       if (info->array.level == 6) odata--; /* number of data disks */
-       ndata = info->array.raid_disks - 1;
-       if (info->new_level == 6) ndata--;
        last_block = 0;
        while (nstripe >= ostripe) {
                nstripe += info->new_chunk / 512;
@@ -1676,3 +1754,148 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
        /* needed to recover critical section! */
        return 1;
 }
+
+int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
+                 char *backup_file)
+{
+       /* Array is assembled and ready to be started, but
+        * monitoring is probably required.
+        * So:
+        *   - start read-only
+        *   - set upper bound for resync
+        *   - initialise the 'suspend' boundaries
+        *   - switch to read-write
+        *   - fork and continue monitoring
+        */
+       int err;
+       int backup_list[1];
+       unsigned long long backup_offsets[1];
+       int odisks, ndisks, ochunk, nchunk,odata,ndata;
+       unsigned long a,b,blocks,stripes;
+       int backup_fd;
+       int *fds;
+       unsigned long long *offsets;
+       int d;
+       struct mdinfo *sra, *sd;
+       int rv;
+       int done = 0;
+
+       err = sysfs_set_str(info, NULL, "array_state", "readonly");
+       if (err)
+               return err;
+
+       /* make sure reshape doesn't progress until we are ready */
+       sysfs_set_str(info, NULL, "sync_max", "0");
+       sysfs_set_str(info, NULL, "array_state", "active"); /* FIXME or clean */
+       
+       /* ndisks is not growing, so raid_disks is old and +delta is new */
+       odisks = info->array.raid_disks;
+       ndisks = odisks + info->delta_disks;
+       odata = odisks - 1;
+       ndata = ndisks - 1;
+       if (info->array.level == 6) {
+               odata--;
+               ndata--;
+       }
+       ochunk = info->array.chunk_size;
+       nchunk = info->new_chunk;
+
+
+       a = ochunk/512 * odata;
+       b = nchunk/512 * ndata;
+       /* Find GCD */
+       while (a != b) {
+               if (a < b)
+                       b -= a;
+               if (b < a)
+                       a -= b;
+       }
+       /* LCM == product / GCD */
+       blocks = ochunk/512 * nchunk/512 * odata * ndata / a;
+
+       if (ndata == odata)
+               blocks *= 16;
+       stripes = blocks / (info->array.chunk_size/512) / odata;
+
+
+       memset(&bsb, 0, 512);
+       memcpy(bsb.magic, "md_backup_data-1", 16);
+       memcpy(&bsb.set_uuid, info->uuid, 16);
+       bsb.mtime = __cpu_to_le64(time(0));
+       bsb.devstart2 = blocks;
+
+       backup_fd = open(backup_file, O_RDWR|O_CREAT, S_IRUSR | S_IWUSR);
+       backup_list[0] = backup_fd;
+       backup_offsets[0] = 8 * 512;
+       fds = malloc(odisks * sizeof(fds[0]));
+       offsets = malloc(odisks * sizeof(offsets[0]));
+       for (d=0; d<odisks; d++)
+               fds[d] = -1;
+
+       sra = sysfs_read(-1, devname2devnum(info->sys_name),
+                        GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
+                        GET_CACHE);
+
+       for (sd = sra->devs; sd; sd = sd->next) {
+               if (sd->disk.state & (1<<MD_DISK_FAULTY))
+                       continue;
+               if (sd->disk.state & (1<<MD_DISK_SYNC)) {
+                       char *dn = map_dev(sd->disk.major,
+                                          sd->disk.minor, 1);
+                       fds[sd->disk.raid_disk]
+                               = dev_open(dn, O_RDONLY);
+                       offsets[sd->disk.raid_disk] = sd->data_offset*512;
+                       if (fds[sd->disk.raid_disk] < 0) {
+                               fprintf(stderr, Name ": %s: cannot open component %s\n",
+                                       info->sys_name, dn?dn:"-unknown-");
+                               rv = 1;
+                               goto release;
+                       }
+                       free(dn);
+               }
+       }
+
+       switch(fork()) {
+       case 0:
+               close(mdfd);
+               mlockall(MCL_FUTURE);
+               if (info->delta_disks < 0)
+                       done = child_shrink(-1, info, stripes,
+                                           fds, offsets,
+                                           info->array.raid_disks,
+                                           info->array.chunk_size,
+                                           info->array.level, info->array.layout,
+                                           odata,
+                                           1, backup_list, backup_offsets);
+               else if (info->delta_disks == 0) {
+                       /* The 'start' is a per-device stripe number.
+                        * reshape_progress is a per-array sector number.
+                        * So divide by ndata * chunk_size
+                        */
+                       unsigned long long start = info->reshape_progress / ndata;
+                       start /= (info->array.chunk_size/512);
+                       done = child_same_size(-1, info, stripes,
+                                              fds, offsets,
+                                              start,
+                                              info->array.raid_disks,
+                                              info->array.chunk_size,
+                                              info->array.level, info->array.layout,
+                                              odata,
+                                              1, backup_list, backup_offsets);
+               }
+               if (backup_file && done)
+                       unlink(backup_file);
+               /* FIXME should I intuit a level change */
+               exit(0);
+       case -1:
+               fprintf(stderr, Name ": Cannot run child to continue monitoring reshape: %s\n",
+                       strerror(errno));
+               return 1;
+       default:
+               break;
+       }
+release:
+       return 0;
+}
+
+
diff --git a/mdadm.h b/mdadm.h
index 181de07cf64e8cf7104cfc745788a447b362d394..b0ff7dc6916cf28aae59aa737d1394d974b2bf7c 100644 (file)
--- a/mdadm.h
+++ b/mdadm.h
@@ -722,7 +722,8 @@ extern int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        int level, char *layout_str, int chunksize, int raid_disks);
 extern int Grow_restart(struct supertype *st, struct mdinfo *info,
                        int *fdlist, int cnt, char *backup_file);
-
+extern int Grow_continue(int mdfd, struct supertype *st,
+                        struct mdinfo *info, char *backup_file);
 
 extern int Assemble(struct supertype *st, char *mddev,
                    mddev_ident_t ident,
index 38b44122066cd9c62a66d04b50157f3ff0d89557..9a70cd092084e5d039fc979c823628a6cf077df7 100644 (file)
@@ -519,13 +519,14 @@ int restore_stripes(int *dest, unsigned long long *offsets,
                    int source, unsigned long long read_offset,
                    unsigned long long start, unsigned long long length)
 {
-       char *stripe_buf = malloc(raid_disks * chunk_size);
+       char *stripe_buf;
        char **stripes = malloc(raid_disks * sizeof(char*));
        char **blocks = malloc(raid_disks * sizeof(char*));
        int i;
 
        int data_disks = raid_disks - (level == 0 ? 0 : level <= 5 ? 1 : 2);
 
+       posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size);
        if (zero == NULL) {
                zero = malloc(chunk_size);
                if (zero)
index 60c51454bf80ae03a756491d19b6270ccaa229ef..71b03240458bbfa360564a49548a502eb37351db 100644 (file)
--- a/super0.c
+++ b/super0.c
@@ -140,7 +140,7 @@ static void examine_super0(struct supertype *st, char *homehost)
                printf("  Reshape pos'n : %llu%s\n", (unsigned long long)sb->reshape_position/2, human_size((long long)sb->reshape_position<<9));
                if (sb->delta_disks) {
                        printf("  Delta Devices : %d", sb->delta_disks);
-                       if (sb->delta_disks)
+                       if (sb->delta_disks > 0)
                                printf(" (%d->%d)\n", sb->raid_disks-sb->delta_disks, sb->raid_disks);
                        else
                                printf(" (%d->%d)\n", sb->raid_disks, sb->raid_disks+sb->delta_disks);