]> git.ipfire.org Git - thirdparty/mdadm.git/blobdiff - Grow.c
Create: cleanup after failed create in duplicated array member case
[thirdparty/mdadm.git] / Grow.c
diff --git a/Grow.c b/Grow.c
index 6bc00b8837439d7b2c141ce9dfca91b67ab7954a..053a372ac3b526f7a5b5cccce6955f02735914ea 100644 (file)
--- a/Grow.c
+++ b/Grow.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  *    Author: Neil Brown
- *    Email: <neilb@cse.unsw.edu.au>
- *    Paper: Neil Brown
- *           School of Computer Science and Engineering
- *           The University of New South Wales
- *           Sydney, 2052
- *           Australia
+ *    Email: <neilb@suse.de>
  */
 #include       "mdadm.h"
 #include       "dlink.h"
 #include       "md_u.h"
 #include       "md_p.h"
 
+#ifndef offsetof
+#define offsetof(t,f) ((size_t)&(((t*)0)->f))
+#endif
+
 int Grow_Add_device(char *devname, int fd, char *newdev)
 {
        /* Add a device to an active array.
@@ -289,6 +288,11 @@ int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int
                return 1;
        } else if (strcmp(file, "internal") == 0) {
                int d;
+               if (st->ss->add_internal_bitmap == NULL) {
+                       fprintf(stderr, Name ": Internal bitmaps not supported "
+                               "with %s metadata\n", st->ss->name);
+                       return 1;
+               }
                for (d=0; d< st->max_devs; d++) {
                        mdu_disk_info_t disk;
                        char *dv;
@@ -382,13 +386,13 @@ int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int
 /*
  * When reshaping an array we might need to backup some data.
  * This is written to all spares with a 'super_block' describing it.
- * The superblock goes 1K form the end of the used space on the
+ * The superblock goes 4K from the end of the used space on the
  * device.
  * It if written after the backup is complete.
  * It has the following structure.
  */
 
-struct mdp_backup_super {
+static struct mdp_backup_super {
        char    magic[16];  /* md_backup_data-1 or -2 */
        __u8    set_uuid[16];
        __u64   mtime;
@@ -403,7 +407,7 @@ struct mdp_backup_super {
        __u64   length2;
        __u32   sb_csum2;       /* csum of preceeding bytes. */
        __u8 pad[512-68-32];
-} __attribute__((aligned(512))) bsb;
+} __attribute__((aligned(512))) bsb, bsb2;
 
 int bsb_csum(char *buf, int len)
 {
@@ -424,6 +428,7 @@ static int child_shrink(int afd, struct mdinfo *sra, unsigned long blocks,
                        int dests, int *destfd, unsigned long long *destoffsets);
 static int child_same_size(int afd, struct mdinfo *sra, unsigned long blocks,
                           int *fds, unsigned long long *offsets,
+                          unsigned long long start,
                           int disks, int chunk, int level, int layout, int data,
                           int dests, int *destfd, unsigned long long *destoffsets);
 
@@ -523,6 +528,23 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        devname);
                return 1;
        }
+
+       if (size >= 0 &&
+           (chunksize || level!= UnSet || layout_str || raid_disks)) {
+               fprintf(stderr, Name ": cannot change component size at the same time "
+                       "as other changes.\n"
+                       "   Change size first, then check data is intact before "
+                       "making other changes.\n");
+               return 1;
+       }
+
+       if (raid_disks && raid_disks < array.raid_disks && array.level > 1 &&
+           get_linux_version() < 2006032 &&
+           !check_env("MDADM_FORCE_FEWER")) {
+               fprintf(stderr, Name ": reducing the number of devices is not safe before Linux 2.6.32\n"
+                       "       Please use a newer kernel\n");
+               return 1;
+       }
        sra = sysfs_read(fd, 0, GET_LEVEL);
        frozen = freeze_array(sra);
        if (frozen < 0) {
@@ -546,16 +568,27 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                } else
                        rv = ioctl(fd, SET_ARRAY_INFO, &array);
                if (rv != 0) {
+                       int err = errno;
                        fprintf(stderr, Name ": Cannot set device size for %s: %s\n",
-                               devname, strerror(errno));
+                               devname, strerror(err));
+                       if (err == EBUSY && 
+                           (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+                               fprintf(stderr, "       Bitmap must be removed before size can be changed\n");
                        rv = 1;
                        goto release;
                }
                ioctl(fd, GET_ARRAY_INFO, &array);
+               size = get_component_size(fd)/2;
+               if (size == 0)
+                       size = array.size;
                if (!quiet)
-                       fprintf(stderr, Name ": component size of %s has been set to %dK\n",
-                               devname, array.size);
+                       fprintf(stderr, Name ": component size of %s has been set to %lluK\n",
+                               devname, size);
                changed = 1;
+       } else {
+               size = get_component_size(fd)/2;
+               if (size == 0)
+                       size = array.size;
        }
 
        /* ======= set level =========== */
@@ -627,7 +660,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                                        }
                                }
                                if (raid_disks)
-                                       /* The find raid6->raid5 conversion
+                                       /* The final raid6->raid5 conversion
                                         * will reduce the number of disks,
                                         * so now we need to aim higher
                                         */
@@ -636,12 +669,18 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                                layout_str = "parity-last";
                } else {
                        c = map_num(pers, level);
-                       if (c == NULL)
-                               return 1;/* not possible */
+                       if (c == NULL) {
+                               rv = 1;/* not possible */
+                               goto release;
+                       }
                        err = sysfs_set_str(sra, NULL, "level", c);
                        if (err) {
+                               err = errno;
                                fprintf(stderr, Name ": %s: could not set level to %s\n",
                                        devname, c);
+                               if (err == EBUSY && 
+                                   (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+                                       fprintf(stderr, "       Bitmap must be removed before level can be changed\n");
                                rv = 1;
                                goto release;
                        }
@@ -707,9 +746,14 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        c = map_num(pers, level);
                        if (c) {
                                rv = sysfs_set_str(sra, NULL, "level", c);
-                               if (rv)
+                               if (rv) {
+                                       int err = errno;
                                        fprintf(stderr, Name ": %s: could not set level to %s\n",
                                                devname, c);
+                                       if (err == EBUSY && 
+                                           (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+                                               fprintf(stderr, "       Bitmap must be removed before level can be changed\n");
+                               }
                        }
                } else if (!changed && !quiet)
                        fprintf(stderr, Name ": %s: no change requested\n",
@@ -825,10 +869,10 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
 
                if (chunksize) {
                        nchunk = chunksize * 1024;
-                       if (array.size % chunksize) {
-                               fprintf(stderr, Name ": component size %dK is not"
+                       if (size % chunksize) {
+                               fprintf(stderr, Name ": component size %lluK is not"
                                        " a multiple of chunksize %dK\n",
-                                       array.size, chunksize);
+                                       size, chunksize);
                                break;
                        }
                }
@@ -841,7 +885,8 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                                if (nlayout == UnSet) {
                                        fprintf(stderr, Name ": layout %s not understood for raid5.\n",
                                                layout_str);
-                                       return 1;
+                                       rv = 1;
+                                       goto release;
                                }
                                break;
 
@@ -850,7 +895,8 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                                if (nlayout == UnSet) {
                                        fprintf(stderr, Name ": layout %s not understood for raid6.\n",
                                                layout_str);
-                                       return 1;
+                                       rv = 1;
+                                       goto release;
                                }
                                break;
                        }
@@ -863,14 +909,19 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        ndata--;
                }
 
+               if (odata == ndata &&
+                   get_linux_version() < 2006032) {
+                       fprintf(stderr, Name ": in-place reshape is not safe before 2.6.32, sorry.\n");
+                       break;
+               }
+
                /* Check that we can hold all the data */
-               size = ndata * array.size;
                get_dev_size(fd, NULL, &array_size);
-               if (size < (array_size/1024)) {
+               if (ndata * size < (array_size/1024)) {
                        fprintf(stderr, Name ": this change will reduce the size of the array.\n"
                                "       use --grow --array-size first to truncate array.\n"
                                "       e.g. mdadm --grow %s --array-size %llu\n",
-                               devname, size);
+                               devname, ndata * size);
                        rv = 1;
                        break;
                }
@@ -892,16 +943,23 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                /* LCM == product / GCD */
                blocks = ochunk/512 * nchunk/512 * odata * ndata / a;
 
-               if (ndata == odata)
-                       blocks *= 16;
-               else
-                       fprintf(stderr, Name ": Need to backup %luK of critical "
-                               "section..\n", blocks/2);
-
                sysfs_free(sra);
                sra = sysfs_read(fd, 0,
                                 GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
                                 GET_CACHE);
+
+               if (ndata == odata) {
+                       /* Make 'blocks' bigger for better throughput, but
+                        * not so big that we reject it below.
+                        * Try for 16 megabytes
+                        */
+                       while (blocks * 32 < sra->component_size &&
+                              blocks < 16*1024*2)
+                              blocks *= 2;
+               } else
+                       fprintf(stderr, Name ": Need to backup %luK of critical "
+                               "section..\n", blocks/2);
+
                if (!sra) {
                        fprintf(stderr, Name ": %s: Cannot get array details from sysfs\n",
                                devname);
@@ -948,7 +1006,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                                char *dn = map_dev(sd->disk.major,
                                                   sd->disk.minor, 1);
                                fdlist[d] = dev_open(dn, O_RDWR);
-                               offsets[d] = (sra->component_size - blocks - 8)*512;
+                               offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512;
                                if (fdlist[d]<0) {
                                        fprintf(stderr, Name ": %s: cannot open component %s\n",
                                                devname, dn?dn:"-unknown");
@@ -1013,6 +1071,9 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                
                cache = (nchunk < ochunk) ? ochunk : nchunk;
                cache = cache * 4 / 4096;
+               if (cache < blocks / 8 / odisks + 16)
+                       /* Make it big enough to hold 'blocks' */
+                       cache = blocks / 8 / odisks + 16;
                if (sra->cache_size < cache)
                        sysfs_set_num(sra, NULL, "stripe_cache_size",
                                      cache+1);
@@ -1023,12 +1084,16 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                if (ochunk == nchunk && olayout == nlayout) {
                        array.raid_disks = ndisks;
                        if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
+                               int err = errno;
                                rv = 1;
                                fprintf(stderr, Name ": Cannot set device shape for %s: %s\n",
                                        devname, strerror(errno));
                                if (ndisks < odisks &&
                                    get_linux_version() < 2006030)
                                        fprintf(stderr, Name ": linux 2.6.30 or later required\n");
+                               if (err == EBUSY && 
+                                   (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+                                       fprintf(stderr, "       Bitmap must be removed before shape can be changed\n");
 
                                break;
                        }
@@ -1036,17 +1101,24 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        /* set them all just in case some old 'new_*' value
                         * persists from some earlier problem
                         */
+                       int err = err; /* only used if rv==1, and always set if
+                                       * rv==1, so initialisation not needed,
+                                       * despite gcc warning
+                                       */
                        if (sysfs_set_num(sra, NULL, "chunk_size", nchunk) < 0)
-                               rv = 1;
-                       if (sysfs_set_num(sra, NULL, "layout", nlayout) < 0)
-                               rv = 1;
-                       if (sysfs_set_num(sra, NULL, "raid_disks", ndisks) < 0)
-                               rv = 1;
+                               rv = 1, err = errno;
+                       if (!rv && sysfs_set_num(sra, NULL, "layout", nlayout) < 0)
+                               rv = 1, err = errno;
+                       if (!rv && sysfs_set_num(sra, NULL, "raid_disks", ndisks) < 0)
+                               rv = 1, err = errno;
                        if (rv) {
                                fprintf(stderr, Name ": Cannot set device shape for %s\n",
                                        devname);
                                if (get_linux_version() < 2006030)
                                        fprintf(stderr, Name ": linux 2.6.30 or later required\n");
+                               if (err == EBUSY && 
+                                   (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+                                       fprintf(stderr, "       Bitmap must be removed before shape can be changed\n");
                                break;
                        }
                }
@@ -1115,6 +1187,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        else
                                done = child_same_size(fd, sra, stripes,
                                                       fdlist, offsets,
+                                                      0,
                                                       odisks, ochunk, array.level, olayout, odata,
                                                       d - odisks, fdlist+odisks, offsets+odisks);
                        if (backup_file && done)
@@ -1186,13 +1259,14 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
  * 
  */
 
+/* FIXME return status is never checked */
 int grow_backup(struct mdinfo *sra,
                unsigned long long offset, /* per device */
                unsigned long stripes, /* per device */
                int *sources, unsigned long long *offsets,
                int disks, int chunk, int level, int layout,
                int dests, int *destfd, unsigned long long *destoffsets,
-               int part,
+               int part, int *degraded,
                char *buf)
 {
        /* Backup 'blocks' sectors at 'offset' on each device of the array,
@@ -1204,12 +1278,38 @@ int grow_backup(struct mdinfo *sra,
        int odata = disks;
        int rv = 0;
        int i;
+       unsigned long long new_degraded;
        //printf("offset %llu\n", offset);
        if (level >= 4)
                odata--;
        if (level == 6)
                odata--;
        sysfs_set_num(sra, NULL, "suspend_hi", (offset + stripes * chunk/512) * odata);
+       /* Check that array hasn't become degraded, else we might backup the wrong data */
+       sysfs_get_ll(sra, NULL, "degraded", &new_degraded);
+       if (new_degraded != *degraded) {
+               /* check each device to ensure it is still working */
+               struct mdinfo *sd;
+               for (sd = sra->devs ; sd ; sd = sd->next) {
+                       if (sd->disk.state & (1<<MD_DISK_FAULTY))
+                               continue;
+                       if (sd->disk.state & (1<<MD_DISK_SYNC)) {
+                               char sbuf[20];
+                               if (sysfs_get_str(sra, sd, "state", sbuf, 20) < 0 ||
+                                   strstr(sbuf, "faulty") ||
+                                   strstr(sbuf, "in_sync") == NULL) {
+                                       /* this device is dead */
+                                       sd->disk.state = (1<<MD_DISK_FAULTY);
+                                       if (sd->disk.raid_disk >= 0 &&
+                                           sources[sd->disk.raid_disk] >= 0) {
+                                               close(sources[sd->disk.raid_disk]);
+                                               sources[sd->disk.raid_disk] = -1;
+                                       }
+                               }
+                       }
+               }
+               *degraded = new_degraded;
+       }
        if (part) {
                bsb.arraystart2 = __cpu_to_le64(offset * odata);
                bsb.length2 = __cpu_to_le64(stripes * chunk/512 * odata);
@@ -1233,6 +1333,7 @@ int grow_backup(struct mdinfo *sra,
 
        if (rv)
                return rv;
+       bsb.mtime = __cpu_to_le64(time(0));
        for (i = 0; i < dests; i++) {
                bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
 
@@ -1241,12 +1342,19 @@ int grow_backup(struct mdinfo *sra,
                        bsb.sb_csum2 = bsb_csum((char*)&bsb,
                                                ((char*)&bsb.sb_csum2)-((char*)&bsb));
 
-               lseek64(destfd[i], destoffsets[i] - 4096, 0);
-               write(destfd[i], &bsb, 512);
+               if (lseek64(destfd[i], destoffsets[i] - 4096, 0) != destoffsets[i] - 4096)
+                       rv = 1;
+               rv = rv ?: write(destfd[i], &bsb, 512);
+               if (destoffsets[i] > 4096) {
+                       if (lseek64(destfd[i], destoffsets[i]+stripes*chunk*odata, 0) !=
+                           destoffsets[i]+stripes*chunk*odata)
+                               rv = 1;
+                       rv = rv ?: write(destfd[i], &bsb, 512);
+               }
                fsync(destfd[i]);
        }
 
-       return 0;
+       return rv;
 }
 
 /* in 2.6.30, the value reported by sync_completed can be
@@ -1259,6 +1367,7 @@ int grow_backup(struct mdinfo *sra,
  * The various caller give appropriate values so that
  * every works.
  */
+/* FIXME return value is often ignored */
 int wait_backup(struct mdinfo *sra,
                unsigned long long offset, /* per device */
                unsigned long long blocks, /* per device */
@@ -1272,6 +1381,7 @@ int wait_backup(struct mdinfo *sra,
        int fd = sysfs_get_fd(sra, NULL, "sync_completed");
        unsigned long long completed;
        int i;
+       int rv;
 
        if (fd < 0)
                return -1;
@@ -1302,24 +1412,29 @@ int wait_backup(struct mdinfo *sra,
                bsb.arraystart = __cpu_to_le64(0);
                bsb.length = __cpu_to_le64(0);
        }
+       bsb.mtime = __cpu_to_le64(time(0));
+       rv = 0;
        for (i = 0; i < dests; i++) {
                bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
                bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
                if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
                        bsb.sb_csum2 = bsb_csum((char*)&bsb,
                                                ((char*)&bsb.sb_csum2)-((char*)&bsb));
-               lseek64(destfd[i], destoffsets[i]-4096, 0);
-               write(destfd[i], &bsb, 512);
+               if (lseek64(destfd[i], destoffsets[i]-4096, 0) !=
+                   destoffsets[i]-4096)
+                       rv = 1;
+               rv = rv ?: write(destfd[i], &bsb, 512);
                fsync(destfd[i]);
        }
-       return 0;
+       return rv;
 }
 
 static void fail(char *msg)
 {
-       write(2, msg, strlen(msg));
-       write(2, "\n", 1);
-       exit(1);
+       int rv;
+       rv = write(2, msg, strlen(msg));
+       rv |= write(2, "\n", 1);
+       exit(rv ? 1 : 2);
 }
 
 static char *abuf, *bbuf;
@@ -1330,7 +1445,6 @@ static void validate(int afd, int bfd, unsigned long long offset)
         * This is only used for regression testing and should not
         * be used while the array is active
         */
-       struct mdp_backup_super bsb2;
        if (afd < 0)
                return;
        lseek64(bfd, offset - 4096, 0);
@@ -1356,27 +1470,33 @@ static void validate(int afd, int bfd, unsigned long long offset)
                        free(abuf);
                        free(bbuf);
                        abuflen = len;
-                       posix_memalign((void**)&abuf, 4096, abuflen);
-                       posix_memalign((void**)&bbuf, 4096, abuflen);
+                       if (posix_memalign((void**)&abuf, 4096, abuflen) ||
+                           posix_memalign((void**)&bbuf, 4096, abuflen)) {
+                               abuflen = 0;
+                               /* just stop validating on mem-alloc failure */
+                               return;
+                       }
                }
 
                lseek64(bfd, offset, 0);
                if (read(bfd, bbuf, len) != len) {
-                       printf("len %llu\n", len);
+                       //printf("len %llu\n", len);
                        fail("read first backup failed");
                }
                lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0);
                if (read(afd, abuf, len) != len)
                        fail("read first from array failed");
                if (memcmp(bbuf, abuf, len) != 0) {
+                       #if 0
                        int i;
                        printf("offset=%llu len=%llu\n",
-                              __le64_to_cpu(bsb2.arraystart)*512, len);
+                              (unsigned long long)__le64_to_cpu(bsb2.arraystart)*512, len);
                        for (i=0; i<len; i++)
                                if (bbuf[i] != abuf[i]) {
                                        printf("first diff byte %d\n", i);
                                        break;
                                }
+                       #endif
                        fail("data1 compare failed");
                }
        }
@@ -1408,19 +1528,21 @@ static int child_grow(int afd, struct mdinfo *sra, unsigned long stripes,
                      int dests, int *destfd, unsigned long long *destoffsets)
 {
        char *buf;
+       int degraded = 0;
 
-       posix_memalign((void**)&buf, 4096, disks * chunk);
+       if (posix_memalign((void**)&buf, 4096, disks * chunk))
+               /* Don't start the 'reshape' */
+               return 0;
        sysfs_set_num(sra, NULL, "suspend_hi", 0);
        sysfs_set_num(sra, NULL, "suspend_lo", 0);
        grow_backup(sra, 0, stripes,
                    fds, offsets, disks, chunk, level, layout,
                    dests, destfd, destoffsets,
-                   0, buf);
+                   0, &degraded, buf);
        validate(afd, destfd[0], destoffsets[0]);
-       if (wait_backup(sra, 0, stripes * chunk / 512, stripes * chunk / 512,
-                       dests, destfd, destoffsets,
-                       0) < 0)
-               return 0;
+       wait_backup(sra, 0, stripes * chunk / 512, stripes * chunk / 512,
+                   dests, destfd, destoffsets,
+                   0);
        sysfs_set_num(sra, NULL, "suspend_lo", (stripes * chunk/512) * data);
        free(buf);
        /* FIXME this should probably be numeric */
@@ -1436,8 +1558,10 @@ static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes,
        char *buf;
        unsigned long long start;
        int rv;
+       int degraded = 0;
 
-       posix_memalign((void**)&buf, 4096, disks * chunk);
+       if (posix_memalign((void**)&buf, 4096, disks * chunk))
+               return 0;
        start = sra->component_size - stripes * chunk/512;
        sysfs_set_num(sra, NULL, "sync_max", start);
        sysfs_set_str(sra, NULL, "sync_action", "reshape");
@@ -1451,12 +1575,10 @@ static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes,
                    fds, offsets,
                    disks, chunk, level, layout,
                    dests, destfd, destoffsets,
-                   0, buf);
+                   0, &degraded, buf);
        validate(afd, destfd[0], destoffsets[0]);
-       rv = wait_backup(sra, start, stripes*chunk/512, 0,
-                        dests, destfd, destoffsets, 0);
-       if (rv < 0)
-               return 0;
+       wait_backup(sra, start, stripes*chunk/512, 0,
+                   dests, destfd, destoffsets, 0);
        sysfs_set_num(sra, NULL, "suspend_lo", (stripes * chunk/512) * data);
        free(buf);
        /* FIXME this should probably be numeric */
@@ -1466,17 +1588,20 @@ static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes,
 
 static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
                           int *fds, unsigned long long *offsets,
+                          unsigned long long start,
                           int disks, int chunk, int level, int layout, int data,
                           int dests, int *destfd, unsigned long long *destoffsets)
 {
-       unsigned long long start, size;
+       unsigned long long size;
        unsigned long tailstripes = stripes;
        int part;
        char *buf;
        unsigned long long speed;
+       int degraded = 0;
 
 
-       posix_memalign((void**)&buf, 4096, disks * chunk);
+       if (posix_memalign((void**)&buf, 4096, disks * chunk))
+               return 0;
 
        sysfs_set_num(sra, NULL, "suspend_lo", 0);
        sysfs_set_num(sra, NULL, "suspend_hi", 0);
@@ -1484,19 +1609,19 @@ static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
        sysfs_get_ll(sra, NULL, "sync_speed_min", &speed);
        sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
 
-       grow_backup(sra, 0, stripes,
+       grow_backup(sra, start, stripes,
                    fds, offsets,
                    disks, chunk, level, layout,
                    dests, destfd, destoffsets,
-                   0, buf);
-       grow_backup(sra, stripes * chunk/512, stripes,
+                   0, &degraded, buf);
+       grow_backup(sra, (start + stripes) * chunk/512, stripes,
                    fds, offsets,
                    disks, chunk, level, layout,
                    dests, destfd, destoffsets,
-                   1, buf);
+                   1, &degraded, buf);
        validate(afd, destfd[0], destoffsets[0]);
        part = 0;
-       start = stripes * 2; /* where to read next */
+       start += stripes * 2; /* where to read next */
        size = sra->component_size / (chunk/512);
        while (start < size) {
                if (wait_backup(sra, (start-stripes*2)*chunk/512,
@@ -1512,7 +1637,7 @@ static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
                            fds, offsets,
                            disks, chunk, level, layout,
                            dests, destfd, destoffsets,
-                           part, buf);
+                           part, &degraded, buf);
                start += stripes;
                part = 1 - part;
                validate(afd, destfd[0], destoffsets[0]);
@@ -1522,10 +1647,9 @@ static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
                        part) < 0)
                return 0;
        sysfs_set_num(sra, NULL, "suspend_lo", ((start-stripes)*chunk/512) * data);
-       if (wait_backup(sra, (start-stripes) * chunk/512, tailstripes * chunk/512, 0,
-                       dests, destfd, destoffsets,
-                       1-part) < 0)
-               return 0;
+       wait_backup(sra, (start-stripes) * chunk/512, tailstripes * chunk/512, 0,
+                   dests, destfd, destoffsets,
+                   1-part);
        sysfs_set_num(sra, NULL, "suspend_lo", (size*chunk/512) * data);
        sysfs_set_num(sra, NULL, "sync_speed_min", speed);
        free(buf);
@@ -1537,27 +1661,35 @@ static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
  * write that data into the array and update the super blocks with
  * the new reshape_progress
  */
-int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt, char *backup_file)
+int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt,
+                char *backup_file, int verbose)
 {
        int i, j;
        int old_disks;
        unsigned long long *offsets;
-       unsigned long long  nstripe, ostripe, last_block;
+       unsigned long long  nstripe, ostripe;
        int ndata, odata;
 
-       if (info->delta_disks < 0)
-               return 1; /* cannot handle a shrink */
-       if (info->new_level != info->array.level ||
-           info->new_layout != info->array.layout ||
-           info->new_chunk != info->array.chunk_size)
-               return 1; /* Can only handle change in disks */
+       if (info->new_level != info->array.level)
+               return 1; /* Cannot handle level changes (they are instantaneous) */
+
+       odata = info->array.raid_disks - info->delta_disks - 1;
+       if (info->array.level == 6) odata--; /* number of data disks */
+       ndata = info->array.raid_disks - 1;
+       if (info->new_level == 6) ndata--;
 
        old_disks = info->array.raid_disks - info->delta_disks;
 
+       if (info->delta_disks <= 0)
+               /* Didn't grow, so the backup file must have
+                * been used
+                */
+               old_disks = cnt;
        for (i=old_disks-(backup_file?1:0); i<cnt; i++) {
                struct mdinfo dinfo;
-               char buf[4096];
                int fd;
+               int bsbsize;
+               char *devname, namebuf[20];
 
                /* This was a spare and may have some saved data on it.
                 * Load the superblock, find and load the
@@ -1568,8 +1700,12 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                 */
                if (i == old_disks-1) {
                        fd = open(backup_file, O_RDONLY);
-                       if (fd<0)
+                       if (fd<0) {
+                               fprintf(stderr, Name ": backup file %s inaccessible: %s\n",
+                                       backup_file, strerror(errno));
                                continue;
+                       }
+                       devname = backup_file;
                } else {
                        fd = fdlist[i];
                        if (fd < 0)
@@ -1582,35 +1718,110 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
 
                        if (lseek64(fd,
                                    (dinfo.data_offset + dinfo.component_size - 8) <<9,
-                                   0) < 0)
+                                   0) < 0) {
+                               fprintf(stderr, Name ": Cannot seek on device %d\n", i);
                                continue; /* Cannot seek */
+                       }
+                       sprintf(namebuf, "device-%d", i);
+                       devname = namebuf;
                }
-               if (read(fd, &bsb, sizeof(bsb)) != sizeof(bsb))
+               if (read(fd, &bsb, sizeof(bsb)) != sizeof(bsb)) {
+                       if (verbose)
+                               fprintf(stderr, Name ": Cannot read from %s\n", devname);
                        continue; /* Cannot read */
-               if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0)
+               }
+               if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0 &&
+                   memcmp(bsb.magic, "md_backup_data-2", 16) != 0) {
+                       if (verbose)
+                               fprintf(stderr, Name ": No backup metadata on %s\n", devname);
                        continue;
-               if (bsb.sb_csum != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb)))
+               }
+               if (bsb.sb_csum != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb))) {
+                       if (verbose)
+                               fprintf(stderr, Name ": Bad backup-metadata checksum on %s\n", devname);
                        continue; /* bad checksum */
-               if (memcmp(bsb.set_uuid,info->uuid, 16) != 0)
+               }
+               if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0 &&
+                   bsb.sb_csum2 != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb))) {
+                       if (verbose)
+                               fprintf(stderr, Name ": Bad backup-metadata checksum2 on %s\n", devname);
+                       continue; /* Bad second checksum */
+               }
+               if (memcmp(bsb.set_uuid,info->uuid, 16) != 0) {
+                       if (verbose)
+                               fprintf(stderr, Name ": Wrong uuid on backup-metadata on %s\n", devname);
                        continue; /* Wrong uuid */
+               }
 
-               if (info->array.utime > __le64_to_cpu(bsb.mtime) + 3600 ||
-                   info->array.utime < __le64_to_cpu(bsb.mtime))
-                       continue; /* time stamp is too bad */
-
-               if (__le64_to_cpu(bsb.arraystart) != 0)
-                       continue; /* Can only handle backup from start of array */
-               if (__le64_to_cpu(bsb.length) <
-                   info->reshape_progress)
-                       continue; /* No new data here */
+               /* array utime and backup-mtime should be updated at much the same time, but it seems that
+                * sometimes they aren't... So allow considerable flexability in matching, and allow
+                * this test to be overridden by an environment variable.
+                */
+               if (info->array.utime > __le64_to_cpu(bsb.mtime) + 2*60*60 ||
+                   info->array.utime < __le64_to_cpu(bsb.mtime) - 10*60) {
+                       if (check_env("MDADM_GROW_ALLOW_OLD")) {
+                               fprintf(stderr, Name ": accepting backup with timestamp %lu "
+                                       "for array with timestamp %lu\n",
+                                       (unsigned long)__le64_to_cpu(bsb.mtime),
+                                       (unsigned long)info->array.utime);
+                       } else {
+                               if (verbose)
+                                       fprintf(stderr, Name ": too-old timestamp on "
+                                               "backup-metadata on %s\n", devname);
+                               continue; /* time stamp is too bad */
+                       }
+               }
 
-               if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0)
+               if (bsb.magic[15] == '1') {
+               if (info->delta_disks >= 0) {
+                       /* reshape_progress is increasing */
+                       if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) <
+                           info->reshape_progress) {
+                       nonew:
+                               if (verbose)
+                                       fprintf(stderr, Name ": backup-metadata found on %s but is not needed\n", devname);
+                               continue; /* No new data here */
+                       }
+               } else {
+                       /* reshape_progress is decreasing */
+                       if (__le64_to_cpu(bsb.arraystart) >=
+                           info->reshape_progress)
+                               goto nonew; /* No new data here */
+               }
+               } else {
+               if (info->delta_disks >= 0) {
+                       /* reshape_progress is increasing */
+                       if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) <
+                           info->reshape_progress &&
+                           __le64_to_cpu(bsb.arraystart2) + __le64_to_cpu(bsb.length2) <
+                           info->reshape_progress)
+                               goto nonew; /* No new data here */
+               } else {
+                       /* reshape_progress is decreasing */
+                       if (__le64_to_cpu(bsb.arraystart) >=
+                           info->reshape_progress &&
+                           __le64_to_cpu(bsb.arraystart2) >=
+                           info->reshape_progress)
+                               goto nonew; /* No new data here */
+               }
+               }
+               if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) {
+               second_fail:
+                       if (verbose)
+                               fprintf(stderr, Name ": Failed to verify secondary backup-metadata block on %s\n",
+                                       devname);
                        continue; /* Cannot seek */
+               }
                /* There should be a duplicate backup superblock 4k before here */
                if (lseek64(fd, -4096, 1) < 0 ||
-                   read(fd, buf, 4096) != 4096 ||
-                   memcmp(buf, &bsb, sizeof(bsb)) != 0)
-                       continue; /* Cannot find leading superblock */
+                   read(fd, &bsb2, 4096) != 4096)
+                       goto second_fail; /* Cannot find leading superblock */
+               if (bsb.magic[15] == '1')
+                       bsbsize = offsetof(struct mdp_backup_super, pad1);
+               else
+                       bsbsize = offsetof(struct mdp_backup_super, pad);
+               if (memcmp(&bsb2, &bsb, bsbsize) != 0)
+                       goto second_fail; /* Cannot find leading superblock */
 
                /* Now need the data offsets for all devices. */
                offsets = malloc(sizeof(*offsets)*info->array.raid_disks);
@@ -1622,7 +1833,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                                continue;
                        st->ss->getinfo_super(st, &dinfo);
                        st->ss->free_super(st);
-                       offsets[j] = dinfo.data_offset;
+                       offsets[j] = dinfo.data_offset * 512;
                }
                printf(Name ": restoring critical section\n");
 
@@ -1632,47 +1843,263 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                                    info->new_level,
                                    info->new_layout,
                                    fd, __le64_to_cpu(bsb.devstart)*512,
-                                   0, __le64_to_cpu(bsb.length)*512)) {
+                                   __le64_to_cpu(bsb.arraystart)*512,
+                                   __le64_to_cpu(bsb.length)*512)) {
                        /* didn't succeed, so giveup */
+                       if (verbose)
+                               fprintf(stderr, Name ": Error restoring backup from %s\n",
+                                       devname);
+                       return 1;
+               }
+               
+               if (bsb.magic[15] == '2' &&
+                   restore_stripes(fdlist, offsets,
+                                   info->array.raid_disks,
+                                   info->new_chunk,
+                                   info->new_level,
+                                   info->new_layout,
+                                   fd, __le64_to_cpu(bsb.devstart)*512 +
+                                   __le64_to_cpu(bsb.devstart2)*512,
+                                   __le64_to_cpu(bsb.arraystart2)*512,
+                                   __le64_to_cpu(bsb.length2)*512)) {
+                       /* didn't succeed, so giveup */
+                       if (verbose)
+                               fprintf(stderr, Name ": Error restoring second backup from %s\n",
+                                       devname);
                        return 1;
                }
 
+
                /* Ok, so the data is restored. Let's update those superblocks. */
 
+               if (info->delta_disks >= 0) {
+                       info->reshape_progress = __le64_to_cpu(bsb.arraystart) +
+                               __le64_to_cpu(bsb.length);
+                       if (bsb.magic[15] == '2') {
+                               unsigned long long p2 = __le64_to_cpu(bsb.arraystart2) +
+                                       __le64_to_cpu(bsb.length2);
+                               if (p2 > info->reshape_progress)
+                                       info->reshape_progress = p2;
+                       }
+               } else {
+                       info->reshape_progress = __le64_to_cpu(bsb.arraystart);
+                       if (bsb.magic[15] == '2') {
+                               unsigned long long p2 = __le64_to_cpu(bsb.arraystart2);
+                               if (p2 < info->reshape_progress)
+                                       info->reshape_progress = p2;
+                       }
+               }
                for (j=0; j<info->array.raid_disks; j++) {
                        if (fdlist[j] < 0) continue;
                        if (st->ss->load_super(st, fdlist[j], NULL))
                                continue;
                        st->ss->getinfo_super(st, &dinfo);
-                       dinfo.reshape_progress = __le64_to_cpu(bsb.length);
+                       dinfo.reshape_progress = info->reshape_progress;
                        st->ss->update_super(st, &dinfo,
                                             "_reshape_progress",
                                             NULL,0, 0, NULL);
                        st->ss->store_super(st, fdlist[j]);
                        st->ss->free_super(st);
                }
-
-               /* And we are done! */
                return 0;
        }
        /* Didn't find any backup data, try to see if any
         * was needed.
         */
-       nstripe = ostripe = 0;
-       odata = info->array.raid_disks - info->delta_disks - 1;
-       if (info->array.level == 6) odata--; /* number of data disks */
-       ndata = info->array.raid_disks - 1;
-       if (info->new_level == 6) ndata--;
-       last_block = 0;
-       while (nstripe >= ostripe) {
-               nstripe += info->new_chunk / 512;
-               last_block = nstripe * ndata;
-               ostripe = last_block / odata / (info->array.chunk_size/512) *
-                       (info->array.chunk_size/512);
+       if (info->delta_disks < 0) {
+               /* When shrinking, the critical section is at the end.
+                * So see if we are before the critical section.
+                */
+               unsigned long long first_block;
+               nstripe = ostripe = 0;
+               first_block = 0;
+               while (ostripe >= nstripe) {
+                       ostripe += info->array.chunk_size / 512;
+                       first_block = ostripe * odata;
+                       nstripe = first_block / ndata / (info->new_chunk/512) *
+                               (info->new_chunk/512);
+               }
+
+               if (info->reshape_progress >= first_block)
+                       return 0;
        }
+       if (info->delta_disks > 0) {
+               /* See if we are beyond the critical section. */
+               unsigned long long last_block;
+               nstripe = ostripe = 0;
+               last_block = 0;
+               while (nstripe >= ostripe) {
+                       nstripe += info->new_chunk / 512;
+                       last_block = nstripe * ndata;
+                       ostripe = last_block / odata / (info->array.chunk_size/512) *
+                               (info->array.chunk_size/512);
+               }
 
-       if (info->reshape_progress >= last_block)
-               return 0;
+               if (info->reshape_progress >= last_block)
+                       return 0;
+       }
        /* needed to recover critical section! */
+       if (verbose)
+               fprintf(stderr, Name ": Failed to find backup of critical section\n");
        return 1;
 }
+
+int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
+                 char *backup_file)
+{
+       /* Array is assembled and ready to be started, but
+        * monitoring is probably required.
+        * So:
+        *   - start read-only
+        *   - set upper bound for resync
+        *   - initialise the 'suspend' boundaries
+        *   - switch to read-write
+        *   - fork and continue monitoring
+        */
+       int err;
+       int backup_list[1];
+       unsigned long long backup_offsets[1];
+       int odisks, ndisks, ochunk, nchunk,odata,ndata;
+       unsigned long a,b,blocks,stripes;
+       int backup_fd;
+       int *fds;
+       unsigned long long *offsets;
+       int d;
+       struct mdinfo *sra, *sd;
+       int rv;
+       int cache;
+       int done = 0;
+
+       err = sysfs_set_str(info, NULL, "array_state", "readonly");
+       if (err)
+               return err;
+
+       /* make sure reshape doesn't progress until we are ready */
+       sysfs_set_str(info, NULL, "sync_max", "0");
+       sysfs_set_str(info, NULL, "array_state", "active"); /* FIXME or clean */
+       
+       /* ndisks is not growing, so raid_disks is old and +delta is new */
+       odisks = info->array.raid_disks;
+       ndisks = odisks + info->delta_disks;
+       odata = odisks - 1;
+       ndata = ndisks - 1;
+       if (info->array.level == 6) {
+               odata--;
+               ndata--;
+       }
+       ochunk = info->array.chunk_size;
+       nchunk = info->new_chunk;
+
+
+       a = ochunk/512 * odata;
+       b = nchunk/512 * ndata;
+       /* Find GCD */
+       while (a != b) {
+               if (a < b)
+                       b -= a;
+               if (b < a)
+                       a -= b;
+       }
+       /* LCM == product / GCD */
+       blocks = ochunk/512 * nchunk/512 * odata * ndata / a;
+
+       sra = sysfs_read(-1, devname2devnum(info->sys_name),
+                        GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
+                        GET_CACHE);
+
+
+       if (ndata == odata)
+               while (blocks * 32 < sra->component_size &&
+                      blocks < 16*1024*2)
+                       blocks *= 2;
+       stripes = blocks / (info->array.chunk_size/512) / odata;
+
+       /* check that the internal stripe cache is
+        * large enough, or it won't work.
+        */
+       cache = (nchunk < ochunk) ? ochunk : nchunk;
+       cache = cache * 4 / 4096;
+       if (cache < blocks / 8 / odisks + 16)
+               /* Make it big enough to hold 'blocks' */
+               cache = blocks / 8 / odisks + 16;
+       if (sra->cache_size < cache)
+               sysfs_set_num(sra, NULL, "stripe_cache_size",
+                             cache+1);
+
+       memset(&bsb, 0, 512);
+       memcpy(bsb.magic, "md_backup_data-1", 16);
+       memcpy(&bsb.set_uuid, info->uuid, 16);
+       bsb.mtime = __cpu_to_le64(time(0));
+       bsb.devstart2 = blocks;
+
+       backup_fd = open(backup_file, O_RDWR|O_CREAT, S_IRUSR | S_IWUSR);
+       backup_list[0] = backup_fd;
+       backup_offsets[0] = 8 * 512;
+       fds = malloc(odisks * sizeof(fds[0]));
+       offsets = malloc(odisks * sizeof(offsets[0]));
+       for (d=0; d<odisks; d++)
+               fds[d] = -1;
+
+       for (sd = sra->devs; sd; sd = sd->next) {
+               if (sd->disk.state & (1<<MD_DISK_FAULTY))
+                       continue;
+               if (sd->disk.state & (1<<MD_DISK_SYNC)) {
+                       char *dn = map_dev(sd->disk.major,
+                                          sd->disk.minor, 1);
+                       fds[sd->disk.raid_disk]
+                               = dev_open(dn, O_RDONLY);
+                       offsets[sd->disk.raid_disk] = sd->data_offset*512;
+                       if (fds[sd->disk.raid_disk] < 0) {
+                               fprintf(stderr, Name ": %s: cannot open component %s\n",
+                                       info->sys_name, dn?dn:"-unknown-");
+                               rv = 1;
+                               goto release;
+                       }
+                       free(dn);
+               }
+       }
+
+       switch(fork()) {
+       case 0:
+               close(mdfd);
+               mlockall(MCL_FUTURE);
+               if (info->delta_disks < 0)
+                       done = child_shrink(-1, info, stripes,
+                                           fds, offsets,
+                                           info->array.raid_disks,
+                                           info->array.chunk_size,
+                                           info->array.level, info->array.layout,
+                                           odata,
+                                           1, backup_list, backup_offsets);
+               else if (info->delta_disks == 0) {
+                       /* The 'start' is a per-device stripe number.
+                        * reshape_progress is a per-array sector number.
+                        * So divide by ndata * chunk_size
+                        */
+                       unsigned long long start = info->reshape_progress / ndata;
+                       start /= (info->array.chunk_size/512);
+                       done = child_same_size(-1, info, stripes,
+                                              fds, offsets,
+                                              start,
+                                              info->array.raid_disks,
+                                              info->array.chunk_size,
+                                              info->array.level, info->array.layout,
+                                              odata,
+                                              1, backup_list, backup_offsets);
+               }
+               if (backup_file && done)
+                       unlink(backup_file);
+               /* FIXME should I intuit a level change */
+               exit(0);
+       case -1:
+               fprintf(stderr, Name ": Cannot run child to continue monitoring reshape: %s\n",
+                       strerror(errno));
+               return 1;
+       default:
+               break;
+       }
+release:
+       return 0;
+}
+
+