]> git.ipfire.org Git - thirdparty/mdadm.git/blobdiff - Grow.c
imsm: add support for checkpointing via 'curr_migr_unit'
[thirdparty/mdadm.git] / Grow.c
diff --git a/Grow.c b/Grow.c
index f36704191a73650b448169ba5655c4e80c8ce168..a654d4e84f2aba2571797e11698772bc53d50535 100644 (file)
--- a/Grow.c
+++ b/Grow.c
@@ -381,7 +381,7 @@ int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int
 /*
  * When reshaping an array we might need to backup some data.
  * This is written to all spares with a 'super_block' describing it.
- * The superblock goes 1K form the end of the used space on the
+ * The superblock goes 4K from the end of the used space on the
  * device.
  * It if written after the backup is complete.
  * It has the following structure.
@@ -524,6 +524,15 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                return 1;
        }
 
+       if (size >= 0 &&
+           (chunksize || level!= UnSet || layout_str || raid_disks)) {
+               fprintf(stderr, Name ": cannot change component size at the same time "
+                       "as other changes.\n"
+                       "   Change size first, then check data is intact before "
+                       "making other changes.\n");
+               return 1;
+       }
+
        if (raid_disks && raid_disks < array.raid_disks && array.level > 1 &&
            get_linux_version() < 2006032 &&
            !check_env("MDADM_FORCE_FEWER")) {
@@ -644,8 +653,10 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                                layout_str = "parity-last";
                } else {
                        c = map_num(pers, level);
-                       if (c == NULL)
-                               return 1;/* not possible */
+                       if (c == NULL) {
+                               rv = 1;/* not possible */
+                               goto release;
+                       }
                        err = sysfs_set_str(sra, NULL, "level", c);
                        if (err) {
                                fprintf(stderr, Name ": %s: could not set level to %s\n",
@@ -849,7 +860,8 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                                if (nlayout == UnSet) {
                                        fprintf(stderr, Name ": layout %s not understood for raid5.\n",
                                                layout_str);
-                                       return 1;
+                                       rv = 1;
+                                       goto release;
                                }
                                break;
 
@@ -858,7 +870,8 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                                if (nlayout == UnSet) {
                                        fprintf(stderr, Name ": layout %s not understood for raid6.\n",
                                                layout_str);
-                                       return 1;
+                                       rv = 1;
+                                       goto release;
                                }
                                break;
                        }
@@ -871,8 +884,14 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        ndata--;
                }
 
+               if (odata == ndata &&
+                   get_linux_version() < 2006032) {
+                       fprintf(stderr, Name ": in-place reshape is not safe before 2.6.32, sorry.\n");
+                       break;
+               }
+
                /* Check that we can hold all the data */
-               size = ndata * array.size;
+               size = ndata * (long long)array.size;
                get_dev_size(fd, NULL, &array_size);
                if (size < (array_size/1024)) {
                        fprintf(stderr, Name ": this change will reduce the size of the array.\n"
@@ -900,20 +919,23 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                /* LCM == product / GCD */
                blocks = ochunk/512 * nchunk/512 * odata * ndata / a;
 
+               sysfs_free(sra);
+               sra = sysfs_read(fd, 0,
+                                GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
+                                GET_CACHE);
+
                if (ndata == odata) {
                        /* Make 'blocks' bigger for better throughput, but
                         * not so big that we reject it below.
+                        * Try for 16 megabytes
                         */
-                       if (blocks * 32 < sra->component_size)
-                               blocks *= 16;
+                       while (blocks * 32 < sra->component_size &&
+                              blocks < 16*1024*2)
+                              blocks *= 2;
                } else
                        fprintf(stderr, Name ": Need to backup %luK of critical "
                                "section..\n", blocks/2);
 
-               sysfs_free(sra);
-               sra = sysfs_read(fd, 0,
-                                GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
-                                GET_CACHE);
                if (!sra) {
                        fprintf(stderr, Name ": %s: Cannot get array details from sysfs\n",
                                devname);
@@ -960,7 +982,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                                char *dn = map_dev(sd->disk.major,
                                                   sd->disk.minor, 1);
                                fdlist[d] = dev_open(dn, O_RDWR);
-                               offsets[d] = (sra->component_size - blocks - 8)*512;
+                               offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512;
                                if (fdlist[d]<0) {
                                        fprintf(stderr, Name ": %s: cannot open component %s\n",
                                                devname, dn?dn:"-unknown");
@@ -1025,6 +1047,9 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                
                cache = (nchunk < ochunk) ? ochunk : nchunk;
                cache = cache * 4 / 4096;
+               if (cache < blocks / 8 / odisks + 16)
+                       /* Make it big enough to hold 'blocks' */
+                       cache = blocks / 8 / odisks + 16;
                if (sra->cache_size < cache)
                        sysfs_set_num(sra, NULL, "stripe_cache_size",
                                      cache+1);
@@ -1205,7 +1230,7 @@ int grow_backup(struct mdinfo *sra,
                int *sources, unsigned long long *offsets,
                int disks, int chunk, int level, int layout,
                int dests, int *destfd, unsigned long long *destoffsets,
-               int part,
+               int part, int *degraded,
                char *buf)
 {
        /* Backup 'blocks' sectors at 'offset' on each device of the array,
@@ -1217,12 +1242,38 @@ int grow_backup(struct mdinfo *sra,
        int odata = disks;
        int rv = 0;
        int i;
+       unsigned long long new_degraded;
        //printf("offset %llu\n", offset);
        if (level >= 4)
                odata--;
        if (level == 6)
                odata--;
        sysfs_set_num(sra, NULL, "suspend_hi", (offset + stripes * chunk/512) * odata);
+       /* Check that array hasn't become degraded, else we might backup the wrong data */
+       sysfs_get_ll(sra, NULL, "degraded", &new_degraded);
+       if (new_degraded != *degraded) {
+               /* check each device to ensure it is still working */
+               struct mdinfo *sd;
+               for (sd = sra->devs ; sd ; sd = sd->next) {
+                       if (sd->disk.state & (1<<MD_DISK_FAULTY))
+                               continue;
+                       if (sd->disk.state & (1<<MD_DISK_SYNC)) {
+                               char sbuf[20];
+                               if (sysfs_get_str(sra, sd, "state", sbuf, 20) < 0 ||
+                                   strstr(sbuf, "faulty") ||
+                                   strstr(sbuf, "in_sync") == NULL) {
+                                       /* this device is dead */
+                                       sd->disk.state = (1<<MD_DISK_FAULTY);
+                                       if (sd->disk.raid_disk >= 0 &&
+                                           sources[sd->disk.raid_disk] >= 0) {
+                                               close(sources[sd->disk.raid_disk]);
+                                               sources[sd->disk.raid_disk] = -1;
+                                       }
+                               }
+                       }
+               }
+               *degraded = new_degraded;
+       }
        if (part) {
                bsb.arraystart2 = __cpu_to_le64(offset * odata);
                bsb.length2 = __cpu_to_le64(stripes * chunk/512 * odata);
@@ -1257,6 +1308,10 @@ int grow_backup(struct mdinfo *sra,
 
                lseek64(destfd[i], destoffsets[i] - 4096, 0);
                write(destfd[i], &bsb, 512);
+               if (destoffsets[i] > 4096) {
+                       lseek64(destfd[i], destoffsets[i]+stripes*chunk*odata, 0);
+                       write(destfd[i], &bsb, 512);
+               }
                fsync(destfd[i]);
        }
 
@@ -1422,6 +1477,7 @@ static int child_grow(int afd, struct mdinfo *sra, unsigned long stripes,
                      int dests, int *destfd, unsigned long long *destoffsets)
 {
        char *buf;
+       int degraded = 0;
 
        posix_memalign((void**)&buf, 4096, disks * chunk);
        sysfs_set_num(sra, NULL, "suspend_hi", 0);
@@ -1429,7 +1485,7 @@ static int child_grow(int afd, struct mdinfo *sra, unsigned long stripes,
        grow_backup(sra, 0, stripes,
                    fds, offsets, disks, chunk, level, layout,
                    dests, destfd, destoffsets,
-                   0, buf);
+                   0, &degraded, buf);
        validate(afd, destfd[0], destoffsets[0]);
        wait_backup(sra, 0, stripes * chunk / 512, stripes * chunk / 512,
                    dests, destfd, destoffsets,
@@ -1449,6 +1505,7 @@ static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes,
        char *buf;
        unsigned long long start;
        int rv;
+       int degraded = 0;
 
        posix_memalign((void**)&buf, 4096, disks * chunk);
        start = sra->component_size - stripes * chunk/512;
@@ -1464,7 +1521,7 @@ static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes,
                    fds, offsets,
                    disks, chunk, level, layout,
                    dests, destfd, destoffsets,
-                   0, buf);
+                   0, &degraded, buf);
        validate(afd, destfd[0], destoffsets[0]);
        wait_backup(sra, start, stripes*chunk/512, 0,
                    dests, destfd, destoffsets, 0);
@@ -1486,6 +1543,7 @@ static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
        int part;
        char *buf;
        unsigned long long speed;
+       int degraded = 0;
 
 
        posix_memalign((void**)&buf, 4096, disks * chunk);
@@ -1500,12 +1558,12 @@ static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
                    fds, offsets,
                    disks, chunk, level, layout,
                    dests, destfd, destoffsets,
-                   0, buf);
+                   0, &degraded, buf);
        grow_backup(sra, (start + stripes) * chunk/512, stripes,
                    fds, offsets,
                    disks, chunk, level, layout,
                    dests, destfd, destoffsets,
-                   1, buf);
+                   1, &degraded, buf);
        validate(afd, destfd[0], destoffsets[0]);
        part = 0;
        start += stripes * 2; /* where to read next */
@@ -1524,7 +1582,7 @@ static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
                            fds, offsets,
                            disks, chunk, level, layout,
                            dests, destfd, destoffsets,
-                           part, buf);
+                           part, &degraded, buf);
                start += stripes;
                part = 1 - part;
                validate(afd, destfd[0], destoffsets[0]);
@@ -1574,7 +1632,6 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                old_disks = cnt;
        for (i=old_disks-(backup_file?1:0); i<cnt; i++) {
                struct mdinfo dinfo;
-               char buf[4096];
                int fd;
                int bsbsize;
                char *devname, namebuf[20];
@@ -1690,13 +1747,13 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                }
                /* There should be a duplicate backup superblock 4k before here */
                if (lseek64(fd, -4096, 1) < 0 ||
-                   read(fd, buf, 4096) != 4096)
+                   read(fd, &bsb2, 4096) != 4096)
                        goto second_fail; /* Cannot find leading superblock */
                if (bsb.magic[15] == '1')
                        bsbsize = offsetof(struct mdp_backup_super, pad1);
                else
                        bsbsize = offsetof(struct mdp_backup_super, pad);
-               if (memcmp(buf, &bsb, bsbsize) != 0)
+               if (memcmp(&bsb2, &bsb, bsbsize) != 0)
                        goto second_fail; /* Cannot find leading superblock */
 
                /* Now need the data offsets for all devices. */
@@ -1709,7 +1766,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                                continue;
                        st->ss->getinfo_super(st, &dinfo);
                        st->ss->free_super(st);
-                       offsets[j] = dinfo.data_offset;
+                       offsets[j] = dinfo.data_offset * 512;
                }
                printf(Name ": restoring critical section\n");
 
@@ -1719,7 +1776,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                                    info->new_level,
                                    info->new_layout,
                                    fd, __le64_to_cpu(bsb.devstart)*512,
-                                   __le64_to_cpu(bsb.arraystart),
+                                   __le64_to_cpu(bsb.arraystart)*512,
                                    __le64_to_cpu(bsb.length)*512)) {
                        /* didn't succeed, so giveup */
                        if (verbose)
@@ -1736,7 +1793,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                                    info->new_layout,
                                    fd, __le64_to_cpu(bsb.devstart)*512 +
                                    __le64_to_cpu(bsb.devstart2)*512,
-                                   __le64_to_cpu(bsb.arraystart2),
+                                   __le64_to_cpu(bsb.arraystart2)*512,
                                    __le64_to_cpu(bsb.length2)*512)) {
                        /* didn't succeed, so giveup */
                        if (verbose)
@@ -1843,6 +1900,7 @@ int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
        int d;
        struct mdinfo *sra, *sd;
        int rv;
+       int cache;
        int done = 0;
 
        err = sysfs_set_str(info, NULL, "array_state", "readonly");
@@ -1878,10 +1936,28 @@ int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
        /* LCM == product / GCD */
        blocks = ochunk/512 * nchunk/512 * odata * ndata / a;
 
+       sra = sysfs_read(-1, devname2devnum(info->sys_name),
+                        GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
+                        GET_CACHE);
+
+
        if (ndata == odata)
-               blocks *= 16;
+               while (blocks * 32 < sra->component_size &&
+                      blocks < 16*1024*2)
+                       blocks *= 2;
        stripes = blocks / (info->array.chunk_size/512) / odata;
 
+       /* check that the internal stripe cache is
+        * large enough, or it won't work.
+        */
+       cache = (nchunk < ochunk) ? ochunk : nchunk;
+       cache = cache * 4 / 4096;
+       if (cache < blocks / 8 / odisks + 16)
+               /* Make it big enough to hold 'blocks' */
+               cache = blocks / 8 / odisks + 16;
+       if (sra->cache_size < cache)
+               sysfs_set_num(sra, NULL, "stripe_cache_size",
+                             cache+1);
 
        memset(&bsb, 0, 512);
        memcpy(bsb.magic, "md_backup_data-1", 16);
@@ -1897,10 +1973,6 @@ int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
        for (d=0; d<odisks; d++)
                fds[d] = -1;
 
-       sra = sysfs_read(-1, devname2devnum(info->sys_name),
-                        GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
-                        GET_CACHE);
-
        for (sd = sra->devs; sd; sd = sd->next) {
                if (sd->disk.state & (1<<MD_DISK_FAULTY))
                        continue;