X-Git-Url: http://git.ipfire.org/?a=blobdiff_plain;f=Grow.c;h=0571f5b0e16d170e5bb9671d7f62df5f0cefa65c;hb=b9b004ebc7abd5a4d8ddafef1fbf08409f24b330;hp=f36704191a73650b448169ba5655c4e80c8ce168;hpb=d6d5656bd5a05b992188eaf154c2d76a25481c06;p=thirdparty%2Fmdadm.git diff --git a/Grow.c b/Grow.c index f3670419..0571f5b0 100644 --- a/Grow.c +++ b/Grow.c @@ -288,6 +288,11 @@ int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int return 1; } else if (strcmp(file, "internal") == 0) { int d; + if (st->ss->add_internal_bitmap == NULL) { + fprintf(stderr, Name ": Internal bitmaps not supported " + "with %s metadata\n", st->ss->name); + return 1; + } for (d=0; d< st->max_devs; d++) { mdu_disk_info_t disk; char *dv; @@ -381,7 +386,7 @@ int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int /* * When reshaping an array we might need to backup some data. * This is written to all spares with a 'super_block' describing it. - * The superblock goes 1K form the end of the used space on the + * The superblock goes 4K from the end of the used space on the * device. * It if written after the backup is complete. * It has the following structure. @@ -404,7 +409,7 @@ static struct mdp_backup_super { __u8 pad[512-68-32]; } __attribute__((aligned(512))) bsb, bsb2; -int bsb_csum(char *buf, int len) +__u32 bsb_csum(char *buf, int len) { int i; int csum = 0; @@ -500,7 +505,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, int nchunk, ochunk; int nlayout, olayout; int ndisks, odisks; - int ndata, odata; + unsigned int ndata, odata; int orig_level = UnSet; char alt_layout[40]; int *fdlist; @@ -510,7 +515,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, int err; int frozen; unsigned long a,b, blocks, stripes; - int cache; + unsigned long cache; unsigned long long array_size; int changed = 0; int done; @@ -524,6 +529,15 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, return 1; } + if (size >= 0 && + (chunksize || level!= UnSet || layout_str || raid_disks)) { + fprintf(stderr, Name ": cannot change component size at the same time " + "as other changes.\n" + " Change size first, then check data is intact before " + "making other changes.\n"); + return 1; + } + if (raid_disks && raid_disks < array.raid_disks && array.level > 1 && get_linux_version() < 2006032 && !check_env("MDADM_FORCE_FEWER")) { @@ -532,7 +546,13 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, return 1; } sra = sysfs_read(fd, 0, GET_LEVEL); - frozen = freeze_array(sra); + if (sra) + frozen = freeze_array(sra); + else { + fprintf(stderr, Name ": failed to read sysfs parameters for %s\n", + devname); + return 1; + } if (frozen < 0) { fprintf(stderr, Name ": %s is performing resync/recovery and cannot" " be reshaped\n", devname); @@ -554,16 +574,27 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, } else rv = ioctl(fd, SET_ARRAY_INFO, &array); if (rv != 0) { + int err = errno; fprintf(stderr, Name ": Cannot set device size for %s: %s\n", - devname, strerror(errno)); + devname, strerror(err)); + if (err == EBUSY && + (array.state & (1<component_size) - blocks *= 16; - } else - fprintf(stderr, Name ": Need to backup %luK of critical " - "section..\n", blocks/2); + blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a; sysfs_free(sra); sra = sysfs_read(fd, 0, GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE| GET_CACHE); + if (!sra) { fprintf(stderr, Name ": %s: Cannot get array details from sysfs\n", devname); @@ -921,13 +961,25 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, break; } + if (ndata == odata) { + /* Make 'blocks' bigger for better throughput, but + * not so big that we reject it below. + * Try for 16 megabytes + */ + while (blocks * 32 < sra->component_size && + blocks < 16*1024*2) + blocks *= 2; + } else + fprintf(stderr, Name ": Need to backup %luK of critical " + "section..\n", blocks/2); + if (blocks >= sra->component_size/2) { fprintf(stderr, Name ": %s: Something wrong - reshape aborted\n", devname); rv = 1; break; } - nrdisks = array.nr_disks + sra->array.spare_disks; + nrdisks = array.raid_disks + sra->array.spare_disks; /* Now we need to open all these devices so we can read/write. */ fdlist = malloc((1+nrdisks) * sizeof(int)); @@ -960,7 +1012,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, char *dn = map_dev(sd->disk.major, sd->disk.minor, 1); fdlist[d] = dev_open(dn, O_RDWR); - offsets[d] = (sra->component_size - blocks - 8)*512; + offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512; if (fdlist[d]<0) { fprintf(stderr, Name ": %s: cannot open component %s\n", devname, dn?dn:"-unknown"); @@ -1002,7 +1054,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, break; } memset(buf, 0, 512); - for (i=0; i < blocks + 1 ; i++) { + for (i=0; i < (signed)blocks + 1 ; i++) { if (write(fdlist[d], buf, 512) != 512) { fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n", devname, backup_file, strerror(errno)); @@ -1025,6 +1077,9 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, cache = (nchunk < ochunk) ? ochunk : nchunk; cache = cache * 4 / 4096; + if (cache < blocks / 8 / odisks + 16) + /* Make it big enough to hold 'blocks' */ + cache = blocks / 8 / odisks + 16; if (sra->cache_size < cache) sysfs_set_num(sra, NULL, "stripe_cache_size", cache+1); @@ -1035,12 +1090,16 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, if (ochunk == nchunk && olayout == nlayout) { array.raid_disks = ndisks; if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { + int err = errno; rv = 1; fprintf(stderr, Name ": Cannot set device shape for %s: %s\n", devname, strerror(errno)); if (ndisks < odisks && get_linux_version() < 2006030) fprintf(stderr, Name ": linux 2.6.30 or later required\n"); + if (err == EBUSY && + (array.state & (1<= 4) odata--; if (level == 6) odata--; - sysfs_set_num(sra, NULL, "suspend_hi", (offset + stripes * chunk/512) * odata); + sysfs_set_num(sra, NULL, "suspend_hi", (offset + stripes * (chunk/512)) * odata); + /* Check that array hasn't become degraded, else we might backup the wrong data */ + sysfs_get_ll(sra, NULL, "degraded", &ll); + new_degraded = (int)ll; + if (new_degraded != *degraded) { + /* check each device to ensure it is still working */ + struct mdinfo *sd; + for (sd = sra->devs ; sd ; sd = sd->next) { + if (sd->disk.state & (1<disk.state & (1<disk.state = (1<disk.raid_disk >= 0 && + sources[sd->disk.raid_disk] >= 0) { + close(sources[sd->disk.raid_disk]); + sources[sd->disk.raid_disk] = -1; + } + } + } + } + *degraded = new_degraded; + } if (part) { bsb.arraystart2 = __cpu_to_le64(offset * odata); - bsb.length2 = __cpu_to_le64(stripes * chunk/512 * odata); + bsb.length2 = __cpu_to_le64(stripes * (chunk/512) * odata); } else { bsb.arraystart = __cpu_to_le64(offset * odata); - bsb.length = __cpu_to_le64(stripes * chunk/512 * odata); + bsb.length = __cpu_to_le64(stripes * (chunk/512) * odata); } if (part) bsb.magic[15] = '2'; @@ -1255,12 +1350,24 @@ int grow_backup(struct mdinfo *sra, bsb.sb_csum2 = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb)); - lseek64(destfd[i], destoffsets[i] - 4096, 0); - write(destfd[i], &bsb, 512); + rv = -1; + if ((unsigned long long)lseek64(destfd[i], destoffsets[i] - 4096, 0) + != destoffsets[i] - 4096) + break; + if (write(destfd[i], &bsb, 512) != 512) + break; + if (destoffsets[i] > 4096) { + if ((unsigned long long)lseek64(destfd[i], destoffsets[i]+stripes*chunk*odata, 0) != + destoffsets[i]+stripes*chunk*odata) + break; + if (write(destfd[i], &bsb, 512) != 512) + break; + } fsync(destfd[i]); + rv = 0; } - return 0; + return rv; } /* in 2.6.30, the value reported by sync_completed can be @@ -1273,6 +1380,7 @@ int grow_backup(struct mdinfo *sra, * The various caller give appropriate values so that * every works. */ +/* FIXME return value is often ignored */ int wait_backup(struct mdinfo *sra, unsigned long long offset, /* per device */ unsigned long long blocks, /* per device */ @@ -1286,6 +1394,7 @@ int wait_backup(struct mdinfo *sra, int fd = sysfs_get_fd(sra, NULL, "sync_completed"); unsigned long long completed; int i; + int rv; if (fd < 0) return -1; @@ -1317,28 +1426,34 @@ int wait_backup(struct mdinfo *sra, bsb.length = __cpu_to_le64(0); } bsb.mtime = __cpu_to_le64(time(0)); + rv = 0; for (i = 0; i < dests; i++) { bsb.devstart = __cpu_to_le64(destoffsets[i]/512); bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb)); if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0) bsb.sb_csum2 = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb)); - lseek64(destfd[i], destoffsets[i]-4096, 0); - write(destfd[i], &bsb, 512); + if ((unsigned long long)lseek64(destfd[i], destoffsets[i]-4096, 0) != + destoffsets[i]-4096) + rv = -1; + if (rv == 0 && + write(destfd[i], &bsb, 512) != 512) + rv = -1; fsync(destfd[i]); } - return 0; + return rv; } static void fail(char *msg) { - write(2, msg, strlen(msg)); - write(2, "\n", 1); - exit(1); + int rv; + rv = (write(2, msg, strlen(msg)) != (int)strlen(msg)); + rv |= (write(2, "\n", 1) != 1); + exit(rv ? 1 : 2); } static char *abuf, *bbuf; -static int abuflen; +static unsigned long long abuflen; static void validate(int afd, int bfd, unsigned long long offset) { /* check that the data in the backup against the array. @@ -1370,27 +1485,33 @@ static void validate(int afd, int bfd, unsigned long long offset) free(abuf); free(bbuf); abuflen = len; - posix_memalign((void**)&abuf, 4096, abuflen); - posix_memalign((void**)&bbuf, 4096, abuflen); + if (posix_memalign((void**)&abuf, 4096, abuflen) || + posix_memalign((void**)&bbuf, 4096, abuflen)) { + abuflen = 0; + /* just stop validating on mem-alloc failure */ + return; + } } lseek64(bfd, offset, 0); - if (read(bfd, bbuf, len) != len) { - printf("len %llu\n", len); + if ((unsigned long long)read(bfd, bbuf, len) != len) { + //printf("len %llu\n", len); fail("read first backup failed"); } lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0); - if (read(afd, abuf, len) != len) + if ((unsigned long long)read(afd, abuf, len) != len) fail("read first from array failed"); if (memcmp(bbuf, abuf, len) != 0) { + #if 0 int i; printf("offset=%llu len=%llu\n", - __le64_to_cpu(bsb2.arraystart)*512, len); + (unsigned long long)__le64_to_cpu(bsb2.arraystart)*512, len); for (i=0; icomponent_size - stripes * chunk/512; + if (posix_memalign((void**)&buf, 4096, disks * chunk)) + return 0; + start = sra->component_size - stripes * (chunk/512); sysfs_set_num(sra, NULL, "sync_max", start); sysfs_set_str(sra, NULL, "sync_action", "reshape"); sysfs_set_num(sra, NULL, "suspend_lo", 0); sysfs_set_num(sra, NULL, "suspend_hi", 0); - rv = wait_backup(sra, 0, start - stripes * chunk/512, stripes * chunk/512, + rv = wait_backup(sra, 0, start - stripes * (chunk/512), stripes * (chunk/512), dests, destfd, destoffsets, 0); if (rv < 0) return 0; @@ -1464,11 +1590,11 @@ static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes, fds, offsets, disks, chunk, level, layout, dests, destfd, destoffsets, - 0, buf); + 0, °raded, buf); validate(afd, destfd[0], destoffsets[0]); - wait_backup(sra, start, stripes*chunk/512, 0, + wait_backup(sra, start, stripes*(chunk/512), 0, dests, destfd, destoffsets, 0); - sysfs_set_num(sra, NULL, "suspend_lo", (stripes * chunk/512) * data); + sysfs_set_num(sra, NULL, "suspend_lo", (stripes * (chunk/512)) * data); free(buf); /* FIXME this should probably be numeric */ sysfs_set_str(sra, NULL, "sync_max", "max"); @@ -1486,9 +1612,11 @@ static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes, int part; char *buf; unsigned long long speed; + int degraded = 0; - posix_memalign((void**)&buf, 4096, disks * chunk); + if (posix_memalign((void**)&buf, 4096, disks * chunk)) + return 0; sysfs_set_num(sra, NULL, "suspend_lo", 0); sysfs_set_num(sra, NULL, "suspend_hi", 0); @@ -1500,44 +1628,44 @@ static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes, fds, offsets, disks, chunk, level, layout, dests, destfd, destoffsets, - 0, buf); - grow_backup(sra, (start + stripes) * chunk/512, stripes, + 0, °raded, buf); + grow_backup(sra, (start + stripes) * (chunk/512), stripes, fds, offsets, disks, chunk, level, layout, dests, destfd, destoffsets, - 1, buf); + 1, °raded, buf); validate(afd, destfd[0], destoffsets[0]); part = 0; start += stripes * 2; /* where to read next */ size = sra->component_size / (chunk/512); while (start < size) { - if (wait_backup(sra, (start-stripes*2)*chunk/512, - stripes*chunk/512, 0, + if (wait_backup(sra, (start-stripes*2)*(chunk/512), + stripes*(chunk/512), 0, dests, destfd, destoffsets, part) < 0) return 0; - sysfs_set_num(sra, NULL, "suspend_lo", start*chunk/512 * data); + sysfs_set_num(sra, NULL, "suspend_lo", start*(chunk/512) * data); if (start + stripes > size) tailstripes = (size - start); - grow_backup(sra, start*chunk/512, tailstripes, + grow_backup(sra, start*(chunk/512), tailstripes, fds, offsets, disks, chunk, level, layout, dests, destfd, destoffsets, - part, buf); + part, °raded, buf); start += stripes; part = 1 - part; validate(afd, destfd[0], destoffsets[0]); } - if (wait_backup(sra, (start-stripes*2) * chunk/512, stripes * chunk/512, 0, + if (wait_backup(sra, (start-stripes*2) * (chunk/512), stripes * (chunk/512), 0, dests, destfd, destoffsets, part) < 0) return 0; - sysfs_set_num(sra, NULL, "suspend_lo", ((start-stripes)*chunk/512) * data); - wait_backup(sra, (start-stripes) * chunk/512, tailstripes * chunk/512, 0, + sysfs_set_num(sra, NULL, "suspend_lo", ((start-stripes)*(chunk/512)) * data); + wait_backup(sra, (start-stripes) * (chunk/512), tailstripes * (chunk/512), 0, dests, destfd, destoffsets, 1-part); - sysfs_set_num(sra, NULL, "suspend_lo", (size*chunk/512) * data); + sysfs_set_num(sra, NULL, "suspend_lo", (size*(chunk/512)) * data); sysfs_set_num(sra, NULL, "sync_speed_min", speed); free(buf); return 1; @@ -1574,7 +1702,6 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt old_disks = cnt; for (i=old_disks-(backup_file?1:0); iarray.utime > __le64_to_cpu(bsb.mtime) + 10*60 || - info->array.utime < __le64_to_cpu(bsb.mtime) - 10*60) { - if (verbose) - fprintf(stderr, Name ": too-old timestamp on backup-metadata on %s\n", devname); - continue; /* time stamp is too bad */ + /* array utime and backup-mtime should be updated at much the same time, but it seems that + * sometimes they aren't... So allow considerable flexability in matching, and allow + * this test to be overridden by an environment variable. + */ + if (info->array.utime > (int)__le64_to_cpu(bsb.mtime) + 2*60*60 || + info->array.utime < (int)__le64_to_cpu(bsb.mtime) - 10*60) { + if (check_env("MDADM_GROW_ALLOW_OLD")) { + fprintf(stderr, Name ": accepting backup with timestamp %lu " + "for array with timestamp %lu\n", + (unsigned long)__le64_to_cpu(bsb.mtime), + (unsigned long)info->array.utime); + } else { + if (verbose) + fprintf(stderr, Name ": too-old timestamp on " + "backup-metadata on %s\n", devname); + continue; /* time stamp is too bad */ + } } if (bsb.magic[15] == '1') { @@ -1690,13 +1829,13 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt } /* There should be a duplicate backup superblock 4k before here */ if (lseek64(fd, -4096, 1) < 0 || - read(fd, buf, 4096) != 4096) + read(fd, &bsb2, sizeof(bsb2)) != sizeof(bsb2)) goto second_fail; /* Cannot find leading superblock */ if (bsb.magic[15] == '1') bsbsize = offsetof(struct mdp_backup_super, pad1); else bsbsize = offsetof(struct mdp_backup_super, pad); - if (memcmp(buf, &bsb, bsbsize) != 0) + if (memcmp(&bsb2, &bsb, bsbsize) != 0) goto second_fail; /* Cannot find leading superblock */ /* Now need the data offsets for all devices. */ @@ -1709,7 +1848,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt continue; st->ss->getinfo_super(st, &dinfo); st->ss->free_super(st); - offsets[j] = dinfo.data_offset; + offsets[j] = dinfo.data_offset * 512; } printf(Name ": restoring critical section\n"); @@ -1719,7 +1858,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt info->new_level, info->new_layout, fd, __le64_to_cpu(bsb.devstart)*512, - __le64_to_cpu(bsb.arraystart), + __le64_to_cpu(bsb.arraystart)*512, __le64_to_cpu(bsb.length)*512)) { /* didn't succeed, so giveup */ if (verbose) @@ -1736,7 +1875,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt info->new_layout, fd, __le64_to_cpu(bsb.devstart)*512 + __le64_to_cpu(bsb.devstart2)*512, - __le64_to_cpu(bsb.arraystart2), + __le64_to_cpu(bsb.arraystart2)*512, __le64_to_cpu(bsb.length2)*512)) { /* didn't succeed, so giveup */ if (verbose) @@ -1843,6 +1982,7 @@ int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info, int d; struct mdinfo *sra, *sd; int rv; + unsigned long cache; int done = 0; err = sysfs_set_str(info, NULL, "array_state", "readonly"); @@ -1852,7 +1992,13 @@ int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info, /* make sure reshape doesn't progress until we are ready */ sysfs_set_str(info, NULL, "sync_max", "0"); sysfs_set_str(info, NULL, "array_state", "active"); /* FIXME or clean */ - + + sra = sysfs_read(-1, devname2devnum(info->sys_name), + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE| + GET_CACHE); + if (!sra) + return 1; + /* ndisks is not growing, so raid_disks is old and +delta is new */ odisks = info->array.raid_disks; ndisks = odisks + info->delta_disks; @@ -1865,9 +2011,8 @@ int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info, ochunk = info->array.chunk_size; nchunk = info->new_chunk; - - a = ochunk/512 * odata; - b = nchunk/512 * ndata; + a = (ochunk/512) * odata; + b = (nchunk/512) * ndata; /* Find GCD */ while (a != b) { if (a < b) @@ -1876,12 +2021,25 @@ int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info, a -= b; } /* LCM == product / GCD */ - blocks = ochunk/512 * nchunk/512 * odata * ndata / a; + blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a; if (ndata == odata) - blocks *= 16; + while (blocks * 32 < sra->component_size && + blocks < 16*1024*2) + blocks *= 2; stripes = blocks / (info->array.chunk_size/512) / odata; + /* check that the internal stripe cache is + * large enough, or it won't work. + */ + cache = (nchunk < ochunk) ? ochunk : nchunk; + cache = cache * 4 / 4096; + if (cache < blocks / 8 / odisks + 16) + /* Make it big enough to hold 'blocks' */ + cache = blocks / 8 / odisks + 16; + if (sra->cache_size < cache) + sysfs_set_num(sra, NULL, "stripe_cache_size", + cache+1); memset(&bsb, 0, 512); memcpy(bsb.magic, "md_backup_data-1", 16); @@ -1897,10 +2055,6 @@ int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info, for (d=0; dsys_name), - GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE| - GET_CACHE); - for (sd = sra->devs; sd; sd = sd->next) { if (sd->disk.state & (1<