/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
- * Email: <neilb@cse.unsw.edu.au>
- * Paper: Neil Brown
- * School of Computer Science and Engineering
- * The University of New South Wales
- * Sydney, 2052
- * Australia
+ * Email: <neilb@suse.de>
*/
#include "mdadm.h"
#include "dlink.h"
#include "md_u.h"
#include "md_p.h"
+#ifndef offsetof
+#define offsetof(t,f) ((size_t)&(((t*)0)->f))
+#endif
+
int Grow_Add_device(char *devname, int fd, char *newdev)
{
/* Add a device to an active array.
return 1;
} else if (strcmp(file, "internal") == 0) {
int d;
+ if (st->ss->add_internal_bitmap == NULL) {
+ fprintf(stderr, Name ": Internal bitmaps not supported "
+ "with %s metadata\n", st->ss->name);
+ return 1;
+ }
for (d=0; d< st->max_devs; d++) {
mdu_disk_info_t disk;
char *dv;
/*
* When reshaping an array we might need to backup some data.
* This is written to all spares with a 'super_block' describing it.
- * The superblock goes 1K form the end of the used space on the
+ * The superblock goes 4K from the end of the used space on the
* device.
* It if written after the backup is complete.
* It has the following structure.
*/
-struct mdp_backup_super {
+static struct mdp_backup_super {
char magic[16]; /* md_backup_data-1 or -2 */
__u8 set_uuid[16];
__u64 mtime;
__u64 length2;
__u32 sb_csum2; /* csum of preceeding bytes. */
__u8 pad[512-68-32];
-} __attribute__((aligned(512))) bsb;
+} __attribute__((aligned(512))) bsb, bsb2;
int bsb_csum(char *buf, int len)
{
int dests, int *destfd, unsigned long long *destoffsets);
static int child_same_size(int afd, struct mdinfo *sra, unsigned long blocks,
int *fds, unsigned long long *offsets,
+ unsigned long long start,
int disks, int chunk, int level, int layout, int data,
int dests, int *destfd, unsigned long long *destoffsets);
devname);
return 1;
}
+
+ if (size >= 0 &&
+ (chunksize || level!= UnSet || layout_str || raid_disks)) {
+ fprintf(stderr, Name ": cannot change component size at the same time "
+ "as other changes.\n"
+ " Change size first, then check data is intact before "
+ "making other changes.\n");
+ return 1;
+ }
+
+ if (raid_disks && raid_disks < array.raid_disks && array.level > 1 &&
+ get_linux_version() < 2006032 &&
+ !check_env("MDADM_FORCE_FEWER")) {
+ fprintf(stderr, Name ": reducing the number of devices is not safe before Linux 2.6.32\n"
+ " Please use a newer kernel\n");
+ return 1;
+ }
sra = sysfs_read(fd, 0, GET_LEVEL);
frozen = freeze_array(sra);
if (frozen < 0) {
} else
rv = ioctl(fd, SET_ARRAY_INFO, &array);
if (rv != 0) {
+ int err = errno;
fprintf(stderr, Name ": Cannot set device size for %s: %s\n",
- devname, strerror(errno));
+ devname, strerror(err));
+ if (err == EBUSY &&
+ (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+ fprintf(stderr, " Bitmap must be removed before size can be changed\n");
rv = 1;
goto release;
}
ioctl(fd, GET_ARRAY_INFO, &array);
+ size = get_component_size(fd)/2;
+ if (size == 0)
+ size = array.size;
if (!quiet)
- fprintf(stderr, Name ": component size of %s has been set to %dK\n",
- devname, array.size);
+ fprintf(stderr, Name ": component size of %s has been set to %lluK\n",
+ devname, size);
changed = 1;
+ } else {
+ size = get_component_size(fd)/2;
+ if (size == 0)
+ size = array.size;
}
/* ======= set level =========== */
}
}
if (raid_disks)
- /* The find raid6->raid5 conversion
+ /* The final raid6->raid5 conversion
* will reduce the number of disks,
* so now we need to aim higher
*/
layout_str = "parity-last";
} else {
c = map_num(pers, level);
- if (c == NULL)
- return 1;/* not possible */
+ if (c == NULL) {
+ rv = 1;/* not possible */
+ goto release;
+ }
err = sysfs_set_str(sra, NULL, "level", c);
if (err) {
+ err = errno;
fprintf(stderr, Name ": %s: could not set level to %s\n",
devname, c);
+ if (err == EBUSY &&
+ (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+ fprintf(stderr, " Bitmap must be removed before level can be changed\n");
rv = 1;
goto release;
}
c = map_num(pers, level);
if (c) {
rv = sysfs_set_str(sra, NULL, "level", c);
- if (rv)
+ if (rv) {
+ int err = errno;
fprintf(stderr, Name ": %s: could not set level to %s\n",
devname, c);
+ if (err == EBUSY &&
+ (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+ fprintf(stderr, " Bitmap must be removed before level can be changed\n");
+ }
}
} else if (!changed && !quiet)
fprintf(stderr, Name ": %s: no change requested\n",
if (chunksize) {
nchunk = chunksize * 1024;
- if (array.size % chunksize) {
- fprintf(stderr, Name ": component size %dK is not"
+ if (size % chunksize) {
+ fprintf(stderr, Name ": component size %lluK is not"
" a multiple of chunksize %dK\n",
- array.size, chunksize);
+ size, chunksize);
break;
}
}
if (nlayout == UnSet) {
fprintf(stderr, Name ": layout %s not understood for raid5.\n",
layout_str);
- return 1;
+ rv = 1;
+ goto release;
}
break;
if (nlayout == UnSet) {
fprintf(stderr, Name ": layout %s not understood for raid6.\n",
layout_str);
- return 1;
+ rv = 1;
+ goto release;
}
break;
}
ndata--;
}
+ if (odata == ndata &&
+ get_linux_version() < 2006032) {
+ fprintf(stderr, Name ": in-place reshape is not safe before 2.6.32, sorry.\n");
+ break;
+ }
+
/* Check that we can hold all the data */
- size = ndata * array.size;
get_dev_size(fd, NULL, &array_size);
- if (size < (array_size/1024)) {
+ if (ndata * size < (array_size/1024)) {
fprintf(stderr, Name ": this change will reduce the size of the array.\n"
" use --grow --array-size first to truncate array.\n"
" e.g. mdadm --grow %s --array-size %llu\n",
- devname, size);
+ devname, ndata * size);
rv = 1;
break;
}
/* LCM == product / GCD */
blocks = ochunk/512 * nchunk/512 * odata * ndata / a;
- if (ndata == odata)
- blocks *= 16;
- else
- fprintf(stderr, Name ": Need to backup %luK of critical "
- "section..\n", blocks/2);
-
sysfs_free(sra);
sra = sysfs_read(fd, 0,
GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
GET_CACHE);
+
+ if (ndata == odata) {
+ /* Make 'blocks' bigger for better throughput, but
+ * not so big that we reject it below.
+ * Try for 16 megabytes
+ */
+ while (blocks * 32 < sra->component_size &&
+ blocks < 16*1024*2)
+ blocks *= 2;
+ } else
+ fprintf(stderr, Name ": Need to backup %luK of critical "
+ "section..\n", blocks/2);
+
if (!sra) {
fprintf(stderr, Name ": %s: Cannot get array details from sysfs\n",
devname);
char *dn = map_dev(sd->disk.major,
sd->disk.minor, 1);
fdlist[d] = dev_open(dn, O_RDWR);
- offsets[d] = (sra->component_size - blocks - 8)*512;
+ offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512;
if (fdlist[d]<0) {
fprintf(stderr, Name ": %s: cannot open component %s\n",
devname, dn?dn:"-unknown");
cache = (nchunk < ochunk) ? ochunk : nchunk;
cache = cache * 4 / 4096;
+ if (cache < blocks / 8 / odisks + 16)
+ /* Make it big enough to hold 'blocks' */
+ cache = blocks / 8 / odisks + 16;
if (sra->cache_size < cache)
sysfs_set_num(sra, NULL, "stripe_cache_size",
cache+1);
if (ochunk == nchunk && olayout == nlayout) {
array.raid_disks = ndisks;
if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
+ int err = errno;
rv = 1;
fprintf(stderr, Name ": Cannot set device shape for %s: %s\n",
devname, strerror(errno));
if (ndisks < odisks &&
get_linux_version() < 2006030)
fprintf(stderr, Name ": linux 2.6.30 or later required\n");
+ if (err == EBUSY &&
+ (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+ fprintf(stderr, " Bitmap must be removed before shape can be changed\n");
break;
}
/* set them all just in case some old 'new_*' value
* persists from some earlier problem
*/
+ int err = err; /* only used if rv==1, and always set if
+ * rv==1, so initialisation not needed,
+ * despite gcc warning
+ */
if (sysfs_set_num(sra, NULL, "chunk_size", nchunk) < 0)
- rv = 1;
- if (sysfs_set_num(sra, NULL, "layout", nlayout) < 0)
- rv = 1;
- if (sysfs_set_num(sra, NULL, "raid_disks", ndisks) < 0)
- rv = 1;
+ rv = 1, err = errno;
+ if (!rv && sysfs_set_num(sra, NULL, "layout", nlayout) < 0)
+ rv = 1, err = errno;
+ if (!rv && sysfs_set_num(sra, NULL, "raid_disks", ndisks) < 0)
+ rv = 1, err = errno;
if (rv) {
fprintf(stderr, Name ": Cannot set device shape for %s\n",
devname);
if (get_linux_version() < 2006030)
fprintf(stderr, Name ": linux 2.6.30 or later required\n");
+ if (err == EBUSY &&
+ (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+ fprintf(stderr, " Bitmap must be removed before shape can be changed\n");
break;
}
}
else
done = child_same_size(fd, sra, stripes,
fdlist, offsets,
+ 0,
odisks, ochunk, array.level, olayout, odata,
d - odisks, fdlist+odisks, offsets+odisks);
if (backup_file && done)
*
*/
+/* FIXME return status is never checked */
int grow_backup(struct mdinfo *sra,
unsigned long long offset, /* per device */
unsigned long stripes, /* per device */
int *sources, unsigned long long *offsets,
int disks, int chunk, int level, int layout,
int dests, int *destfd, unsigned long long *destoffsets,
- int part,
+ int part, int *degraded,
char *buf)
{
/* Backup 'blocks' sectors at 'offset' on each device of the array,
int odata = disks;
int rv = 0;
int i;
+ unsigned long long new_degraded;
//printf("offset %llu\n", offset);
if (level >= 4)
odata--;
if (level == 6)
odata--;
sysfs_set_num(sra, NULL, "suspend_hi", (offset + stripes * chunk/512) * odata);
+ /* Check that array hasn't become degraded, else we might backup the wrong data */
+ sysfs_get_ll(sra, NULL, "degraded", &new_degraded);
+ if (new_degraded != *degraded) {
+ /* check each device to ensure it is still working */
+ struct mdinfo *sd;
+ for (sd = sra->devs ; sd ; sd = sd->next) {
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ if (sd->disk.state & (1<<MD_DISK_SYNC)) {
+ char sbuf[20];
+ if (sysfs_get_str(sra, sd, "state", sbuf, 20) < 0 ||
+ strstr(sbuf, "faulty") ||
+ strstr(sbuf, "in_sync") == NULL) {
+ /* this device is dead */
+ sd->disk.state = (1<<MD_DISK_FAULTY);
+ if (sd->disk.raid_disk >= 0 &&
+ sources[sd->disk.raid_disk] >= 0) {
+ close(sources[sd->disk.raid_disk]);
+ sources[sd->disk.raid_disk] = -1;
+ }
+ }
+ }
+ }
+ *degraded = new_degraded;
+ }
if (part) {
bsb.arraystart2 = __cpu_to_le64(offset * odata);
bsb.length2 = __cpu_to_le64(stripes * chunk/512 * odata);
if (rv)
return rv;
+ bsb.mtime = __cpu_to_le64(time(0));
for (i = 0; i < dests; i++) {
bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
bsb.sb_csum2 = bsb_csum((char*)&bsb,
((char*)&bsb.sb_csum2)-((char*)&bsb));
- lseek64(destfd[i], destoffsets[i] - 4096, 0);
- write(destfd[i], &bsb, 512);
+ if (lseek64(destfd[i], destoffsets[i] - 4096, 0) != destoffsets[i] - 4096)
+ rv = 1;
+ rv = rv ?: write(destfd[i], &bsb, 512);
+ if (destoffsets[i] > 4096) {
+ if (lseek64(destfd[i], destoffsets[i]+stripes*chunk*odata, 0) !=
+ destoffsets[i]+stripes*chunk*odata)
+ rv = 1;
+ rv = rv ?: write(destfd[i], &bsb, 512);
+ }
fsync(destfd[i]);
}
- return 0;
+ return rv;
}
/* in 2.6.30, the value reported by sync_completed can be
* The various caller give appropriate values so that
* every works.
*/
+/* FIXME return value is often ignored */
int wait_backup(struct mdinfo *sra,
unsigned long long offset, /* per device */
unsigned long long blocks, /* per device */
int fd = sysfs_get_fd(sra, NULL, "sync_completed");
unsigned long long completed;
int i;
+ int rv;
if (fd < 0)
return -1;
bsb.arraystart = __cpu_to_le64(0);
bsb.length = __cpu_to_le64(0);
}
+ bsb.mtime = __cpu_to_le64(time(0));
+ rv = 0;
for (i = 0; i < dests; i++) {
bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
bsb.sb_csum2 = bsb_csum((char*)&bsb,
((char*)&bsb.sb_csum2)-((char*)&bsb));
- lseek64(destfd[i], destoffsets[i]-4096, 0);
- write(destfd[i], &bsb, 512);
+ if (lseek64(destfd[i], destoffsets[i]-4096, 0) !=
+ destoffsets[i]-4096)
+ rv = 1;
+ rv = rv ?: write(destfd[i], &bsb, 512);
fsync(destfd[i]);
}
- return 0;
+ return rv;
}
static void fail(char *msg)
{
- write(2, msg, strlen(msg));
- write(2, "\n", 1);
- exit(1);
+ int rv;
+ rv = write(2, msg, strlen(msg));
+ rv |= write(2, "\n", 1);
+ exit(rv ? 1 : 2);
}
static char *abuf, *bbuf;
* This is only used for regression testing and should not
* be used while the array is active
*/
- struct mdp_backup_super bsb2;
if (afd < 0)
return;
lseek64(bfd, offset - 4096, 0);
free(abuf);
free(bbuf);
abuflen = len;
- posix_memalign((void**)&abuf, 4096, abuflen);
- posix_memalign((void**)&bbuf, 4096, abuflen);
+ if (posix_memalign((void**)&abuf, 4096, abuflen) ||
+ posix_memalign((void**)&bbuf, 4096, abuflen)) {
+ abuflen = 0;
+ /* just stop validating on mem-alloc failure */
+ return;
+ }
}
lseek64(bfd, offset, 0);
if (read(bfd, bbuf, len) != len) {
- printf("len %llu\n", len);
+ //printf("len %llu\n", len);
fail("read first backup failed");
}
lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0);
if (read(afd, abuf, len) != len)
fail("read first from array failed");
if (memcmp(bbuf, abuf, len) != 0) {
+ #if 0
int i;
printf("offset=%llu len=%llu\n",
- __le64_to_cpu(bsb2.arraystart)*512, len);
+ (unsigned long long)__le64_to_cpu(bsb2.arraystart)*512, len);
for (i=0; i<len; i++)
if (bbuf[i] != abuf[i]) {
printf("first diff byte %d\n", i);
break;
}
+ #endif
fail("data1 compare failed");
}
}
int dests, int *destfd, unsigned long long *destoffsets)
{
char *buf;
+ int degraded = 0;
- posix_memalign((void**)&buf, 4096, disks * chunk);
+ if (posix_memalign((void**)&buf, 4096, disks * chunk))
+ /* Don't start the 'reshape' */
+ return 0;
sysfs_set_num(sra, NULL, "suspend_hi", 0);
sysfs_set_num(sra, NULL, "suspend_lo", 0);
grow_backup(sra, 0, stripes,
fds, offsets, disks, chunk, level, layout,
dests, destfd, destoffsets,
- 0, buf);
+ 0, °raded, buf);
validate(afd, destfd[0], destoffsets[0]);
- if (wait_backup(sra, 0, stripes * chunk / 512, stripes * chunk / 512,
- dests, destfd, destoffsets,
- 0) < 0)
- return 0;
+ wait_backup(sra, 0, stripes * chunk / 512, stripes * chunk / 512,
+ dests, destfd, destoffsets,
+ 0);
sysfs_set_num(sra, NULL, "suspend_lo", (stripes * chunk/512) * data);
free(buf);
/* FIXME this should probably be numeric */
char *buf;
unsigned long long start;
int rv;
+ int degraded = 0;
- posix_memalign((void**)&buf, 4096, disks * chunk);
+ if (posix_memalign((void**)&buf, 4096, disks * chunk))
+ return 0;
start = sra->component_size - stripes * chunk/512;
sysfs_set_num(sra, NULL, "sync_max", start);
sysfs_set_str(sra, NULL, "sync_action", "reshape");
fds, offsets,
disks, chunk, level, layout,
dests, destfd, destoffsets,
- 0, buf);
+ 0, °raded, buf);
validate(afd, destfd[0], destoffsets[0]);
- rv = wait_backup(sra, start, stripes*chunk/512, 0,
- dests, destfd, destoffsets, 0);
- if (rv < 0)
- return 0;
+ wait_backup(sra, start, stripes*chunk/512, 0,
+ dests, destfd, destoffsets, 0);
sysfs_set_num(sra, NULL, "suspend_lo", (stripes * chunk/512) * data);
free(buf);
/* FIXME this should probably be numeric */
static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
int *fds, unsigned long long *offsets,
+ unsigned long long start,
int disks, int chunk, int level, int layout, int data,
int dests, int *destfd, unsigned long long *destoffsets)
{
- unsigned long long start, size;
+ unsigned long long size;
unsigned long tailstripes = stripes;
int part;
char *buf;
unsigned long long speed;
+ int degraded = 0;
- posix_memalign((void**)&buf, 4096, disks * chunk);
+ if (posix_memalign((void**)&buf, 4096, disks * chunk))
+ return 0;
sysfs_set_num(sra, NULL, "suspend_lo", 0);
sysfs_set_num(sra, NULL, "suspend_hi", 0);
sysfs_get_ll(sra, NULL, "sync_speed_min", &speed);
sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
- grow_backup(sra, 0, stripes,
+ grow_backup(sra, start, stripes,
fds, offsets,
disks, chunk, level, layout,
dests, destfd, destoffsets,
- 0, buf);
- grow_backup(sra, stripes * chunk/512, stripes,
+ 0, °raded, buf);
+ grow_backup(sra, (start + stripes) * chunk/512, stripes,
fds, offsets,
disks, chunk, level, layout,
dests, destfd, destoffsets,
- 1, buf);
+ 1, °raded, buf);
validate(afd, destfd[0], destoffsets[0]);
part = 0;
- start = stripes * 2; /* where to read next */
+ start += stripes * 2; /* where to read next */
size = sra->component_size / (chunk/512);
while (start < size) {
if (wait_backup(sra, (start-stripes*2)*chunk/512,
fds, offsets,
disks, chunk, level, layout,
dests, destfd, destoffsets,
- part, buf);
+ part, °raded, buf);
start += stripes;
part = 1 - part;
validate(afd, destfd[0], destoffsets[0]);
part) < 0)
return 0;
sysfs_set_num(sra, NULL, "suspend_lo", ((start-stripes)*chunk/512) * data);
- if (wait_backup(sra, (start-stripes) * chunk/512, tailstripes * chunk/512, 0,
- dests, destfd, destoffsets,
- 1-part) < 0)
- return 0;
+ wait_backup(sra, (start-stripes) * chunk/512, tailstripes * chunk/512, 0,
+ dests, destfd, destoffsets,
+ 1-part);
sysfs_set_num(sra, NULL, "suspend_lo", (size*chunk/512) * data);
sysfs_set_num(sra, NULL, "sync_speed_min", speed);
free(buf);
* write that data into the array and update the super blocks with
* the new reshape_progress
*/
-int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt, char *backup_file)
+int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt,
+ char *backup_file, int verbose)
{
int i, j;
int old_disks;
unsigned long long *offsets;
- unsigned long long nstripe, ostripe, last_block;
+ unsigned long long nstripe, ostripe;
int ndata, odata;
- if (info->delta_disks < 0)
- return 1; /* cannot handle a shrink */
- if (info->new_level != info->array.level ||
- info->new_layout != info->array.layout ||
- info->new_chunk != info->array.chunk_size)
- return 1; /* Can only handle change in disks */
+ if (info->new_level != info->array.level)
+ return 1; /* Cannot handle level changes (they are instantaneous) */
+
+ odata = info->array.raid_disks - info->delta_disks - 1;
+ if (info->array.level == 6) odata--; /* number of data disks */
+ ndata = info->array.raid_disks - 1;
+ if (info->new_level == 6) ndata--;
old_disks = info->array.raid_disks - info->delta_disks;
+ if (info->delta_disks <= 0)
+ /* Didn't grow, so the backup file must have
+ * been used
+ */
+ old_disks = cnt;
for (i=old_disks-(backup_file?1:0); i<cnt; i++) {
struct mdinfo dinfo;
- char buf[4096];
int fd;
+ int bsbsize;
+ char *devname, namebuf[20];
/* This was a spare and may have some saved data on it.
* Load the superblock, find and load the
*/
if (i == old_disks-1) {
fd = open(backup_file, O_RDONLY);
- if (fd<0)
+ if (fd<0) {
+ fprintf(stderr, Name ": backup file %s inaccessible: %s\n",
+ backup_file, strerror(errno));
continue;
+ }
+ devname = backup_file;
} else {
fd = fdlist[i];
if (fd < 0)
if (lseek64(fd,
(dinfo.data_offset + dinfo.component_size - 8) <<9,
- 0) < 0)
+ 0) < 0) {
+ fprintf(stderr, Name ": Cannot seek on device %d\n", i);
continue; /* Cannot seek */
+ }
+ sprintf(namebuf, "device-%d", i);
+ devname = namebuf;
}
- if (read(fd, &bsb, sizeof(bsb)) != sizeof(bsb))
+ if (read(fd, &bsb, sizeof(bsb)) != sizeof(bsb)) {
+ if (verbose)
+ fprintf(stderr, Name ": Cannot read from %s\n", devname);
continue; /* Cannot read */
- if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0)
+ }
+ if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0 &&
+ memcmp(bsb.magic, "md_backup_data-2", 16) != 0) {
+ if (verbose)
+ fprintf(stderr, Name ": No backup metadata on %s\n", devname);
continue;
- if (bsb.sb_csum != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb)))
+ }
+ if (bsb.sb_csum != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb))) {
+ if (verbose)
+ fprintf(stderr, Name ": Bad backup-metadata checksum on %s\n", devname);
continue; /* bad checksum */
- if (memcmp(bsb.set_uuid,info->uuid, 16) != 0)
+ }
+ if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0 &&
+ bsb.sb_csum2 != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb))) {
+ if (verbose)
+ fprintf(stderr, Name ": Bad backup-metadata checksum2 on %s\n", devname);
+ continue; /* Bad second checksum */
+ }
+ if (memcmp(bsb.set_uuid,info->uuid, 16) != 0) {
+ if (verbose)
+ fprintf(stderr, Name ": Wrong uuid on backup-metadata on %s\n", devname);
continue; /* Wrong uuid */
+ }
- if (info->array.utime > __le64_to_cpu(bsb.mtime) + 3600 ||
- info->array.utime < __le64_to_cpu(bsb.mtime))
- continue; /* time stamp is too bad */
-
- if (__le64_to_cpu(bsb.arraystart) != 0)
- continue; /* Can only handle backup from start of array */
- if (__le64_to_cpu(bsb.length) <
- info->reshape_progress)
- continue; /* No new data here */
+ /* array utime and backup-mtime should be updated at much the same time, but it seems that
+ * sometimes they aren't... So allow considerable flexability in matching, and allow
+ * this test to be overridden by an environment variable.
+ */
+ if (info->array.utime > __le64_to_cpu(bsb.mtime) + 2*60*60 ||
+ info->array.utime < __le64_to_cpu(bsb.mtime) - 10*60) {
+ if (check_env("MDADM_GROW_ALLOW_OLD")) {
+ fprintf(stderr, Name ": accepting backup with timestamp %lu "
+ "for array with timestamp %lu\n",
+ (unsigned long)__le64_to_cpu(bsb.mtime),
+ (unsigned long)info->array.utime);
+ } else {
+ if (verbose)
+ fprintf(stderr, Name ": too-old timestamp on "
+ "backup-metadata on %s\n", devname);
+ continue; /* time stamp is too bad */
+ }
+ }
- if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0)
+ if (bsb.magic[15] == '1') {
+ if (info->delta_disks >= 0) {
+ /* reshape_progress is increasing */
+ if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) <
+ info->reshape_progress) {
+ nonew:
+ if (verbose)
+ fprintf(stderr, Name ": backup-metadata found on %s but is not needed\n", devname);
+ continue; /* No new data here */
+ }
+ } else {
+ /* reshape_progress is decreasing */
+ if (__le64_to_cpu(bsb.arraystart) >=
+ info->reshape_progress)
+ goto nonew; /* No new data here */
+ }
+ } else {
+ if (info->delta_disks >= 0) {
+ /* reshape_progress is increasing */
+ if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) <
+ info->reshape_progress &&
+ __le64_to_cpu(bsb.arraystart2) + __le64_to_cpu(bsb.length2) <
+ info->reshape_progress)
+ goto nonew; /* No new data here */
+ } else {
+ /* reshape_progress is decreasing */
+ if (__le64_to_cpu(bsb.arraystart) >=
+ info->reshape_progress &&
+ __le64_to_cpu(bsb.arraystart2) >=
+ info->reshape_progress)
+ goto nonew; /* No new data here */
+ }
+ }
+ if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) {
+ second_fail:
+ if (verbose)
+ fprintf(stderr, Name ": Failed to verify secondary backup-metadata block on %s\n",
+ devname);
continue; /* Cannot seek */
+ }
/* There should be a duplicate backup superblock 4k before here */
if (lseek64(fd, -4096, 1) < 0 ||
- read(fd, buf, 4096) != 4096 ||
- memcmp(buf, &bsb, sizeof(bsb)) != 0)
- continue; /* Cannot find leading superblock */
+ read(fd, &bsb2, 4096) != 4096)
+ goto second_fail; /* Cannot find leading superblock */
+ if (bsb.magic[15] == '1')
+ bsbsize = offsetof(struct mdp_backup_super, pad1);
+ else
+ bsbsize = offsetof(struct mdp_backup_super, pad);
+ if (memcmp(&bsb2, &bsb, bsbsize) != 0)
+ goto second_fail; /* Cannot find leading superblock */
/* Now need the data offsets for all devices. */
offsets = malloc(sizeof(*offsets)*info->array.raid_disks);
continue;
st->ss->getinfo_super(st, &dinfo);
st->ss->free_super(st);
- offsets[j] = dinfo.data_offset;
+ offsets[j] = dinfo.data_offset * 512;
}
printf(Name ": restoring critical section\n");
info->new_level,
info->new_layout,
fd, __le64_to_cpu(bsb.devstart)*512,
- 0, __le64_to_cpu(bsb.length)*512)) {
+ __le64_to_cpu(bsb.arraystart)*512,
+ __le64_to_cpu(bsb.length)*512)) {
/* didn't succeed, so giveup */
+ if (verbose)
+ fprintf(stderr, Name ": Error restoring backup from %s\n",
+ devname);
+ return 1;
+ }
+
+ if (bsb.magic[15] == '2' &&
+ restore_stripes(fdlist, offsets,
+ info->array.raid_disks,
+ info->new_chunk,
+ info->new_level,
+ info->new_layout,
+ fd, __le64_to_cpu(bsb.devstart)*512 +
+ __le64_to_cpu(bsb.devstart2)*512,
+ __le64_to_cpu(bsb.arraystart2)*512,
+ __le64_to_cpu(bsb.length2)*512)) {
+ /* didn't succeed, so giveup */
+ if (verbose)
+ fprintf(stderr, Name ": Error restoring second backup from %s\n",
+ devname);
return 1;
}
+
/* Ok, so the data is restored. Let's update those superblocks. */
+ if (info->delta_disks >= 0) {
+ info->reshape_progress = __le64_to_cpu(bsb.arraystart) +
+ __le64_to_cpu(bsb.length);
+ if (bsb.magic[15] == '2') {
+ unsigned long long p2 = __le64_to_cpu(bsb.arraystart2) +
+ __le64_to_cpu(bsb.length2);
+ if (p2 > info->reshape_progress)
+ info->reshape_progress = p2;
+ }
+ } else {
+ info->reshape_progress = __le64_to_cpu(bsb.arraystart);
+ if (bsb.magic[15] == '2') {
+ unsigned long long p2 = __le64_to_cpu(bsb.arraystart2);
+ if (p2 < info->reshape_progress)
+ info->reshape_progress = p2;
+ }
+ }
for (j=0; j<info->array.raid_disks; j++) {
if (fdlist[j] < 0) continue;
if (st->ss->load_super(st, fdlist[j], NULL))
continue;
st->ss->getinfo_super(st, &dinfo);
- dinfo.reshape_progress = __le64_to_cpu(bsb.length);
+ dinfo.reshape_progress = info->reshape_progress;
st->ss->update_super(st, &dinfo,
"_reshape_progress",
NULL,0, 0, NULL);
st->ss->store_super(st, fdlist[j]);
st->ss->free_super(st);
}
-
- /* And we are done! */
return 0;
}
/* Didn't find any backup data, try to see if any
* was needed.
*/
- nstripe = ostripe = 0;
- odata = info->array.raid_disks - info->delta_disks - 1;
- if (info->array.level == 6) odata--; /* number of data disks */
- ndata = info->array.raid_disks - 1;
- if (info->new_level == 6) ndata--;
- last_block = 0;
- while (nstripe >= ostripe) {
- nstripe += info->new_chunk / 512;
- last_block = nstripe * ndata;
- ostripe = last_block / odata / (info->array.chunk_size/512) *
- (info->array.chunk_size/512);
+ if (info->delta_disks < 0) {
+ /* When shrinking, the critical section is at the end.
+ * So see if we are before the critical section.
+ */
+ unsigned long long first_block;
+ nstripe = ostripe = 0;
+ first_block = 0;
+ while (ostripe >= nstripe) {
+ ostripe += info->array.chunk_size / 512;
+ first_block = ostripe * odata;
+ nstripe = first_block / ndata / (info->new_chunk/512) *
+ (info->new_chunk/512);
+ }
+
+ if (info->reshape_progress >= first_block)
+ return 0;
}
+ if (info->delta_disks > 0) {
+ /* See if we are beyond the critical section. */
+ unsigned long long last_block;
+ nstripe = ostripe = 0;
+ last_block = 0;
+ while (nstripe >= ostripe) {
+ nstripe += info->new_chunk / 512;
+ last_block = nstripe * ndata;
+ ostripe = last_block / odata / (info->array.chunk_size/512) *
+ (info->array.chunk_size/512);
+ }
- if (info->reshape_progress >= last_block)
- return 0;
+ if (info->reshape_progress >= last_block)
+ return 0;
+ }
/* needed to recover critical section! */
+ if (verbose)
+ fprintf(stderr, Name ": Failed to find backup of critical section\n");
return 1;
}
+
+int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
+ char *backup_file)
+{
+ /* Array is assembled and ready to be started, but
+ * monitoring is probably required.
+ * So:
+ * - start read-only
+ * - set upper bound for resync
+ * - initialise the 'suspend' boundaries
+ * - switch to read-write
+ * - fork and continue monitoring
+ */
+ int err;
+ int backup_list[1];
+ unsigned long long backup_offsets[1];
+ int odisks, ndisks, ochunk, nchunk,odata,ndata;
+ unsigned long a,b,blocks,stripes;
+ int backup_fd;
+ int *fds;
+ unsigned long long *offsets;
+ int d;
+ struct mdinfo *sra, *sd;
+ int rv;
+ int cache;
+ int done = 0;
+
+ err = sysfs_set_str(info, NULL, "array_state", "readonly");
+ if (err)
+ return err;
+
+ /* make sure reshape doesn't progress until we are ready */
+ sysfs_set_str(info, NULL, "sync_max", "0");
+ sysfs_set_str(info, NULL, "array_state", "active"); /* FIXME or clean */
+
+ /* ndisks is not growing, so raid_disks is old and +delta is new */
+ odisks = info->array.raid_disks;
+ ndisks = odisks + info->delta_disks;
+ odata = odisks - 1;
+ ndata = ndisks - 1;
+ if (info->array.level == 6) {
+ odata--;
+ ndata--;
+ }
+ ochunk = info->array.chunk_size;
+ nchunk = info->new_chunk;
+
+
+ a = ochunk/512 * odata;
+ b = nchunk/512 * ndata;
+ /* Find GCD */
+ while (a != b) {
+ if (a < b)
+ b -= a;
+ if (b < a)
+ a -= b;
+ }
+ /* LCM == product / GCD */
+ blocks = ochunk/512 * nchunk/512 * odata * ndata / a;
+
+ sra = sysfs_read(-1, devname2devnum(info->sys_name),
+ GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
+ GET_CACHE);
+
+
+ if (ndata == odata)
+ while (blocks * 32 < sra->component_size &&
+ blocks < 16*1024*2)
+ blocks *= 2;
+ stripes = blocks / (info->array.chunk_size/512) / odata;
+
+ /* check that the internal stripe cache is
+ * large enough, or it won't work.
+ */
+ cache = (nchunk < ochunk) ? ochunk : nchunk;
+ cache = cache * 4 / 4096;
+ if (cache < blocks / 8 / odisks + 16)
+ /* Make it big enough to hold 'blocks' */
+ cache = blocks / 8 / odisks + 16;
+ if (sra->cache_size < cache)
+ sysfs_set_num(sra, NULL, "stripe_cache_size",
+ cache+1);
+
+ memset(&bsb, 0, 512);
+ memcpy(bsb.magic, "md_backup_data-1", 16);
+ memcpy(&bsb.set_uuid, info->uuid, 16);
+ bsb.mtime = __cpu_to_le64(time(0));
+ bsb.devstart2 = blocks;
+
+ backup_fd = open(backup_file, O_RDWR|O_CREAT, S_IRUSR | S_IWUSR);
+ backup_list[0] = backup_fd;
+ backup_offsets[0] = 8 * 512;
+ fds = malloc(odisks * sizeof(fds[0]));
+ offsets = malloc(odisks * sizeof(offsets[0]));
+ for (d=0; d<odisks; d++)
+ fds[d] = -1;
+
+ for (sd = sra->devs; sd; sd = sd->next) {
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ if (sd->disk.state & (1<<MD_DISK_SYNC)) {
+ char *dn = map_dev(sd->disk.major,
+ sd->disk.minor, 1);
+ fds[sd->disk.raid_disk]
+ = dev_open(dn, O_RDONLY);
+ offsets[sd->disk.raid_disk] = sd->data_offset*512;
+ if (fds[sd->disk.raid_disk] < 0) {
+ fprintf(stderr, Name ": %s: cannot open component %s\n",
+ info->sys_name, dn?dn:"-unknown-");
+ rv = 1;
+ goto release;
+ }
+ free(dn);
+ }
+ }
+
+ switch(fork()) {
+ case 0:
+ close(mdfd);
+ mlockall(MCL_FUTURE);
+ if (info->delta_disks < 0)
+ done = child_shrink(-1, info, stripes,
+ fds, offsets,
+ info->array.raid_disks,
+ info->array.chunk_size,
+ info->array.level, info->array.layout,
+ odata,
+ 1, backup_list, backup_offsets);
+ else if (info->delta_disks == 0) {
+ /* The 'start' is a per-device stripe number.
+ * reshape_progress is a per-array sector number.
+ * So divide by ndata * chunk_size
+ */
+ unsigned long long start = info->reshape_progress / ndata;
+ start /= (info->array.chunk_size/512);
+ done = child_same_size(-1, info, stripes,
+ fds, offsets,
+ start,
+ info->array.raid_disks,
+ info->array.chunk_size,
+ info->array.level, info->array.layout,
+ odata,
+ 1, backup_list, backup_offsets);
+ }
+ if (backup_file && done)
+ unlink(backup_file);
+ /* FIXME should I intuit a level change */
+ exit(0);
+ case -1:
+ fprintf(stderr, Name ": Cannot run child to continue monitoring reshape: %s\n",
+ strerror(errno));
+ return 1;
+ default:
+ break;
+ }
+release:
+ return 0;
+}
+
+