]> git.ipfire.org Git - thirdparty/mdadm.git/commitdiff
Initial reshape support
authorNeil Brown <neilb@suse.de>
Mon, 13 Mar 2006 05:51:32 +0000 (05:51 +0000)
committerNeil Brown <neilb@suse.de>
Mon, 13 Mar 2006 05:51:32 +0000 (05:51 +0000)
Needs work for other levels etc.

Signed-off-by: Neil Brown <neilb@suse.de>
Grow.c
Makefile
mdadm.c
mdadm.h
restripe.c [new file with mode: 0644]
sysfs.c [new file with mode: 0644]
tests/07testreshape5 [new file with mode: 0644]
util.c

diff --git a/Grow.c b/Grow.c
index 9e8e217293f09b41b04259b5190b4edb2dd7a6c3..ece2bda384f5bc3c36012a78b3a9fc0597e43b95 100644 (file)
--- a/Grow.c
+++ b/Grow.c
@@ -306,7 +306,7 @@ int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int
                                        continue;
                                if (st->ss->load_super(st, fd2, &super, NULL)==0) {
                                        if (st->ss->add_internal_bitmap(st, super,
-                                                                   chunk, delay, write_behind,
+                                                                       chunk, delay, write_behind,
                                                                        bitmapsize, 0, major))
                                                st->ss->write_bitmap(st, fd2, super);
                                        else {
@@ -378,4 +378,410 @@ int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int
        return 0;
 }
 
+
+/*
+ * When reshaping an array we might need to backup some data.
+ * This is written to all spares with a 'super_block' describing it.
+ * The superblock goes 1K form the end of the used space on the
+ * device.
+ * It if written after the backup is complete.
+ * It has the following structure.
+ */
+
+struct mdp_backup_super {
+       char    magic[16];  /* md_backup_data-1 */
+       __u8    set_uuid[16];
+       __u64   mtime;
+       /* start/sizes in 512byte sectors */
+       __u64   devstart;
+       __u64   arraystart;
+       __u64   length;
+       __u32   sb_csum;        /* csum of preceeding bytes. */
+};
+
+int bsb_csum(char *buf, int len)
+{
+       int i;
+       int csum = 0;
+       for (i=0; i<len; i++)
+               csum = (csum<<3) + buf[0];
+       return __cpu_to_le32(csum);
+}
+
+int Grow_reshape(char *devname, int fd, int quiet,
+                long long size,
+                int level, int layout, int chunksize, int raid_disks)
+{
+       /* Make some changes in the shape of an array.
+        * The kernel must support the change.
+        * Different reshapes have subtly different meaning for different
+        * levels, so we need to check the current state of the array
+        * and go from there.
+        */
+       struct mdu_array_info_s array;
+       char *c;
+
+       struct mdp_backup_super bsb;
+       struct supertype *st;
+
+       int nlevel, olevel;
+       int nchunk, ochunk;
+       int nlayout, olayout;
+       int ndisks, odisks;
+       int ndata, odata;
+       unsigned long long nstripe, ostripe, last_block;
+       int *fdlist;
+       unsigned long long *offsets;
+       int d, i, spares;
+       int nrdisks;
+       int err;
+       void *super = NULL;
+
+       struct sysarray *sra;
+       struct sysdev *sd;
+
+       if (ioctl(fd, GET_ARRAY_INFO, &array) < 0) {
+               fprintf(stderr, Name ": %s is not an active md array - aborting\n",
+                       devname);
+               return 1;
+       }
+       c = map_num(pers, array.level);
+       if (c == NULL) c = "-unknown-";
+       switch(array.level) {
+       default: /* raid0, linear, multipath cannot be reconfigured */
+               fprintf(stderr, Name ": %s array %s cannot be reshaped.\n",
+                       c, devname);
+               return 1;
+
+       case LEVEL_FAULTY: /* only 'layout' change is permitted */
+
+               if (size >= 0) {
+                       fprintf(stderr, Name ": %s: Cannot change size of a 'faulty' array\n",
+                               devname);
+                       return 1;
+               }
+               if (level != UnSet && level != LEVEL_FAULTY) {
+                       fprintf(stderr, Name ": %s: Cannot change RAID level of a 'faulty' array\n",
+                               devname);
+                       return 1;
+               }
+               if (chunksize  || raid_disks) {
+                       fprintf(stderr, Name ": %s: Cannot change chunksize or disks of a 'faulty' array\n",
+                               devname);
+                       return 1;
+               }
+               if (layout == UnSet)
+                       return 0; /* nothing to do.... */
+
+               array.layout = layout;
+               if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
+                       fprintf(stderr, Name ": Cannot set layout for %s: %s\n",
+                               devname, strerror(errno));
+                       return 1;
+               }
+               if (!quiet)
+                       printf("layout for %s set to %d\n", devname, array.layout);
+               return 0;
+
+       case 1: /* raid_disks and size can each be changed.  They are independant */
+
+               if (level != UnSet && level != 1) {
+                       fprintf(stderr, Name ": %s: Cannot change RAID level of a RAID1 array.\n",
+                               devname);
+                       return 1;
+               }
+               if (chunksize || layout != UnSet) {
+                       fprintf(stderr, Name ": %s: Cannot change chunk size of layout for a RAID1 array.\n",
+                               devname);
+                       return 1;
+               }
+
+               /* Each can trigger a resync/recovery which will block the
+                * other from happening.  Later we could block
+                * resync for the duration via 'sync_action'...
+                */
+               if (raid_disks >= 0)
+                       array.raid_disks = raid_disks;
+               if (size >= 0)
+                       array.size = size;
+               if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
+                       fprintf(stderr, Name ": Cannot set device size/shape for %s: %s\n",
+                               devname, strerror(errno));
+                       return 1;
+               }
+               return 0;
+
+       case 4:
+       case 5:
+       case 6:
+               st = super_by_version(array.major_version,
+                                     array.minor_version);
+               /* size can be changed independantly.
+                * layout/chunksize/raid_disks/level can be changed
+                * though the kernel may not support it all.
+                * If 'suspend_lo' is not present in devfs, then
+                * these cannot be changed.
+                */
+               if (size >= 0) {
+                       /* Cannot change other details as well.. */
+                       if (layout != UnSet ||
+                           chunksize != 0 ||
+                           raid_disks != 0 ||
+                           level != UnSet) {
+                               fprintf(stderr, Name ": %s: Cannot change shape as well as size of a %s array.\n",
+                                       devname, c);
+                               return 1;
+                       }
+                       array.size = size;
+                       if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
+                               fprintf(stderr, Name ": Cannot set device size/shape for %s: %s\n",
+                                       devname, strerror(errno));
+                               return 1;
+                       }
+                       return 0;
+               }
+               /* Ok, just change the shape. This can be awkward.
+                *  There are three possibilities.
+                * 1/ The array will shrink.  We don't support this
+                *    possibility.  Maybe one day...
+                * 2/ The array will not change size.  This is easy enough
+                *    to do, but not reliably.  If the process is aborted
+                *    the array *will* be corrupted.  So maybe we can allow
+                *    this but only if the user is really certain.  e.g.
+                *    --really-risk-everything
+                * 3/ The array will grow. This can be reliably achieved.
+                *    However the kernel's restripe routines will cheerfully
+                *    overwrite some early data before it is safe.  So we
+                *    need to make a backup of the early parts of the array
+                *    and be ready to restore it if rebuild aborts very early.
+                *
+                *    We backup data by writing it to all spares (there must be
+                *    at least 1, so even raid6->raid5 requires a spare to be
+                *    present).
+                *
+                *    So: we enumerate the devices in the array and
+                *    make sure we can open all of them.
+                *    Then we freeze the early part of the array and
+                *    backup to the various spares.
+                *    Then we request changes and start the reshape.
+                *    Monitor progress until it has passed the danger zone.
+                *    and finally invalidate the copied data and unfreeze the
+                *    start of the array.
+                *
+                *    Before we can do this we need to decide:
+                *     - will the array grow?  Just calculate size
+                *     - how much needs to be saved: count stripes.
+                *     - where to save data... good question.
+                *
+                */
+               nlevel = olevel = array.level;
+               nchunk = ochunk = array.chunk_size;
+               nlayout = olayout = array.layout;
+               ndisks = odisks = array.raid_disks;
+
+               if (level != UnSet) nlevel = level;
+               if (chunksize) nchunk = chunksize;
+               if (layout != UnSet) nlayout = layout;
+               if (raid_disks) ndisks = raid_disks;
+
+               odata = odisks-1;
+               if (olevel == 6) odata--; /* number of data disks */
+               ndata = ndisks-1;
+               if (nlevel == 6) ndata--;
+
+               if (ndata < odata) {
+                       fprintf(stderr, Name ": %s: Cannot reduce number of data disks (yet).\n",
+                               devname);
+                       return 1;
+               }
+               if (ndata == odata) {
+                       fprintf(stderr, Name ": %s: Cannot reshape array without increasing size (yet).\n",
+                               devname);
+                       return 1;
+               }
+               /* Well, it is growing... so how much do we need to backup.
+                * Need to backup a full number of new-stripes, such that the
+                * last one does not over-write any place that it would be read
+                * from
+                */
+               nstripe = ostripe = 0;
+               while (nstripe+ochunk/512 >= ostripe) {
+                       nstripe += nchunk/512;
+                       last_block = nstripe * ndata;
+                       ostripe = last_block / odata;
+               }
+               printf("Need to backup to stripe %llu sectors, %lluK\n", nstripe, last_block/2);
+
+               sra = sysfs_read(fd, 0,
+                                GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE);
+               if (!sra) {
+                       fprintf(stderr, Name ": %s: Cannot get array details from sysfs\n",
+                               devname);
+                       return 1;
+               }
+
+               if (last_block >= sra->component_size/2) {
+                       fprintf(stderr, Name ": %s: Something wrong - reshape aborted\n",
+                               devname);
+                       return 1;
+               }
+
+               nrdisks = array.nr_disks + sra->spares;
+               /* Now we need to open all these devices so we can read/write.
+                */
+               fdlist = malloc(nrdisks * sizeof(int));
+               offsets = malloc(nrdisks * sizeof(offsets[0]));
+               if (!fdlist || !offsets) {
+                       fprintf(stderr, Name ": malloc failed: grow aborted\n");
+                       return 1;
+               }
+               for (d=0; d< nrdisks; d++)
+                       fdlist[d] = -1;
+               d = array.raid_disks;
+               for (sd = sra->devs; sd; sd=sd->next) {
+                       if (sd->state & (1<<MD_DISK_FAULTY))
+                               continue;
+                       if (sd->state & (1<<MD_DISK_SYNC)) {
+                               char *dn = map_dev(sd->major, sd->minor);
+                               fdlist[sd->role] = open(dn, O_RDONLY);
+                               offsets[sd->role] = sd->offset;
+                               if (fdlist[sd->role] < 0) {
+                                       fprintf(stderr, Name ": %s: cannot open component %s\n",
+                                               devname, dn);
+                                       goto abort;
+                               }
+                       } else {
+                               /* spare */
+                               char *dn = map_dev(sd->major, sd->minor);
+                               fdlist[d] = open(dn, O_RDWR);
+                               offsets[d] = sd->offset;
+                               if (fdlist[d]<0) {
+                                       fprintf(stderr, Name ": %s: cannot open component %s\n",
+                                               devname, dn);
+                                       goto abort;
+                               }
+                               d++;
+                       }
+               }
+               for (i=0 ; i<array.raid_disks; i++)
+                       if (fdlist[i] < 0) {
+                               fprintf(stderr, Name ": %s: failed to find device %d. Array might be degraded.\n"
+                                       " --grow aborted\n", devname, i);
+                               goto abort;
+                       }
+               if (fdlist[array.raid_disks] < 0) {
+                       fprintf(stderr, Name ": %s: failed to find a spare - --grow aborted\n",
+                               devname);
+                       goto abort;
+               }
+
+               /* Find a superblock */
+               if (st->ss->load_super(st, fdlist[0], &super, NULL)) {
+                       fprintf(stderr, Name ": %s: Cannot find a superblock\n",
+                               devname);
+                       goto abort;
+               }
+
+               spares = sra->spares;
+
+               /* Decide offset for the backup and llseek the spares */
+               for (i=array.raid_disks; i<d; i++) {
+                       offsets[i] += sra->component_size - last_block - 8;
+                       if (lseek64(fdlist[i], offsets[i]<<9, 0) != offsets[i]<<9) {
+                               fprintf(stderr, Name ": could not seek...\n");
+                               goto abort;
+                       }
+               }
+               array.level = nlevel;
+               array.raid_disks = ndisks;
+               array.chunk_size = nchunk;
+               array.layout = nlayout;
+               if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
+                       fprintf(stderr, Name ": Cannot set device size/shape for %s: %s\n",
+                               devname, strerror(errno));
+                       goto abort;
+               }
+
+               /* suspend the relevant region */
+               sysfs_set_num(sra, NULL, "suspend_hi", 0); /* just in case */
+               if (sysfs_set_num(sra, NULL, "suspend_lo", 0) < 0 ||
+                   sysfs_set_num(sra, NULL, "suspend_hi", last_block) < 0) {
+                       fprintf(stderr, Name ": %s: failed to suspend device.\n",
+                               devname);
+                       goto abort_resume;
+               }
+
+
+               err = save_stripes(fdlist, offsets,
+                                  odisks, ochunk, olevel, olayout,
+                                  spares, fdlist+odisks,
+                                  0ULL, nstripe*512);
+
+               /* abort if there was an error */
+               if (err < 0) {
+                       fprintf(stderr, Name ": %s: failed to save critical region\n",
+                               devname);
+                       goto abort_resume;
+               }
+               /* FIXME write superblocks */
+               memcpy(bsb.magic, "md_backups_data-1", 16);
+               st->ss->uuid_from_super((int*)&bsb.set_uuid, super);
+               bsb.mtime = time(0);
+               bsb.arraystart = 0;
+               bsb.length = last_block;
+               for (i=odisks; i<d ; i++) {
+                       bsb.devstart = offsets[i];
+                       bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
+                       lseek64(fdlist[i], (offsets[i]+last_block)<<9, 0);
+                       write(fdlist[i], &bsb, sizeof(bsb));
+                       /* FIXME error check */
+               }
+
+               /* start the reshape happening */
+               if (sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) {
+                       fprintf(stderr, Name ": %s: failed to initiate reshape\n",
+                               devname);
+                       goto abort_resume;
+               }
+               /* wait for reshape to pass the critical region */
+               while(1) {
+                       unsigned long long comp;
+                       if (sysfs_get_ll(sra, NULL, "sync_completed", &comp)<0)
+                               break;
+                       if (comp >= nstripe)
+                               break;
+                       sleep(1);
+               }
                
+               /* invalidate superblocks */
+               memset(&bsb, 0, sizeof(bsb));
+               for (i=odisks; i<d ; i++) {
+                       lseek64(fdlist[i], (offsets[i]+last_block)<<9, 0);
+                       write(fdlist[i], &bsb, sizeof(bsb));
+               }
+
+               /* unsuspend. */
+               sysfs_set_num(sra, NULL, "suspend_lo", last_block);
+
+               for (i=0; i<d; i++)
+                       if (fdlist[i] >= 0)
+                               close(fdlist[i]);
+               free(fdlist);
+               free(offsets);
+
+               break;
+       }
+       return 0;
+
+
+ abort_resume:
+       sysfs_set_num(sra, NULL, "suspend_lo", last_block);
+ abort:
+       for (i=0; i<array.nr_disks; i++)
+               if (fdlist[i] >= 0)
+                       close(fdlist[i]);
+       free(fdlist);
+       free(offsets);
+       return 1;
+
+}
index c310fca7bd1f5c5e06f831ed4b43de9be9e5c5e9..3e18f9d04ef84b8b7beac3fdf0ae9437920291b1 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -59,10 +59,10 @@ MAN8DIR = $(MANDIR)/man8
 
 OBJS =  mdadm.o config.o mdstat.o  ReadMe.o util.o Manage.o Assemble.o Build.o \
        Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \
-       mdopen.o super0.o super1.o bitmap.o
+       mdopen.o super0.o super1.o bitmap.o restripe.o sysfs.o
 SRCS =  mdadm.c config.c mdstat.c  ReadMe.c util.c Manage.c Assemble.c Build.c \
        Create.c Detail.c Examine.c Grow.c Monitor.c dlink.c Kill.c Query.c \
-       mdopen.c super0.c super1.c bitmap.c
+       mdopen.c super0.c super1.c bitmap.c restripe.c sysfs.c
 
 ASSEMBLE_SRCS := mdassemble.c Assemble.c config.c dlink.c util.c super0.c super1.c
 ASSEMBLE_FLAGS:= -DMDASSEMBLE
@@ -73,7 +73,7 @@ endif
 
 all : mdadm mdadm.man md.man mdadm.conf.man
 
-everything: all mdadm.static mdadm.uclibc swap_super  mdassemble mdassemble.uclibc mdassemble.static mdassemble.man
+everything: all mdadm.static mdadm.uclibc swap_super test_stripe  mdassemble mdassemble.uclibc mdassemble.static mdassemble.man
 # mdadm.tcc doesn't work..
 
 mdadm : $(OBJS)
@@ -92,6 +92,9 @@ mdadm.klibc : $(SRCS) mdadm.h
        rm -f $(OBJS) 
        gcc -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 $(CFLAGS) $(SRCS)
 
+test_stripe : restripe.c mdadm.h
+       $(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe -DMAIN restripe.c
+
 mdassemble : $(ASSEMBLE_SRCS) mdadm.h
        rm -f $(OBJS)
        $(DIET_GCC) $(ASSEMBLE_FLAGS) -o mdassemble $(ASSEMBLE_SRCS) 
diff --git a/mdadm.c b/mdadm.c
index 1c9388c2e2ae052bbdc6fdf10a176307c7a68ef3..5347999679631cc5a609a3aecd1f139d754018fb 100644 (file)
--- a/mdadm.c
+++ b/mdadm.c
@@ -1155,7 +1155,8 @@ int main(int argc, char *argv[])
                } else if (layout != UnSet)
                        rv = Manage_reconfig(devlist->devname, mdfd, layout);
                else if (size >= 0 || raiddisks)
-                       rv = Manage_resize(devlist->devname, mdfd, size, raiddisks);
+                       rv = Grow_reshape(devlist->devname, mdfd, quiet,
+                                         size, level, layout, chunk, raiddisks);
                else if (bitmap_file) {
                        if (delay == 0) delay = DEFAULT_BITMAP_DELAY;
                        rv = Grow_addbitmap(devlist->devname, mdfd, bitmap_file,
diff --git a/mdadm.h b/mdadm.h
index 00abefe03c0fc8c99db68117a5b6d68daea6d616..61d0469c456907b015cdd9d55e9ca0aac56bd21f 100644 (file)
--- a/mdadm.h
+++ b/mdadm.h
@@ -178,6 +178,54 @@ extern struct mdstat_ent *mdstat_read(int hold, int start);
 extern void free_mdstat(struct mdstat_ent *ms);
 extern void mdstat_wait(int seconds);
 
+/* Data structure for holding info read from sysfs */
+struct sysdev {
+       char    name[20];
+       int     role;
+       int     major, minor;
+       unsigned long long offset, size;
+       int     state;
+       int     errors;
+       struct sysdev *next;
+};
+struct sysarray {
+       char    name[20];
+       struct sysdev *devs;
+       int     chunk;
+       unsigned long long component_size;
+       int     layout;
+       int     level;
+       int     spares;
+};
+/* various details can be requested */
+#define        GET_LEVEL       1
+#define        GET_LAYOUT      2
+#define        GET_COMPONENT   4
+#define        GET_CHUNK       8
+
+#define        GET_DEVS        1024 /* gets role, major, minor */
+#define        GET_OFFSET      2048
+#define        GET_SIZE        4096
+#define        GET_STATE       8192
+#define        GET_ERROR       16384
+
+/* If fd >= 0, get the array it is open on,
+ * else use devnum. >=0 -> major9. <0.....
+ */
+extern struct sysarray *sysfs_read(int fd, int devnum, unsigned long options);
+extern int sysfs_set_str(struct sysarray *sra, struct sysdev *dev,
+                        char *name, char *val);
+extern int sysfs_set_num(struct sysarray *sra, struct sysdev *dev,
+                        char *name, unsigned long long val);
+extern int sysfs_get_ll(struct sysarray *sra, struct sysdev *dev,
+                       char *name, unsigned long long *val);
+
+
+extern int save_stripes(int *source, unsigned long long *offsets,
+                       int raid_disks, int chunk_size, int level, int layout,
+                       int nwrites, int *dest,
+                       unsigned long long start, unsigned long long length);
+
 #ifndef Sendmail
 #define Sendmail "/usr/lib/sendmail -t"
 #endif
@@ -251,6 +299,9 @@ extern int Manage_subdevs(char *devname, int fd,
                          mddev_dev_t devlist, int verbose);
 extern int Grow_Add_device(char *devname, int fd, char *newdev);
 extern int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int write_behind, int force);
+extern int Grow_reshape(char *devname, int fd, int quiet,
+                       long long size,
+                       int level, int layout, int chunksize, int raid_disks);
 
 
 extern int Assemble(struct supertype *st, char *mddev, int mdfd,
@@ -367,3 +418,8 @@ extern int open_mddev(char *dev, int autof);
 #define makedev(M,m) (((M)<<8) | (m))
 #endif
 
+/* for raid5 */
+#define ALGORITHM_LEFT_ASYMMETRIC      0
+#define ALGORITHM_RIGHT_ASYMMETRIC     1
+#define ALGORITHM_LEFT_SYMMETRIC       2
+#define ALGORITHM_RIGHT_SYMMETRIC      3
diff --git a/restripe.c b/restripe.c
new file mode 100644 (file)
index 0000000..94a0e3d
--- /dev/null
@@ -0,0 +1,324 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2006 Neil Brown <neilb@suse.de>
+ *
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2 of the License, or
+ *    (at your option) any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *    Author: Neil Brown
+ *    Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+
+/* To restripe, we read from old geometry to a buffer, and
+ * read from buffer to new geometry.
+ * When reading we don't worry about parity. When writing we do.
+ *
+ */
+
+static int geo_map(int block, unsigned long long stripe, int raid_disks, int level, int layout)
+{
+       /* On the given stripe, find which disk in the array with have
+        * block numbered 'block'.
+        */
+       int pd;
+
+       switch(level*100 + layout) {
+       case 000:
+       case 400:
+               /* raid 4 isn't messed around by parity blocks */
+               if (block == -1)
+                       return raid_disks-1; /* parity block */
+               return block;
+       case 500 + ALGORITHM_LEFT_ASYMMETRIC:
+               pd = (raid_disks-1) - stripe % raid_disks;
+               if (block == -1) return pd;
+               if (block >= pd)
+                       block++;
+               return block;
+
+       case 500 + ALGORITHM_RIGHT_ASYMMETRIC:
+               pd = stripe % raid_disks;
+               if (block == -1) return pd;
+               if (block >= pd)
+                       block++;
+               return block;
+
+       case 500 + ALGORITHM_LEFT_SYMMETRIC:
+               pd = (raid_disks - 1) - stripe % raid_disks;
+               if (block == -1) return pd;
+               return (pd + 1 + block) % raid_disks;
+
+       case 500 + ALGORITHM_RIGHT_SYMMETRIC:
+               pd = stripe % raid_disks;
+               if (block == -1) return pd;
+               return (pd + 1 + block) % raid_disks;
+
+       case 600 + ALGORITHM_LEFT_ASYMMETRIC:
+               pd = raid_disks - 1 - (stripe % raid_disks);
+               if (block == -1) return pd;
+               if (pd == raid_disks - 1)
+                       return block+1;
+               if (block >= pd)
+                       return block+2;
+               return block;
+
+       case 600 + ALGORITHM_RIGHT_ASYMMETRIC:
+               pd = stripe % raid_disks;
+               if (block == -1) return pd;
+               if (pd == raid_disks - 1)
+                       return block+1;
+               if (block >= pd)
+                       return block+2;
+               return block;
+
+       case 600 + ALGORITHM_LEFT_SYMMETRIC:
+               pd = raid_disks - 1 - (stripe % raid_disks);
+               if (block == -1) return pd;
+               return (pd + 2 + block) % raid_disks;
+
+       case 600 + ALGORITHM_RIGHT_SYMMETRIC:
+               pd = stripe % raid_disks;
+               if (block == -1) return pd;
+               return (pd + 2 + block) % raid_disks;
+       }
+       return -1;
+}
+
+
+static void xor_blocks(char *target, char **sources, int disks, int size)
+{
+       int i, j;
+       /* Amazingly inefficient... */
+       for (i=0; i<size; i++) {
+               char c = 0;
+               for (j=0 ; j<disks; j++)
+                       c ^= sources[j][i];
+               target[i] = c;
+       }
+}
+
+/* Save data:
+ * We are given:
+ *  A list of 'fds' of the active disks.  For now we require all to be present.
+ *  A geomtry: raid_disks, chunk_size, level, layout
+ *  A list of 'fds' for mirrored targets.  They are already seeked to
+ *    right (Write) location
+ *  A start and length
+ */
+
+int save_stripes(int *source, unsigned long long *offsets,
+                int raid_disks, int chunk_size, int level, int layout,
+                int nwrites, int *dest,
+                unsigned long long start, unsigned long long length)
+{
+       char buf[8192];
+       int cpos = start % chunk_size; /* where in chunk we are up to */
+       int len;
+       int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2);
+       int disk;
+
+       while (length > 0) {
+               unsigned long long offset;
+               int i;
+               len = chunk_size - cpos;
+               if (len > sizeof(buf)) len = sizeof(buf);
+               if (len > length) len = length;
+               /* len bytes to be moved from one device */
+
+               offset = (start/chunk_size/data_disks)*chunk_size + cpos;
+               disk = start/chunk_size % data_disks;
+               disk = geo_map(disk, start/chunk_size/data_disks,
+                              raid_disks, level, layout);
+               if (lseek64(source[disk], offsets[disk]+offset, 0) < 0)
+                       return -1;
+               if (read(source[disk], buf, len) != len)
+                       return -1;
+               for (i=0; i<nwrites; i++)
+                       if (write(dest[i], buf, len) != len)
+                               return -1;
+               length -= len;
+               start += len;
+               cpos += len;
+               while (cpos >= chunk_size) cpos -= chunk_size;
+       }
+       return 0;
+}
+
+/* Restore data:
+ * We are given:
+ *  A list of 'fds' of the active disks. Some may be '-1' for not-available.
+ *  A geometry: raid_disks, chunk_sisze, level, layout
+ *  An 'fd' to read from.  It is already seeked to the right (Read) location.
+ *  A start and length.
+ * The length must be a multiple of the stripe size.
+ *
+ * We build a full stripe in memory and then write it out.
+ * We assume that there are enough working devices.
+ */
+int restore_stripes(int *dest, unsigned long long *offsets,
+                   int raid_disks, int chunk_size, int level, int layout,
+                   int source,
+                   unsigned long long start, unsigned long long length)
+{
+       char *stripe_buf = malloc(raid_disks * chunk_size);
+       char **stripes = malloc(raid_disks * sizeof(char*));
+       char **blocks = malloc(raid_disks * sizeof(char*));
+       int i;
+
+       int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2);
+
+       if (stripe_buf == NULL || stripes == NULL || blocks == NULL) {
+               free(stripe_buf);
+               free(stripes);
+               free(blocks);
+               return -2;
+       }
+       for (i=0; i<raid_disks; i++)
+               stripes[i] = stripe_buf + i * chunk_size;
+       while (length > 0) {
+               int len = data_disks * chunk_size;
+               unsigned long long offset;
+               if (length < len)
+                       return -3;
+               for (i=0; i < data_disks; i++) {
+                       int disk = geo_map(i, start/chunk_size/data_disks,
+                                          raid_disks, level, layout);
+                       blocks[i] = stripes[disk];
+                       if (read(source, stripes[disk], chunk_size) != chunk_size)
+                               return -1;
+               }
+               /* We have the data, now do the parity */
+               offset = (start/chunk_size/data_disks) * chunk_size;
+               if (level >= 4) {
+                       int disk = geo_map(-1, start/chunk_size/data_disks,
+                                          raid_disks, level, layout);
+                       xor_blocks(stripes[disk], blocks, data_disks, chunk_size);
+                       /* FIXME need to do raid6 Q as well */
+               }
+               for (i=0; i < raid_disks ; i++)
+                       if (dest[i] >= 0) {
+                               if (lseek64(dest[i], offsets[i]+offset, 0) < 0)
+                                       return -1;
+                               if (write(dest[i], stripes[i], chunk_size) != chunk_size)
+                                       return -1;
+                       }
+               length -= len;
+               start += len;
+       }
+       return 0;
+}
+
+#ifdef MAIN
+
+unsigned long long getnum(char *str, char **err)
+{
+       char *e;
+       unsigned long long rv = strtoull(str, &e, 10);
+       if (e==str || *e) {
+               *err = str;
+               return 0;
+       }
+       return rv;
+}
+
+main(int argc, char *argv[])
+{
+       /* save/restore file raid_disks chunk_size level layout start length devices...
+        */
+       int save;
+       int *fds;
+       char *file;
+       int storefd;
+       unsigned long long *offsets;
+       int raid_disks, chunk_size, level, layout;
+       unsigned long long start, length;
+       int i;
+
+       char *err = NULL;
+       if (argc < 10) {
+               fprintf(stderr, "Usage: test_stripe save/restore file raid_disks"
+                       " chunk_size level layout start length devices...\n");
+               exit(1);
+       }
+       if (strcmp(argv[1], "save")==0)
+               save = 1;
+       else if (strcmp(argv[1], "restore") == 0)
+               save = 0;
+       else {
+               fprintf(stderr, "test_stripe: must give 'save' or 'restore'.\n");
+               exit(2);
+       }
+
+       file = argv[2];
+       raid_disks = getnum(argv[3], &err);
+       chunk_size = getnum(argv[4], &err);
+       level = getnum(argv[5], &err);
+       layout = getnum(argv[6], &err);
+       start = getnum(argv[7], &err);
+       length = getnum(argv[8], &err);
+       if (err) {
+               fprintf(stderr, "test_stripe: Bad number: %s\n", err);
+               exit(2);
+       }
+       if (argc != raid_disks + 9) {
+               fprintf(stderr, "test_stripe: wrong number of devices: want %d found %d\n",
+                       raid_disks, argc-9);
+               exit(2);
+       }
+       fds = malloc(raid_disks * sizeof(*fds));
+       offsets = malloc(raid_disks * sizeof(*offsets));
+       memset(offsets, 0, raid_disks * sizeof(*offsets));
+
+       storefd = open(file, O_RDWR);
+       if (storefd < 0) {
+               perror(file);
+               fprintf(stderr, "test_stripe: could not open %s.\n", file);
+               exit(3);
+       }
+       for (i=0; i<raid_disks; i++) {
+               fds[i] = open(argv[9+i], O_RDWR);
+               if (fds[i] < 0) {
+                       perror(argv[9+i]);
+                       fprintf(stderr,"test_stripe: cannot open %s.\n", argv[9+i]);
+                       exit(3);
+               }
+       }
+
+       if (save) {
+               int rv = save_stripes(fds, offsets,
+                                     raid_disks, chunk_size, level, layout,
+                                     1, &storefd,
+                                     start, length);
+               if (rv != 0) {
+                       fprintf(stderr, "test_stripe: save_stripes returned %d\n", rv);
+                       exit(1);
+               }
+       } else {
+               int rv = restore_stripes(fds, offsets,
+                                        raid_disks, chunk_size, level, layout,
+                                        storefd,
+                                        start, length);
+               if (rv != 0) {
+                       fprintf(stderr, "test_stripe: restore_stripes returned %d\n", rv);
+                       exit(1);
+               }
+       }
+       exit(0);
+}
+
+#endif /* MAIN */
diff --git a/sysfs.c b/sysfs.c
new file mode 100644 (file)
index 0000000..9894760
--- /dev/null
+++ b/sysfs.c
@@ -0,0 +1,265 @@
+/*
+ * sysfs - extract md related information from sysfs.  Part of:
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2006 Neil Brown <neilb@suse.de>
+ *
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2 of the License, or
+ *    (at your option) any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *    Author: Neil Brown
+ *    Email: <neilb@suse.de>
+ */
+
+#include       "mdadm.h"
+#include       <dirent.h>
+
+int load_sys(char *path, char *buf)
+{
+       int fd = open(path, O_RDONLY);
+       int n;
+       if (fd < 0)
+               return -1;
+       n = read(fd, buf, 1024);
+       close(fd);
+       if (n <=0 || n >= 1024)
+               return -1;
+       buf[n] = 0;
+       if (buf[n-1] == '\n')
+               buf[n-1] = 0;
+       return 0;
+}
+
+struct sysarray *sysfs_read(int fd, int devnum, unsigned long options)
+{
+       /* Longest possible name in sysfs, mounted at /sys, is
+        *  /sys/block/md_dXXX/md/dev-XXXXX/block/dev
+        *  /sys/block/md_dXXX/md/metadata_version
+        * which is about 41 characters.  50 should do for now
+        */
+       char fname[50];
+       char buf[1024];
+       char *base;
+       char *dbase;
+       struct sysarray *sra;
+       struct sysdev *dev;
+       DIR *dir;
+       struct dirent *de;
+
+       sra = malloc(sizeof(*sra));
+       if (sra == NULL)
+               return sra;
+
+       if (fd >= 0) {
+               struct stat stb;
+               if (fstat(fd, &stb)) return NULL;
+               if (major(stb.st_rdev)==9)
+                       sprintf(sra->name, "md%d", minor(stb.st_rdev));
+               else
+                       sprintf(sra->name, "md_d%d",
+                               minor(stb.st_rdev)/16);
+       } else {
+               if (devnum >= 0)
+                       sprintf(sra->name, "md%d", devnum);
+               else
+                       sprintf(sra->name, "md_d%d",
+                               -1-devnum);
+       }
+       sprintf(fname, "/sys/block/%s/md/", sra->name);
+       base = fname + strlen(fname);
+
+       sra->devs = NULL;
+       if (options & GET_LEVEL) {
+               strcpy(base, "level");
+               if (load_sys(fname, buf))
+                       goto abort;
+               sra->level = map_name(pers, buf);
+       }
+       if (options & GET_LAYOUT) {
+               strcpy(base, "layout");
+               if (load_sys(fname, buf))
+                       goto abort;
+               sra->layout = strtoul(buf, NULL, 0);
+       }
+       if (options & GET_COMPONENT) {
+               strcpy(base, "component_size");
+               if (load_sys(fname, buf))
+                       goto abort;
+               sra->component_size = strtoull(buf, NULL, 0);
+       }
+       if (options & GET_CHUNK) {
+               strcpy(base, "chunk_size");
+               if (load_sys(fname, buf))
+                       goto abort;
+               sra->chunk = strtoul(buf, NULL, 0);
+       }
+
+       if (! (options & GET_DEVS))
+               return sra;
+
+       /* Get all the devices as well */
+       *base = 0;
+       dir = opendir(fname);
+       if (!dir)
+               goto abort;
+       sra->spares = 0;
+
+       while ((de = readdir(dir)) != NULL) {
+               char *ep;
+               if (de->d_ino == 0 ||
+                   strncmp(de->d_name, "dev-", 4) != 0)
+                       continue;
+               strcpy(base, de->d_name);
+               dbase = base + strlen(base);
+               *dbase++ = '/';
+
+               dev = malloc(sizeof(*dev));
+               if (!dev)
+                       goto abort;
+               dev->next = sra->devs;
+               sra->devs = dev;
+
+               /* Always get slot, major, minor */
+               strcpy(dbase, "slot");
+               if (load_sys(fname, buf))
+                       goto abort;
+               dev->role = strtoul(buf, &ep, 10);
+               if (*ep) dev->role = -1;
+
+               strcpy(dbase, "block/dev");
+               if (load_sys(fname, buf))
+                       goto abort;
+               sscanf(buf, "%d:%d", &dev->major, &dev->minor);
+
+               if (options & GET_OFFSET) {
+                       strcpy(dbase, "offset");
+                       if (load_sys(fname, buf))
+                               goto abort;
+                       dev->offset = strtoull(buf, NULL, 0);
+               }
+               if (options & GET_SIZE) {
+                       strcpy(dbase, "size");
+                       if (load_sys(fname, buf))
+                               goto abort;
+                       dev->size = strtoull(buf, NULL, 0);
+               }
+               if (options & GET_STATE) {
+                       dev->state = 0;
+                       strcpy(dbase, "state");
+                       if (load_sys(fname, buf))
+                               goto abort;
+                       if (strstr(buf, "in_sync"))
+                               dev->state |= (1<<MD_DISK_SYNC);
+                       if (strstr(buf, "faulty"))
+                               dev->state |= (1<<MD_DISK_FAULTY);
+                       if (dev->state == 0)
+                               sra->spares++;
+               }
+               if (options & GET_ERROR) {
+                       strcpy(buf, "errors");
+                       if (load_sys(fname, buf))
+                               goto abort;
+                       dev->errors = strtoul(buf, NULL, 0);
+               }
+       }
+       return sra;
+
+ abort:
+       while (sra && sra->devs) {
+               dev = sra->devs;
+               sra->devs = dev->next;
+               free(dev);
+       }
+       if(sra) free(sra);
+       return NULL;
+}
+
+unsigned long long get_component_size(int fd)
+{
+       /* Find out the component size of the array.
+        * We cannot trust GET_ARRAY_INFO ioctl as it's
+        * size field is only 32bits.
+        * So look in /sys/block/mdXXX/md/component_size
+        */
+       struct stat stb;
+       char fname[50];
+       int n;
+       if (fstat(fd, &stb)) return 0;
+       if (major(stb.st_rdev) == 9)
+               sprintf(fname, "/sys/block/md%d/md/component_size",
+                       minor(stb.st_rdev));
+       else
+               sprintf(fname, "/sys/block/md_d%d/md/component_size",
+                       minor(stb.st_rdev)/16);
+       fd = open(fname, O_RDONLY);
+       if (fd < 0)
+               return 0;
+       n = read(fd, fname, sizeof(fname));
+       close(fd);
+       if (n == sizeof(fname))
+               return 0;
+       fname[n] = 0;
+       return strtoull(fname, NULL, 10);
+}
+
+int sysfs_set_str(struct sysarray *sra, struct sysdev *dev,
+                 char *name, char *val)
+{
+       char fname[50];
+       int n;
+       int fd;
+       sprintf(fname, "/sys/block/%s/md/%s/%s",
+               sra->name, dev?dev->name:"", name);
+       fd = open(fname, O_WRONLY);
+       if (fd < 0)
+               return -1;
+       n = write(fd, val, strlen(val));
+       close(fd);
+       if (n != strlen(val))
+               return -1;
+       return 0;
+}
+
+int sysfs_set_num(struct sysarray *sra, struct sysdev *dev,
+                 char *name, unsigned long long val)
+{
+       char valstr[50];
+       sprintf(valstr, "%llu", val);
+       return sysfs_set_str(sra, dev, name, valstr);
+}
+
+int sysfs_get_ll(struct sysarray *sra, struct sysdev *dev,
+                      char *name, unsigned long long *val)
+{
+       char fname[50];
+       char buf[50];
+       int n;
+       int fd;
+       char *ep;
+       sprintf(fname, "/sys/block/%s/md/%s/%s",
+               sra->name, dev?dev->name:"", name);
+       fd = open(fname, O_RDONLY);
+       if (fd < 0)
+               return -1;
+       n = read(fd, buf, sizeof(buf));
+       close(fd);
+       if (n <= 0)
+               return -1;
+       buf[n] = 0;
+       *val = strtoull(buf, &ep, 0);
+       if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' '))
+               return -1;
+       return 0;
+}
diff --git a/tests/07testreshape5 b/tests/07testreshape5
new file mode 100644 (file)
index 0000000..8f56a72
--- /dev/null
@@ -0,0 +1,38 @@
+
+#
+# test the reshape code by using test_reshape and the
+# kernel md code to move data into and out of variously
+# shaped md arrays.
+set -x
+layouts=(la ra ls rs)
+for chunk in 4 8 16 32 64 128
+do
+  devs="$dev1"
+  for disks in 2 3 4 5 6
+  do
+    eval devs=\"$devs \$dev$disks\"
+    for nlayout in 0 1 2 3
+    do
+      layout=${layouts[$nlayout]}
+
+      size=$[chunk*(disks-1)*disks]
+
+      # test restore: make a raid5 from a file, then do a compare
+      dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$size
+      $dir/test_stripe restore /tmp/RandFile $disks $[chunk*1024] 5 $nlayout 0 $[size*1024] $devs
+      $mdadm -CR $md0 -amd -l5 -n$disks --assume-clean -c $chunk -p $layout $devs
+      cmp -s -n $[size*1024] $md0 /tmp/RandFile || { echo cmp failed ; exit 2; }
+
+      # FIXME check parity
+
+      # test save
+      dd if=/dev/urandom of=$md0 bs=1024 count=$size
+      > /tmp/NewRand
+      $dir/test_stripe save /tmp/NewRand $disks $[chunk*1024] 5 $nlayout 0 $[size*1024] $devs
+      cmp -s -n $[size*1024] $md0 /tmp/NewRand || { echo cmp failed ; exit 2; }
+      $mdadm -S $md0
+    done
+  done
+done
+exit 0
+
diff --git a/util.c b/util.c
index 26254c086881fa21c6e9d930fde04f11f68c242b..0af433130bb723dcf4943a865cebda0f7f8e5553 100644 (file)
--- a/util.c
+++ b/util.c
@@ -676,34 +676,6 @@ struct supertype *guess_super(int fd)
        return NULL;
 }
 
-unsigned long long get_component_size(int fd)
-{
-       /* Find out the component size of the array.
-        * We cannot trust GET_ARRAY_INFO ioctl as it's
-        * size field is only 32bits.
-        * So look in /sys/block/mdXXX/md/component_size
-        */
-       struct stat stb;
-       char fname[50];
-       int n;
-       if (fstat(fd, &stb)) return 0;
-       if (major(stb.st_rdev) == 9)
-               sprintf(fname, "/sys/block/md%d/md/component_size",
-                       minor(stb.st_rdev));
-       else
-               sprintf(fname, "/sys/block/md_d%d/md/component_size",
-                       minor(stb.st_rdev)/16);
-       fd = open(fname, O_RDONLY);
-       if (fd < 0)
-               return 0;
-       n = read(fd, fname, sizeof(fname));
-       close(fd);
-       if (n == sizeof(fname))
-               return 0;
-       fname[n] = 0;
-       return strtoull(fname, NULL, 10);
-}
-
 
 #ifdef __TINYC__
 /* tinyc doesn't optimize this check in ioctl.h out ... */