From dfd4d8ee426fb71a369f494f95fe95b114a33c7c Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Tue, 9 Aug 2005 04:25:47 +0000 Subject: [PATCH] Add write-behind support Currently this includes --write-behind to set level of write-behind supported --write-mostly to flag devices as write-mostly. Signed-off-by: Neil Brown --- Build.c | 8 +++++--- ChangeLog | 1 + Create.c | 15 ++++++++++----- Detail.c | 3 +++ Grow.c | 4 ++-- Manage.c | 2 ++ ReadMe.c | 4 ++++ bitmap.c | 8 ++++++++ bitmap.h | 12 ++++++++++-- md_p.h | 5 +++++ mdadm.8 | 33 ++++++++++++++++++++++++++++++--- mdadm.c | 45 ++++++++++++++++++++++++++++++++++++++++----- mdadm.h | 11 +++++++---- super0.c | 26 ++++++++++++++++++-------- super1.c | 14 ++++++++++++-- tests/06wrmostly | 15 +++++++++++++++ 16 files changed, 172 insertions(+), 34 deletions(-) create mode 100644 tests/06wrmostly diff --git a/Build.c b/Build.c index 5537b46d..6489d845 100644 --- a/Build.c +++ b/Build.c @@ -36,7 +36,7 @@ int Build(char *mddev, int mdfd, int chunk, int level, int layout, int raiddisks, mddev_dev_t devlist, int assume_clean, - char *bitmap_file, int bitmap_chunk, int delay) + char *bitmap_file, int bitmap_chunk, int write_behind, int delay) { /* Build a linear or raid0 arrays without superblocks * We cannot really do any checks, we just do it. @@ -164,7 +164,9 @@ int Build(char *mddev, int mdfd, int chunk, int level, int layout, mdu_disk_info_t disk; disk.number = i; disk.raid_disk = i; - disk.state = 6; + disk.state = (1<writemostly) + disk.state |= 1<ss->add_internal_bitmap(super, bitmap_chunk, delay, + if (!st->ss->add_internal_bitmap(super, bitmap_chunk, delay, write_behind, size ? size : maxsize)) { fprintf(stderr, Name ": Given bitmap chunk size not supported.\n"); return 1; @@ -382,7 +382,8 @@ int Create(struct supertype *st, char *mddev, int mdfd, bitmap_chunk = DEFAULT_BITMAP_CHUNK; st->ss->uuid_from_super(uuid, super); - if (CreateBitmap(bitmap_file, force, (char*)uuid, bitmap_chunk, delay, + if (CreateBitmap(bitmap_file, force, (char*)uuid, bitmap_chunk, + delay, write_behind, array.size*2ULL /* FIXME wrong for raid10 */)) { return 1; } @@ -416,14 +417,18 @@ int Create(struct supertype *st, char *mddev, int mdfd, } disk.raid_disk = disk.number; if (disk.raid_disk < raiddisks) - disk.state = 6; /* active and in sync */ + disk.state = (1<writemostly) + disk.state |= (1<devname, "missing")==0) { disk.major = 0; disk.minor = 0; - disk.state = 1; /* faulty */ + disk.state = (1<devname, O_RDONLY|O_EXCL, 0); if (fd < 0) { diff --git a/Detail.c b/Detail.c index 46b483c4..4c93eff9 100644 --- a/Detail.c +++ b/Detail.c @@ -216,6 +216,8 @@ int Detail(char *dev, int brief, int test) for (d= 0; d < max_disks; d++) { mdu_disk_info_t disk; char *dv; + int wonly = disk.state & (1<ss->load_super(st, fd2, &super, NULL)==0) { st->ss->add_internal_bitmap(super, - chunk, delay, + chunk, delay, write_behind, array.size); st->ss->write_bitmap(st, fd2, super); } diff --git a/Manage.c b/Manage.c index 53accd7e..a91e467c 100644 --- a/Manage.c +++ b/Manage.c @@ -266,6 +266,8 @@ int Manage_subdevs(char *devname, int fd, disc.minor = minor(stb.st_rdev); disc.number =j; disc.state = 0; + if (dv->writemostly) + disc.state |= 1 << MD_DISK_WRITEMOSTLY; st->ss->add_to_super(dsuper, &disc); if (st->ss->write_init_super(st, dsuper, &disc, dv->devname)) return 1; diff --git a/ReadMe.c b/ReadMe.c index 1d28adf5..1ba7301b 100644 --- a/ReadMe.c +++ b/ReadMe.c @@ -131,6 +131,8 @@ struct option long_options[] = { {"metadata", 1, 0, 'e'}, /* superblock format */ {"bitmap", 1, 0, 'b'}, {"bitmap-chunk", 1, 0, 4}, + {"write-behind", 2, 0, 5}, + {"write-mostly",0, 0, 'W'}, /* For assemble */ {"uuid", 1, 0, 'u'}, @@ -139,6 +141,7 @@ struct option long_options[] = { {"scan", 0, 0, 's'}, {"force", 0, 0, 'f'}, {"update", 1, 0, 'U'}, + /* Management */ {"add", 0, 0, 'a'}, {"remove", 0, 0, 'r'}, @@ -232,6 +235,7 @@ char OptionHelp[] = " --assume-clean : Assume the array is already in-sync. This is dangerous.\n" " --bitmap-chunk= : chunksize of bitmap in bitmap file (Kilobytes)\n" " --delay= -d : seconds between bitmap updates\n" +" --write-behind= : number of simultaneous write-behind requests to allow (requires bitmap)\n" "\n" " For assemble:\n" " --bitmap= -b : File to find bitmap information in\n" diff --git a/bitmap.c b/bitmap.c index 96a26f9a..0a2ed5d6 100644 --- a/bitmap.c +++ b/bitmap.c @@ -215,6 +215,7 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st) bitmap_super_t *sb; bitmap_info_t *info; int rv = 1; + char buf[64]; info = bitmap_file_read(filename, brief, st); if (!info) @@ -243,6 +244,11 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st) printf(" State : %s\n", bitmap_state(sb->state)); printf(" Chunksize : %s\n", human_chunksize(sb->chunksize)); printf(" Daemon : %ds flush period\n", sb->daemon_sleep); + if (sb->write_behind) + sprintf(buf, "Allow write behind, max %d", sb->write_behind); + else + sprintf(buf, "Normal"); + printf(" Write Mode : %s\n", buf); printf(" Sync Size : %llu%s\n", sb->sync_size/2, human_size(sb->sync_size * 512)); if (brief) @@ -257,6 +263,7 @@ free_info: int CreateBitmap(char *filename, int force, char uuid[16], unsigned long chunksize, unsigned long daemon_sleep, + unsigned long write_behind, unsigned long long array_size) { /* @@ -288,6 +295,7 @@ int CreateBitmap(char *filename, int force, char uuid[16], memcpy(sb.uuid, uuid, 16); sb.chunksize = chunksize; sb.daemon_sleep = daemon_sleep; + sb.write_behind = write_behind; sb.sync_size = array_size; sb_cpu_to_le(&sb); /* convert to on-disk byte ordering */ diff --git a/bitmap.h b/bitmap.h index 811485d1..02a4e97a 100644 --- a/bitmap.h +++ b/bitmap.h @@ -7,7 +7,7 @@ #define BITMAP_H 1 #define BITMAP_MAJOR 3 -#define BITMAP_MINOR 38 +#define BITMAP_MINOR 39 /* * in-memory bitmap: @@ -43,6 +43,13 @@ * When we set a bit, or in the counter (to start a write), if the fields is * 0, we first set the disk bit and set the counter to 1. * + * If the counter is 0, the on-disk bit is clear and the stipe is clean + * Anything that dirties the stipe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block * counters as a fallback when "page" memory cannot be allocated: * @@ -140,8 +147,9 @@ typedef struct bitmap_super_s { __u32 state; /* 48 bitmap state information */ __u32 chunksize; /* 52 the bitmap chunk size in bytes */ __u32 daemon_sleep; /* 56 seconds between disk flushes */ + __u32 write_behind; /* 60 number of outstanding write-behind writes */ - __u8 pad[256 - 60]; /* set to zero */ + __u8 pad[256 - 64]; /* set to zero */ } bitmap_super_t; /* notes: diff --git a/md_p.h b/md_p.h index 31eaafd2..0a0b3815 100644 --- a/md_p.h +++ b/md_p.h @@ -79,6 +79,11 @@ #define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ #define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ +#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. + * read requests will only be sent here in + * dire need + */ + typedef struct mdp_device_descriptor_s { __u32 number; /* 0 Device number in the entire set */ __u32 major; /* 1 Device major number */ diff --git a/mdadm.8 b/mdadm.8 index 1ecdac29..48d6c53a 100644 --- a/mdadm.8 +++ b/mdadm.8 @@ -204,6 +204,18 @@ with .B --verbose gives an intermediate level of verbosity. +.TP +.BR -W ", " --write-mostly +subsequent devices lists in a +.BR --build , +.BR --create , +or +.B --add +command will be flagged as 'write-mostly'. This is valid for RAID1 +only and means that the 'md' driver will avoid reading from these +devices if at all possible. This can be useful if mirroring over a +slow link. + .TP .BR -b ", " --bitmap= Give the name of a bitmap file to use with this array. Can be used @@ -215,6 +227,15 @@ exist). Set the Chunksize of the bitmap. Each bit corresponds to that many Kilobytes of storage. Default is 4. +.TP +.BR --write-behind= +Specify that write-behind mode should be enabled (valid for RAID1 +only). If an argument is specified, it will set the maximum number +of outstanding writes allowed. The default value is 256. +A write-intent bitmap is required in order to use write-behind +mode, and write-behind is only attempted on drives marked as +.IR write-mostly . + .TP .BR -f ", " --force @@ -1218,9 +1239,15 @@ For this to work, the kernel must support the necessary change. Various types of growth may be added during 2.6 development, possibly including restructuring a raid5 array to have more active devices. -Currently the only support available is to change the "size" attribute -for arrays with redundancy, and the raid-disks attribute of RAID1 -arrays. +Currently the only support available is to +.IP \(bu 4 +change the "size" attribute +for RAID1, RAID5 and RAID6. +.IP \(bu 4 +change the "raid-disks" attribute of RAID1. +.IP \(bu 4 +add a write-intent bitmap to a RAID1 array. +.PP Normally when an array is build the "size" it taken from the smallest of the drives. If all the small drives in an arrays are, one at a diff --git a/mdadm.c b/mdadm.c index 2b2b9be7..4dd65245 100644 --- a/mdadm.c +++ b/mdadm.c @@ -26,7 +26,7 @@ * Sydney, 2052 * Australia * - * Additions for bitmap and async RAID options, Copyright (C) 2003-2004, + * Additions for bitmap and write-behind RAID options, Copyright (C) 2003-2004, * Paul Clements, SteelEye Technology, Inc. */ @@ -60,6 +60,7 @@ int main(int argc, char *argv[]) char devmode = 0; int runstop = 0; int readonly = 0; + int write_behind = 0; int bitmap_fd = -1; char *bitmap_file = NULL; int bitmap_chunk = UnSet; @@ -89,6 +90,7 @@ int main(int argc, char *argv[]) char *pidfile = NULL; int oneshot = 0; struct supertype *ss = NULL; + int writemostly = 0; int copies; @@ -214,6 +216,7 @@ int main(int argc, char *argv[]) } dv->devname = optarg; dv->disposition = devmode; + dv->writemostly = writemostly; dv->next = NULL; *devlistend = dv; devlistend = &dv->next; @@ -262,6 +265,7 @@ int main(int argc, char *argv[]) } dv->devname = optarg; dv->disposition = devmode; + dv->writemostly = writemostly; dv->next = NULL; *devlistend = dv; devlistend = &dv->next; @@ -306,6 +310,13 @@ int main(int argc, char *argv[]) max_disks = ss->max_devs; continue; + case O(MANAGE,'W'): + case O(BUILD,'W'): + case O(CREATE,'W'): + /* set write-mostly for following devices */ + writemostly = 1; + continue; + case O(GROW,'z'): case O(CREATE,'z'): /* size */ if (size >= 0) { @@ -741,6 +752,19 @@ int main(int argc, char *argv[]) /* convert K to B, chunk of 0K means 512B */ bitmap_chunk = bitmap_chunk ? bitmap_chunk * 1024 : 512; continue; + + case O(BUILD, 5): + case O(CREATE, 5): /* write-behind mode */ + write_behind = DEFAULT_MAX_WRITE_BEHIND; + if (optarg) { + write_behind = strtol(optarg, &c, 10); + if (write_behind < 0 || *c || + write_behind > 16383) { + fprintf(stderr, Name ": Invalid value for maximum outstanding write-behind writes: %s.\n\tMust be between 0 and 16383.\n", optarg); + exit(2); + } + } + continue; } /* We have now processed all the valid options. Anything else is * an error @@ -904,6 +928,12 @@ int main(int argc, char *argv[]) case BUILD: if (bitmap_chunk == UnSet) bitmap_chunk = DEFAULT_BITMAP_CHUNK; if (delay == 0) delay = DEFAULT_BITMAP_DELAY; + if (write_behind && !bitmap_file) { + fprintf(stderr, Name ": write-behind mode requires a bitmap.\n"); + rv = 1; + break; + } + if (bitmap_file) { if (strcmp(bitmap_file, "internal")==0) { fprintf(stderr, Name ": 'internal' bitmaps not supported with --build\n"); @@ -918,15 +948,20 @@ int main(int argc, char *argv[]) } if (bitmap_fd < 0) { bitmap_fd = CreateBitmap(bitmap_file, force, NULL, - bitmap_chunk, delay, size); + bitmap_chunk, delay, write_behind, size); } } rv = Build(devlist->devname, mdfd, chunk, level, layout, raiddisks, devlist->next, assume_clean, - bitmap_file, bitmap_chunk, delay); + bitmap_file, bitmap_chunk, write_behind, delay); break; case CREATE: if (delay == 0) delay = DEFAULT_BITMAP_DELAY; + if (write_behind && !bitmap_file) { + fprintf(stderr, Name ": write-behind mode requires a bitmap.\n"); + rv = 1; + break; + } if (ss == NULL) { for(i=0; !ss && superlist[i]; i++) ss = superlist[i]->match_metadata_desc("default"); @@ -939,7 +974,7 @@ int main(int argc, char *argv[]) rv = Create(ss, devlist->devname, mdfd, chunk, level, layout, size<0 ? 0 : size, raiddisks, sparedisks, devs_found-1, devlist->next, runstop, verbose, force, - bitmap_file, bitmap_chunk, delay); + bitmap_file, bitmap_chunk, write_behind, delay); break; case MISC: @@ -1078,7 +1113,7 @@ int main(int argc, char *argv[]) else if (bitmap_file) { if (delay == 0) delay = DEFAULT_BITMAP_DELAY; rv = Grow_addbitmap(devlist->devname, mdfd, bitmap_file, - bitmap_chunk, delay); + bitmap_chunk, delay, write_behind); } else fprintf(stderr, Name ": no changes to --grow\n"); break; diff --git a/mdadm.h b/mdadm.h index fa8ea69b..8b58afc0 100644 --- a/mdadm.h +++ b/mdadm.h @@ -64,6 +64,7 @@ char *strncpy(char *dest, const char *src, size_t n) __THROW; #define DEFAULT_BITMAP_CHUNK 4096 #define DEFAULT_BITMAP_DELAY 5 +#define DEFAULT_MAX_WRITE_BEHIND 256 #include "md_u.h" #include "md_p.h" @@ -134,6 +135,7 @@ typedef struct mddev_dev_s { char disposition; /* 'a' for add, 'r' for remove, 'f' for fail. * Not set for names read from .config */ + char writemostly; struct mddev_dev_s *next; } *mddev_dev_t; @@ -186,7 +188,7 @@ extern struct superswitch { int (*load_super)(struct supertype *st, int fd, void **sbp, char *devname); struct supertype * (*match_metadata_desc)(char *arg); __u64 (*avail_size)(__u64 size); - int (*add_internal_bitmap)(void *sbv, int chunk, int delay, unsigned long long size); + int (*add_internal_bitmap)(void *sbv, int chunk, int delay, int write_behind, unsigned long long size); void (*locate_bitmap)(struct supertype *st, int fd); int (*write_bitmap)(struct supertype *st, int fd, void *sbv); int major; @@ -223,7 +225,7 @@ extern int Manage_reconfig(char *devname, int fd, int layout); extern int Manage_subdevs(char *devname, int fd, mddev_dev_t devlist); extern int Grow_Add_device(char *devname, int fd, char *newdev); -extern int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay); +extern int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int write_behind); extern int Assemble(struct supertype *st, char *mddev, int mdfd, @@ -237,14 +239,14 @@ extern int Assemble(struct supertype *st, char *mddev, int mdfd, extern int Build(char *mddev, int mdfd, int chunk, int level, int layout, int raiddisks, mddev_dev_t devlist, int assume_clean, - char *bitmap_file, int bitmap_chunk, int delay); + char *bitmap_file, int bitmap_chunk, int write_behind, int delay); extern int Create(struct supertype *st, char *mddev, int mdfd, int chunk, int level, int layout, unsigned long size, int raiddisks, int sparedisks, int subdevs, mddev_dev_t devlist, int runstop, int verbose, int force, - char *bitmap_file, int bitmap_chunk, int delay); + char *bitmap_file, int bitmap_chunk, int write_behind, int delay); extern int Detail(char *dev, int brief, int test); extern int Query(char *dev); @@ -259,6 +261,7 @@ extern int Kill(char *dev, int force); extern int CreateBitmap(char *filename, int force, char uuid[16], unsigned long chunksize, unsigned long daemon_sleep, + unsigned long write_behind, unsigned long long array_size); extern int ExamineBitmap(char *filename, int brief, struct supertype *st); diff --git a/super0.c b/super0.c index 7a306b52..e3364390 100644 --- a/super0.c +++ b/super0.c @@ -148,15 +148,19 @@ static void examine_super0(void *sbv) mdp_disk_t *dp; char *dv; char nb[5]; + int wonly; if (d>=0) dp = &sb->disks[d]; else dp = &sb->this_disk; snprintf(nb, sizeof(nb), "%4d", d); printf("%4s %5d %5d %5d %5d ", d < 0 ? "this" : nb, dp->number, dp->major, dp->minor, dp->raid_disk); + wonly = dp->state & (1<state &= ~(1<state & (1<state & (1<state & (1<state & (1<state == 0) printf(" spare"); if ((dv=map_dev(dp->major, dp->minor))) printf(" %s", dv); @@ -312,8 +316,10 @@ static int update_super0(struct mdinfo *info, void *sbv, char *update, char *dev } if (strcmp(update, "assemble")==0) { int d = info->disk.number; + int wonly = sb->disks[d].state & (1<disks[d].state &= ~(1<disks[d].state != info->disk.state) { - sb->disks[d].state = info->disk.state; + sb->disks[d].state = info->disk.state & wonly; rv = 1; } } @@ -467,7 +473,7 @@ static int store_super0(struct supertype *st, int fd, void *sbv) static int write_init_super0(struct supertype *st, void *sbv, mdu_disk_info_t *dinfo, char *devname) { mdp_super_t *sb = sbv; - int fd = open(devname, O_RDWR, O_EXCL); + int fd = open(devname, O_RDWR|O_EXCL); int rv; if (fd < 0) { @@ -485,6 +491,7 @@ static int write_init_super0(struct supertype *st, void *sbv, mdu_disk_info_t *d if (sb->state & (1<state |= (1<magic = __le32_to_cpu(BITMAP_MAGIC); - bms->version = __le32_to_cpu(BITMAP_MAJOR); + bms->magic = __cpu_to_le32(BITMAP_MAGIC); + bms->version = __cpu_to_le32(BITMAP_MAJOR); uuid_from_super0((int*)bms->uuid, sb); - bms->chunksize = __le32_to_cpu(chunk); - bms->daemon_sleep = __le32_to_cpu(delay); - bms->sync_size = __le64_to_cpu(size); + bms->chunksize = __cpu_to_le32(chunk); + bms->daemon_sleep = __cpu_to_le32(delay); + bms->sync_size = __cpu_to_le64(size); + bms->write_behind = __cpu_to_le32(write_behind); @@ -776,6 +785,7 @@ int write_bitmap0(struct supertype *st, int fd, void *sbv) else break; } + fsync(fd); if (towrite) rv = -2; diff --git a/super1.c b/super1.c index 3c24f348..f59eff06 100644 --- a/super1.c +++ b/super1.c @@ -64,7 +64,9 @@ struct mdp_superblock_1 { __u32 dev_number; /* permanent identifier of this device - not role in raid */ __u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ - __u8 pad2[64-56]; /* set to 0 when writing */ + __u8 devflags; /* per-device flags. Only one defined...*/ +#define WriteMostly1 1 /* mask for writemostly flag in above */ + __u8 pad2[64-57]; /* set to 0 when writing */ /* array state information - 64 bytes */ __u64 utime; /* 40 bits second, 24 btes microseconds */ @@ -153,6 +155,12 @@ static void examine_super1(void *sbv) if ((i&3)==0 && i != 0) printf(":"); } printf("\n"); + if (sb->devflags) { + printf(" Flags :"); + if (sb->devflags & WriteMostly1) + printf(" write-mostly"); + printf("\n"); + } atime = __le64_to_cpu(sb->utime) & 0xFFFFFFFFFFULL; printf(" Update Time : %.24s\n", ctime(&atime)); @@ -429,7 +437,7 @@ static void add_to_super1(void *sbv, mdu_disk_info_t *dk) { struct mdp_superblock_1 *sb = sbv; __u16 *rp = sb->dev_roles + dk->number; - if (dk->state == 6) /* active, sync */ + if ((dk->state & 6) == 6) /* active, sync */ *rp = __cpu_to_le16(dk->raid_disk); else if ((dk->state & ~2) == 0) /* active or idle -> spare */ *rp = 0xffff; @@ -517,6 +525,8 @@ static int write_init_super1(struct supertype *st, void *sbv, mdu_disk_info_t *d } sb->dev_number = __cpu_to_le32(dinfo->number); + if (dinfo->state & (1<devflags |= WriteMostly1; if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || read(rfd, sb->device_uuid, 16) != 16) { diff --git a/tests/06wrmostly b/tests/06wrmostly new file mode 100644 index 00000000..51fff60d --- /dev/null +++ b/tests/06wrmostly @@ -0,0 +1,15 @@ +set -e + +# create a raid1 array with a wrmostly device + +$mdadm -CR $md0 -l1 -n3 $dev0 $dev1 --write-mostly $dev2 +sh tests/testdev $md0 1 $mdsize0 64 + +# unfortunately, we cannot measure if any read requests are going to $dev2 + +$mdadm -S $md0 + +$mdadm -CR $md0 -l1 -n3 --write-behind --bitmap=internal $dev0 $dev1 --write-mostly $dev2 +sh tests/testdev $md0 1 $mdsize0 64 +$mdadm -S $md0 + -- 2.39.2