X-Git-Url: http://git.ipfire.org/?a=blobdiff_plain;f=super-ddf.c;h=acfb491655ed7ab37b6fc12b162180fc3683f49b;hb=c42ec1ed43a811ad866be58f8f7fa460e22a3109;hp=b494647e04dab00f9c28dccf094010a6f2076b40;hpb=598f0d58ac3544e25f90f2d34337e15764f4d877;p=thirdparty%2Fmdadm.git diff --git a/super-ddf.c b/super-ddf.c index b494647e..acfb4916 100644 --- a/super-ddf.c +++ b/super-ddf.c @@ -27,6 +27,7 @@ #define HAVE_STDINT_H 1 #include "mdadm.h" +#include "mdmon.h" #include "sha1.h" #include @@ -110,7 +111,7 @@ unsigned long crc32( #define DDF_REVISION "01.00.00" struct ddf_header { - __u32 magic; + __u32 magic; /* DDF_HEADER_MAGIC */ __u32 crc; char guid[DDF_GUID_LEN]; char revision[8]; /* 01.00.00 */ @@ -166,7 +167,7 @@ struct ddf_header { /* The content of the 'controller section' - global scope */ struct ddf_controller_data { - __u32 magic; + __u32 magic; /* DDF_CONTROLLER_MAGIC */ __u32 crc; char guid[DDF_GUID_LEN]; struct controller_type { @@ -182,7 +183,7 @@ struct ddf_controller_data { /* The content of phys_section - global scope */ struct phys_disk { - __u32 magic; + __u32 magic; /* DDF_PHYS_RECORDS_MAGIC */ __u32 crc; __u16 used_pdes; __u16 max_pdes; @@ -201,7 +202,7 @@ struct phys_disk { /* phys_disk_entry.type is a bitmap - bigendian remember */ #define DDF_Forced_PD_GUID 1 #define DDF_Active_in_VD 2 -#define DDF_Global_Spare 4 +#define DDF_Global_Spare 4 /* VD_CONF records are ignored */ #define DDF_Spare 8 /* overrides Global_spare */ #define DDF_Foreign 16 #define DDF_Legacy 32 /* no DDF on this device */ @@ -223,7 +224,7 @@ struct phys_disk { /* The content of the virt_section global scope */ struct virtual_disk { - __u32 magic; + __u32 magic; /* DDF_VIRT_RECORDS_MAGIC */ __u32 crc; __u16 populated_vdes; __u16 max_vdes; @@ -254,6 +255,7 @@ struct virtual_disk { #define DDF_state_deleted 0x2 #define DDF_state_missing 0x3 #define DDF_state_failed 0x4 +#define DDF_state_part_optimal 0x5 #define DDF_state_morphing 0x8 #define DDF_state_inconsistent 0x10 @@ -261,7 +263,8 @@ struct virtual_disk { /* virtual_entry.init_state is a bigendian bitmap */ #define DDF_initstate_mask 0x03 #define DDF_init_not 0x00 -#define DDF_init_quick 0x01 +#define DDF_init_quick 0x01 /* initialisation is progress. + * i.e. 'state_inconsistent' */ #define DDF_init_full 0x02 #define DDF_access_mask 0xc0 @@ -275,7 +278,7 @@ struct virtual_disk { */ struct vd_config { - __u32 magic; + __u32 magic; /* DDF_VD_CONF_MAGIC */ __u32 crc; char guid[DDF_GUID_LEN]; __u32 timestamp; @@ -319,7 +322,7 @@ struct vd_config { #define DDF_cache_rallowed 64 /* enable read caching */ struct spare_assign { - __u32 magic; + __u32 magic; /* DDF_SPARE_ASSIGN_MAGIC */ __u32 crc; __u32 timestamp; __u8 reserved[7]; @@ -341,7 +344,7 @@ struct spare_assign { /* The data_section contents - local scope */ struct disk_data { - __u32 magic; + __u32 magic; /* DDF_PHYS_DATA_MAGIC */ __u32 crc; char guid[DDF_GUID_LEN]; __u32 refnum; /* crc of some magic drive data ... */ @@ -395,20 +398,23 @@ struct ddf_super { struct phys_disk *phys; struct virtual_disk *virt; int pdsize, vdsize; - int max_part; + int max_part, mppe, conf_rec_len; struct vcl { struct vcl *next; __u64 *lba_offset; /* location in 'conf' of * the lba table */ struct vd_config conf; } *conflist, *newconf; + int conf_num; /* Index into 'virt' of entry matching 'newconf' */ struct dl { struct dl *next; struct disk_data disk; int major, minor; char *devname; int fd; - struct vcl *vlist[0]; /* max_part+1 in size */ + int pdnum; /* index in ->phys */ + struct spare_assign *spare; + struct vcl *vlist[0]; /* max_part in size */ } *dlist; }; @@ -416,7 +422,7 @@ struct ddf_super { #define offsetof(t,f) ((size_t)&(((t*)0)->f)) #endif -struct superswitch super_ddf_container, super_ddf_bvd; +extern struct superswitch super_ddf_container, super_ddf_bvd, super_ddf; static int calc_crc(void *buf, int len) { @@ -596,10 +602,16 @@ static int load_ddf_global(int fd, struct ddf_super *super, char *devname) !super->virt) { free(super->phys); free(super->virt); + super->phys = NULL; + super->virt = NULL; return 2; } super->conflist = NULL; super->dlist = NULL; + + super->max_part = __be16_to_cpu(super->active->max_partitions); + super->mppe = __be16_to_cpu(super->active->max_primary_element_entries); + super->conf_rec_len = __be16_to_cpu(super->active->config_record_len); return 0; } @@ -610,13 +622,11 @@ static int load_ddf_local(int fd, struct ddf_super *super, struct stat stb; char *conf; int i; - int conflen; - int mppe; + int vnum; /* First the local disk info */ - super->max_part = __be16_to_cpu(super->active->max_partitions); dl = malloc(sizeof(*dl) + - (super->max_part+1) * sizeof(dl->vlist[0])); + (super->max_part) * sizeof(dl->vlist[0])); load_section(fd, super, &dl->disk, super->active->data_section_offset, @@ -629,29 +639,43 @@ static int load_ddf_local(int fd, struct ddf_super *super, dl->minor = minor(stb.st_rdev); dl->next = super->dlist; dl->fd = keep ? fd : -1; - for (i=0 ; i < super->max_part + 1 ; i++) + dl->spare = NULL; + for (i=0 ; i < super->max_part ; i++) dl->vlist[i] = NULL; super->dlist = dl; + dl->pdnum = 0; + for (i=0; i < __be16_to_cpu(super->active->max_pd_entries); i++) + if (memcmp(super->phys->entries[i].guid, + dl->disk.guid, DDF_GUID_LEN) == 0) + dl->pdnum = i; + /* Now the config list. */ /* 'conf' is an array of config entries, some of which are * probably invalid. Those which are good need to be copied into * the conflist */ - conflen = __be16_to_cpu(super->active->config_record_len); conf = load_section(fd, super, NULL, super->active->config_section_offset, super->active->config_section_length, 0); + vnum = 0; for (i = 0; i < __be32_to_cpu(super->active->config_section_length); - i += conflen) { + i += super->conf_rec_len) { struct vd_config *vd = (struct vd_config *)((char*)conf + i*512); struct vcl *vcl; + if (vd->magic == DDF_SPARE_ASSIGN_MAGIC) { + if (dl->spare) + continue; + dl->spare = malloc(super->conf_rec_len*512); + memcpy(dl->spare, vd, super->conf_rec_len*512); + continue; + } if (vd->magic != DDF_VD_CONF_MAGIC) continue; for (vcl = super->conflist; vcl; vcl = vcl->next) { @@ -661,20 +685,20 @@ static int load_ddf_local(int fd, struct ddf_super *super, } if (vcl) { - dl->vlist[i/conflen] = vcl; + dl->vlist[vnum++] = vcl; if (__be32_to_cpu(vd->seqnum) <= __be32_to_cpu(vcl->conf.seqnum)) continue; } else { - vcl = malloc(conflen*512 + offsetof(struct vcl, conf)); + vcl = malloc(super->conf_rec_len*512 + + offsetof(struct vcl, conf)); vcl->next = super->conflist; super->conflist = vcl; + dl->vlist[vnum++] = vcl; } - memcpy(&vcl->conf, vd, conflen*512); - mppe = __be16_to_cpu(super->anchor.max_primary_element_entries); + memcpy(&vcl->conf, vd, super->conf_rec_len*512); vcl->lba_offset = (__u64*) - &vcl->conf.phys_refnum[mppe]; - dl->vlist[i/conflen] = vcl; + &vcl->conf.phys_refnum[super->mppe]; } free(conf); @@ -693,7 +717,7 @@ static int load_super_ddf(struct supertype *st, int fd, int rv; #ifndef MDASSEMBLE - if (load_super_ddf_all(st, fd, &st->sb, devname, 0) == 0) + if (load_super_ddf_all(st, fd, &st->sb, devname, 1) == 0) return 0; #endif @@ -726,6 +750,7 @@ static int load_super_ddf(struct supertype *st, int fd, sizeof(*super)); return 1; } + memset(super, 0, sizeof(*super)); rv = load_ddf_headers(fd, super, devname); if (rv) { @@ -777,6 +802,8 @@ static void free_super_ddf(struct supertype *st) ddf->dlist = d->next; if (d->fd >= 0) close(d->fd); + if (d->spare) + free(d->spare); free(d); } free(ddf); @@ -892,7 +919,8 @@ static struct num_mapping ddf_level_num[] = { { DDF_RAID0, 0 }, { DDF_RAID1, 1 }, { DDF_RAID3, LEVEL_UNSUPPORTED }, - { DDF_RAID5, 4 }, + { DDF_RAID4, 4 }, + { DDF_RAID5, 5 }, { DDF_RAID1E, LEVEL_UNSUPPORTED }, { DDF_JBOD, LEVEL_UNSUPPORTED }, { DDF_CONCAT, LEVEL_LINEAR }, @@ -943,7 +971,7 @@ static void print_guid(char *guid, int tstamp) static void examine_vd(int n, struct ddf_super *sb, char *guid) { - int crl = __be16_to_cpu(sb->anchor.config_record_len); + int crl = sb->conf_rec_len; struct vcl *vcl; for (vcl = sb->conflist ; vcl ; vcl = vcl->next) { @@ -1112,10 +1140,29 @@ static int match_home_ddf(struct supertype *st, char *homehost) ddf->controller.vendor_data[len] == 0); } -static struct vd_config *find_vdcr(struct ddf_super *ddf) +static struct vd_config *find_vdcr(struct ddf_super *ddf, int inst) +{ + struct vcl *v; + if (inst < 0 || inst > __be16_to_cpu(ddf->virt->populated_vdes)) + return NULL; + for (v = ddf->conflist; v; v = v->next) + if (memcmp(v->conf.guid, + ddf->virt->entries[inst].guid, + DDF_GUID_LEN) == 0) + return &v->conf; + return NULL; +} + +static int find_phys(struct ddf_super *ddf, __u32 phys_refnum) { - /* FIXME this just picks off the first one */ - return &ddf->conflist->conf; + /* Find the entry in phys_disk which has the given refnum + * and return it's index + */ + int i; + for (i=0; i < __be16_to_cpu(ddf->phys->max_pdes); i++) + if (ddf->phys->entries[i].refnum == phys_refnum) + return i; + return -1; } static void uuid_from_super_ddf(struct supertype *st, int uuid[4]) @@ -1137,7 +1184,7 @@ static void uuid_from_super_ddf(struct supertype *st, int uuid[4]) * The first 16 bytes of the sha1 of these is used. */ struct ddf_super *ddf = st->sb; - struct vd_config *vd = find_vdcr(ddf); + struct vd_config *vd = find_vdcr(ddf, ddf->conf_num); if (!vd) memset(uuid, 0, sizeof (uuid)); @@ -1175,18 +1222,25 @@ static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info) info->disk.major = 0; info->disk.minor = 0; - info->disk.number = __be32_to_cpu(ddf->dlist->disk.refnum); -// info->disk.raid_disk = find refnum in the table and use index; - info->disk.raid_disk = -1; - for (i = 0; i < __be16_to_cpu(ddf->phys->max_pdes) ; i++) - if (ddf->phys->entries[i].refnum == ddf->dlist->disk.refnum) { - info->disk.raid_disk = i; - break; - } + if (ddf->dlist) { + info->disk.number = __be32_to_cpu(ddf->dlist->disk.refnum); + info->disk.raid_disk = -1; + for (i = 0; i < __be16_to_cpu(ddf->phys->max_pdes) ; i++) + if (ddf->phys->entries[i].refnum == + ddf->dlist->disk.refnum) { + info->disk.raid_disk = i; + break; + } + } else { + info->disk.number = -1; +// info->disk.raid_disk = find refnum in the table and use index; + } info->disk.state = (1 << MD_DISK_SYNC); info->reshape_active = 0; + strcpy(info->text_version, "ddf"); + // uuid_from_super_ddf(info->uuid, sbv); // info->name[] ?? ; @@ -1207,7 +1261,7 @@ static int rlq_to_layout(int rlq, int prl, int raiddisks); static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info) { struct ddf_super *ddf = st->sb; - struct vd_config *vd = find_vdcr(ddf); + struct vd_config *vd = find_vdcr(ddf, info->container_member); /* FIXME this returns BVD info - what if we want SVD ?? */ @@ -1232,8 +1286,20 @@ static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info) // info->disk.raid_disk = find refnum in the table and use index; // info->disk.state = ???; + info->resync_start = 0; + if (!(ddf->virt->entries[info->container_member].state + & DDF_state_inconsistent) && + (ddf->virt->entries[info->container_member].init_state + & DDF_initstate_mask) + == DDF_init_full) + info->resync_start = ~0ULL; + uuid_from_super_ddf(st, info->uuid); + sprintf(info->text_version, "/%s/%d", + devnum2devname(st->container_dev), + info->container_member); + // info->name[] ?? ; } @@ -1280,7 +1346,7 @@ static int update_super_ddf(struct supertype *st, struct mdinfo *info, */ int rv = 0; // struct ddf_super *ddf = st->sb; -// struct vd_config *vd = find_vdcr(ddf); +// struct vd_config *vd = find_vdcr(ddf, info->container_member); // struct virtual_entry *ve = find_ve(ddf); @@ -1420,8 +1486,10 @@ static int init_super_ddf(struct supertype *st, ddf->anchor.max_vd_entries = __cpu_to_be16(max_virt_disks); /* ?? */ ddf->anchor.max_partitions = __cpu_to_be16(64); /* ?? */ ddf->max_part = 64; - ddf->anchor.config_record_len = __cpu_to_be16(1 + 256*12/512); + ddf->conf_rec_len = 1 + 256 * 12 / 512; + ddf->anchor.config_record_len = __cpu_to_be16(ddf->conf_rec_len); ddf->anchor.max_primary_element_entries = __cpu_to_be16(256); + ddf->mppe = 256; memset(ddf->anchor.pad3, 0xff, 54); /* controller sections is one sector long immediately @@ -1493,8 +1561,8 @@ static int init_super_ddf(struct supertype *st, * Remaining 16 are serial number.... maybe a hostname would do? */ memcpy(ddf->controller.guid, T10, sizeof(T10)); - gethostname(hostname, 17); - hostname[17] = 0; + gethostname(hostname, sizeof(hostname)); + hostname[sizeof(hostname) - 1] = 0; hostlen = strlen(hostname); memcpy(ddf->controller.guid + 24 - hostlen, hostname, hostlen); for (i = strlen(T10) ; i+hostlen < 24; i++) @@ -1637,8 +1705,6 @@ static int init_super_ddf_bvd(struct supertype *st, struct virtual_entry *ve; struct vcl *vcl; struct vd_config *vc; - int mppe; - int conflen; if (__be16_to_cpu(ddf->virt->populated_vdes) >= __be16_to_cpu(ddf->virt->max_vdes)) { @@ -1657,6 +1723,7 @@ static int init_super_ddf_bvd(struct supertype *st, return 0; } ve = &ddf->virt->entries[venum]; + ddf->conf_num = venum; /* A Virtual Disk GUID contains the T10 Vendor ID, controller type, * timestamp, random number @@ -1666,10 +1733,12 @@ static int init_super_ddf_bvd(struct supertype *st, ve->pad0 = 0xFFFF; ve->guid_crc = crc32(0, (unsigned char*)ddf->anchor.guid, DDF_GUID_LEN); ve->type = 0; - ve->state = 0; - ve->init_state = 0; - if (!(info->state & 1)) - ve->init_state = DDF_state_inconsistent; + ve->state = DDF_state_degraded; /* Will be modified as devices are added */ + if (info->state & 1) /* clean */ + ve->init_state = DDF_init_full; + else + ve->init_state = DDF_init_not; + memset(ve->pad1, 0xff, 14); memset(ve->name, ' ', 16); if (name) @@ -1678,10 +1747,8 @@ static int init_super_ddf_bvd(struct supertype *st, __cpu_to_be16(__be16_to_cpu(ddf->virt->populated_vdes)+1); /* Now create a new vd_config */ - conflen = __be16_to_cpu(ddf->active->config_record_len); - vcl = malloc(offsetof(struct vcl, conf) + conflen * 512); - mppe = __be16_to_cpu(ddf->anchor.max_primary_element_entries); - vcl->lba_offset = (__u64*) &vcl->conf.phys_refnum[mppe]; + vcl = malloc(offsetof(struct vcl, conf) + ddf->conf_rec_len * 512); + vcl->lba_offset = (__u64*) &vcl->conf.phys_refnum[ddf->mppe]; vc = &vcl->conf; @@ -1721,8 +1788,8 @@ static int init_super_ddf_bvd(struct supertype *st, memset(vc->v3, 0xff, 16); memset(vc->vendor, 0xff, 32); - memset(vc->phys_refnum, 0xff, 4*mppe); - memset(vc->phys_refnum+mppe, 0x00, 8*mppe); + memset(vc->phys_refnum, 0xff, 4*ddf->mppe); + memset(vc->phys_refnum+(ddf->mppe * 4), 0x00, 8*ddf->mppe); vcl->next = ddf->conflist; ddf->conflist = vcl; @@ -1738,13 +1805,15 @@ static void add_to_super_ddf_bvd(struct supertype *st, * We need to find suitable free space in that device and update * the phys_refnum and lba_offset for the newly created vd_config. * We might also want to update the type in the phys_disk - * section. FIXME + * section. */ struct dl *dl; struct ddf_super *ddf = st->sb; struct vd_config *vc; __u64 *lba_offset; - int mppe; + int working; + int i; + int max_virt_disks; for (dl = ddf->dlist; dl ; dl = dl->next) if (dl->major == dk->major && @@ -1754,15 +1823,49 @@ static void add_to_super_ddf_bvd(struct supertype *st, return; vc = &ddf->newconf->conf; + lba_offset = ddf->newconf->lba_offset; vc->phys_refnum[dk->raid_disk] = dl->disk.refnum; - mppe = __be16_to_cpu(ddf->anchor.max_primary_element_entries); - lba_offset = (__u64*)(vc->phys_refnum + mppe); lba_offset[dk->raid_disk] = 0; /* FIXME */ - dl->vlist[0] =ddf->newconf; /* FIXME */ + for (i=0; i < ddf->max_part ; i++) + if (dl->vlist[i] == NULL) + break; + if (i == ddf->max_part) + return; + dl->vlist[i] = ddf->newconf; dl->fd = fd; dl->devname = devname; + + /* Check how many working raid_disks, and if we can mark + * array as optimal yet + */ + working = 0; + + for (i=0; i < __be16_to_cpu(vc->prim_elmnt_count); i++) + if (vc->phys_refnum[i] != 0xffffffff) + working++; + /* Find which virtual_entry */ + max_virt_disks = __be16_to_cpu(ddf->active->max_vd_entries); + for (i=0; i < max_virt_disks ; i++) + if (memcmp(ddf->virt->entries[i].guid, + vc->guid, DDF_GUID_LEN)==0) + break; + if (i == max_virt_disks) + return; + if (working == __be16_to_cpu(vc->prim_elmnt_count)) + ddf->virt->entries[i].state = + (ddf->virt->entries[i].state & ~DDF_state_mask) + | DDF_state_optimal; + + if (vc->prl == DDF_RAID6 && + working+1 == __be16_to_cpu(vc->prim_elmnt_count)) + ddf->virt->entries[i].state = + (ddf->virt->entries[i].state & ~DDF_state_mask) + | DDF_state_part_optimal; + + ddf->phys->entries[dl->pdnum].type &= ~__cpu_to_be16(DDF_Global_Spare); + ddf->phys->entries[dl->pdnum].type |= __cpu_to_be16(DDF_Active_in_VD); } /* add a device to a container, either while creating it or while @@ -1784,12 +1887,13 @@ static void add_to_super_ddf(struct supertype *st, * a phys_disk entry and a more detailed disk_data entry. */ fstat(fd, &stb); - dd = malloc(sizeof(*dd) + sizeof(dd->vlist[0]) * (ddf->max_part+1)); + dd = malloc(sizeof(*dd) + sizeof(dd->vlist[0]) * ddf->max_part); dd->major = major(stb.st_rdev); dd->minor = minor(stb.st_rdev); dd->devname = devname; dd->next = ddf->dlist; dd->fd = fd; + dd->spare = NULL; dd->disk.magic = DDF_PHYS_DATA_MAGIC; now = time(0); @@ -1805,17 +1909,19 @@ static void add_to_super_ddf(struct supertype *st, memset(dd->disk.vendor, ' ', 32); memcpy(dd->disk.vendor, "Linux", 5); memset(dd->disk.pad, 0xff, 442); - for (i = 0; i < ddf->max_part+1 ; i++) + for (i = 0; i < ddf->max_part ; i++) dd->vlist[i] = NULL; n = __be16_to_cpu(ddf->phys->used_pdes); pde = &ddf->phys->entries[n]; + dd->pdnum = n; + n++; ddf->phys->used_pdes = __cpu_to_be16(n); memcpy(pde->guid, dd->disk.guid, DDF_GUID_LEN); pde->refnum = dd->disk.refnum; - pde->type = __cpu_to_be16(DDF_Forced_PD_GUID |DDF_Global_Spare); + pde->type = __cpu_to_be16(DDF_Forced_PD_GUID | DDF_Global_Spare); pde->state = __cpu_to_be16(DDF_Online); get_dev_size(fd, NULL, &size); /* We are required to reserve 32Meg, and record the size in sectors */ @@ -1833,7 +1939,7 @@ static void add_to_super_ddf(struct supertype *st, */ #ifndef MDASSEMBLE -static int write_init_super_ddf(struct supertype *st) +static int __write_init_super_ddf(struct supertype *st, int do_close) { struct ddf_super *ddf = st->sb; @@ -1892,10 +1998,12 @@ static int write_init_super_ddf(struct supertype *st) write(fd, ddf->virt, ddf->vdsize); /* Now write lots of config records. */ - n_config = __be16_to_cpu(ddf->active->max_partitions); - conf_size = __be16_to_cpu(ddf->active->config_record_len) * 512; + n_config = ddf->max_part; + conf_size = ddf->conf_rec_len * 512; for (i = 0 ; i <= n_config ; i++) { struct vcl *c = d->vlist[i]; + if (i == n_config) + c = (struct vcl*)d->spare; if (c) { c->conf.crc = calc_crc(&c->conf, conf_size); @@ -1913,10 +2021,19 @@ static int write_init_super_ddf(struct supertype *st) lseek64(fd, (size-1)*512, SEEK_SET); write(fd, &ddf->anchor, 512); - close(fd); + if (do_close) { + close(fd); + d->fd = -1; + } } return 1; } + +static int write_init_super_ddf(struct supertype *st) +{ + return __write_init_super_ddf(st, 1); +} + #endif static __u64 avail_size_ddf(struct supertype *st, __u64 devsize) @@ -2023,15 +2140,16 @@ int validate_geometry_ddf(struct supertype *st, st->ss = &super_ddf_bvd; if (load_super_ddf_all(st, cfd, (void **)&ddf, NULL, 1) == 0) { st->sb = ddf; + st->container_dev = fd2devnum(cfd); close(cfd); return st->ss->validate_geometry(st, level, layout, raiddisks, chunk, size, dev, freesize); } close(cfd); - } - fprintf(stderr, Name ": Cannot use %s: Already in use\n", - dev); + } else /* device may belong to a different container */ + return 0; + return 1; } @@ -2060,7 +2178,7 @@ int validate_geometry_ddf_container(struct supertype *st, } close(fd); - *freesize = avail_size_ddf(st, ldsize); + *freesize = avail_size_ddf(st, ldsize >> 9); return 1; } @@ -2082,7 +2200,7 @@ int cmp_extent(const void *av, const void *bv) struct extent *get_extents(struct ddf_super *ddf, struct dl *dl) { /* find a list of used extents on the give physical device - * (dnum) or the given ddf. + * (dnum) of the given ddf. * Return a malloced array of 'struct extent' FIXME ignore DDF_Legacy devices? @@ -2093,6 +2211,7 @@ FIXME ignore DDF_Legacy devices? int dnum; int i, j; + /* FIXME this is dl->pdnum */ for (dnum = 0; dnum < ddf->phys->used_pdes; dnum++) if (memcmp(dl->disk.guid, ddf->phys->entries[dnum].guid, @@ -2106,7 +2225,7 @@ FIXME ignore DDF_Legacy devices? if (!rv) return NULL; - for (i = 0; i < ddf->max_part+1; i++) { + for (i = 0; i < ddf->max_part; i++) { struct vcl *v = dl->vlist[i]; if (v == NULL) continue; @@ -2156,6 +2275,7 @@ int validate_geometry_ddf_bvd(struct supertype *st, for (dl = ddf->dlist; dl ; dl = dl->next) { int found = 0; + pos = 0; i = 0; e = get_extents(ddf, dl); @@ -2247,16 +2367,17 @@ static int load_super_ddf_all(struct supertype *st, int fd, super = malloc(sizeof(*super)); if (!super) return 1; + memset(super, 0, sizeof(*super)); /* first, try each device, and choose the best ddf */ for (sd = sra->devs ; sd ; sd = sd->next) { int rv; sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); - dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY); - if (!dfd) + dfd = dev_open(nm, O_RDONLY); + if (dfd < 0) return 2; rv = load_ddf_headers(dfd, super, NULL); - if (!keep_fd) close(dfd); + close(dfd); if (rv == 0) { seq = __be32_to_cpu(super->active->seq); if (super->active->openflag) @@ -2272,7 +2393,7 @@ static int load_super_ddf_all(struct supertype *st, int fd, /* OK, load this ddf */ sprintf(nm, "%d:%d", best->disk.major, best->disk.minor); dfd = dev_open(nm, O_RDONLY); - if (!dfd) + if (dfd < 0) return 1; load_ddf_headers(dfd, super, NULL); load_ddf_global(dfd, super, NULL); @@ -2281,7 +2402,7 @@ static int load_super_ddf_all(struct supertype *st, int fd, for (sd = sra->devs ; sd ; sd = sd->next) { sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY); - if (!dfd) + if (dfd < 0) return 2; seq = load_ddf_local(dfd, super, NULL, keep_fd); if (!keep_fd) close(dfd); @@ -2291,6 +2412,7 @@ static int load_super_ddf_all(struct supertype *st, int fd, st->ss = &super_ddf_container; st->minor_version = 0; st->max_devs = 512; + st->container_dev = fd2devnum(fd); } return 0; } @@ -2314,7 +2436,6 @@ static struct mdinfo *container_content_ddf(struct supertype *st) for (vc = ddf->conflist ; vc ; vc=vc->next) { - int mppe; int i; struct mdinfo *this; this = malloc(sizeof(*this)); @@ -2328,8 +2449,8 @@ static struct mdinfo *container_content_ddf(struct supertype *st) this->array.level = map_num1(ddf_level_num, vc->conf.prl); this->array.raid_disks = __be16_to_cpu(vc->conf.prim_elmnt_count); - /* FIXME this should be mapped */ - this->array.layout = vc->conf.rlq; + this->array.layout = rlq_to_layout(vc->conf.rlq, vc->conf.prl, + this->array.raid_disks); this->array.md_minor = -1; this->array.ctime = DECADE + __be32_to_cpu(*(__u32*)(vc->conf.guid+16)); @@ -2341,19 +2462,29 @@ static struct mdinfo *container_content_ddf(struct supertype *st) if (memcmp(ddf->virt->entries[i].guid, vc->conf.guid, DDF_GUID_LEN) == 0) break; - if (ddf->virt->entries[i].state & DDF_state_inconsistent) + if ((ddf->virt->entries[i].state & DDF_state_inconsistent) || + (ddf->virt->entries[i].init_state & DDF_initstate_mask) != + DDF_init_full) { this->array.state = 0; - else + this->resync_start = 0; + } else { this->array.state = 1; + this->resync_start = ~0ULL; + } memcpy(this->name, ddf->virt->entries[i].name, 32); this->name[33]=0; memset(this->uuid, 0, sizeof(this->uuid)); this->component_size = __be64_to_cpu(vc->conf.blocks); this->array.size = this->component_size / 2; + this->container_member = i; - mppe = __be16_to_cpu(ddf->anchor.max_primary_element_entries); - for (i=0 ; i < mppe ; i++) { + sprintf(this->text_version, "/%s/%d", + devnum2devname(st->container_dev), + this->container_member); + + + for (i=0 ; i < ddf->mppe ; i++) { struct mdinfo *dev; struct dl *d; @@ -2438,6 +2569,501 @@ static int compare_super_ddf(struct supertype *st, struct supertype *tst) return 0; } +/* + * A new array 'a' has been started which claims to be instance 'inst' + * within container 'c'. + * We need to confirm that the array matches the metadata in 'c' so + * that we don't corrupt any metadata. + */ +static int ddf_open_new(struct supertype *c, struct active_array *a, char *inst) +{ + fprintf(stderr, "ddf: open_new %s\n", inst); + a->info.container_member = atoi(inst); + return 0; +} + +/* + * The array 'a' is to be marked clean in the metadata. + * If '->resync_start' is not ~(unsigned long long)0, then the array is only + * clean up to the point (in sectors). If that cannot be recorded in the + * metadata, then leave it as dirty. + * + * For DDF, we need to clear the DDF_state_inconsistent bit in the + * !global! virtual_disk.virtual_entry structure. + */ +static void ddf_set_array_state(struct active_array *a, int consistent) +{ + struct ddf_super *ddf = a->container->sb; + int inst = a->info.container_member; + if (consistent) + ddf->virt->entries[inst].state &= ~DDF_state_inconsistent; + else + ddf->virt->entries[inst].state |= DDF_state_inconsistent; + ddf->virt->entries[inst].init_state &= ~DDF_initstate_mask; + if (a->resync_start == ~0ULL) + ddf->virt->entries[inst].init_state |= DDF_init_full; + else if (a->resync_start == 0) + ddf->virt->entries[inst].init_state |= DDF_init_not; + else + ddf->virt->entries[inst].init_state |= DDF_init_quick; + + printf("ddf mark %s %llu\n", consistent?"clean":"dirty", + a->resync_start); +} + +/* + * The state of each disk is stored in the global phys_disk structure + * in phys_disk.entries[n].state. + * This makes various combinations awkward. + * - When a device fails in any array, it must be failed in all arrays + * that include a part of this device. + * - When a component is rebuilding, we cannot include it officially in the + * array unless this is the only array that uses the device. + * + * So: when transitioning: + * Online -> failed, just set failed flag. monitor will propagate + * spare -> online, the device might need to be added to the array. + * spare -> failed, just set failed. Don't worry if in array or not. + */ +static void ddf_set_disk(struct active_array *a, int n, int state) +{ + struct ddf_super *ddf = a->container->sb; + int inst = a->info.container_member; + struct vd_config *vc = find_vdcr(ddf, inst); + int pd = find_phys(ddf, vc->phys_refnum[n]); + int i, st, working; + + if (vc == NULL) { + fprintf(stderr, "ddf: cannot find instance %d!!\n", inst); + return; + } + if (pd < 0) { + /* disk doesn't currently exist. If it is now in_sync, + * insert it. */ + if ((state & DS_INSYNC) && ! (state & DS_FAULTY)) { + /* Find dev 'n' in a->info->devs, determine the + * ddf refnum, and set vc->phys_refnum and update + * phys->entries[] + */ + /* FIXME */ + } + } else { + if (state & DS_FAULTY) + ddf->phys->entries[pd].state |= __cpu_to_be16(DDF_Failed); + if (state & DS_INSYNC) { + ddf->phys->entries[pd].state |= __cpu_to_be16(DDF_Online); + ddf->phys->entries[pd].state &= __cpu_to_be16(~DDF_Rebuilding); + } + } + + fprintf(stderr, "ddf: set_disk %d to %x\n", n, state); + + /* Now we need to check the state of the array and update + * virtual_disk.entries[n].state. + * It needs to be one of "optimal", "degraded", "failed". + * I don't understand 'deleted' or 'missing'. + */ + working = 0; + for (i=0; i < a->info.array.raid_disks; i++) { + pd = find_phys(ddf, vc->phys_refnum[i]); + if (pd < 0) + continue; + st = __be16_to_cpu(ddf->phys->entries[pd].state); + if ((st & (DDF_Online|DDF_Failed|DDF_Rebuilding)) + == DDF_Online) + working++; + } + state = DDF_state_degraded; + if (working == a->info.array.raid_disks) + state = DDF_state_optimal; + else switch(vc->prl) { + case DDF_RAID0: + case DDF_CONCAT: + case DDF_JBOD: + state = DDF_state_failed; + break; + case DDF_RAID1: + if (working == 0) + state = DDF_state_failed; + break; + case DDF_RAID4: + case DDF_RAID5: + if (working < a->info.array.raid_disks-1) + state = DDF_state_failed; + break; + case DDF_RAID6: + if (working < a->info.array.raid_disks-2) + state = DDF_state_failed; + else if (working == a->info.array.raid_disks-1) + state = DDF_state_part_optimal; + break; + } + + ddf->virt->entries[inst].state = + (ddf->virt->entries[inst].state & ~DDF_state_mask) + | state; + +} + +static void ddf_sync_metadata(struct supertype *st) +{ + + /* + * Write all data to all devices. + * Later, we might be able to track whether only local changes + * have been made, or whether any global data has been changed, + * but ddf is sufficiently weird that it probably always + * changes global data .... + */ + __write_init_super_ddf(st, 0); + fprintf(stderr, "ddf: sync_metadata\n"); +} + +static void ddf_process_update(struct supertype *st, + struct metadata_update *update) +{ + /* Apply this update to the metadata. + * The first 4 bytes are a DDF_*_MAGIC which guides + * our actions. + * Possible update are: + * DDF_PHYS_RECORDS_MAGIC + * Add a new physical device. Changes to this record + * only happen implicitly. + * used_pdes is the device number. + * DDF_VIRT_RECORDS_MAGIC + * Add a new VD. Possibly also change the 'access' bits. + * populated_vdes is the entry number. + * DDF_VD_CONF_MAGIC + * New or updated VD. the VIRT_RECORD must already + * exist. For an update, phys_refnum and lba_offset + * (at least) are updated, and the VD_CONF must + * be written to precisely those devices listed with + * a phys_refnum. + * DDF_SPARE_ASSIGN_MAGIC + * replacement Spare Assignment Record... but for which device? + * + * So, e.g.: + * - to create a new array, we send a VIRT_RECORD and + * a VD_CONF. Then assemble and start the array. + * - to activate a spare we send a VD_CONF to add the phys_refnum + * and offset. This will also mark the spare as active with + * a spare-assignment record. + */ + struct ddf_super *ddf = st->sb; + __u32 *magic = (__u32*)update->buf; + struct phys_disk *pd; + struct virtual_disk *vd; + struct vd_config *vc; + struct vcl *vcl; + struct dl *dl; + int mppe; + int ent; + + printf("Process update %x\n", *magic); + + switch (*magic) { + case DDF_PHYS_RECORDS_MAGIC: + + if (update->len != (sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry))) + return; + pd = (struct phys_disk*)update->buf; + + ent = __be16_to_cpu(pd->used_pdes); + if (ent >= __be16_to_cpu(ddf->phys->max_pdes)) + return; + if (!all_ff(ddf->phys->entries[ent].guid)) + return; + ddf->phys->entries[ent] = pd->entries[0]; + ddf->phys->used_pdes = __cpu_to_be16(1 + + __be16_to_cpu(ddf->phys->used_pdes)); + break; + + case DDF_VIRT_RECORDS_MAGIC: + + if (update->len != (sizeof(struct virtual_disk) + + sizeof(struct virtual_entry))) + return; + vd = (struct virtual_disk*)update->buf; + + ent = __be16_to_cpu(vd->populated_vdes); + if (ent >= __be16_to_cpu(ddf->virt->max_vdes)) + return; + if (!all_ff(ddf->virt->entries[ent].guid)) + return; + ddf->virt->entries[ent] = vd->entries[0]; + ddf->virt->populated_vdes = __cpu_to_be16(1 + + __be16_to_cpu(ddf->virt->populated_vdes)); + break; + + case DDF_VD_CONF_MAGIC: + printf("len %d %d\n", update->len, ddf->conf_rec_len); + + mppe = __be16_to_cpu(ddf->anchor.max_primary_element_entries); + if (update->len != ddf->conf_rec_len) + return; + vc = (struct vd_config*)update->buf; + for (vcl = ddf->conflist; vcl ; vcl = vcl->next) + if (memcmp(vcl->conf.guid, vc->guid, DDF_GUID_LEN) == 0) + break; + printf("vcl = %p\n", vcl); + if (vcl) { + /* An update, just copy the phys_refnum and lba_offset + * fields + */ + memcpy(vcl->conf.phys_refnum, vc->phys_refnum, + mppe * (sizeof(__u32) + sizeof(__u64))); + } else { + /* A new VD_CONF */ + vcl = update->space; + update->space = NULL; + vcl->next = ddf->conflist; + vcl->conf = *vc; + vcl->lba_offset = (__u64*) + &vcl->conf.phys_refnum[mppe]; + ddf->conflist = vcl; + } + /* Now make sure vlist is correct for each dl. */ + for (dl = ddf->dlist; dl; dl = dl->next) { + int dn; + int vn = 0; + for (vcl = ddf->conflist; vcl ; vcl = vcl->next) + for (dn=0; dn < ddf->mppe ; dn++) + if (vcl->conf.phys_refnum[dn] == + dl->disk.refnum) { + printf("dev %d has %p at %d\n", + dl->pdnum, vcl, vn); + dl->vlist[vn++] = vcl; + break; + } + while (vn < ddf->max_part) + dl->vlist[vn++] = NULL; + if (dl->vlist[0]) { + ddf->phys->entries[dl->pdnum].type &= + ~__cpu_to_be16(DDF_Global_Spare); + ddf->phys->entries[dl->pdnum].type |= + __cpu_to_be16(DDF_Active_in_VD); + } + if (dl->spare) { + ddf->phys->entries[dl->pdnum].type &= + ~__cpu_to_be16(DDF_Global_Spare); + ddf->phys->entries[dl->pdnum].type |= + __cpu_to_be16(DDF_Spare); + } + if (!dl->vlist[0] && !dl->spare) { + ddf->phys->entries[dl->pdnum].type |= + __cpu_to_be16(DDF_Global_Spare); + ddf->phys->entries[dl->pdnum].type &= + ~__cpu_to_be16(DDF_Spare | + DDF_Active_in_VD); + } + } + break; + case DDF_SPARE_ASSIGN_MAGIC: + default: break; + } +} + +/* + * Check if the array 'a' is degraded but not failed. + * If it is, find as many spares as are available and needed and + * arrange for their inclusion. + * We only choose devices which are not already in the array, + * and prefer those with a spare-assignment to this array. + * otherwise we choose global spares - assuming always that + * there is enough room. + * For each spare that we assign, we return an 'mdinfo' which + * describes the position for the device in the array. + * We also add to 'updates' a DDF_VD_CONF_MAGIC update with + * the new phys_refnum and lba_offset values. + * + * Only worry about BVDs at the moment. + */ +static struct mdinfo *ddf_activate_spare(struct active_array *a, + struct metadata_update **updates) +{ + int working = 0; + struct mdinfo *d; + struct ddf_super *ddf = a->container->sb; + int global_ok = 0; + struct mdinfo *rv = NULL; + struct mdinfo *di; + struct metadata_update *mu; + struct dl *dl; + int i; + struct vd_config *vc; + __u64 *lba; + +/* FIXME, If there is a DS_FAULTY, we want to wait for it to be + * removed. Then only look at DS_REMOVE devices. + * What about !DS_INSYNC - how can that happen? + */ + for (d = a->info.devs ; d ; d = d->next) { + if ((d->curr_state & DS_FAULTY) && + d->state_fd >= 0) + /* wait for Removal to happen */ + return NULL; + if (d->state_fd >= 0) + working ++; + } + + printf("ddf_activate: working=%d (%d) level=%d\n", working, a->info.array.raid_disks, + a->info.array.level); + if (working == a->info.array.raid_disks) + return NULL; /* array not degraded */ + switch (a->info.array.level) { + case 1: + if (working == 0) + return NULL; /* failed */ + break; + case 4: + case 5: + if (working < a->info.array.raid_disks - 1) + return NULL; /* failed */ + break; + case 6: + if (working < a->info.array.raid_disks - 2) + return NULL; /* failed */ + break; + default: /* concat or stripe */ + return NULL; /* failed */ + } + + /* For each slot, if it is not working, find a spare */ + dl = ddf->dlist; + for (i = 0; i < a->info.array.raid_disks; i++) { + for (d = a->info.devs ; d ; d = d->next) + if (d->disk.raid_disk == i) + break; + printf("found %d: %p %x\n", i, d, d?d->curr_state:0); + if (d && (d->state_fd >= 0)) + continue; + + /* OK, this device needs recovery. Find a spare */ + again: + for ( ; dl ; dl = dl->next) { + unsigned long long esize; + unsigned long long pos; + struct mdinfo *d2; + int is_global = 0; + int is_dedicated = 0; + struct extent *ex; + int j; + /* If in this array, skip */ + for (d2 = a->info.devs ; d2 ; d2 = d2->next) + if (d2->disk.major == dl->major && + d2->disk.minor == dl->minor) { + printf("%x:%x already in array\n", dl->major, dl->minor); + break; + } + if (d2) + continue; + if (ddf->phys->entries[dl->pdnum].type & + __cpu_to_be16(DDF_Spare)) { + /* Check spare assign record */ + if (dl->spare) { + if (dl->spare->type & DDF_spare_dedicated) { + /* check spare_ents for guid */ + for (j = 0 ; + j < __be16_to_cpu(dl->spare->populated); + j++) { + if (memcmp(dl->spare->spare_ents[j].guid, + ddf->virt->entries[a->info.container_member].guid, + DDF_GUID_LEN) == 0) + is_dedicated = 1; + } + } else + is_global = 1; + } + } else if (ddf->phys->entries[dl->pdnum].type & + __cpu_to_be16(DDF_Global_Spare)) { + is_global = 1; + } + if ( ! (is_dedicated || + (is_global && global_ok))) { + printf("%x:%x not suitable: %d %d\n", dl->major, dl->minor, + is_dedicated, is_global); + continue; + } + + /* We are allowed to use this device - is there space? + * We need a->info.component_size sectors */ + ex = get_extents(ddf, dl); + if (!ex) { + printf("cannot get extents\n"); + continue; + } + j = 0; pos = 0; + esize = 0; + + do { + esize = ex[j].start - pos; + if (esize >= a->info.component_size) + break; + pos = ex[i].start + ex[i].size; + i++; + } while (ex[i-1].size); + + free(ex); + if (esize < a->info.component_size) { + printf("%x:%x has no room: %llu %llu\n", dl->major, dl->minor, + esize, a->info.component_size); + /* No room */ + continue; + } + + /* Cool, we have a device with some space at pos */ + di = malloc(sizeof(*di)); + memset(di, 0, sizeof(*di)); + di->disk.number = i; + di->disk.raid_disk = i; + di->disk.major = dl->major; + di->disk.minor = dl->minor; + di->disk.state = 0; + di->data_offset = pos; + di->component_size = a->info.component_size; + di->container_member = dl->pdnum; + di->next = rv; + rv = di; + printf("%x:%x to be %d at %llu\n", dl->major, dl->minor, + i, pos); + + break; + } + if (!dl && ! global_ok) { + /* not enough dedicated spares, try global */ + global_ok = 1; + dl = ddf->dlist; + goto again; + } + } + + if (!rv) + /* No spares found */ + return rv; + /* Now 'rv' has a list of devices to return. + * Create a metadata_update record to update the + * phys_refnum and lba_offset values + */ + mu = malloc(sizeof(*mu) + ddf->conf_rec_len * 512); + mu->buf = (char*)(mu+1); + mu->space = malloc(sizeof(struct vcl)); + mu->len = ddf->conf_rec_len; + mu->next = *updates; + vc = find_vdcr(ddf, a->info.container_member); + memcpy(mu->buf, vc, ddf->conf_rec_len * 512); + + vc = (struct vd_config*)mu->buf; + lba = (__u64*)&vc->phys_refnum[ddf->mppe]; + for (di = rv ; di ; di = di->next) { + vc->phys_refnum[di->disk.raid_disk] = + ddf->phys->entries[dl->pdnum].refnum; + lba[di->disk.raid_disk] = di->data_offset; + } + *updates = mu; + return rv; +} + struct superswitch super_ddf = { #ifndef MDASSEMBLE .examine_super = examine_super_ddf, @@ -2466,7 +3092,15 @@ struct superswitch super_ddf = { .major = 1000, .swapuuid = 0, .external = 1, - .text_version = "ddf", + +/* for mdmon */ + .open_new = ddf_open_new, + .set_array_state= ddf_set_array_state, + .set_disk = ddf_set_disk, + .sync_metadata = ddf_sync_metadata, + .process_update = ddf_process_update, + .activate_spare = ddf_activate_spare, + }; /* Super_ddf_container is set by validate_geometry_ddf when given a @@ -2478,17 +3112,19 @@ struct superswitch super_ddf_container = { .write_init_super = write_init_super_ddf, #endif + .load_super = load_super_ddf, .init_super = init_super_ddf, .add_to_super = add_to_super_ddf, + .getinfo_super = getinfo_super_ddf, .free_super = free_super_ddf, .container_content = container_content_ddf, + .getinfo_super_n = getinfo_super_n_container, .major = 1000, .swapuuid = 0, .external = 1, - .text_version = "ddf", }; struct superswitch super_ddf_bvd = { @@ -2512,7 +3148,6 @@ struct superswitch super_ddf_bvd = { .major = 1001, .swapuuid = 0, .external = 2, - .text_version = "ddf", }; struct superswitch super_ddf_svd = { @@ -2531,5 +3166,4 @@ struct superswitch super_ddf_svd = { .major = 1002, .swapuuid = 0, .external = 2, - .text_version = "ddf", };