X-Git-Url: http://git.ipfire.org/?p=thirdparty%2Fmdadm.git;a=blobdiff_plain;f=super-intel.c;h=f5c6b09b93a5b464fa93319316130e3eaa2baf21;hp=618baf68800aa6f6dab39f47d4a120906b507185;hb=f35f25259279573c6274e2783536c0b0a399bdd4;hpb=5802a8118e447833749cbf0fe7b909f3c7d8349d diff --git a/super-intel.c b/super-intel.c index 618baf68..f5c6b09b 100644 --- a/super-intel.c +++ b/super-intel.c @@ -68,8 +68,10 @@ struct imsm_map { __u8 num_members; /* number of member disks */ __u8 reserved[3]; __u32 filler[7]; /* expansion area */ +#define IMSM_ORD_REBUILD (1 << 24) __u32 disk_ord_tbl[1]; /* disk_ord_tbl[num_members], - top byte special */ + * top byte contains some flags + */ } __attribute__ ((packed)); struct imsm_vol { @@ -100,16 +102,43 @@ struct imsm_super { __u32 mpb_size; /* 0x24 - 0x27 Size of MPB */ __u32 family_num; /* 0x28 - 0x2B Checksum from first time this config was written */ __u32 generation_num; /* 0x2C - 0x2F Incremented each time this array's MPB is written */ - __u32 reserved[2]; /* 0x30 - 0x37 */ + __u32 error_log_size; /* 0x30 - 0x33 in bytes */ + __u32 attributes; /* 0x34 - 0x37 */ __u8 num_disks; /* 0x38 Number of configured disks */ __u8 num_raid_devs; /* 0x39 Number of configured volumes */ - __u8 fill[2]; /* 0x3A - 0x3B */ -#define IMSM_FILLERS 39 - __u32 filler[IMSM_FILLERS]; /* 0x3C - 0xD7 RAID_MPB_FILLERS */ + __u8 error_log_pos; /* 0x3A */ + __u8 fill[1]; /* 0x3B */ + __u32 cache_size; /* 0x3c - 0x40 in mb */ + __u32 orig_family_num; /* 0x40 - 0x43 original family num */ + __u32 pwr_cycle_count; /* 0x44 - 0x47 simulated power cycle count for array */ + __u32 bbm_log_size; /* 0x48 - 0x4B - size of bad Block Mgmt Log in bytes */ +#define IMSM_FILLERS 35 + __u32 filler[IMSM_FILLERS]; /* 0x4C - 0xD7 RAID_MPB_FILLERS */ struct imsm_disk disk[1]; /* 0xD8 diskTbl[numDisks] */ /* here comes imsm_dev[num_raid_devs] */ + /* here comes BBM logs */ } __attribute__ ((packed)); +#define BBM_LOG_MAX_ENTRIES 254 + +struct bbm_log_entry { + __u64 defective_block_start; +#define UNREADABLE 0xFFFFFFFF + __u32 spare_block_offset; + __u16 remapped_marked_count; + __u16 disk_ordinal; +} __attribute__ ((__packed__)); + +struct bbm_log { + __u32 signature; /* 0xABADB10C */ + __u32 entry_count; + __u32 reserved_spare_block_count; /* 0 */ + __u32 reserved; /* 0xFFFF */ + __u64 first_spare_lba; + struct bbm_log_entry mapped_block_entries[BBM_LOG_MAX_ENTRIES]; +} __attribute__ ((__packed__)); + + #ifndef MDASSEMBLE static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed" }; #endif @@ -127,20 +156,28 @@ static unsigned int mpb_sectors(struct imsm_super *mpb) /* internal representation of IMSM metadata */ struct intel_super { union { - struct imsm_super *mpb; - void *buf; + void *buf; /* O_DIRECT buffer for reading/writing metadata */ + struct imsm_super *anchor; /* immovable parameters */ }; + size_t len; /* size of the 'buf' allocation */ + void *next_buf; /* for realloc'ing buf from the manager */ + size_t next_len; int updates_pending; /* count of pending updates for mdmon */ int creating_imsm; /* flag to indicate container creation */ int current_vol; /* index of raid device undergoing creation */ + #define IMSM_MAX_RAID_DEVS 2 + struct imsm_dev *dev_tbl[IMSM_MAX_RAID_DEVS]; struct dl { struct dl *next; int index; __u8 serial[MAX_RAID_SERIAL_LEN]; int major, minor; char *devname; + struct imsm_disk disk; int fd; } *disks; + struct dl *add; /* list of disks to add while mdmon active */ + struct bbm_log *bbm_log; }; struct extent { @@ -151,11 +188,12 @@ struct extent { enum imsm_update_type { update_activate_spare, update_create_array, + update_add_disk, }; struct imsm_update_activate_spare { enum imsm_update_type type; - int disk_idx; + struct dl *dl; int slot; int array; struct imsm_update_activate_spare *next; @@ -163,10 +201,25 @@ struct imsm_update_activate_spare { struct imsm_update_create_array { enum imsm_update_type type; - struct imsm_dev dev; int dev_idx; + struct imsm_dev dev; +}; + +struct imsm_update_add_disk { + enum imsm_update_type type; }; +static int imsm_env_devname_as_serial(void) +{ + char *val = getenv("IMSM_DEVNAME_AS_SERIAL"); + + if (val && atoi(val) == 1) + return 1; + + return 0; +} + + static struct supertype *match_metadata_desc_imsm(char *arg) { struct supertype *st; @@ -185,19 +238,41 @@ static struct supertype *match_metadata_desc_imsm(char *arg) return st; } +#ifndef MDASSEMBLE static __u8 *get_imsm_version(struct imsm_super *mpb) { return &mpb->sig[MPB_SIG_LEN]; } +#endif -static struct imsm_disk *get_imsm_disk(struct imsm_super *mpb, __u8 index) +/* retrieve a disk directly from the anchor when the anchor is known to be + * up-to-date, currently only at load time + */ +static struct imsm_disk *__get_imsm_disk(struct imsm_super *mpb, __u8 index) { - if (index > mpb->num_disks - 1) + if (index >= mpb->num_disks) return NULL; return &mpb->disk[index]; } -static __u32 gen_imsm_checksum(struct imsm_super *mpb) +#ifndef MDASSEMBLE +/* retrieve a disk from the parsed metadata */ +static struct imsm_disk *get_imsm_disk(struct intel_super *super, __u8 index) +{ + struct dl *d; + + for (d = super->disks; d; d = d->next) + if (d->index == index) + return &d->disk; + + return NULL; +} +#endif + +/* generate a checksum directly from the anchor when the anchor is known to be + * up-to-date, currently only at load or write_super after coalescing + */ +static __u32 __gen_imsm_checksum(struct imsm_super *mpb) { __u32 end = mpb->mpb_size / sizeof(end); __u32 *p = (__u32 *) mpb; @@ -209,29 +284,50 @@ static __u32 gen_imsm_checksum(struct imsm_super *mpb) return sum - __le32_to_cpu(mpb->check_sum); } -static size_t sizeof_imsm_dev(struct imsm_dev *dev) +static size_t sizeof_imsm_map(struct imsm_map *map) { - size_t size = sizeof(*dev); + return sizeof(struct imsm_map) + sizeof(__u32) * (map->num_members - 1); +} + +struct imsm_map *get_imsm_map(struct imsm_dev *dev, int second_map) +{ + struct imsm_map *map = &dev->vol.map[0]; + + if (second_map && !dev->vol.migr_state) + return NULL; + else if (second_map) { + void *ptr = map; + + return ptr + sizeof_imsm_map(map); + } else + return map; + +} - /* each map has disk_ord_tbl[num_members - 1] additional space */ - size += sizeof(__u32) * (dev->vol.map[0].num_members - 1); +/* return the size of the device. + * migr_state increases the returned size if map[0] were to be duplicated + */ +static size_t sizeof_imsm_dev(struct imsm_dev *dev, int migr_state) +{ + size_t size = sizeof(*dev) - sizeof(struct imsm_map) + + sizeof_imsm_map(get_imsm_map(dev, 0)); /* migrating means an additional map */ - if (dev->vol.migr_state) { - size += sizeof(struct imsm_map); - size += sizeof(__u32) * (dev->vol.map[1].num_members - 1); - } + if (dev->vol.migr_state) + size += sizeof_imsm_map(get_imsm_map(dev, 1)); + else if (migr_state) + size += sizeof_imsm_map(get_imsm_map(dev, 0)); return size; } -static struct imsm_dev *get_imsm_dev(struct imsm_super *mpb, __u8 index) +static struct imsm_dev *__get_imsm_dev(struct imsm_super *mpb, __u8 index) { int offset; int i; void *_mpb = mpb; - if (index > mpb->num_raid_devs - 1) + if (index >= mpb->num_raid_devs) return NULL; /* devices start after all disks */ @@ -241,17 +337,42 @@ static struct imsm_dev *get_imsm_dev(struct imsm_super *mpb, __u8 index) if (i == index) return _mpb + offset; else - offset += sizeof_imsm_dev(_mpb + offset); + offset += sizeof_imsm_dev(_mpb + offset, 0); return NULL; } -static __u32 get_imsm_disk_idx(struct imsm_map *map, int slot) +static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index) +{ + if (index >= super->anchor->num_raid_devs) + return NULL; + return super->dev_tbl[index]; +} + +static __u32 get_imsm_ord_tbl_ent(struct imsm_dev *dev, int slot) +{ + struct imsm_map *map; + + if (dev->vol.migr_state) + map = get_imsm_map(dev, 1); + else + map = get_imsm_map(dev, 0); + + /* top byte identifies disk under rebuild */ + return __le32_to_cpu(map->disk_ord_tbl[slot]); +} + +#define ord_to_idx(ord) (((ord) << 8) >> 8) +static __u32 get_imsm_disk_idx(struct imsm_dev *dev, int slot) { - __u32 *ord_tbl = &map->disk_ord_tbl[slot]; + __u32 ord = get_imsm_ord_tbl_ent(dev, slot); - /* top byte is 'special' */ - return __le32_to_cpu(*ord_tbl & ~(0xff << 24)); + return ord_to_idx(ord); +} + +static void set_imsm_ord_tbl_ent(struct imsm_map *map, int slot, __u32 ord) +{ + map->disk_ord_tbl[slot] = __cpu_to_le32(ord); } static int get_imsm_raid_level(struct imsm_map *map) @@ -266,6 +387,7 @@ static int get_imsm_raid_level(struct imsm_map *map) return map->raid_level; } +#ifndef MDASSEMBLE static int cmp_extent(const void *av, const void *bv) { const struct extent *a = av; @@ -280,22 +402,16 @@ static int cmp_extent(const void *av, const void *bv) static struct extent *get_extents(struct intel_super *super, struct dl *dl) { /* find a list of used extents on the given physical device */ - struct imsm_super *mpb = super->mpb; - struct imsm_disk *disk; struct extent *rv, *e; int i, j; int memberships = 0; - disk = get_imsm_disk(mpb, dl->index); - if (!disk) - return NULL; - - for (i = 0; i < mpb->num_raid_devs; i++) { - struct imsm_dev *dev = get_imsm_dev(mpb, i); - struct imsm_map *map = dev->vol.map; + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_map *map = get_imsm_map(dev, 0); for (j = 0; j < map->num_members; j++) { - __u32 index = get_imsm_disk_idx(map, j); + __u32 index = get_imsm_disk_idx(dev, j); if (index == dl->index) memberships++; @@ -306,12 +422,12 @@ static struct extent *get_extents(struct intel_super *super, struct dl *dl) return NULL; e = rv; - for (i = 0; i < mpb->num_raid_devs; i++) { - struct imsm_dev *dev = get_imsm_dev(mpb, i); - struct imsm_map *map = dev->vol.map; + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_map *map = get_imsm_map(dev, 0); for (j = 0; j < map->num_members; j++) { - __u32 index = get_imsm_disk_idx(map, j); + __u32 index = get_imsm_disk_idx(dev, j); if (index == dl->index) { e->start = __le32_to_cpu(map->pba_of_lba0); @@ -322,29 +438,31 @@ static struct extent *get_extents(struct intel_super *super, struct dl *dl) } qsort(rv, memberships, sizeof(*rv), cmp_extent); - e->start = __le32_to_cpu(disk->total_blocks) - + e->start = __le32_to_cpu(dl->disk.total_blocks) - (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS); e->size = 0; return rv; } -#ifndef MDASSEMBLE static void print_imsm_dev(struct imsm_dev *dev, int index) { __u64 sz; int slot; - struct imsm_map *map = dev->vol.map; + struct imsm_map *map = get_imsm_map(dev, 0); + __u32 ord; printf("\n"); printf("[%s]:\n", dev->volume); printf(" RAID Level : %d\n", get_imsm_raid_level(map)); printf(" Members : %d\n", map->num_members); for (slot = 0; slot < map->num_members; slot++) - if (index == get_imsm_disk_idx(map, slot)) + if (index == get_imsm_disk_idx(dev, slot)) break; - if (slot < map->num_members) - printf(" This Slot : %d\n", slot); - else + if (slot < map->num_members) { + ord = get_imsm_ord_tbl_ent(dev, slot); + printf(" This Slot : %d%s\n", slot, + ord & IMSM_ORD_REBUILD ? " (out-of-sync)" : ""); + } else printf(" This Slot : ?\n"); sz = __le32_to_cpu(dev->size_high); sz <<= 32; @@ -361,20 +479,31 @@ static void print_imsm_dev(struct imsm_dev *dev, int index) printf(" Chunk Size : %u KiB\n", __le16_to_cpu(map->blocks_per_strip) / 2); printf(" Reserved : %d\n", __le32_to_cpu(dev->reserved_blocks)); - printf(" Migrate State : %s\n", dev->vol.migr_state ? "migrating" : "idle"); + printf(" Migrate State : %s", dev->vol.migr_state ? "migrating" : "idle"); + if (dev->vol.migr_state) + printf(": %s", dev->vol.migr_type ? "rebuilding" : "initializing"); + printf("\n"); + printf(" Map State : %s", map_state_str[map->map_state]); + if (dev->vol.migr_state) { + struct imsm_map *map = get_imsm_map(dev, 1); + printf(" <-- %s", map_state_str[map->map_state]); + } + printf("\n"); printf(" Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean"); - printf(" Map State : %s\n", map_state_str[map->map_state]); } static void print_imsm_disk(struct imsm_super *mpb, int index) { - struct imsm_disk *disk = get_imsm_disk(mpb, index); - char str[MAX_RAID_SERIAL_LEN]; + struct imsm_disk *disk = __get_imsm_disk(mpb, index); + char str[MAX_RAID_SERIAL_LEN + 1]; __u32 s; __u64 sz; + if (index < 0) + return; + printf("\n"); - snprintf(str, MAX_RAID_SERIAL_LEN, "%s", disk->serial); + snprintf(str, MAX_RAID_SERIAL_LEN + 1, "%s", disk->serial); printf(" Disk%02d Serial : %s\n", index, str); s = __le32_to_cpu(disk->status); printf(" State :%s%s%s%s\n", s&SPARE_DISK ? " spare" : "", @@ -391,7 +520,7 @@ static void print_imsm_disk(struct imsm_super *mpb, int index) static void examine_super_imsm(struct supertype *st, char *homehost) { struct intel_super *super = st->sb; - struct imsm_super *mpb = super->mpb; + struct imsm_super *mpb = super->anchor; char str[MAX_SIGNATURE_LENGTH]; int i; __u32 sum; @@ -404,13 +533,24 @@ static void examine_super_imsm(struct supertype *st, char *homehost) printf(" Generation : %08x\n", __le32_to_cpu(mpb->generation_num)); sum = __le32_to_cpu(mpb->check_sum); printf(" Checksum : %08x %s\n", sum, - gen_imsm_checksum(mpb) == sum ? "correct" : "incorrect"); + __gen_imsm_checksum(mpb) == sum ? "correct" : "incorrect"); printf(" MPB Sectors : %d\n", mpb_sectors(mpb)); printf(" Disks : %d\n", mpb->num_disks); printf(" RAID Devices : %d\n", mpb->num_raid_devs); print_imsm_disk(mpb, super->disks->index); + if (super->bbm_log) { + struct bbm_log *log = super->bbm_log; + + printf("\n"); + printf("Bad Block Management Log:\n"); + printf(" Log Size : %d\n", __le32_to_cpu(mpb->bbm_log_size)); + printf(" Signature : %x\n", __le32_to_cpu(log->signature)); + printf(" Entry Count : %d\n", __le32_to_cpu(log->entry_count)); + printf(" Spare Blocks : %d\n", __le32_to_cpu(log->reserved_spare_block_count)); + printf(" First Spare : %llx\n", __le64_to_cpu(log->first_spare_lba)); + } for (i = 0; i < mpb->num_raid_devs; i++) - print_imsm_dev(get_imsm_dev(mpb, i), super->disks->index); + print_imsm_dev(__get_imsm_dev(mpb, i), super->disks->index); for (i = 0; i < mpb->num_disks; i++) { if (i == super->disks->index) continue; @@ -420,11 +560,7 @@ static void examine_super_imsm(struct supertype *st, char *homehost) static void brief_examine_super_imsm(struct supertype *st) { - struct intel_super *super = st->sb; - struct imsm_super *mpb = super->mpb; - - printf("ARRAY /dev/imsm family=%08x metadata=external:imsm\n", - __le32_to_cpu(mpb->family_num)); + printf("ARRAY /dev/imsm metadata=imsm\n"); } static void detail_super_imsm(struct supertype *st, char *homehost) @@ -447,7 +583,13 @@ static int match_home_imsm(struct supertype *st, char *homehost) static void uuid_from_super_imsm(struct supertype *st, int uuid[4]) { - printf("%s\n", __FUNCTION__); + /* imsm does not track uuid's so just make sure we never return + * the same value twice to break uuid matching in Manage_subdevs + * FIXME what about the use of uuid's with bitmap's? + */ + static int dummy_id = 0; + + uuid[0] = dummy_id++; } #if 0 @@ -486,7 +628,7 @@ static int imsm_level_to_layout(int level) return 0; case 5: case 6: - return ALGORITHM_LEFT_SYMMETRIC; + return ALGORITHM_LEFT_ASYMMETRIC; case 10: return 0x102; //FIXME is this correct? } @@ -496,9 +638,8 @@ static int imsm_level_to_layout(int level) static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info) { struct intel_super *super = st->sb; - struct imsm_super *mpb = super->mpb; - struct imsm_dev *dev = get_imsm_dev(mpb, super->current_vol); - struct imsm_map *map = &dev->vol.map[0]; + struct imsm_dev *dev = get_imsm_dev(super, super->current_vol); + struct imsm_map *map = get_imsm_map(dev, 0); info->container_member = super->current_vol; info->array.raid_disks = map->num_members; @@ -507,24 +648,37 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info) info->array.md_minor = -1; info->array.ctime = 0; info->array.utime = 0; - info->array.chunk_size = __le16_to_cpu(map->blocks_per_strip * 512); + info->array.chunk_size = __le16_to_cpu(map->blocks_per_strip) << 9; + info->array.state = !dev->vol.dirty; + + info->disk.major = 0; + info->disk.minor = 0; info->data_offset = __le32_to_cpu(map->pba_of_lba0); info->component_size = __le32_to_cpu(map->blocks_per_member); + memset(info->uuid, 0, sizeof(info->uuid)); - info->disk.major = 0; - info->disk.minor = 0; + if (map->map_state == IMSM_T_STATE_UNINITIALIZED || + dev->vol.dirty || dev->vol.migr_state) + info->resync_start = 0; + else + info->resync_start = ~0ULL; + strncpy(info->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN); + info->name[MAX_RAID_SERIAL_LEN] = 0; + + info->array.major_version = -1; + info->array.minor_version = -2; sprintf(info->text_version, "/%s/%d", devnum2devname(st->container_dev), info->container_member); + info->safe_mode_delay = 4000; /* 4 secs like the Matrix driver */ } static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info) { struct intel_super *super = st->sb; - struct imsm_super *mpb = super->mpb; struct imsm_disk *disk; __u32 s; @@ -532,7 +686,11 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info) getinfo_super_imsm_volume(st, info); return; } - info->array.raid_disks = mpb->num_disks; + + /* Set raid_disks to zero so that Assemble will always pull in valid + * spares + */ + info->array.raid_disks = 0; info->array.level = LEVEL_CONTAINER; info->array.layout = 0; info->array.md_minor = -1; @@ -544,12 +702,15 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info) info->disk.minor = 0; info->disk.raid_disk = -1; info->reshape_active = 0; + info->array.major_version = -1; + info->array.minor_version = -2; strcpy(info->text_version, "imsm"); + info->safe_mode_delay = 0; info->disk.number = -1; info->disk.state = 0; if (super->disks) { - disk = get_imsm_disk(mpb, super->disks->index); + disk = &super->disks->disk; info->disk.number = super->disks->index; info->disk.raid_disk = super->disks->index; info->data_offset = __le32_to_cpu(disk->total_blocks) - @@ -648,42 +809,75 @@ static int compare_super_imsm(struct supertype *st, struct supertype *tst) return 0; } - if (memcmp(first->mpb->sig, sec->mpb->sig, MAX_SIGNATURE_LENGTH) != 0) - return 3; - if (first->mpb->family_num != sec->mpb->family_num) - return 3; - if (first->mpb->mpb_size != sec->mpb->mpb_size) - return 3; - if (first->mpb->check_sum != sec->mpb->check_sum) + if (memcmp(first->anchor->sig, sec->anchor->sig, MAX_SIGNATURE_LENGTH) != 0) return 3; + /* if an anchor does not have num_raid_devs set then it is a free + * floating spare + */ + if (first->anchor->num_raid_devs > 0 && + sec->anchor->num_raid_devs > 0) { + if (first->anchor->family_num != sec->anchor->family_num) + return 3; + } + + /* if 'first' is a spare promote it to a populated mpb with sec's + * family number + */ + if (first->anchor->num_raid_devs == 0 && + sec->anchor->num_raid_devs > 0) { + first->anchor->num_raid_devs = sec->anchor->num_raid_devs; + first->anchor->family_num = sec->anchor->family_num; + } + return 0; } +static void fd2devname(int fd, char *name) +{ + struct stat st; + char path[256]; + char dname[100]; + char *nm; + int rv; + + name[0] = '\0'; + if (fstat(fd, &st) != 0) + return; + sprintf(path, "/sys/dev/block/%d:%d", + major(st.st_rdev), minor(st.st_rdev)); + + rv = readlink(path, dname, sizeof(dname)); + if (rv <= 0) + return; + + dname[rv] = '\0'; + nm = strrchr(dname, '/'); + nm++; + snprintf(name, MAX_RAID_SERIAL_LEN, "/dev/%s", nm); +} + + extern int scsi_get_serial(int fd, void *buf, size_t buf_len); static int imsm_read_serial(int fd, char *devname, __u8 serial[MAX_RAID_SERIAL_LEN]) { unsigned char scsi_serial[255]; - int sg_fd; int rv; int rsp_len; - int i, cnt; + int len; + char *c, *rsp_buf; memset(scsi_serial, 0, sizeof(scsi_serial)); - sg_fd = sysfs_disk_to_sg(fd); - if (sg_fd < 0) { - if (devname) - fprintf(stderr, - Name ": Failed to open sg interface for %s: %s\n", - devname, strerror(errno)); - return 1; - } + rv = scsi_get_serial(fd, scsi_serial, sizeof(scsi_serial)); - rv = scsi_get_serial(sg_fd, scsi_serial, sizeof(scsi_serial)); - close(sg_fd); + if (rv && imsm_env_devname_as_serial()) { + memset(serial, 0, MAX_RAID_SERIAL_LEN); + fd2devname(fd, (char *) serial); + return 0; + } if (rv != 0) { if (devname) @@ -693,30 +887,62 @@ static int imsm_read_serial(int fd, char *devname, return rv; } + /* trim whitespace */ rsp_len = scsi_serial[3]; - for (i = 0, cnt = 0; i < rsp_len; i++) { - if (!isspace(scsi_serial[4 + i])) - serial[cnt++] = scsi_serial[4 + i]; - if (cnt == MAX_RAID_SERIAL_LEN) - break; - } - - serial[MAX_RAID_SERIAL_LEN - 1] = '\0'; + rsp_buf = (char *) &scsi_serial[4]; + c = rsp_buf; + while (isspace(*c)) + c++; + if (c + MAX_RAID_SERIAL_LEN > rsp_buf + rsp_len) + len = rsp_len - (c - rsp_buf); + else + len = MAX_RAID_SERIAL_LEN; + memcpy(serial, c, len); + c = (char *) &serial[len - 1]; + while (isspace(*c) || *c == '\0') + *c-- = '\0'; return 0; } +static int serialcmp(__u8 *s1, __u8 *s2) +{ + return strncmp((char *) s1, (char *) s2, MAX_RAID_SERIAL_LEN); +} + +static void serialcpy(__u8 *dest, __u8 *src) +{ + strncpy((char *) dest, (char *) src, MAX_RAID_SERIAL_LEN); +} + static int load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) { - struct imsm_super *mpb = super->mpb; struct dl *dl; struct stat stb; - struct imsm_disk *disk; int rv; int i; + int alloc = 1; + __u8 serial[MAX_RAID_SERIAL_LEN]; + + rv = imsm_read_serial(fd, devname, serial); + + if (rv != 0) + return 2; + + /* check if this is a disk we have seen before. it may be a spare in + * super->disks while the current anchor believes it is a raid member, + * check if we need to update dl->index + */ + for (dl = super->disks; dl; dl = dl->next) + if (serialcmp(dl->serial, serial) == 0) + break; + + if (!dl) + dl = malloc(sizeof(*dl)); + else + alloc = 0; - dl = malloc(sizeof(*dl)); if (!dl) { if (devname) fprintf(stderr, @@ -724,48 +950,154 @@ load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) devname); return 2; } - memset(dl, 0, sizeof(*dl)); - fstat(fd, &stb); - dl->major = major(stb.st_rdev); - dl->minor = minor(stb.st_rdev); - dl->next = super->disks; - dl->fd = keep_fd ? fd : -1; - dl->devname = devname ? strdup(devname) : NULL; - dl->index = -1; - super->disks = dl; - rv = imsm_read_serial(fd, devname, dl->serial); + if (alloc) { + fstat(fd, &stb); + dl->major = major(stb.st_rdev); + dl->minor = minor(stb.st_rdev); + dl->next = super->disks; + dl->fd = keep_fd ? fd : -1; + dl->devname = devname ? strdup(devname) : NULL; + serialcpy(dl->serial, serial); + dl->index = -2; + } else if (keep_fd) { + close(dl->fd); + dl->fd = fd; + } - if (rv != 0) - return 2; + /* look up this disk's index in the current anchor */ + for (i = 0; i < super->anchor->num_disks; i++) { + struct imsm_disk *disk_iter; - /* look up this disk's index */ - for (i = 0; i < mpb->num_disks; i++) { - disk = get_imsm_disk(mpb, i); + disk_iter = __get_imsm_disk(super->anchor, i); + + if (serialcmp(disk_iter->serial, dl->serial) == 0) { + __u32 status; + + dl->disk = *disk_iter; + status = __le32_to_cpu(dl->disk.status); + /* only set index on disks that are a member of a + * populated contianer, i.e. one with raid_devs + */ + if (status & FAILED_DISK) + dl->index = -2; + else if (status & SPARE_DISK) + dl->index = -1; + else + dl->index = i; - if (memcmp(disk->serial, dl->serial, MAX_RAID_SERIAL_LEN) == 0) break; + } } - if (i > mpb->num_disks) - return 2; + if (alloc) + super->disks = dl; + + return 0; +} + +static void imsm_copy_dev(struct imsm_dev *dest, struct imsm_dev *src) +{ + memcpy(dest, src, sizeof_imsm_dev(src, 0)); +} + +#ifndef MDASSEMBLE +/* When migrating map0 contains the 'destination' state while map1 + * contains the current state. When not migrating map0 contains the + * current state. This routine assumes that map[0].map_state is set to + * the current array state before being called. + * + * Migration is indicated by one of the following states + * 1/ Idle (migr_state=0 map0state=normal||unitialized||degraded||failed) + * 2/ Initialize (migr_state=1 migr_type=0 map0state=normal + * map1state=unitialized) + * 3/ Verify (Resync) (migr_state=1 migr_type=1 map0state=normal + * map1state=normal) + * 4/ Rebuild (migr_state=1 migr_type=1 map0state=normal + * map1state=degraded) + */ +static void migrate(struct imsm_dev *dev, __u8 to_state, int rebuild_resync) +{ + struct imsm_map *dest; + struct imsm_map *src = get_imsm_map(dev, 0); + + dev->vol.migr_state = 1; + dev->vol.migr_type = rebuild_resync; + dest = get_imsm_map(dev, 1); - dl->index = i; + memcpy(dest, src, sizeof_imsm_map(src)); + src->map_state = to_state; +} +#endif + +static int parse_raid_devices(struct intel_super *super) +{ + int i; + struct imsm_dev *dev_new; + size_t len, len_migr; + size_t space_needed = 0; + struct imsm_super *mpb = super->anchor; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i); + + len = sizeof_imsm_dev(dev_iter, 0); + len_migr = sizeof_imsm_dev(dev_iter, 1); + if (len_migr > len) + space_needed += len_migr - len; + + dev_new = malloc(len_migr); + if (!dev_new) + return 1; + imsm_copy_dev(dev_new, dev_iter); + super->dev_tbl[i] = dev_new; + } + + /* ensure that super->buf is large enough when all raid devices + * are migrating + */ + if (__le32_to_cpu(mpb->mpb_size) + space_needed > super->len) { + void *buf; + + len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + space_needed, 512); + if (posix_memalign(&buf, 512, len) != 0) + return 1; + memcpy(buf, super->buf, len); + free(super->buf); + super->buf = buf; + super->len = len; + } + return 0; } +/* retrieve a pointer to the bbm log which starts after all raid devices */ +struct bbm_log *__get_imsm_bbm_log(struct imsm_super *mpb) +{ + void *ptr = NULL; + + if (__le32_to_cpu(mpb->bbm_log_size)) { + ptr = mpb; + ptr += mpb->mpb_size - __le32_to_cpu(mpb->bbm_log_size); + } + + return ptr; +} + +static void __free_imsm(struct intel_super *super, int free_disks); + /* load_imsm_mpb - read matrix metadata * allocates super->mpb to be freed by free_super */ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) { unsigned long long dsize; - size_t len, mpb_size; unsigned long long sectors; struct stat; struct imsm_super *anchor; __u32 check_sum; + int rc; get_dev_size(fd, NULL, &dsize); @@ -777,15 +1109,14 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) return 1; } - len = 512; - if (posix_memalign((void**)&anchor, 512, len) != 0) { + if (posix_memalign((void**)&anchor, 512, 512) != 0) { if (devname) fprintf(stderr, Name ": Failed to allocate imsm anchor buffer" " on %s\n", devname); return 1; } - if (read(fd, anchor, len) != len) { + if (read(fd, anchor, 512) != 512) { if (devname) fprintf(stderr, Name ": Cannot read anchor block on %s: %s\n", @@ -802,22 +1133,26 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) return 2; } - mpb_size = __le32_to_cpu(anchor->mpb_size); - mpb_size = ROUND_UP(mpb_size, 512); - if (posix_memalign(&super->buf, 512, mpb_size) != 0) { + __free_imsm(super, 0); + super->len = ROUND_UP(anchor->mpb_size, 512); + if (posix_memalign(&super->buf, 512, super->len) != 0) { if (devname) fprintf(stderr, Name ": unable to allocate %zu byte mpb buffer\n", - mpb_size); + super->len); free(anchor); return 2; } - memcpy(super->buf, anchor, len); + memcpy(super->buf, anchor, 512); sectors = mpb_sectors(anchor) - 1; free(anchor); - if (!sectors) - return load_imsm_disk(fd, super, devname, 0); + if (!sectors) { + rc = load_imsm_disk(fd, super, devname, 0); + if (rc == 0) + rc = parse_raid_devices(super); + return rc; + } /* read the extended mpb */ if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) { @@ -828,8 +1163,7 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) return 1; } - len = mpb_size - 512; - if (read(fd, super->buf + 512, len) != len) { + if (read(fd, super->buf + 512, super->len - 512) != super->len - 512) { if (devname) fprintf(stderr, Name ": Cannot read extended mpb on %s: %s\n", @@ -837,42 +1171,72 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) return 2; } - check_sum = gen_imsm_checksum(super->mpb); - if (check_sum != __le32_to_cpu(super->mpb->check_sum)) { + check_sum = __gen_imsm_checksum(super->anchor); + if (check_sum != __le32_to_cpu(super->anchor->check_sum)) { if (devname) fprintf(stderr, Name ": IMSM checksum %x != %x on %s\n", - check_sum, __le32_to_cpu(super->mpb->check_sum), + check_sum, __le32_to_cpu(super->anchor->check_sum), devname); return 2; } - return load_imsm_disk(fd, super, devname, 0); + /* FIXME the BBM log is disk specific so we cannot use this global + * buffer for all disks. Ok for now since we only look at the global + * bbm_log_size parameter to gate assembly + */ + super->bbm_log = __get_imsm_bbm_log(super->anchor); + + rc = load_imsm_disk(fd, super, devname, 0); + if (rc == 0) + rc = parse_raid_devices(super); + + return rc; } +static void __free_imsm_disk(struct dl *d) +{ + if (d->fd >= 0) + close(d->fd); + if (d->devname) + free(d->devname); + free(d); + +} static void free_imsm_disks(struct intel_super *super) { while (super->disks) { struct dl *d = super->disks; super->disks = d->next; - if (d->fd >= 0) - close(d->fd); - if (d->devname) - free(d->devname); - free(d); + __free_imsm_disk(d); } } +/* free all the pieces hanging off of a super pointer */ +static void __free_imsm(struct intel_super *super, int free_disks) +{ + int i; + + if (super->buf) { + free(super->buf); + super->buf = NULL; + } + if (free_disks) + free_imsm_disks(super); + for (i = 0; i < IMSM_MAX_RAID_DEVS; i++) + if (super->dev_tbl[i]) { + free(super->dev_tbl[i]); + super->dev_tbl[i] = NULL; + } +} + static void free_imsm(struct intel_super *super) { - if (super->mpb) - free(super->mpb); - free_imsm_disks(super); + __free_imsm(super, 1); free(super); } - static void free_super_imsm(struct supertype *st) { struct intel_super *super = st->sb; @@ -924,7 +1288,7 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, if (!super) return 1; - /* find the most up to date disk in this array */ + /* find the most up to date disk in this array, skipping spares */ for (sd = sra->devs; sd; sd = sd->next) { sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); dfd = dev_open(nm, keep_fd ? O_RDWR : O_RDONLY); @@ -936,7 +1300,10 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, if (!keep_fd) close(dfd); if (rv == 0) { - gen = __le32_to_cpu(super->mpb->generation_num); + if (super->anchor->num_raid_devs == 0) + gen = 0; + else + gen = __le32_to_cpu(super->anchor->generation_num); if (!best || gen > bestgen) { bestgen = gen; best = sd; @@ -966,10 +1333,7 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, return 2; } - /* reset the disk list */ - free_imsm_disks(super); - - /* populate disk list */ + /* re-parse the disk list with the current anchor */ for (sd = sra->devs ; sd ; sd = sd->next) { sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY); @@ -983,18 +1347,18 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, } if (st->subarray[0]) { - if (atoi(st->subarray) <= super->mpb->num_raid_devs) + if (atoi(st->subarray) <= super->anchor->num_raid_devs) super->current_vol = atoi(st->subarray); else return 1; } *sbp = super; + st->container_dev = fd2devnum(fd); if (st->ss == NULL) { st->ss = &super_imsm; st->minor_version = 0; st->max_devs = IMSM_MAX_DEVICES; - st->container_dev = fd2devnum(fd); } return 0; @@ -1060,6 +1424,11 @@ static __u32 info_to_num_data_stripes(mdu_array_info_t *info) return num_stripes; } +static __u32 info_to_blocks_per_member(mdu_array_info_t *info) +{ + return (info->size * 2) & ~(info_to_blocks_per_strip(info) - 1); +} + static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, unsigned long long size, char *name, char *homehost, int *uuid) @@ -1068,7 +1437,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, * so st->sb is already set. */ struct intel_super *super = st->sb; - struct imsm_super *mpb = super->mpb; + struct imsm_super *mpb = super->anchor; struct imsm_dev *dev; struct imsm_vol *vol; struct imsm_map *map; @@ -1098,14 +1467,23 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, memcpy(mpb_new, mpb, size_old); free(mpb); mpb = mpb_new; - super->mpb = mpb_new; + super->anchor = mpb_new; mpb->mpb_size = __cpu_to_le32(size_new); memset(mpb_new + size_old, 0, size_round - size_old); } super->current_vol = idx; + /* when creating the first raid device in this container set num_disks + * to zero, i.e. delete this spare and add raid member devices in + * add_to_super_imsm_volume() + */ + if (super->current_vol == 0) + mpb->num_disks = 0; sprintf(st->subarray, "%d", idx); - mpb->num_raid_devs++; - dev = get_imsm_dev(mpb, idx); + dev = malloc(sizeof(*dev) + sizeof(__u32) * (info->raid_disks - 1)); + if (!dev) { + fprintf(stderr, Name": could not allocate raid device\n"); + return 0; + } strncpy((char *) dev->volume, name, MAX_RAID_SERIAL_LEN); array_blocks = calc_array_size(info->level, info->raid_disks, info->layout, info->chunk_size, @@ -1119,15 +1497,15 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, vol->migr_type = 0; vol->dirty = 0; for (i = 0; i < idx; i++) { - struct imsm_dev *prev = get_imsm_dev(mpb, i); - struct imsm_map *pmap = &prev->vol.map[0]; + struct imsm_dev *prev = get_imsm_dev(super, i); + struct imsm_map *pmap = get_imsm_map(prev, 0); offset += __le32_to_cpu(pmap->blocks_per_member); offset += IMSM_RESERVED_SECTORS; } - map = &vol->map[0]; + map = get_imsm_map(dev, 0); map->pba_of_lba0 = __cpu_to_le32(offset); - map->blocks_per_member = __cpu_to_le32(info->size * 2); + map->blocks_per_member = __cpu_to_le32(info_to_blocks_per_member(info)); map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info)); map->num_data_stripes = __cpu_to_le32(info_to_num_data_stripes(info)); map->map_state = info->level ? IMSM_T_STATE_UNINITIALIZED : @@ -1146,8 +1524,10 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, map->num_members = info->raid_disks; for (i = 0; i < map->num_members; i++) { /* initialized in add_to_super */ - map->disk_ord_tbl[i] = __cpu_to_le32(0); + set_imsm_ord_tbl_ent(map, i, 0); } + mpb->num_raid_devs++; + super->dev_tbl[super->current_vol] = dev; return 1; } @@ -1196,40 +1576,54 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, return 1; } +#ifndef MDASSEMBLE static void add_to_super_imsm_volume(struct supertype *st, mdu_disk_info_t *dk, int fd, char *devname) { struct intel_super *super = st->sb; - struct imsm_super *mpb = super->mpb; + struct imsm_super *mpb = super->anchor; struct dl *dl; struct imsm_dev *dev; struct imsm_map *map; - struct imsm_disk *disk; __u32 status; - dev = get_imsm_dev(mpb, super->current_vol); - map = &dev->vol.map[0]; + dev = get_imsm_dev(super, super->current_vol); + map = get_imsm_map(dev, 0); for (dl = super->disks; dl ; dl = dl->next) if (dl->major == dk->major && dl->minor == dk->minor) break; + if (!dl || ! (dk->state & (1<disk_ord_tbl[dk->number] = __cpu_to_le32(dl->index); - - disk = get_imsm_disk(mpb, dl->index); + /* add a pristine spare to the metadata */ + if (dl->index < 0) { + dl->index = super->anchor->num_disks; + super->anchor->num_disks++; + } + set_imsm_ord_tbl_ent(map, dk->number, dl->index); status = CONFIGURED_DISK | USABLE_DISK; - disk->status = __cpu_to_le32(status); + dl->disk.status = __cpu_to_le32(status); + + /* if we are creating the first raid device update the family number */ + if (super->current_vol == 0) { + __u32 sum; + struct imsm_dev *_dev = __get_imsm_dev(mpb, 0); + struct imsm_disk *_disk = __get_imsm_disk(mpb, dl->index); + + *_dev = *dev; + *_disk = dl->disk; + sum = __gen_imsm_checksum(mpb); + mpb->family_num = __cpu_to_le32(sum); + } } static void add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk, int fd, char *devname) { struct intel_super *super = st->sb; - struct imsm_super *mpb = super->mpb; - struct imsm_disk *disk; struct dl *dd; unsigned long long size; __u32 status, id; @@ -1251,118 +1645,212 @@ static void add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk, memset(dd, 0, sizeof(*dd)); dd->major = major(stb.st_rdev); dd->minor = minor(stb.st_rdev); - dd->index = dk->number; + dd->index = -1; dd->devname = devname ? strdup(devname) : NULL; - dd->next = super->disks; dd->fd = fd; rv = imsm_read_serial(fd, devname, dd->serial); if (rv) { fprintf(stderr, - Name ": failed to retrieve scsi serial " - "using \'%s\' instead\n", devname); - strcpy((char *) dd->serial, devname); + Name ": failed to retrieve scsi serial, aborting\n"); + free(dd); + abort(); } - if (mpb->num_disks <= dk->number) - mpb->num_disks = dk->number + 1; - - disk = get_imsm_disk(mpb, dk->number); get_dev_size(fd, NULL, &size); size /= 512; status = USABLE_DISK | SPARE_DISK; - strcpy((char *) disk->serial, (char *) dd->serial); - disk->total_blocks = __cpu_to_le32(size); - disk->status = __cpu_to_le32(status); + serialcpy(dd->disk.serial, dd->serial); + dd->disk.total_blocks = __cpu_to_le32(size); + dd->disk.status = __cpu_to_le32(status); if (sysfs_disk_to_scsi_id(fd, &id) == 0) - disk->scsi_id = __cpu_to_le32(id); + dd->disk.scsi_id = __cpu_to_le32(id); else - disk->scsi_id = __cpu_to_le32(0); + dd->disk.scsi_id = __cpu_to_le32(0); - /* update the family number if we are creating a container */ - if (super->creating_imsm) - mpb->family_num = __cpu_to_le32(gen_imsm_checksum(mpb)); - - super->disks = dd; + if (st->update_tail) { + dd->next = super->add; + super->add = dd; + } else { + dd->next = super->disks; + super->disks = dd; + } } static int store_imsm_mpb(int fd, struct intel_super *super); +/* spare records have their own family number and do not have any defined raid + * devices + */ +static int write_super_imsm_spares(struct intel_super *super, int doclose) +{ + struct imsm_super mpb_save; + struct imsm_super *mpb = super->anchor; + __u32 sum; + struct dl *d; + + mpb_save = *mpb; + mpb->num_raid_devs = 0; + mpb->num_disks = 1; + mpb->mpb_size = sizeof(struct imsm_super); + mpb->generation_num = __cpu_to_le32(1UL); + + for (d = super->disks; d; d = d->next) { + if (d->index != -1) + continue; + + mpb->disk[0] = d->disk; + sum = __gen_imsm_checksum(mpb); + mpb->family_num = __cpu_to_le32(sum); + sum = __gen_imsm_checksum(mpb); + mpb->check_sum = __cpu_to_le32(sum); + + if (store_imsm_mpb(d->fd, super)) { + fprintf(stderr, "%s: failed for device %d:%d %s\n", + __func__, d->major, d->minor, strerror(errno)); + *mpb = mpb_save; + return 1; + } + if (doclose) { + close(d->fd); + d->fd = -1; + } + } + + *mpb = mpb_save; + return 0; +} + static int write_super_imsm(struct intel_super *super, int doclose) { - struct imsm_super *mpb = super->mpb; + struct imsm_super *mpb = super->anchor; struct dl *d; __u32 generation; __u32 sum; + int spares = 0; + int i; + __u32 mpb_size = sizeof(struct imsm_super) - sizeof(struct imsm_disk); /* 'generation' is incremented everytime the metadata is written */ generation = __le32_to_cpu(mpb->generation_num); generation++; mpb->generation_num = __cpu_to_le32(generation); + for (d = super->disks; d; d = d->next) { + if (d->index == -1) + spares++; + else { + mpb->disk[d->index] = d->disk; + mpb_size += sizeof(struct imsm_disk); + } + } + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = __get_imsm_dev(mpb, i); + + imsm_copy_dev(dev, super->dev_tbl[i]); + mpb_size += sizeof_imsm_dev(dev, 0); + } + mpb_size += __le32_to_cpu(mpb->bbm_log_size); + mpb->mpb_size = __cpu_to_le32(mpb_size); + /* recalculate checksum */ - sum = gen_imsm_checksum(mpb); + sum = __gen_imsm_checksum(mpb); mpb->check_sum = __cpu_to_le32(sum); + /* write the mpb for disks that compose raid devices */ for (d = super->disks; d ; d = d->next) { - if (store_imsm_mpb(d->fd, super)) { + if (d->index < 0) + continue; + if (store_imsm_mpb(d->fd, super)) fprintf(stderr, "%s: failed for device %d:%d %s\n", __func__, d->major, d->minor, strerror(errno)); - return 0; - } if (doclose) { close(d->fd); d->fd = -1; } } - return 1; + if (spares) + return write_super_imsm_spares(super, doclose); + + return 0; +} + + +static int create_array(struct supertype *st) +{ + size_t len; + struct imsm_update_create_array *u; + struct intel_super *super = st->sb; + struct imsm_dev *dev = get_imsm_dev(super, super->current_vol); + + len = sizeof(*u) - sizeof(*dev) + sizeof_imsm_dev(dev, 0); + u = malloc(len); + if (!u) { + fprintf(stderr, "%s: failed to allocate update buffer\n", + __func__); + return 1; + } + + u->type = update_create_array; + u->dev_idx = super->current_vol; + imsm_copy_dev(&u->dev, dev); + append_metadata_update(st, u, len); + + return 0; +} + +static int _add_disk(struct supertype *st) +{ + struct intel_super *super = st->sb; + size_t len; + struct imsm_update_add_disk *u; + + if (!super->add) + return 0; + + len = sizeof(*u); + u = malloc(len); + if (!u) { + fprintf(stderr, "%s: failed to allocate update buffer\n", + __func__); + return 1; + } + + u->type = update_add_disk; + append_metadata_update(st, u, len); + + return 0; } static int write_init_super_imsm(struct supertype *st) { if (st->update_tail) { - /* queue the recently created array as a metadata update */ - size_t len; - struct imsm_update_create_array *u; + /* queue the recently created array / added disk + * as a metadata update */ struct intel_super *super = st->sb; - struct imsm_super *mpb = super->mpb; - struct imsm_dev *dev; - struct imsm_map *map; struct dl *d; + int rv; - if (super->current_vol < 0 || - !(dev = get_imsm_dev(mpb, super->current_vol))) { - fprintf(stderr, "%s: could not determine sub-array\n", - __func__); - return 1; - } - - - map = &dev->vol.map[0]; - len = sizeof(*u) + sizeof(__u32) * (map->num_members - 1); - u = malloc(len); - if (!u) { - fprintf(stderr, "%s: failed to allocate update buffer\n", - __func__); - return 1; - } - - u->type = update_create_array; - u->dev_idx = super->current_vol; - memcpy(&u->dev, dev, sizeof(*dev)); - memcpy(u->dev.vol.map[0].disk_ord_tbl, map->disk_ord_tbl, - sizeof(__u32) * map->num_members); - append_metadata_update(st, u, len); + /* determine if we are creating a volume or adding a disk */ + if (super->current_vol < 0) { + /* in the add disk case we are running in mdmon + * context, so don't close fd's + */ + return _add_disk(st); + } else + rv = create_array(st); for (d = super->disks; d ; d = d->next) { close(d->fd); d->fd = -1; } - return 0; + return rv; } else return write_super_imsm(st->sb, 1); } +#endif static int store_zero_imsm(struct supertype *st, int fd) { @@ -1378,12 +1866,18 @@ static int store_zero_imsm(struct supertype *st, int fd) if (posix_memalign(&buf, 512, 512) != 0) return 1; - memset(buf, 0, sizeof(buf)); - if (write(fd, buf, sizeof(buf)) != sizeof(buf)) + memset(buf, 0, 512); + if (write(fd, buf, 512) != 512) return 1; return 0; } +static int imsm_bbm_log_size(struct imsm_super *mpb) +{ + return __le32_to_cpu(mpb->bbm_log_size); +} + +#ifndef MDASSEMBLE static int validate_geometry_imsm_container(struct supertype *st, int level, int layout, int raiddisks, int chunk, unsigned long long size, char *dev, @@ -1612,6 +2106,7 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout, return 1; } +#endif /* MDASSEMBLE */ static struct mdinfo *container_content_imsm(struct supertype *st) { @@ -1624,84 +2119,82 @@ static struct mdinfo *container_content_imsm(struct supertype *st) * and create appropriate device mdinfo. */ struct intel_super *super = st->sb; - struct imsm_super *mpb = super->mpb; + struct imsm_super *mpb = super->anchor; struct mdinfo *rest = NULL; int i; + /* do not assemble arrays that might have bad blocks */ + if (imsm_bbm_log_size(super->anchor)) { + fprintf(stderr, Name ": BBM log found in metadata. " + "Cannot activate array(s).\n"); + return NULL; + } + for (i = 0; i < mpb->num_raid_devs; i++) { - struct imsm_dev *dev = get_imsm_dev(mpb, i); - struct imsm_vol *vol = &dev->vol; - struct imsm_map *map = vol->map; + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_map *map = get_imsm_map(dev, 0); struct mdinfo *this; - __u64 sz; int slot; this = malloc(sizeof(*this)); memset(this, 0, sizeof(*this)); this->next = rest; - rest = this; - - this->array.level = get_imsm_raid_level(map); - this->array.raid_disks = map->num_members; - this->array.layout = imsm_level_to_layout(this->array.level); - this->array.md_minor = -1; - this->array.ctime = 0; - this->array.utime = 0; - this->array.chunk_size = __le16_to_cpu(map->blocks_per_strip) << 9; - this->array.state = !vol->dirty; - this->container_member = i; - if (map->map_state == IMSM_T_STATE_UNINITIALIZED || dev->vol.dirty) - this->resync_start = 0; - else - this->resync_start = ~0ULL; - - strncpy(this->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN); - this->name[MAX_RAID_SERIAL_LEN] = 0; - - sprintf(this->text_version, "/%s/%d", - devnum2devname(st->container_dev), - this->container_member); - - memset(this->uuid, 0, sizeof(this->uuid)); - - sz = __le32_to_cpu(dev->size_high); - sz <<= 32; - sz += __le32_to_cpu(dev->size_low); - this->component_size = sz; - this->array.size = this->component_size / 2; + super->current_vol = i; + getinfo_super_imsm_volume(st, this); for (slot = 0 ; slot < map->num_members; slot++) { - struct imsm_disk *disk; struct mdinfo *info_d; struct dl *d; int idx; + int skip; __u32 s; + __u32 ord; - idx = __le32_to_cpu(map->disk_ord_tbl[slot] & ~(0xff << 24)); + skip = 0; + idx = get_imsm_disk_idx(dev, slot); + ord = get_imsm_ord_tbl_ent(dev, slot); for (d = super->disks; d ; d = d->next) if (d->index == idx) break; if (d == NULL) - break; /* shouldn't this be continue ?? */ + skip = 1; + + s = d ? __le32_to_cpu(d->disk.status) : 0; + if (s & FAILED_DISK) + skip = 1; + if (!(s & USABLE_DISK)) + skip = 1; + if (ord & IMSM_ORD_REBUILD) + skip = 1; + + /* + * if we skip some disks the array will be assmebled degraded; + * reset resync start to avoid a dirty-degraded situation + * + * FIXME handle dirty degraded + */ + if (skip && !dev->vol.dirty) + this->resync_start = ~0ULL; + if (skip) + continue; info_d = malloc(sizeof(*info_d)); - if (!info_d) - break; /* ditto ?? */ + if (!info_d) { + fprintf(stderr, Name ": failed to allocate disk" + " for volume %s\n", (char *) dev->volume); + free(this); + this = rest; + break; + } memset(info_d, 0, sizeof(*info_d)); info_d->next = this->devs; this->devs = info_d; - disk = get_imsm_disk(mpb, idx); - s = __le32_to_cpu(disk->status); - info_d->disk.number = d->index; info_d->disk.major = d->major; info_d->disk.minor = d->minor; info_d->disk.raid_disk = slot; - info_d->disk.state = s & CONFIGURED_DISK ? (1 << MD_DISK_ACTIVE) : 0; - info_d->disk.state |= s & FAILED_DISK ? (1 << MD_DISK_FAULTY) : 0; - info_d->disk.state |= s & USABLE_DISK ? (1 << MD_DISK_SYNC) : 0; this->array.working_disks++; @@ -1711,19 +2204,21 @@ static struct mdinfo *container_content_imsm(struct supertype *st) if (d->devname) strcpy(info_d->name, d->devname); } + rest = this; } return rest; } +#ifndef MDASSEMBLE static int imsm_open_new(struct supertype *c, struct active_array *a, char *inst) { struct intel_super *super = c->sb; - struct imsm_super *mpb = super->mpb; + struct imsm_super *mpb = super->anchor; - if (atoi(inst) + 1 > mpb->num_raid_devs) { + if (atoi(inst) >= mpb->num_raid_devs) { fprintf(stderr, "%s: subarry index %d, out of range\n", __func__, atoi(inst)); return -ENODEV; @@ -1734,13 +2229,13 @@ static int imsm_open_new(struct supertype *c, struct active_array *a, return 0; } -static __u8 imsm_check_degraded(struct imsm_super *mpb, int n, int failed) +static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, int failed) { - struct imsm_dev *dev = get_imsm_dev(mpb, n); - struct imsm_map *map = dev->vol.map; + struct imsm_map *map = get_imsm_map(dev, 0); if (!failed) - return map->map_state; + return map->map_state == IMSM_T_STATE_UNINITIALIZED ? + IMSM_T_STATE_UNINITIALIZED : IMSM_T_STATE_NORMAL; switch (get_imsm_raid_level(map)) { case 0: @@ -1761,22 +2256,24 @@ static __u8 imsm_check_degraded(struct imsm_super *mpb, int n, int failed) int device_per_mirror = 2; /* FIXME is this always the case? * and are they always adjacent? */ - int failed = 0; + int r10fail = 0; int i; for (i = 0; i < map->num_members; i++) { - int idx = get_imsm_disk_idx(map, i); - struct imsm_disk *disk = get_imsm_disk(mpb, idx); + int idx = get_imsm_disk_idx(dev, i); + struct imsm_disk *disk = get_imsm_disk(super, idx); - if (__le32_to_cpu(disk->status) & FAILED_DISK) - failed++; + if (!disk) + r10fail++; + else if (__le32_to_cpu(disk->status) & FAILED_DISK) + r10fail++; - if (failed >= device_per_mirror) + if (r10fail >= device_per_mirror) return IMSM_T_STATE_FAILED; - /* reset 'failed' for next mirror set */ + /* reset 'r10fail' for next mirror set */ if (!((i + 1) % device_per_mirror)) - failed = 0; + r10fail = 0; } return IMSM_T_STATE_DEGRADED; @@ -1794,65 +2291,126 @@ static __u8 imsm_check_degraded(struct imsm_super *mpb, int n, int failed) return map->map_state; } -static int imsm_count_failed(struct imsm_super *mpb, struct imsm_map *map) +static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev) { int i; int failed = 0; struct imsm_disk *disk; + struct imsm_map *map = get_imsm_map(dev, 0); for (i = 0; i < map->num_members; i++) { - int idx = get_imsm_disk_idx(map, i); + __u32 ord = get_imsm_ord_tbl_ent(dev, i); + int idx = ord_to_idx(ord); - disk = get_imsm_disk(mpb, idx); - if (__le32_to_cpu(disk->status) & FAILED_DISK) + disk = get_imsm_disk(super, idx); + if (!disk || + __le32_to_cpu(disk->status) & FAILED_DISK || + ord & IMSM_ORD_REBUILD) failed++; } return failed; } -static void imsm_set_array_state(struct active_array *a, int consistent) +static int is_resyncing(struct imsm_dev *dev) +{ + struct imsm_map *migr_map; + + if (!dev->vol.migr_state) + return 0; + + if (dev->vol.migr_type == 0) + return 1; + + migr_map = get_imsm_map(dev, 1); + + if (migr_map->map_state == IMSM_T_STATE_NORMAL) + return 1; + else + return 0; +} + +static int is_rebuilding(struct imsm_dev *dev) +{ + struct imsm_map *migr_map; + + if (!dev->vol.migr_state) + return 0; + + if (dev->vol.migr_type == 0) + return 0; + + migr_map = get_imsm_map(dev, 1); + + if (migr_map->map_state == IMSM_T_STATE_DEGRADED) + return 1; + else + return 0; +} + +/* Handle dirty -> clean transititions and resync. Degraded and rebuild + * states are handled in imsm_set_disk() with one exception, when a + * resync is stopped due to a new failure this routine will set the + * 'degraded' state for the array. + */ +static int imsm_set_array_state(struct active_array *a, int consistent) { int inst = a->info.container_member; struct intel_super *super = a->container->sb; - struct imsm_dev *dev = get_imsm_dev(super->mpb, inst); - struct imsm_map *map = &dev->vol.map[0]; - int dirty = !consistent; - int failed; - __u8 map_state; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = get_imsm_map(dev, 0); + int failed = imsm_count_failed(super, dev); + __u8 map_state = imsm_check_degraded(super, dev, failed); + + if (consistent == 2 && + (a->resync_start != ~0ULL || + map_state != IMSM_T_STATE_NORMAL || + dev->vol.migr_state)) + consistent = 0; if (a->resync_start == ~0ULL) { - failed = imsm_count_failed(super->mpb, map); - map_state = imsm_check_degraded(super->mpb, inst, failed); - if (!failed) - map_state = IMSM_T_STATE_NORMAL; - if (map->map_state != map_state) { - dprintf("imsm: map_state %d: %d\n", - inst, map_state); + /* complete intialization / resync, + * recovery is completed in ->set_disk + */ + if (is_resyncing(dev)) { + dprintf("imsm: mark resync done\n"); + dev->vol.migr_state = 0; map->map_state = map_state; super->updates_pending++; } + } else if (!is_resyncing(dev) && !failed) { + /* mark the start of the init process if nothing is failed */ + dprintf("imsm: mark resync start (%llu)\n", a->resync_start); + map->map_state = map_state; + migrate(dev, IMSM_T_STATE_NORMAL, + map->map_state == IMSM_T_STATE_NORMAL); + super->updates_pending++; } - if (dev->vol.dirty != dirty) { + /* mark dirty / clean */ + if (dev->vol.dirty != !consistent) { dprintf("imsm: mark '%s' (%llu)\n", - dirty?"dirty":"clean", a->resync_start); - - dev->vol.dirty = dirty; + consistent ? "clean" : "dirty", a->resync_start); + if (consistent) + dev->vol.dirty = 0; + else + dev->vol.dirty = 1; super->updates_pending++; } + return consistent; } static void imsm_set_disk(struct active_array *a, int n, int state) { int inst = a->info.container_member; struct intel_super *super = a->container->sb; - struct imsm_dev *dev = get_imsm_dev(super->mpb, inst); - struct imsm_map *map = dev->vol.map; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = get_imsm_map(dev, 0); struct imsm_disk *disk; + int failed; __u32 status; - int failed = 0; - int new_failure = 0; + __u32 ord; + __u8 map_state; if (n > map->num_members) fprintf(stderr, "imsm: set_disk %d out of range 0..%d\n", @@ -1863,44 +2421,52 @@ static void imsm_set_disk(struct active_array *a, int n, int state) dprintf("imsm: set_disk %d:%x\n", n, state); - disk = get_imsm_disk(super->mpb, get_imsm_disk_idx(map, n)); + ord = get_imsm_ord_tbl_ent(dev, n); + disk = get_imsm_disk(super, ord_to_idx(ord)); /* check for new failures */ status = __le32_to_cpu(disk->status); if ((state & DS_FAULTY) && !(status & FAILED_DISK)) { status |= FAILED_DISK; disk->status = __cpu_to_le32(status); - new_failure = 1; + disk->scsi_id = __cpu_to_le32(~(__u32)0); + memmove(&disk->serial[0], &disk->serial[1], MAX_RAID_SERIAL_LEN - 1); super->updates_pending++; } + /* check if in_sync */ + if (state & DS_INSYNC && ord & IMSM_ORD_REBUILD) { + struct imsm_map *migr_map = get_imsm_map(dev, 1); - /* the number of failures have changed, count up 'failed' to determine - * degraded / failed status - */ - if (new_failure && map->map_state != IMSM_T_STATE_FAILED) - failed = imsm_count_failed(super->mpb, map); - - /* determine map_state based on failed or in_sync count */ - if (failed) - map->map_state = imsm_check_degraded(super->mpb, inst, failed); - else if (map->map_state == IMSM_T_STATE_DEGRADED) { - struct mdinfo *d; - int working = 0; + set_imsm_ord_tbl_ent(migr_map, n, ord_to_idx(ord)); + super->updates_pending++; + } - for (d = a->info.devs ; d ; d = d->next) - if (d->curr_state & DS_INSYNC) - working++; + failed = imsm_count_failed(super, dev); + map_state = imsm_check_degraded(super, dev, failed); - if (working == a->info.array.raid_disks) { - map->map_state = IMSM_T_STATE_NORMAL; - super->updates_pending++; - } + /* check if recovery complete, newly degraded, or failed */ + if (map_state == IMSM_T_STATE_NORMAL && is_rebuilding(dev)) { + map->map_state = map_state; + dev->vol.migr_state = 0; + super->updates_pending++; + } else if (map_state == IMSM_T_STATE_DEGRADED && + map->map_state != map_state && + !dev->vol.migr_state) { + dprintf("imsm: mark degraded\n"); + map->map_state = map_state; + super->updates_pending++; + } else if (map_state == IMSM_T_STATE_FAILED && + map->map_state != map_state) { + dprintf("imsm: mark failed\n"); + dev->vol.migr_state = 0; + map->map_state = map_state; + super->updates_pending++; } } static int store_imsm_mpb(int fd, struct intel_super *super) { - struct imsm_super *mpb = super->mpb; + struct imsm_super *mpb = super->anchor; __u32 mpb_size = __le32_to_cpu(mpb->mpb_size); unsigned long long dsize; unsigned long long sectors; @@ -1941,25 +2507,123 @@ static void imsm_sync_metadata(struct supertype *container) super->updates_pending = 0; } +static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_array *a) +{ + struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member); + int i = get_imsm_disk_idx(dev, idx); + struct dl *dl; + + for (dl = super->disks; dl; dl = dl->next) + if (dl->index == i) + break; + + if (dl && __le32_to_cpu(dl->disk.status) & FAILED_DISK) + dl = NULL; + + if (dl) + dprintf("%s: found %x:%x\n", __func__, dl->major, dl->minor); + + return dl; +} + +static struct dl *imsm_add_spare(struct intel_super *super, int slot, struct active_array *a) +{ + struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member); + int idx = get_imsm_disk_idx(dev, slot); + struct imsm_map *map = get_imsm_map(dev, 0); + unsigned long long esize; + unsigned long long pos; + struct mdinfo *d; + struct extent *ex; + int j; + int found; + __u32 array_start; + __u32 status; + struct dl *dl; + + for (dl = super->disks; dl; dl = dl->next) { + /* If in this array, skip */ + for (d = a->info.devs ; d ; d = d->next) + if (d->state_fd >= 0 && + d->disk.major == dl->major && + d->disk.minor == dl->minor) { + dprintf("%x:%x already in array\n", dl->major, dl->minor); + break; + } + if (d) + continue; + + /* skip in use or failed drives */ + status = __le32_to_cpu(dl->disk.status); + if (status & FAILED_DISK || idx == dl->index) { + dprintf("%x:%x status ( %s%s)\n", + dl->major, dl->minor, + status & FAILED_DISK ? "failed " : "", + idx == dl->index ? "in use " : ""); + continue; + } + + /* Does this unused device have the requisite free space? + * We need a->info.component_size sectors + */ + ex = get_extents(super, dl); + if (!ex) { + dprintf("cannot get extents\n"); + continue; + } + found = 0; + j = 0; + pos = 0; + array_start = __le32_to_cpu(map->pba_of_lba0); + + do { + /* check that we can start at pba_of_lba0 with + * a->info.component_size of space + */ + esize = ex[j].start - pos; + if (array_start >= pos && + array_start + a->info.component_size < ex[j].start) { + found = 1; + break; + } + pos = ex[j].start + ex[j].size; + j++; + + } while (ex[j-1].size); + + free(ex); + if (!found) { + dprintf("%x:%x does not have %llu at %d\n", + dl->major, dl->minor, + a->info.component_size, + __le32_to_cpu(map->pba_of_lba0)); + /* No room */ + continue; + } else + break; + } + + return dl; +} + static struct mdinfo *imsm_activate_spare(struct active_array *a, struct metadata_update **updates) { /** - * Take a device that is marked spare in the metadata and use it to - * replace a failed/vacant slot in an array. There may be a case where - * a device is failed in one array but active in a second. - * imsm_process_update catches this case and does not clear the SPARE_DISK - * flag, allowing the second array to start using the device on failure. - * SPARE_DISK is cleared when all arrays are using a device. + * Find a device with unused free space and use it to replace a + * failed/vacant region in an array. We replace failed regions one a + * array at a time. The result is that a new spare disk will be added + * to the first failed array and after the monitor has finished + * propagating failures the remainder will be consumed. * - * FIXME: is this a valid use of SPARE_DISK? + * FIXME add a capability for mdmon to request spares from another + * container. */ struct intel_super *super = a->container->sb; - struct imsm_super *mpb = super->mpb; int inst = a->info.container_member; - struct imsm_dev *dev = get_imsm_dev(mpb, inst); - struct imsm_map *map = dev->vol.map; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = get_imsm_map(dev, 0); int failed = a->info.array.raid_disks; struct mdinfo *rv = NULL; struct mdinfo *d; @@ -1981,11 +2645,10 @@ static struct mdinfo *imsm_activate_spare(struct active_array *a, dprintf("imsm: activate spare: inst=%d failed=%d (%d) level=%d\n", inst, failed, a->info.array.raid_disks, a->info.array.level); - if (imsm_check_degraded(mpb, inst, failed) != IMSM_T_STATE_DEGRADED) + if (imsm_check_degraded(super, dev, failed) != IMSM_T_STATE_DEGRADED) return NULL; /* For each slot, if it is not working, find a spare */ - dl = super->disks; for (i = 0; i < a->info.array.raid_disks; i++) { for (d = a->info.devs ; d ; d = d->next) if (d->disk.raid_disk == i) @@ -1994,88 +2657,47 @@ static struct mdinfo *imsm_activate_spare(struct active_array *a, if (d && (d->state_fd >= 0)) continue; - /* OK, this device needs recovery. Find a spare */ - for ( ; dl ; dl = dl->next) { - unsigned long long esize; - unsigned long long pos; - struct mdinfo *d2; - struct extent *ex; - struct imsm_disk *disk; - int j; - int found; - __u32 array_start; - - /* If in this array, skip */ - for (d2 = a->info.devs ; d2 ; d2 = d2->next) - if (d2->disk.major == dl->major && - d2->disk.minor == dl->minor) { - dprintf("%x:%x already in array\n", dl->major, dl->minor); - break; - } - if (d2) - continue; - - /* is this unused device marked as a spare? */ - disk = get_imsm_disk(mpb, dl->index); - if (!(__le32_to_cpu(disk->status) & SPARE_DISK)) - continue; - - /* We are allowed to use this device - is there space? - * We need a->info.component_size sectors */ - ex = get_extents(super, dl); - if (!ex) { - dprintf("cannot get extents\n"); - continue; - } - found = 0; - j = 0; - pos = 0; - array_start = __le32_to_cpu(map->pba_of_lba0); - - do { - /* check that we can start at pba_of_lba0 with - * a->info.component_size of space - */ - esize = ex[j].start - pos; - if (array_start >= pos && - array_start + a->info.component_size < ex[j].start) { - found = 1; - break; - } - pos = ex[j].start + ex[j].size; - j++; - - } while (ex[j-1].size); - - free(ex); - if (!found) { - dprintf("%x:%x does not have %llu at %d\n", - dl->major, dl->minor, - a->info.component_size, - __le32_to_cpu(map->pba_of_lba0)); - /* No room */ - continue; - } + /* + * OK, this device needs recovery. Try to re-add the previous + * occupant of this slot, if this fails add a new spare + */ + dl = imsm_readd(super, i, a); + if (!dl) + dl = imsm_add_spare(super, i, a); + if (!dl) + continue; + + /* found a usable disk with enough space */ + di = malloc(sizeof(*di)); + memset(di, 0, sizeof(*di)); + + /* dl->index will be -1 in the case we are activating a + * pristine spare. imsm_process_update() will create a + * new index in this case. Once a disk is found to be + * failed in all member arrays it is kicked from the + * metadata + */ + di->disk.number = dl->index; - /* found a usable disk with enough space */ - di = malloc(sizeof(*di)); - memset(di, 0, sizeof(*di)); - di->disk.number = dl->index; - di->disk.raid_disk = i; - di->disk.major = dl->major; - di->disk.minor = dl->minor; - di->disk.state = 0; - di->data_offset = array_start; - di->component_size = a->info.component_size; - di->container_member = inst; - di->next = rv; - rv = di; - num_spares++; - dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor, - i, pos); + /* (ab)use di->devs to store a pointer to the device + * we chose + */ + di->devs = (struct mdinfo *) dl; + + di->disk.raid_disk = i; + di->disk.major = dl->major; + di->disk.minor = dl->minor; + di->disk.state = 0; + di->data_offset = __le32_to_cpu(map->pba_of_lba0); + di->component_size = a->info.component_size; + di->container_member = inst; + di->next = rv; + rv = di; + num_spares++; + dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor, + i, di->data_offset); - break; - } + break; } if (!rv) @@ -2094,7 +2716,8 @@ static struct mdinfo *imsm_activate_spare(struct active_array *a, for (di = rv ; di ; di = di->next) { u->type = update_activate_spare; - u->disk_idx = di->disk.number; + u->dl = (struct dl *) di->devs; + di->devs = NULL; u->slot = di->disk.raid_disk; u->array = inst; u->next = u + 1; @@ -2106,32 +2729,26 @@ static struct mdinfo *imsm_activate_spare(struct active_array *a, return rv; } -static int weight(unsigned int field) -{ - int weight; - - for (weight = 0; field; weight++) - field &= field - 1; - - return weight; -} - -static int disks_overlap(struct imsm_map *m1, struct imsm_map *m2) +static int disks_overlap(struct imsm_dev *d1, struct imsm_dev *d2) { + struct imsm_map *m1 = get_imsm_map(d1, 0); + struct imsm_map *m2 = get_imsm_map(d2, 0); int i; int j; int idx; for (i = 0; i < m1->num_members; i++) { - idx = get_imsm_disk_idx(m1, i); + idx = get_imsm_disk_idx(d1, i); for (j = 0; j < m2->num_members; j++) - if (idx == get_imsm_disk_idx(m2, j)) + if (idx == get_imsm_disk_idx(d2, j)) return 1; } return 0; } +static void imsm_delete(struct intel_super *super, struct dl **dlp, int index); + static void imsm_process_update(struct supertype *st, struct metadata_update *update) { @@ -2144,86 +2761,110 @@ static void imsm_process_update(struct supertype *st, * flag */ struct intel_super *super = st->sb; - struct imsm_super *mpb = super->mpb; + struct imsm_super *mpb; enum imsm_update_type type = *(enum imsm_update_type *) update->buf; + /* update requires a larger buf but the allocation failed */ + if (super->next_len && !super->next_buf) { + super->next_len = 0; + return; + } + + if (super->next_buf) { + memcpy(super->next_buf, super->buf, super->len); + free(super->buf); + super->len = super->next_len; + super->buf = super->next_buf; + + super->next_len = 0; + super->next_buf = NULL; + } + + mpb = super->anchor; + switch (type) { case update_activate_spare: { struct imsm_update_activate_spare *u = (void *) update->buf; - struct imsm_dev *dev = get_imsm_dev(mpb, u->array); - struct imsm_map *map = &dev->vol.map[0]; + struct imsm_dev *dev = get_imsm_dev(super, u->array); + struct imsm_map *map = get_imsm_map(dev, 0); + struct imsm_map *migr_map; struct active_array *a; struct imsm_disk *disk; __u32 status; + __u8 to_state; struct dl *dl; - struct mdinfo *d; - unsigned int members; unsigned int found; - int victim; + int failed; + int victim = get_imsm_disk_idx(dev, u->slot); int i; for (dl = super->disks; dl; dl = dl->next) - if (dl->index == u->disk_idx) + if (dl == u->dl) break; if (!dl) { fprintf(stderr, "error: imsm_activate_spare passed " - "an unknown disk_idx: %d\n", u->disk_idx); + "an unknown disk (index: %d)\n", + u->dl->index); return; } super->updates_pending++; - victim = get_imsm_disk_idx(map, u->slot); - map->disk_ord_tbl[u->slot] = __cpu_to_le32(u->disk_idx); - disk = get_imsm_disk(mpb, u->disk_idx); + /* count failures (excluding rebuilds and the victim) + * to determine map[0] state + */ + failed = 0; + for (i = 0; i < map->num_members; i++) { + if (i == u->slot) + continue; + disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i)); + if (!disk || + __le32_to_cpu(disk->status) & FAILED_DISK) + failed++; + } + + /* adding a pristine spare, assign a new index */ + if (dl->index < 0) { + dl->index = super->anchor->num_disks; + super->anchor->num_disks++; + } + disk = &dl->disk; status = __le32_to_cpu(disk->status); status |= CONFIGURED_DISK; + status &= ~SPARE_DISK; disk->status = __cpu_to_le32(status); - /* map unique/live arrays using the spare */ - members = 0; - found = 0; - for (a = st->arrays; a; a = a->next) { - int inst = a->info.container_member; - - dev = get_imsm_dev(mpb, inst); - map = &dev->vol.map[0]; - if (map->raid_level > 0) - members |= 1 << inst; - for (d = a->info.devs; d; d = d->next) - if (d->disk.major == dl->major && - d->disk.minor == dl->minor) - found |= 1 << inst; - } - - /* until all arrays that can absorb this disk have absorbed - * this disk it can still be considered a spare - */ - if (weight(found) >= weight(members)) { - status = __le32_to_cpu(disk->status); - status &= ~SPARE_DISK; - disk->status = __cpu_to_le32(status); - } + /* mark rebuild */ + to_state = imsm_check_degraded(super, dev, failed); + map->map_state = IMSM_T_STATE_DEGRADED; + migrate(dev, to_state, 1); + migr_map = get_imsm_map(dev, 1); + set_imsm_ord_tbl_ent(map, u->slot, dl->index); + set_imsm_ord_tbl_ent(migr_map, u->slot, dl->index | IMSM_ORD_REBUILD); /* count arrays using the victim in the metadata */ found = 0; for (a = st->arrays; a ; a = a->next) { - dev = get_imsm_dev(mpb, a->info.container_member); - map = &dev->vol.map[0]; + dev = get_imsm_dev(super, a->info.container_member); for (i = 0; i < map->num_members; i++) - if (victim == get_imsm_disk_idx(map, i)) + if (victim == get_imsm_disk_idx(dev, i)) found++; } - /* clear some flags if the victim is no longer being + /* delete the victim if it is no longer being * utilized anywhere */ - disk = get_imsm_disk(mpb, victim); if (!found) { - status = __le32_to_cpu(disk->status); - status &= ~(CONFIGURED_DISK | USABLE_DISK); - disk->status = __cpu_to_le32(status); + struct dl **dlp; + + for (dlp = &super->disks; *dlp; dlp = &(*dlp)->next) + if ((*dlp)->index == victim) + break; + /* We know that 'manager' isn't touching anything, + * so it is safe to: + */ + imsm_delete(super, dlp, victim); } break; } @@ -2254,12 +2895,12 @@ static void imsm_process_update(struct supertype *st, /* check update is next in sequence */ if (u->dev_idx != mpb->num_raid_devs) { - dprintf("%s: can not create arrays out of sequence\n", - __func__); + dprintf("%s: can not create array %d expected index %d\n", + __func__, u->dev_idx, mpb->num_raid_devs); return; } - new_map = &u->dev.vol.map[0]; + new_map = get_imsm_map(&u->dev, 0); new_start = __le32_to_cpu(new_map->pba_of_lba0); new_end = new_start + __le32_to_cpu(new_map->blocks_per_member); @@ -2268,14 +2909,14 @@ static void imsm_process_update(struct supertype *st, * overalpping disks */ for (i = 0; i < mpb->num_raid_devs; i++) { - dev = get_imsm_dev(mpb, i); - map = &dev->vol.map[0]; + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, 0); start = __le32_to_cpu(map->pba_of_lba0); end = start + __le32_to_cpu(map->blocks_per_member); if ((new_start >= start && new_start <= end) || (start >= new_start && start <= new_end)) overlap = 1; - if (overlap && disks_overlap(map, new_map)) { + if (overlap && disks_overlap(dev, &u->dev)) { dprintf("%s: arrays overlap\n", __func__); return; } @@ -2286,51 +2927,158 @@ static void imsm_process_update(struct supertype *st, return; } + /* check that prepare update was successful */ + if (!update->space) { + dprintf("%s: prepare update failed\n", __func__); + return; + } + super->updates_pending++; + dev = update->space; + map = get_imsm_map(dev, 0); + update->space = NULL; + imsm_copy_dev(dev, &u->dev); + map = get_imsm_map(dev, 0); + super->dev_tbl[u->dev_idx] = dev; mpb->num_raid_devs++; - dev = get_imsm_dev(mpb, u->dev_idx); - memcpy(dev, &u->dev, sizeof(*dev)); - map = &dev->vol.map[0]; - memcpy(map->disk_ord_tbl, new_map->disk_ord_tbl, - sizeof(__u32) * new_map->num_members); - - /* fix up flags, if arrays overlap then the drives can not be - * spares - */ + + /* fix up flags */ for (i = 0; i < map->num_members; i++) { struct imsm_disk *disk; __u32 status; - disk = get_imsm_disk(mpb, get_imsm_disk_idx(map, i)); + disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i)); status = __le32_to_cpu(disk->status); status |= CONFIGURED_DISK; - if (overlap) - status &= ~SPARE_DISK; + status &= ~SPARE_DISK; disk->status = __cpu_to_le32(status); } break; } + case update_add_disk: + + /* we may be able to repair some arrays if disks are + * being added */ + if (super->add) { + struct active_array *a; + for (a = st->arrays; a; a = a->next) + a->check_degraded = 1; + } + /* add some spares to the metadata */ + while (super->add) { + struct dl *al; + + al = super->add; + super->add = al->next; + al->next = super->disks; + super->disks = al; + dprintf("%s: added %x:%x\n", + __func__, al->major, al->minor); + } + + break; } } static void imsm_prepare_update(struct supertype *st, struct metadata_update *update) { - /* Allocate space to hold a new mpb if necessary. We currently - * allocate enough to hold 2 subarrays for the given number of disks. - * This may not be sufficient iff reshaping. - * - * FIX ME handle the reshape case. - * - * The monitor will be able to safely change super->mpb by arranging - * for it to be freed in check_update_queue(). I.e. the monitor thread - * will start using the new pointer and the manager can continue to use - * the old value until check_update_queue() runs. + /** + * Allocate space to hold new disk entries, raid-device entries or a new + * mpb if necessary. The manager synchronously waits for updates to + * complete in the monitor, so new mpb buffers allocated here can be + * integrated by the monitor thread without worrying about live pointers + * in the manager thread. */ + enum imsm_update_type type = *(enum imsm_update_type *) update->buf; + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + size_t buf_len; + size_t len = 0; + + switch (type) { + case update_create_array: { + struct imsm_update_create_array *u = (void *) update->buf; - return; + len = sizeof_imsm_dev(&u->dev, 1); + update->space = malloc(len); + break; + default: + break; + } + } + + /* check if we need a larger metadata buffer */ + if (super->next_buf) + buf_len = super->next_len; + else + buf_len = super->len; + + if (__le32_to_cpu(mpb->mpb_size) + len > buf_len) { + /* ok we need a larger buf than what is currently allocated + * if this allocation fails process_update will notice that + * ->next_len is set and ->next_buf is NULL + */ + buf_len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + len, 512); + if (super->next_buf) + free(super->next_buf); + + super->next_len = buf_len; + if (posix_memalign(&super->next_buf, buf_len, 512) != 0) + super->next_buf = NULL; + } } +/* must be called while manager is quiesced */ +static void imsm_delete(struct intel_super *super, struct dl **dlp, int index) +{ + struct imsm_super *mpb = super->anchor; + struct dl *iter; + struct imsm_dev *dev; + struct imsm_map *map; + int i, j, num_members; + __u32 ord; + + dprintf("%s: deleting device[%d] from imsm_super\n", + __func__, index); + + /* shift all indexes down one */ + for (iter = super->disks; iter; iter = iter->next) + if (iter->index > index) + iter->index--; + + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, 0); + num_members = map->num_members; + for (j = 0; j < num_members; j++) { + /* update ord entries being careful not to propagate + * ord-flags to the first map + */ + ord = get_imsm_ord_tbl_ent(dev, j); + + if (ord_to_idx(ord) <= index) + continue; + + map = get_imsm_map(dev, 0); + set_imsm_ord_tbl_ent(map, j, ord_to_idx(ord - 1)); + map = get_imsm_map(dev, 1); + if (map) + set_imsm_ord_tbl_ent(map, j, ord - 1); + } + } + + mpb->num_disks--; + super->updates_pending++; + if (*dlp) { + struct dl *dl = *dlp; + + *dlp = (*dlp)->next; + __free_imsm_disk(dl); + } +} +#endif /* MDASSEMBLE */ + struct superswitch super_imsm = { #ifndef MDASSEMBLE .examine_super = examine_super_imsm, @@ -2338,6 +3086,8 @@ struct superswitch super_imsm = { .detail_super = detail_super_imsm, .brief_detail_super = brief_detail_super_imsm, .write_init_super = write_init_super_imsm, + .validate_geometry = validate_geometry_imsm, + .add_to_super = add_to_super_imsm, #endif .match_home = match_home_imsm, .uuid_from_super= uuid_from_super_imsm, @@ -2350,15 +3100,14 @@ struct superswitch super_imsm = { .load_super = load_super_imsm, .init_super = init_super_imsm, - .add_to_super = add_to_super_imsm, .store_super = store_zero_imsm, .free_super = free_super_imsm, .match_metadata_desc = match_metadata_desc_imsm, .container_content = container_content_imsm, - .validate_geometry = validate_geometry_imsm, .external = 1, +#ifndef MDASSEMBLE /* for mdmon */ .open_new = imsm_open_new, .load_super = load_super_imsm, @@ -2368,4 +3117,5 @@ struct superswitch super_imsm = { .activate_spare = imsm_activate_spare, .process_update = imsm_process_update, .prepare_update = imsm_prepare_update, +#endif /* MDASSEMBLE */ };