X-Git-Url: http://git.ipfire.org/?p=thirdparty%2Fmdadm.git;a=blobdiff_plain;f=super-intel.c;h=609aaf51a04dcf28fb595ce82a9c522b0031999f;hp=8e00d742748a74b95a8e21edfc9eb6b4839eda63;hb=1e5c69836d4d0b6dcaef8fc187e6bf2841eb57f6;hpb=d665cc31e72f722f5c649a7029a85f3ad51e6a4d diff --git a/super-intel.c b/super-intel.c index 8e00d742..609aaf51 100644 --- a/super-intel.c +++ b/super-intel.c @@ -53,6 +53,7 @@ #define MPB_SECTOR_CNT 418 #define IMSM_RESERVED_SECTORS 4096 +#define SECT_PER_MB_SHIFT 11 /* Disk configuration info. */ #define IMSM_MAX_DEVICES 255 @@ -63,7 +64,6 @@ struct imsm_disk { #define SPARE_DISK __cpu_to_le32(0x01) /* Spare */ #define CONFIGURED_DISK __cpu_to_le32(0x02) /* Member of some RaidDev */ #define FAILED_DISK __cpu_to_le32(0x04) /* Permanent failure */ -#define USABLE_DISK __cpu_to_le32(0x08) /* Fully usable unless FAILED_DISK is set */ __u32 status; /* 0xF0 - 0xF3 */ __u32 owner_cfg_num; /* which config 0,1,2... owns this disk */ #define IMSM_DISK_FILLERS 4 @@ -88,7 +88,7 @@ struct imsm_map { __u8 num_members; /* number of member disks */ __u8 num_domains; /* number of parity domains */ __u8 failed_disk_num; /* valid only when state is degraded */ - __u8 reserved[1]; + __u8 ddf; __u32 filler[7]; /* expansion area */ #define IMSM_ORD_REBUILD (1 << 24) __u32 disk_ord_tbl[1]; /* disk_ord_tbl[num_members], @@ -105,6 +105,7 @@ struct imsm_vol { #define MIGR_VERIFY 2 /* analagous to echo check > sync_action */ #define MIGR_GEN_MIGR 3 #define MIGR_STATE_CHANGE 4 +#define MIGR_REPAIR 5 __u8 migr_type; /* Initializing, Rebuilding, ... */ __u8 dirty; __u8 fs_state; /* fast-sync state for CnG (0xff == disabled) */ @@ -193,6 +194,29 @@ struct bbm_log { static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed" }; #endif +static __u8 migr_type(struct imsm_dev *dev) +{ + if (dev->vol.migr_type == MIGR_VERIFY && + dev->status & DEV_VERIFY_AND_FIX) + return MIGR_REPAIR; + else + return dev->vol.migr_type; +} + +static void set_migr_type(struct imsm_dev *dev, __u8 migr_type) +{ + /* for compatibility with older oroms convert MIGR_REPAIR, into + * MIGR_VERIFY w/ DEV_VERIFY_AND_FIX status + */ + if (migr_type == MIGR_REPAIR) { + dev->vol.migr_type = MIGR_VERIFY; + dev->status |= DEV_VERIFY_AND_FIX; + } else { + dev->vol.migr_type = migr_type; + dev->status &= ~DEV_VERIFY_AND_FIX; + } +} + static unsigned int sector_count(__u32 bytes) { return ((bytes + (512-1)) & (~(512-1))) / 512; @@ -203,6 +227,12 @@ static unsigned int mpb_sectors(struct imsm_super *mpb) return sector_count(__le32_to_cpu(mpb->mpb_size)); } +struct intel_dev { + struct imsm_dev *dev; + struct intel_dev *next; + int index; +}; + /* internal representation of IMSM metadata */ struct intel_super { union { @@ -216,8 +246,8 @@ struct intel_super { int creating_imsm; /* flag to indicate container creation */ int current_vol; /* index of raid device undergoing creation */ __u32 create_offset; /* common start for 'current_vol' */ - #define IMSM_MAX_RAID_DEVS 2 - struct imsm_dev *dev_tbl[IMSM_MAX_RAID_DEVS]; + __u32 random; /* random data for seeding new family numbers */ + struct intel_dev *devlist; struct dl { struct dl *next; int index; @@ -228,12 +258,21 @@ struct intel_super { int fd; int extent_cnt; struct extent *e; /* for determining freespace @ create */ + int raiddisk; /* slot to fill in autolayout */ } *disks; struct dl *add; /* list of disks to add while mdmon active */ struct dl *missing; /* disks removed while we weren't looking */ struct bbm_log *bbm_log; const char *hba; /* device path of the raid controller for this metadata */ const struct imsm_orom *orom; /* platform firmware support */ + struct intel_super *next; /* (temp) list for disambiguating family_num */ +}; + +struct intel_disk { + struct imsm_disk disk; + #define IMSM_UNKNOWN_OWNER (-1) + int owner; + struct intel_disk *next; }; struct extent { @@ -409,9 +448,14 @@ static struct imsm_dev *__get_imsm_dev(struct imsm_super *mpb, __u8 index) static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index) { + struct intel_dev *dv; + if (index >= super->anchor->num_raid_devs) return NULL; - return super->dev_tbl[index]; + for (dv = super->devlist; dv; dv = dv->next) + if (dv->index == index) + return dv->dev; + return NULL; } static __u32 get_imsm_ord_tbl_ent(struct imsm_dev *dev, int slot) @@ -440,6 +484,20 @@ static void set_imsm_ord_tbl_ent(struct imsm_map *map, int slot, __u32 ord) map->disk_ord_tbl[slot] = __cpu_to_le32(ord); } +static int get_imsm_disk_slot(struct imsm_map *map, int idx) +{ + int slot; + __u32 ord; + + for (slot = 0; slot < map->num_members; slot++) { + ord = __le32_to_cpu(map->disk_ord_tbl[slot]); + if (ord_to_idx(ord) == idx) + return slot; + } + + return -1; +} + static int get_imsm_raid_level(struct imsm_map *map) { if (map->raid_level == 1) { @@ -466,18 +524,14 @@ static int cmp_extent(const void *av, const void *bv) static int count_memberships(struct dl *dl, struct intel_super *super) { int memberships = 0; - int i, j; + int i; for (i = 0; i < super->anchor->num_raid_devs; i++) { struct imsm_dev *dev = get_imsm_dev(super, i); struct imsm_map *map = get_imsm_map(dev, 0); - for (j = 0; j < map->num_members; j++) { - __u32 index = get_imsm_disk_idx(dev, j); - - if (index == dl->index) - memberships++; - } + if (get_imsm_disk_slot(map, dl->index) >= 0) + memberships++; } return memberships; @@ -487,7 +541,7 @@ static struct extent *get_extents(struct intel_super *super, struct dl *dl) { /* find a list of used extents on the given physical device */ struct extent *rv, *e; - int i, j; + int i; int memberships = count_memberships(dl, super); __u32 reservation = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; @@ -500,14 +554,10 @@ static struct extent *get_extents(struct intel_super *super, struct dl *dl) struct imsm_dev *dev = get_imsm_dev(super, i); struct imsm_map *map = get_imsm_map(dev, 0); - for (j = 0; j < map->num_members; j++) { - __u32 index = get_imsm_disk_idx(dev, j); - - if (index == dl->index) { - e->start = __le32_to_cpu(map->pba_of_lba0); - e->size = __le32_to_cpu(map->blocks_per_member); - e++; - } + if (get_imsm_disk_slot(map, dl->index) >= 0) { + e->start = __le32_to_cpu(map->pba_of_lba0); + e->size = __le32_to_cpu(map->blocks_per_member); + e++; } } qsort(rv, memberships, sizeof(*rv), cmp_extent); @@ -569,7 +619,24 @@ static __u32 imsm_reserved_sectors(struct intel_super *super, struct dl *dl) return rv; } +static int is_spare(struct imsm_disk *disk) +{ + return (disk->status & SPARE_DISK) == SPARE_DISK; +} + +static int is_configured(struct imsm_disk *disk) +{ + return (disk->status & CONFIGURED_DISK) == CONFIGURED_DISK; +} + +static int is_failed(struct imsm_disk *disk) +{ + return (disk->status & FAILED_DISK) == FAILED_DISK; +} + #ifndef MDASSEMBLE +static __u64 blocks_per_migr_unit(struct imsm_dev *dev); + static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx) { __u64 sz; @@ -582,10 +649,8 @@ static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx) printf(" UUID : %s\n", uuid); printf(" RAID Level : %d\n", get_imsm_raid_level(map)); printf(" Members : %d\n", map->num_members); - for (slot = 0; slot < map->num_members; slot++) - if (disk_idx== get_imsm_disk_idx(dev, slot)) - break; - if (slot < map->num_members) { + slot = get_imsm_disk_slot(map, disk_idx); + if (slot >= 0) { ord = get_imsm_ord_tbl_ent(dev, slot); printf(" This Slot : %d%s\n", slot, ord & IMSM_ORD_REBUILD ? " (out-of-sync)" : ""); @@ -606,14 +671,32 @@ static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx) printf(" Chunk Size : %u KiB\n", __le16_to_cpu(map->blocks_per_strip) / 2); printf(" Reserved : %d\n", __le32_to_cpu(dev->reserved_blocks)); - printf(" Migrate State : %s", dev->vol.migr_state ? "migrating" : "idle"); - if (dev->vol.migr_state) - printf(": %s", dev->vol.migr_type ? "rebuilding" : "initializing"); - printf("\n"); + printf(" Migrate State : "); + if (dev->vol.migr_state) { + if (migr_type(dev) == MIGR_INIT) + printf("initialize\n"); + else if (migr_type(dev) == MIGR_REBUILD) + printf("rebuild\n"); + else if (migr_type(dev) == MIGR_VERIFY) + printf("check\n"); + else if (migr_type(dev) == MIGR_GEN_MIGR) + printf("general migration\n"); + else if (migr_type(dev) == MIGR_STATE_CHANGE) + printf("state change\n"); + else if (migr_type(dev) == MIGR_REPAIR) + printf("repair\n"); + else + printf("\n", migr_type(dev)); + } else + printf("idle\n"); printf(" Map State : %s", map_state_str[map->map_state]); if (dev->vol.migr_state) { struct imsm_map *map = get_imsm_map(dev, 1); + printf(" <-- %s", map_state_str[map->map_state]); + printf("\n Checkpoint : %u (%llu)", + __le32_to_cpu(dev->vol.curr_migr_unit), + blocks_per_migr_unit(dev)); } printf("\n"); printf(" Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean"); @@ -623,7 +706,6 @@ static void print_imsm_disk(struct imsm_super *mpb, int index, __u32 reserved) { struct imsm_disk *disk = __get_imsm_disk(mpb, index); char str[MAX_RAID_SERIAL_LEN + 1]; - __u32 s; __u64 sz; if (index < 0) @@ -632,11 +714,9 @@ static void print_imsm_disk(struct imsm_super *mpb, int index, __u32 reserved) printf("\n"); snprintf(str, MAX_RAID_SERIAL_LEN + 1, "%s", disk->serial); printf(" Disk%02d Serial : %s\n", index, str); - s = disk->status; - printf(" State :%s%s%s%s\n", s&SPARE_DISK ? " spare" : "", - s&CONFIGURED_DISK ? " active" : "", - s&FAILED_DISK ? " failed" : "", - s&USABLE_DISK ? " usable" : ""); + printf(" State :%s%s%s\n", is_spare(disk) ? " spare" : "", + is_configured(disk) ? " active" : "", + is_failed(disk) ? " failed" : ""); printf(" Id : %08x\n", __le32_to_cpu(disk->scsi_id)); sz = __le32_to_cpu(disk->total_blocks) - reserved; printf(" Usable Size : %llu%s\n", (unsigned long long)sz, @@ -661,10 +741,11 @@ static void examine_super_imsm(struct supertype *st, char *homehost) printf(" Magic : %s\n", str); snprintf(str, strlen(MPB_VERSION_RAID0), "%s", get_imsm_version(mpb)); printf(" Version : %s\n", get_imsm_version(mpb)); + printf(" Orig Family : %08x\n", __le32_to_cpu(mpb->orig_family_num)); printf(" Family : %08x\n", __le32_to_cpu(mpb->family_num)); printf(" Generation : %08x\n", __le32_to_cpu(mpb->generation_num)); getinfo_super_imsm(st, &info); - fname_from_uuid(st, &info, nbuf,'-'); + fname_from_uuid(st, &info, nbuf, ':'); printf(" UUID : %s\n", nbuf + 5); sum = __le32_to_cpu(mpb->check_sum); printf(" Checksum : %08x %s\n", sum, @@ -682,7 +763,8 @@ static void examine_super_imsm(struct supertype *st, char *homehost) printf(" Signature : %x\n", __le32_to_cpu(log->signature)); printf(" Entry Count : %d\n", __le32_to_cpu(log->entry_count)); printf(" Spare Blocks : %d\n", __le32_to_cpu(log->reserved_spare_block_count)); - printf(" First Spare : %llx\n", __le64_to_cpu(log->first_spare_lba)); + printf(" First Spare : %llx\n", + (unsigned long long) __le64_to_cpu(log->first_spare_lba)); } for (i = 0; i < mpb->num_raid_devs; i++) { struct mdinfo info; @@ -690,7 +772,7 @@ static void examine_super_imsm(struct supertype *st, char *homehost) super->current_vol = i; getinfo_super_imsm(st, &info); - fname_from_uuid(st, &info, nbuf, '-'); + fname_from_uuid(st, &info, nbuf, ':'); print_imsm_dev(dev, nbuf + 5, super->disks->index); } for (i = 0; i < mpb->num_disks; i++) { @@ -700,7 +782,24 @@ static void examine_super_imsm(struct supertype *st, char *homehost) } } -static void brief_examine_super_imsm(struct supertype *st) +static void brief_examine_super_imsm(struct supertype *st, int verbose) +{ + /* We just write a generic IMSM ARRAY entry */ + struct mdinfo info; + char nbuf[64]; + struct intel_super *super = st->sb; + + if (!super->anchor->num_raid_devs) { + printf("ARRAY metadata=imsm\n"); + return; + } + + getinfo_super_imsm(st, &info); + fname_from_uuid(st, &info, nbuf, ':'); + printf("ARRAY metadata=imsm UUID=%s\n", nbuf + 5); +} + +static void brief_examine_subarrays_imsm(struct supertype *st, int verbose) { /* We just write a generic IMSM ARRAY entry */ struct mdinfo info; @@ -713,27 +812,40 @@ static void brief_examine_super_imsm(struct supertype *st) return; getinfo_super_imsm(st, &info); - fname_from_uuid(st, &info, nbuf,'-'); - printf("ARRAY metadata=imsm auto=md UUID=%s\n", nbuf + 5); + fname_from_uuid(st, &info, nbuf, ':'); for (i = 0; i < super->anchor->num_raid_devs; i++) { struct imsm_dev *dev = get_imsm_dev(super, i); super->current_vol = i; getinfo_super_imsm(st, &info); - fname_from_uuid(st, &info, nbuf1,'-'); - printf("ARRAY /dev/md/%.16s container=%s\n" - " member=%d auto=mdp UUID=%s\n", + fname_from_uuid(st, &info, nbuf1, ':'); + printf("ARRAY /dev/md/%.16s container=%s member=%d UUID=%s\n", dev->volume, nbuf + 5, i, nbuf1 + 5); } } +static void export_examine_super_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct mdinfo info; + char nbuf[64]; + + getinfo_super_imsm(st, &info); + fname_from_uuid(st, &info, nbuf, ':'); + printf("MD_METADATA=imsm\n"); + printf("MD_LEVEL=container\n"); + printf("MD_UUID=%s\n", nbuf+5); + printf("MD_DEVICES=%u\n", mpb->num_disks); +} + static void detail_super_imsm(struct supertype *st, char *homehost) { struct mdinfo info; char nbuf[64]; getinfo_super_imsm(st, &info); - fname_from_uuid(st, &info, nbuf,'-'); + fname_from_uuid(st, &info, nbuf, ':'); printf("\n UUID : %s\n", nbuf + 5); } @@ -742,7 +854,7 @@ static void brief_detail_super_imsm(struct supertype *st) struct mdinfo info; char nbuf[64]; getinfo_super_imsm(st, &info); - fname_from_uuid(st, &info, nbuf,'-'); + fname_from_uuid(st, &info, nbuf, ':'); printf(" UUID=%s", nbuf + 5); } @@ -906,7 +1018,7 @@ static int imsm_enumerate_ports(const char *hba_path, int port_count, int host_b return err; } -static int detail_platform_imsm(int verbose) +static int detail_platform_imsm(int verbose, int enumerate_only) { /* There are two components to imsm platform support, the ahci SATA * controller and the option-rom. To find the SATA controller we @@ -927,6 +1039,12 @@ static int detail_platform_imsm(int verbose) int host_base = 0; int port_count = 0; + if (enumerate_only) { + if (check_env("IMSM_NO_PLATFORM") || find_imsm_orom()) + return 0; + return 2; + } + list = find_driver_devices("pci", "ahci"); for (hba = list; hba; hba = hba->next) if (devpath_to_vendor(hba->path) == 0x8086) @@ -959,6 +1077,23 @@ static int detail_platform_imsm(int verbose) imsm_orom_has_raid1e(orom) ? " raid1e" : "", imsm_orom_has_raid10(orom) ? " raid10" : "", imsm_orom_has_raid5(orom) ? " raid5" : ""); + printf(" Chunk Sizes :%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + imsm_orom_has_chunk(orom, 2) ? " 2k" : "", + imsm_orom_has_chunk(orom, 4) ? " 4k" : "", + imsm_orom_has_chunk(orom, 8) ? " 8k" : "", + imsm_orom_has_chunk(orom, 16) ? " 16k" : "", + imsm_orom_has_chunk(orom, 32) ? " 32k" : "", + imsm_orom_has_chunk(orom, 64) ? " 64k" : "", + imsm_orom_has_chunk(orom, 128) ? " 128k" : "", + imsm_orom_has_chunk(orom, 256) ? " 256k" : "", + imsm_orom_has_chunk(orom, 512) ? " 512k" : "", + imsm_orom_has_chunk(orom, 1024*1) ? " 1M" : "", + imsm_orom_has_chunk(orom, 1024*2) ? " 2M" : "", + imsm_orom_has_chunk(orom, 1024*4) ? " 4M" : "", + imsm_orom_has_chunk(orom, 1024*8) ? " 8M" : "", + imsm_orom_has_chunk(orom, 1024*16) ? " 16M" : "", + imsm_orom_has_chunk(orom, 1024*32) ? " 32M" : "", + imsm_orom_has_chunk(orom, 1024*64) ? " 64M" : ""); printf(" Max Disks : %d\n", orom->tds); printf(" Max Volumes : %d\n", orom->vpa); printf(" I/O Controller : %s\n", hba_path); @@ -998,7 +1133,7 @@ static int match_home_imsm(struct supertype *st, char *homehost) /* the imsm metadata format does not specify any host * identification information. We return -1 since we can never * confirm nor deny whether a given array is "meant" for this - * host. We rely on compare_super and the 'family_num' field to + * host. We rely on compare_super and the 'family_num' fields to * exclude member disks that do not belong, and we rely on * mdadm.conf to specify the arrays that should be assembled. * Auto-assembly may still pick up "foreign" arrays. @@ -1026,7 +1161,7 @@ static void uuid_from_super_imsm(struct supertype *st, int uuid[4]) */ /* imsm does not track uuid's so we synthesis one using sha1 on * - The signature (Which is constant for all imsm array, but no matter) - * - the family_num of the container + * - the orig_family_num of the container * - the index number of the volume * - the 'serial' number of the volume. * Hopefully these are all constant. @@ -1036,10 +1171,18 @@ static void uuid_from_super_imsm(struct supertype *st, int uuid[4]) char buf[20]; struct sha1_ctx ctx; struct imsm_dev *dev = NULL; + __u32 family_num; + /* some mdadm versions failed to set ->orig_family_num, in which + * case fall back to ->family_num. orig_family_num will be + * fixed up with the first metadata update. + */ + family_num = super->anchor->orig_family_num; + if (family_num == 0) + family_num = super->anchor->family_num; sha1_init_ctx(&ctx); sha1_process_bytes(super->anchor->sig, MPB_SIG_LEN, &ctx); - sha1_process_bytes(&super->anchor->family_num, sizeof(__u32), &ctx); + sha1_process_bytes(&family_num, sizeof(__u32), &ctx); if (super->current_vol >= 0) dev = get_imsm_dev(super, super->current_vol); if (dev) { @@ -1079,6 +1222,179 @@ get_imsm_numerical_version(struct imsm_super *mpb, int *m, int *p) } #endif +static __u32 migr_strip_blocks_resync(struct imsm_dev *dev) +{ + /* migr_strip_size when repairing or initializing parity */ + struct imsm_map *map = get_imsm_map(dev, 0); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch (get_imsm_raid_level(map)) { + case 5: + case 10: + return chunk; + default: + return 128*1024 >> 9; + } +} + +static __u32 migr_strip_blocks_rebuild(struct imsm_dev *dev) +{ + /* migr_strip_size when rebuilding a degraded disk, no idea why + * this is different than migr_strip_size_resync(), but it's good + * to be compatible + */ + struct imsm_map *map = get_imsm_map(dev, 1); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch (get_imsm_raid_level(map)) { + case 1: + case 10: + if (map->num_members % map->num_domains == 0) + return 128*1024 >> 9; + else + return chunk; + case 5: + return max((__u32) 64*1024 >> 9, chunk); + default: + return 128*1024 >> 9; + } +} + +static __u32 num_stripes_per_unit_resync(struct imsm_dev *dev) +{ + struct imsm_map *lo = get_imsm_map(dev, 0); + struct imsm_map *hi = get_imsm_map(dev, 1); + __u32 lo_chunk = __le32_to_cpu(lo->blocks_per_strip); + __u32 hi_chunk = __le32_to_cpu(hi->blocks_per_strip); + + return max((__u32) 1, hi_chunk / lo_chunk); +} + +static __u32 num_stripes_per_unit_rebuild(struct imsm_dev *dev) +{ + struct imsm_map *lo = get_imsm_map(dev, 0); + int level = get_imsm_raid_level(lo); + + if (level == 1 || level == 10) { + struct imsm_map *hi = get_imsm_map(dev, 1); + + return hi->num_domains; + } else + return num_stripes_per_unit_resync(dev); +} + +static __u8 imsm_num_data_members(struct imsm_dev *dev) +{ + /* named 'imsm_' because raid0, raid1 and raid10 + * counter-intuitively have the same number of data disks + */ + struct imsm_map *map = get_imsm_map(dev, 0); + + switch (get_imsm_raid_level(map)) { + case 0: + case 1: + case 10: + return map->num_members; + case 5: + return map->num_members - 1; + default: + dprintf("%s: unsupported raid level\n", __func__); + return 0; + } +} + +static __u32 parity_segment_depth(struct imsm_dev *dev) +{ + struct imsm_map *map = get_imsm_map(dev, 0); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch(get_imsm_raid_level(map)) { + case 1: + case 10: + return chunk * map->num_domains; + case 5: + return chunk * map->num_members; + default: + return chunk; + } +} + +static __u32 map_migr_block(struct imsm_dev *dev, __u32 block) +{ + struct imsm_map *map = get_imsm_map(dev, 1); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + __u32 strip = block / chunk; + + switch (get_imsm_raid_level(map)) { + case 1: + case 10: { + __u32 vol_strip = (strip * map->num_domains) + 1; + __u32 vol_stripe = vol_strip / map->num_members; + + return vol_stripe * chunk + block % chunk; + } case 5: { + __u32 stripe = strip / (map->num_members - 1); + + return stripe * chunk + block % chunk; + } + default: + return 0; + } +} + +static __u64 blocks_per_migr_unit(struct imsm_dev *dev) +{ + /* calculate the conversion factor between per member 'blocks' + * (md/{resync,rebuild}_start) and imsm migration units, return + * 0 for the 'not migrating' and 'unsupported migration' cases + */ + if (!dev->vol.migr_state) + return 0; + + switch (migr_type(dev)) { + case MIGR_VERIFY: + case MIGR_REPAIR: + case MIGR_INIT: { + struct imsm_map *map = get_imsm_map(dev, 0); + __u32 stripes_per_unit; + __u32 blocks_per_unit; + __u32 parity_depth; + __u32 migr_chunk; + __u32 block_map; + __u32 block_rel; + __u32 segment; + __u32 stripe; + __u8 disks; + + /* yes, this is really the translation of migr_units to + * per-member blocks in the 'resync' case + */ + stripes_per_unit = num_stripes_per_unit_resync(dev); + migr_chunk = migr_strip_blocks_resync(dev); + disks = imsm_num_data_members(dev); + blocks_per_unit = stripes_per_unit * migr_chunk * disks; + stripe = __le32_to_cpu(map->blocks_per_strip) * disks; + segment = blocks_per_unit / stripe; + block_rel = blocks_per_unit - segment * stripe; + parity_depth = parity_segment_depth(dev); + block_map = map_migr_block(dev, block_rel); + return block_map + parity_depth * segment; + } + case MIGR_REBUILD: { + __u32 stripes_per_unit; + __u32 migr_chunk; + + stripes_per_unit = num_stripes_per_unit_rebuild(dev); + migr_chunk = migr_strip_blocks_rebuild(dev); + return migr_chunk * stripes_per_unit; + } + case MIGR_GEN_MIGR: + case MIGR_STATE_CHANGE: + default: + return 0; + } +} + static int imsm_level_to_layout(int level) { switch (level) { @@ -1091,7 +1407,7 @@ static int imsm_level_to_layout(int level) case 10: return 0x102; } - return -1; + return UnSet; } static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info) @@ -1099,7 +1415,11 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info) struct intel_super *super = st->sb; struct imsm_dev *dev = get_imsm_dev(super, super->current_vol); struct imsm_map *map = get_imsm_map(dev, 0); + struct dl *dl; + for (dl = super->disks; dl; dl = dl->next) + if (dl->raiddisk == info->disk.raid_disk) + break; info->container_member = super->current_vol; info->array.raid_disks = map->num_members; info->array.level = get_imsm_raid_level(map); @@ -1109,20 +1429,49 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info) info->array.utime = 0; info->array.chunk_size = __le16_to_cpu(map->blocks_per_strip) << 9; info->array.state = !dev->vol.dirty; + info->custom_array_size = __le32_to_cpu(dev->size_high); + info->custom_array_size <<= 32; + info->custom_array_size |= __le32_to_cpu(dev->size_low); info->disk.major = 0; info->disk.minor = 0; + if (dl) { + info->disk.major = dl->major; + info->disk.minor = dl->minor; + } info->data_offset = __le32_to_cpu(map->pba_of_lba0); info->component_size = __le32_to_cpu(map->blocks_per_member); memset(info->uuid, 0, sizeof(info->uuid)); - if (map->map_state == IMSM_T_STATE_UNINITIALIZED || dev->vol.dirty) + if (map->map_state == IMSM_T_STATE_UNINITIALIZED || dev->vol.dirty) { info->resync_start = 0; - else if (dev->vol.migr_state) - info->resync_start = __le32_to_cpu(dev->vol.curr_migr_unit); - else - info->resync_start = ~0ULL; + } else if (dev->vol.migr_state) { + switch (migr_type(dev)) { + case MIGR_REPAIR: + case MIGR_INIT: { + __u64 blocks_per_unit = blocks_per_migr_unit(dev); + __u64 units = __le32_to_cpu(dev->vol.curr_migr_unit); + + info->resync_start = blocks_per_unit * units; + break; + } + case MIGR_VERIFY: + /* we could emulate the checkpointing of + * 'sync_action=check' migrations, but for now + * we just immediately complete them + */ + case MIGR_REBUILD: + /* this is handled by container_content_imsm() */ + case MIGR_GEN_MIGR: + case MIGR_STATE_CHANGE: + /* FIXME handle other migrations */ + default: + /* we are not dirty, so... */ + info->resync_start = MaxSector; + } + } else + info->resync_start = MaxSector; strncpy(info->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN); info->name[MAX_RAID_SERIAL_LEN] = 0; @@ -1136,12 +1485,41 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info) uuid_from_super_imsm(st, info->uuid); } +/* check the config file to see if we can return a real uuid for this spare */ +static void fixup_container_spare_uuid(struct mdinfo *inf) +{ + struct mddev_ident_s *array_list; + + if (inf->array.level != LEVEL_CONTAINER || + memcmp(inf->uuid, uuid_match_any, sizeof(int[4])) != 0) + return; + + array_list = conf_get_ident(NULL); + + for (; array_list; array_list = array_list->next) { + if (array_list->uuid_set) { + struct supertype *_sst; /* spare supertype */ + struct supertype *_cst; /* container supertype */ + + _cst = array_list->st; + if (_cst) + _sst = _cst->ss->match_metadata_desc(inf->text_version); + else + _sst = NULL; + + if (_sst) { + memcpy(inf->uuid, array_list->uuid, sizeof(int[4])); + free(_sst); + break; + } + } + } +} static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info) { struct intel_super *super = st->sb; struct imsm_disk *disk; - __u32 s; if (super->current_vol >= 0) { getinfo_super_imsm_volume(st, info); @@ -1177,10 +1555,13 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info) disk = &super->disks->disk; info->data_offset = __le32_to_cpu(disk->total_blocks) - reserved; info->component_size = reserved; - s = disk->status; - info->disk.state = s & CONFIGURED_DISK ? (1 << MD_DISK_ACTIVE) : 0; - info->disk.state |= s & FAILED_DISK ? (1 << MD_DISK_FAULTY) : 0; - info->disk.state |= s & SPARE_DISK ? 0 : (1 << MD_DISK_SYNC); + info->disk.state = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0; + /* we don't change info->disk.raid_disk here because + * this state will be finalized in mdmon after we have + * found the 'most fresh' version of the metadata + */ + info->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0; + info->disk.state |= is_spare(disk) ? 0 : (1 << MD_DISK_SYNC); } /* only call uuid_from_super_imsm when this disk is part of a populated container, @@ -1188,16 +1569,16 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info) */ if (info->disk.state & (1 << MD_DISK_SYNC) || super->anchor->num_raid_devs) uuid_from_super_imsm(st, info->uuid); - else + else { memcpy(info->uuid, uuid_match_any, sizeof(int[4])); + fixup_container_spare_uuid(info); + } } static int update_super_imsm(struct supertype *st, struct mdinfo *info, char *update, char *devname, int verbose, int uuid_set, char *homehost) { - /* FIXME */ - /* For 'assemble' and 'force' we need to return non-zero if any * change was made. For others, the return value is ignored. * Update options are: @@ -1213,26 +1594,55 @@ static int update_super_imsm(struct supertype *st, struct mdinfo *info, * linear only * resync: mark as dirty so a resync will happen. * name: update the name - preserving the homehost + * uuid: Change the uuid of the array to match watch is given * * Following are not relevant for this imsm: * sparc2.2 : update from old dodgey metadata * super-minor: change the preferred_minor number * summaries: update redundant counters. - * uuid: Change the uuid of the array to match watch is given * homehost: update the recorded homehost * _reshape_progress: record new reshape_progress position. */ - int rv = 0; - //struct intel_super *super = st->sb; - //struct imsm_super *mpb = super->mpb; + int rv = 1; + struct intel_super *super = st->sb; + struct imsm_super *mpb; - if (strcmp(update, "grow") == 0) { - } - if (strcmp(update, "resync") == 0) { - /* dev->vol.dirty = 1; */ - } + /* we can only update container info */ + if (!super || super->current_vol >= 0 || !super->anchor) + return 1; - /* IMSM has no concept of UUID or homehost */ + mpb = super->anchor; + + if (strcmp(update, "uuid") == 0 && uuid_set && !info->update_private) + fprintf(stderr, + Name ": '--uuid' not supported for imsm metadata\n"); + else if (strcmp(update, "uuid") == 0 && uuid_set && info->update_private) { + mpb->orig_family_num = *((__u32 *) info->update_private); + rv = 0; + } else if (strcmp(update, "uuid") == 0) { + __u32 *new_family = malloc(sizeof(*new_family)); + + /* update orig_family_number with the incoming random + * data, report the new effective uuid, and store the + * new orig_family_num for future updates. + */ + if (new_family) { + memcpy(&mpb->orig_family_num, info->uuid, sizeof(__u32)); + uuid_from_super_imsm(st, info->uuid); + *new_family = mpb->orig_family_num; + info->update_private = new_family; + rv = 0; + } + } else if (strcmp(update, "assemble") == 0) + rv = 0; + else + fprintf(stderr, + Name ": '--update=%s' not supported for imsm metadata\n", + update); + + /* successful update? recompute checksum */ + if (rv == 0) + mpb->check_sum = __le32_to_cpu(__gen_imsm_checksum(mpb)); return rv; } @@ -1260,6 +1670,23 @@ static __u64 avail_size_imsm(struct supertype *st, __u64 devsize) return devsize - (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS); } +static void free_devlist(struct intel_super *super) +{ + struct intel_dev *dv; + + while (super->devlist) { + dv = super->devlist->next; + free(super->devlist->dev); + free(super->devlist); + super->devlist = dv; + } +} + +static void imsm_copy_dev(struct imsm_dev *dest, struct imsm_dev *src) +{ + memcpy(dest, src, sizeof_imsm_dev(src, 0)); +} + static int compare_super_imsm(struct supertype *st, struct supertype *tst) { /* @@ -1278,43 +1705,71 @@ static int compare_super_imsm(struct supertype *st, struct supertype *tst) return 0; } - if (memcmp(first->anchor->sig, sec->anchor->sig, MAX_SIGNATURE_LENGTH) != 0) - return 3; - /* if an anchor does not have num_raid_devs set then it is a free * floating spare */ if (first->anchor->num_raid_devs > 0 && sec->anchor->num_raid_devs > 0) { - if (first->anchor->family_num != sec->anchor->family_num) + /* Determine if these disks might ever have been + * related. Further disambiguation can only take place + * in load_super_imsm_all + */ + __u32 first_family = first->anchor->orig_family_num; + __u32 sec_family = sec->anchor->orig_family_num; + + if (memcmp(first->anchor->sig, sec->anchor->sig, + MAX_SIGNATURE_LENGTH) != 0) + return 3; + + if (first_family == 0) + first_family = first->anchor->family_num; + if (sec_family == 0) + sec_family = sec->anchor->family_num; + + if (first_family != sec_family) return 3; + } + /* if 'first' is a spare promote it to a populated mpb with sec's * family number */ if (first->anchor->num_raid_devs == 0 && sec->anchor->num_raid_devs > 0) { int i; + struct intel_dev *dv; + struct imsm_dev *dev; /* we need to copy raid device info from sec if an allocation * fails here we don't associate the spare */ for (i = 0; i < sec->anchor->num_raid_devs; i++) { - first->dev_tbl[i] = malloc(sizeof(struct imsm_dev)); - if (!first->dev_tbl) { - while (--i >= 0) { - free(first->dev_tbl[i]); - first->dev_tbl[i] = NULL; - } - fprintf(stderr, "imsm: failed to associate spare\n"); - return 3; + dv = malloc(sizeof(*dv)); + if (!dv) + break; + dev = malloc(sizeof_imsm_dev(get_imsm_dev(sec, i), 1)); + if (!dev) { + free(dv); + break; } - *first->dev_tbl[i] = *sec->dev_tbl[i]; + dv->dev = dev; + dv->index = i; + dv->next = first->devlist; + first->devlist = dv; + } + if (i < sec->anchor->num_raid_devs) { + /* allocation failure */ + free_devlist(first); + fprintf(stderr, "imsm: failed to associate spare\n"); + return 3; } - first->anchor->num_raid_devs = sec->anchor->num_raid_devs; + first->anchor->orig_family_num = sec->anchor->orig_family_num; first->anchor->family_num = sec->anchor->family_num; + memcpy(first->anchor->sig, sec->anchor->sig, MAX_SIGNATURE_LENGTH); + for (i = 0; i < sec->anchor->num_raid_devs; i++) + imsm_copy_dev(get_imsm_dev(first, i), get_imsm_dev(sec, i)); } return 0; @@ -1344,7 +1799,6 @@ static void fd2devname(int fd, char *name) snprintf(name, MAX_RAID_SERIAL_LEN, "/dev/%s", nm); } - extern int scsi_get_serial(int fd, void *buf, size_t buf_len); static int imsm_read_serial(int fd, char *devname, @@ -1354,7 +1808,10 @@ static int imsm_read_serial(int fd, char *devname, int rv; int rsp_len; int len; - char *c, *rsp_buf; + char *dest; + char *src; + char *rsp_buf; + int i; memset(scsi_serial, 0, sizeof(scsi_serial)); @@ -1374,27 +1831,42 @@ static int imsm_read_serial(int fd, char *devname, return rv; } - /* trim leading whitespace */ rsp_len = scsi_serial[3]; + if (!rsp_len) { + if (devname) + fprintf(stderr, + Name ": Failed to retrieve serial for %s\n", + devname); + return 2; + } rsp_buf = (char *) &scsi_serial[4]; - c = rsp_buf; - while (isspace(*c)) - c++; - /* truncate len to the end of rsp_buf if necessary */ - if (c + MAX_RAID_SERIAL_LEN > rsp_buf + rsp_len) - len = rsp_len - (c - rsp_buf); - else + /* trim all whitespace and non-printable characters and convert + * ':' to ';' + */ + for (i = 0, dest = rsp_buf; i < rsp_len; i++) { + src = &rsp_buf[i]; + if (*src > 0x20) { + /* ':' is reserved for use in placeholder serial + * numbers for missing disks + */ + if (*src == ':') + *dest++ = ';'; + else + *dest++ = *src; + } + } + len = dest - rsp_buf; + dest = rsp_buf; + + /* truncate leading characters */ + if (len > MAX_RAID_SERIAL_LEN) { + dest += len - MAX_RAID_SERIAL_LEN; len = MAX_RAID_SERIAL_LEN; + } - /* initialize the buffer and copy rsp_buf characters */ memset(serial, 0, MAX_RAID_SERIAL_LEN); - memcpy(serial, c, len); - - /* trim trailing whitespace starting with the last character copied */ - c = (char *) &serial[len - 1]; - while (isspace(*c) || *c == '\0') - *c-- = '\0'; + memcpy(serial, dest, len); return 0; } @@ -1409,6 +1881,7 @@ static void serialcpy(__u8 *dest, __u8 *src) strncpy((char *) dest, (char *) src, MAX_RAID_SERIAL_LEN); } +#ifndef MDASSEMBLE static struct dl *serial_to_dl(__u8 *serial, struct intel_super *super) { struct dl *dl; @@ -1419,15 +1892,34 @@ static struct dl *serial_to_dl(__u8 *serial, struct intel_super *super) return dl; } +#endif + +static struct imsm_disk * +__serial_to_disk(__u8 *serial, struct imsm_super *mpb, int *idx) +{ + int i; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, i); + + if (serialcmp(disk->serial, serial) == 0) { + if (idx) + *idx = i; + return disk; + } + } + + return NULL; +} static int load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) { + struct imsm_disk *disk; struct dl *dl; struct stat stb; int rv; - int i; - int alloc = 1; + char name[40]; __u8 serial[MAX_RAID_SERIAL_LEN]; rv = imsm_read_serial(fd, devname, serial); @@ -1435,16 +1927,7 @@ load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) if (rv != 0) return 2; - /* check if this is a disk we have seen before. it may be a spare in - * super->disks while the current anchor believes it is a raid member, - * check if we need to update dl->index - */ - dl = serial_to_dl(serial, super); - if (!dl) - dl = malloc(sizeof(*dl)); - else - alloc = 0; - + dl = calloc(1, sizeof(*dl)); if (!dl) { if (devname) fprintf(stderr, @@ -1453,61 +1936,38 @@ load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) return 2; } - if (alloc) { - fstat(fd, &stb); - dl->major = major(stb.st_rdev); - dl->minor = minor(stb.st_rdev); - dl->next = super->disks; - dl->fd = keep_fd ? fd : -1; - dl->devname = devname ? strdup(devname) : NULL; - serialcpy(dl->serial, serial); - dl->index = -2; - dl->e = NULL; - } else if (keep_fd) { - close(dl->fd); - dl->fd = fd; - } + fstat(fd, &stb); + dl->major = major(stb.st_rdev); + dl->minor = minor(stb.st_rdev); + dl->next = super->disks; + dl->fd = keep_fd ? fd : -1; + assert(super->disks == NULL); + super->disks = dl; + serialcpy(dl->serial, serial); + dl->index = -2; + dl->e = NULL; + fd2devname(fd, name); + if (devname) + dl->devname = strdup(devname); + else + dl->devname = strdup(name); /* look up this disk's index in the current anchor */ - for (i = 0; i < super->anchor->num_disks; i++) { - struct imsm_disk *disk_iter; - - disk_iter = __get_imsm_disk(super->anchor, i); - - if (serialcmp(disk_iter->serial, dl->serial) == 0) { - dl->disk = *disk_iter; - /* only set index on disks that are a member of a - * populated contianer, i.e. one with raid_devs - */ - if (dl->disk.status & FAILED_DISK) - dl->index = -2; - else if (dl->disk.status & SPARE_DISK) - dl->index = -1; - else - dl->index = i; - - break; - } - } - - /* no match, maybe a stale failed drive */ - if (i == super->anchor->num_disks && dl->index >= 0) { - dl->disk = *__get_imsm_disk(super->anchor, dl->index); - if (dl->disk.status & FAILED_DISK) + disk = __serial_to_disk(dl->serial, super->anchor, &dl->index); + if (disk) { + dl->disk = *disk; + /* only set index on disks that are a member of a + * populated contianer, i.e. one with raid_devs + */ + if (is_failed(&dl->disk)) dl->index = -2; + else if (is_spare(&dl->disk)) + dl->index = -1; } - if (alloc) - super->disks = dl; - return 0; } -static void imsm_copy_dev(struct imsm_dev *dest, struct imsm_dev *src) -{ - memcpy(dest, src, sizeof_imsm_dev(src, 0)); -} - #ifndef MDASSEMBLE /* When migrating map0 contains the 'destination' state while map1 * contains the current state. When not migrating map0 contains the @@ -1518,28 +1978,50 @@ static void imsm_copy_dev(struct imsm_dev *dest, struct imsm_dev *src) * 1/ Idle (migr_state=0 map0state=normal||unitialized||degraded||failed) * 2/ Initialize (migr_state=1 migr_type=MIGR_INIT map0state=normal * map1state=unitialized) - * 3/ Verify (Resync) (migr_state=1 migr_type=MIGR_REBUILD map0state=normal + * 3/ Repair (Resync) (migr_state=1 migr_type=MIGR_REPAIR map0state=normal * map1state=normal) * 4/ Rebuild (migr_state=1 migr_type=MIGR_REBUILD map0state=normal * map1state=degraded) */ -static void migrate(struct imsm_dev *dev, __u8 to_state, int rebuild_resync) +static void migrate(struct imsm_dev *dev, __u8 to_state, int migr_type) { struct imsm_map *dest; struct imsm_map *src = get_imsm_map(dev, 0); dev->vol.migr_state = 1; - dev->vol.migr_type = rebuild_resync; + set_migr_type(dev, migr_type); dev->vol.curr_migr_unit = 0; dest = get_imsm_map(dev, 1); + /* duplicate and then set the target end state in map[0] */ memcpy(dest, src, sizeof_imsm_map(src)); + if (migr_type == MIGR_REBUILD) { + __u32 ord; + int i; + + for (i = 0; i < src->num_members; i++) { + ord = __le32_to_cpu(src->disk_ord_tbl[i]); + set_imsm_ord_tbl_ent(src, i, ord_to_idx(ord)); + } + } + src->map_state = to_state; } static void end_migration(struct imsm_dev *dev, __u8 map_state) { struct imsm_map *map = get_imsm_map(dev, 0); + struct imsm_map *prev = get_imsm_map(dev, dev->vol.migr_state); + int i; + + /* merge any IMSM_ORD_REBUILD bits that were not successfully + * completed in the last migration. + * + * FIXME add support for online capacity expansion and + * raid-level-migration + */ + for (i = 0; i < prev->num_members; i++) + map->disk_ord_tbl[i] |= prev->disk_ord_tbl[i]; dev->vol.migr_state = 0; dev->vol.curr_migr_unit = 0; @@ -1557,17 +2039,26 @@ static int parse_raid_devices(struct intel_super *super) for (i = 0; i < super->anchor->num_raid_devs; i++) { struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i); + struct intel_dev *dv; len = sizeof_imsm_dev(dev_iter, 0); len_migr = sizeof_imsm_dev(dev_iter, 1); if (len_migr > len) space_needed += len_migr - len; + dv = malloc(sizeof(*dv)); + if (!dv) + return 1; dev_new = malloc(len_migr); - if (!dev_new) + if (!dev_new) { + free(dv); return 1; + } imsm_copy_dev(dev_new, dev_iter); - super->dev_tbl[i] = dev_new; + dv->dev = dev_new; + dv->index = i; + dv->next = super->devlist; + super->devlist = dv; } /* ensure that super->buf is large enough when all raid devices @@ -1580,7 +2071,8 @@ static int parse_raid_devices(struct intel_super *super) if (posix_memalign(&buf, 512, len) != 0) return 1; - memcpy(buf, super->buf, len); + memcpy(buf, super->buf, super->len); + memset(buf + super->len, 0, len - super->len); free(super->buf); super->buf = buf; super->len = len; @@ -1614,7 +2106,6 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) struct stat; struct imsm_super *anchor; __u32 check_sum; - int rc; get_dev_size(fd, NULL, &dsize); @@ -1665,10 +2156,18 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) sectors = mpb_sectors(anchor) - 1; free(anchor); if (!sectors) { - rc = load_imsm_disk(fd, super, devname, 0); - if (rc == 0) - rc = parse_raid_devices(super); - return rc; + check_sum = __gen_imsm_checksum(super->anchor); + if (check_sum != __le32_to_cpu(super->anchor->check_sum)) { + if (devname) + fprintf(stderr, + Name ": IMSM checksum %x != %x on %s\n", + check_sum, + __le32_to_cpu(super->anchor->check_sum), + devname); + return 2; + } + + return 0; } /* read the extended mpb */ @@ -1695,7 +2194,7 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) Name ": IMSM checksum %x != %x on %s\n", check_sum, __le32_to_cpu(super->anchor->check_sum), devname); - return 2; + return 3; } /* FIXME the BBM log is disk specific so we cannot use this global @@ -1704,11 +2203,23 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) */ super->bbm_log = __get_imsm_bbm_log(super->anchor); - rc = load_imsm_disk(fd, super, devname, 0); - if (rc == 0) - rc = parse_raid_devices(super); + return 0; +} + +static int +load_and_parse_mpb(int fd, struct intel_super *super, char *devname, int keep_fd) +{ + int err; + + err = load_imsm_mpb(fd, super, devname); + if (err) + return err; + err = load_imsm_disk(fd, super, devname, keep_fd); + if (err) + return err; + err = parse_raid_devices(super); - return rc; + return err; } static void __free_imsm_disk(struct dl *d) @@ -1742,19 +2253,13 @@ static void free_imsm_disks(struct intel_super *super) /* free all the pieces hanging off of a super pointer */ static void __free_imsm(struct intel_super *super, int free_disks) { - int i; - if (super->buf) { free(super->buf); super->buf = NULL; } if (free_disks) free_imsm_disks(super); - for (i = 0; i < IMSM_MAX_RAID_DEVS; i++) - if (super->dev_tbl[i]) { - free(super->dev_tbl[i]); - super->dev_tbl[i] = NULL; - } + free_devlist(super); if (super->hba) { free((void *) super->hba); super->hba = NULL; @@ -1789,7 +2294,7 @@ static struct intel_super *alloc_super(int creating_imsm) super->create_offset = ~((__u32 ) 0); if (!check_env("IMSM_NO_PLATFORM")) super->orom = find_imsm_orom(); - if (super->orom) { + if (super->orom && !check_env("IMSM_TEST_OROM")) { struct sys_dev *list, *ent; /* find the first intel ahci controller */ @@ -1825,11 +2330,6 @@ static int find_missing(struct intel_super *super) dl = serial_to_dl(disk->serial, super); if (dl) continue; - /* ok we have a 'disk' without a live entry in - * super->disks - */ - if (disk->status & FAILED_DISK || !(disk->status & USABLE_DISK)) - continue; /* never mind, already marked */ dl = malloc(sizeof(*dl)); if (!dl) @@ -1841,6 +2341,7 @@ static int find_missing(struct intel_super *super) dl->index = i; serialcpy(dl->serial, disk->serial); dl->disk = *disk; + dl->e = NULL; dl->next = super->missing; super->missing = dl; } @@ -1848,20 +2349,341 @@ static int find_missing(struct intel_super *super) return 0; } +static struct intel_disk *disk_list_get(__u8 *serial, struct intel_disk *disk_list) +{ + struct intel_disk *idisk = disk_list; + + while (idisk) { + if (serialcmp(idisk->disk.serial, serial) == 0) + break; + idisk = idisk->next; + } + + return idisk; +} + +static int __prep_thunderdome(struct intel_super **table, int tbl_size, + struct intel_super *super, + struct intel_disk **disk_list) +{ + struct imsm_disk *d = &super->disks->disk; + struct imsm_super *mpb = super->anchor; + int i, j; + + for (i = 0; i < tbl_size; i++) { + struct imsm_super *tbl_mpb = table[i]->anchor; + struct imsm_disk *tbl_d = &table[i]->disks->disk; + + if (tbl_mpb->family_num == mpb->family_num) { + if (tbl_mpb->check_sum == mpb->check_sum) { + dprintf("%s: mpb from %d:%d matches %d:%d\n", + __func__, super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + break; + } + + if (((is_configured(d) && !is_configured(tbl_d)) || + is_configured(d) == is_configured(tbl_d)) && + tbl_mpb->generation_num < mpb->generation_num) { + /* current version of the mpb is a + * better candidate than the one in + * super_table, but copy over "cross + * generational" status + */ + struct intel_disk *idisk; + + dprintf("%s: mpb from %d:%d replaces %d:%d\n", + __func__, super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + + idisk = disk_list_get(tbl_d->serial, *disk_list); + if (idisk && is_failed(&idisk->disk)) + tbl_d->status |= FAILED_DISK; + break; + } else { + struct intel_disk *idisk; + struct imsm_disk *disk; + + /* tbl_mpb is more up to date, but copy + * over cross generational status before + * returning + */ + disk = __serial_to_disk(d->serial, mpb, NULL); + if (disk && is_failed(disk)) + d->status |= FAILED_DISK; + + idisk = disk_list_get(d->serial, *disk_list); + if (idisk) { + idisk->owner = i; + if (disk && is_configured(disk)) + idisk->disk.status |= CONFIGURED_DISK; + } + + dprintf("%s: mpb from %d:%d prefer %d:%d\n", + __func__, super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + + return tbl_size; + } + } + } + + if (i >= tbl_size) + table[tbl_size++] = super; + else + table[i] = super; + + /* update/extend the merged list of imsm_disk records */ + for (j = 0; j < mpb->num_disks; j++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, j); + struct intel_disk *idisk; + + idisk = disk_list_get(disk->serial, *disk_list); + if (idisk) { + idisk->disk.status |= disk->status; + if (is_configured(&idisk->disk) || + is_failed(&idisk->disk)) + idisk->disk.status &= ~(SPARE_DISK); + } else { + idisk = calloc(1, sizeof(*idisk)); + if (!idisk) + return -1; + idisk->owner = IMSM_UNKNOWN_OWNER; + idisk->disk = *disk; + idisk->next = *disk_list; + *disk_list = idisk; + } + + if (serialcmp(idisk->disk.serial, d->serial) == 0) + idisk->owner = i; + } + + return tbl_size; +} + +static struct intel_super * +validate_members(struct intel_super *super, struct intel_disk *disk_list, + const int owner) +{ + struct imsm_super *mpb = super->anchor; + int ok_count = 0; + int i; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, i); + struct intel_disk *idisk; + + idisk = disk_list_get(disk->serial, disk_list); + if (idisk) { + if (idisk->owner == owner || + idisk->owner == IMSM_UNKNOWN_OWNER) + ok_count++; + else + dprintf("%s: '%.16s' owner %d != %d\n", + __func__, disk->serial, idisk->owner, + owner); + } else { + dprintf("%s: unknown disk %x [%d]: %.16s\n", + __func__, __le32_to_cpu(mpb->family_num), i, + disk->serial); + break; + } + } + + if (ok_count == mpb->num_disks) + return super; + return NULL; +} + +static void show_conflicts(__u32 family_num, struct intel_super *super_list) +{ + struct intel_super *s; + + for (s = super_list; s; s = s->next) { + if (family_num != s->anchor->family_num) + continue; + fprintf(stderr, "Conflict, offlining family %#x on '%s'\n", + __le32_to_cpu(family_num), s->disks->devname); + } +} + +static struct intel_super * +imsm_thunderdome(struct intel_super **super_list, int len) +{ + struct intel_super *super_table[len]; + struct intel_disk *disk_list = NULL; + struct intel_super *champion, *spare; + struct intel_super *s, **del; + int tbl_size = 0; + int conflict; + int i; + + memset(super_table, 0, sizeof(super_table)); + for (s = *super_list; s; s = s->next) + tbl_size = __prep_thunderdome(super_table, tbl_size, s, &disk_list); + + for (i = 0; i < tbl_size; i++) { + struct imsm_disk *d; + struct intel_disk *idisk; + struct imsm_super *mpb = super_table[i]->anchor; + + s = super_table[i]; + d = &s->disks->disk; + + /* 'd' must appear in merged disk list for its + * configuration to be valid + */ + idisk = disk_list_get(d->serial, disk_list); + if (idisk && idisk->owner == i) + s = validate_members(s, disk_list, i); + else + s = NULL; + + if (!s) + dprintf("%s: marking family: %#x from %d:%d offline\n", + __func__, mpb->family_num, + super_table[i]->disks->major, + super_table[i]->disks->minor); + super_table[i] = s; + } + + /* This is where the mdadm implementation differs from the Windows + * driver which has no strict concept of a container. We can only + * assemble one family from a container, so when returning a prodigal + * array member to this system the code will not be able to disambiguate + * the container contents that should be assembled ("foreign" versus + * "local"). It requires user intervention to set the orig_family_num + * to a new value to establish a new container. The Windows driver in + * this situation fixes up the volume name in place and manages the + * foreign array as an independent entity. + */ + s = NULL; + spare = NULL; + conflict = 0; + for (i = 0; i < tbl_size; i++) { + struct intel_super *tbl_ent = super_table[i]; + int is_spare = 0; + + if (!tbl_ent) + continue; + + if (tbl_ent->anchor->num_raid_devs == 0) { + spare = tbl_ent; + is_spare = 1; + } + + if (s && !is_spare) { + show_conflicts(tbl_ent->anchor->family_num, *super_list); + conflict++; + } else if (!s && !is_spare) + s = tbl_ent; + } + + if (!s) + s = spare; + if (!s) { + champion = NULL; + goto out; + } + champion = s; + + if (conflict) + fprintf(stderr, "Chose family %#x on '%s', " + "assemble conflicts to new container with '--update=uuid'\n", + __le32_to_cpu(s->anchor->family_num), s->disks->devname); + + /* collect all dl's onto 'champion', and update them to + * champion's version of the status + */ + for (s = *super_list; s; s = s->next) { + struct imsm_super *mpb = champion->anchor; + struct dl *dl = s->disks; + + if (s == champion) + continue; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk; + + disk = __serial_to_disk(dl->serial, mpb, &dl->index); + if (disk) { + dl->disk = *disk; + /* only set index on disks that are a member of + * a populated contianer, i.e. one with + * raid_devs + */ + if (is_failed(&dl->disk)) + dl->index = -2; + else if (is_spare(&dl->disk)) + dl->index = -1; + break; + } + } + + if (i >= mpb->num_disks) { + struct intel_disk *idisk; + + idisk = disk_list_get(dl->serial, disk_list); + if (idisk && is_spare(&idisk->disk) && + !is_failed(&idisk->disk) && !is_configured(&idisk->disk)) + dl->index = -1; + else { + dl->index = -2; + continue; + } + } + + dl->next = champion->disks; + champion->disks = dl; + s->disks = NULL; + } + + /* delete 'champion' from super_list */ + for (del = super_list; *del; ) { + if (*del == champion) { + *del = (*del)->next; + break; + } else + del = &(*del)->next; + } + champion->next = NULL; + + out: + while (disk_list) { + struct intel_disk *idisk = disk_list; + + disk_list = disk_list->next; + free(idisk); + } + + return champion; +} + static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, char *devname, int keep_fd) { struct mdinfo *sra; - struct intel_super *super; - struct mdinfo *sd, *best = NULL; - __u32 bestgen = 0; - __u32 gen; - char nm[20]; - int dfd; - int rv; + struct intel_super *super_list = NULL; + struct intel_super *super = NULL; + int devnum = fd2devnum(fd); + struct mdinfo *sd; + int retry; + int err = 0; + int i; + enum sysfs_read_flags flags; + + flags = GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE; + if (mdmon_running(devnum)) + flags |= SKIP_GONE_DEVS; - /* check if this disk is a member of an active array */ - sra = sysfs_read(fd, 0, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE); + /* check if 'fd' an opened container */ + sra = sysfs_read(fd, 0, flags); if (!sra) return 1; @@ -1870,84 +2692,78 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, strcmp(sra->text_version, "imsm") != 0) return 1; - super = alloc_super(0); - if (!super) - return 1; + /* load all mpbs */ + for (sd = sra->devs, i = 0; sd; sd = sd->next, i++) { + struct intel_super *s = alloc_super(0); + char nm[20]; + int dfd; + + err = 1; + if (!s) + goto error; + s->next = super_list; + super_list = s; - /* find the most up to date disk in this array, skipping spares */ - for (sd = sra->devs; sd; sd = sd->next) { + err = 2; sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); dfd = dev_open(nm, keep_fd ? O_RDWR : O_RDONLY); - if (!dfd) { - free_imsm(super); - return 2; - } - rv = load_imsm_mpb(dfd, super, NULL); + if (dfd < 0) + goto error; + + err = load_and_parse_mpb(dfd, s, NULL, keep_fd); + + /* retry the load if we might have raced against mdmon */ + if (err == 3 && mdmon_running(devnum)) + for (retry = 0; retry < 3; retry++) { + usleep(3000); + err = load_and_parse_mpb(dfd, s, NULL, keep_fd); + if (err != 3) + break; + } if (!keep_fd) close(dfd); - if (rv == 0) { - if (super->anchor->num_raid_devs == 0) - gen = 0; - else - gen = __le32_to_cpu(super->anchor->generation_num); - if (!best || gen > bestgen) { - bestgen = gen; - best = sd; - } - } else { - free_imsm(super); - return 2; - } + if (err) + goto error; } - if (!best) { - free_imsm(super); - return 1; + /* all mpbs enter, maybe one leaves */ + super = imsm_thunderdome(&super_list, i); + if (!super) { + err = 1; + goto error; } - /* load the most up to date anchor */ - sprintf(nm, "%d:%d", best->disk.major, best->disk.minor); - dfd = dev_open(nm, O_RDONLY); - if (!dfd) { - free_imsm(super); - return 1; - } - rv = load_imsm_mpb(dfd, super, NULL); - close(dfd); - if (rv != 0) { + if (find_missing(super) != 0) { free_imsm(super); - return 2; + err = 2; + goto error; } - /* re-parse the disk list with the current anchor */ - for (sd = sra->devs ; sd ; sd = sd->next) { - sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); - dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY); - if (!dfd) { + if (st->subarray[0]) { + if (atoi(st->subarray) <= super->anchor->num_raid_devs) + super->current_vol = atoi(st->subarray); + else { free_imsm(super); - return 2; + err = 1; + goto error; } - load_imsm_disk(dfd, super, NULL, keep_fd); - if (!keep_fd) - close(dfd); } + err = 0; + error: + while (super_list) { + struct intel_super *s = super_list; - if (find_missing(super) != 0) { - free_imsm(super); - return 2; + super_list = super_list->next; + free_imsm(s); } - if (st->subarray[0]) { - if (atoi(st->subarray) <= super->anchor->num_raid_devs) - super->current_vol = atoi(st->subarray); - else - return 1; - } + if (err) + return err; *sbp = super; - st->container_dev = fd2devnum(fd); - if (st->ss == NULL) { + st->container_dev = devnum; + if (err == 0 && st->ss == NULL) { st->ss = &super_imsm; st->minor_version = 0; st->max_devs = IMSM_MAX_DEVICES; @@ -1967,8 +2783,8 @@ static int load_super_imsm(struct supertype *st, int fd, char *devname) if (load_super_imsm_all(st, fd, &st->sb, devname, 1) == 0) return 0; #endif - if (st->subarray[0]) - return 1; /* FIXME */ + + free_super_imsm(st); super = alloc_super(0); if (!super) { @@ -1978,7 +2794,7 @@ static int load_super_imsm(struct supertype *st, int fd, char *devname) return 1; } - rv = load_imsm_mpb(fd, super, devname); + rv = load_and_parse_mpb(fd, super, devname, 0); if (rv) { if (devname) @@ -1989,6 +2805,15 @@ static int load_super_imsm(struct supertype *st, int fd, char *devname) return rv; } + if (st->subarray[0]) { + if (atoi(st->subarray) <= super->anchor->num_raid_devs) + super->current_vol = atoi(st->subarray); + else { + free_imsm(super); + return 1; + } + } + st->sb = super; if (st->ss == NULL) { st->ss = &super_imsm; @@ -2007,20 +2832,22 @@ static __u16 info_to_blocks_per_strip(mdu_array_info_t *info) return info->chunk_size >> 9; } -static __u32 info_to_num_data_stripes(mdu_array_info_t *info) +static __u32 info_to_num_data_stripes(mdu_array_info_t *info, int num_domains) { __u32 num_stripes; num_stripes = (info->size * 2) / info_to_blocks_per_strip(info); - if (info->level == 1) - num_stripes /= 2; + num_stripes /= num_domains; return num_stripes; } static __u32 info_to_blocks_per_member(mdu_array_info_t *info) { - return (info->size * 2) & ~(info_to_blocks_per_strip(info) - 1); + if (info->level == 1) + return info->size * 2; + else + return (info->size * 2) & ~(info_to_blocks_per_strip(info) - 1); } static void imsm_update_version_info(struct intel_super *super) @@ -2079,6 +2906,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, */ struct intel_super *super = st->sb; struct imsm_super *mpb = super->anchor; + struct intel_dev *dv; struct imsm_dev *dev; struct imsm_vol *vol; struct imsm_map *map; @@ -2086,6 +2914,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, int i; unsigned long long array_blocks; size_t size_old, size_new; + __u32 num_data_stripes; if (super->orom && mpb->num_raid_devs >= super->orom->vpa) { fprintf(stderr, Name": This imsm-container already has the " @@ -2118,9 +2947,26 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, */ if (super->current_vol == 0) mpb->num_disks = 0; + + for (i = 0; i < super->current_vol; i++) { + dev = get_imsm_dev(super, i); + if (strncmp((char *) dev->volume, name, + MAX_RAID_SERIAL_LEN) == 0) { + fprintf(stderr, Name": '%s' is already defined for this container\n", + name); + return 0; + } + } + sprintf(st->subarray, "%d", idx); + dv = malloc(sizeof(*dv)); + if (!dv) { + fprintf(stderr, Name ": failed to allocate device list entry\n"); + return 0; + } dev = malloc(sizeof(*dev) + sizeof(__u32) * (info->raid_disks - 1)); if (!dev) { + free(dv); fprintf(stderr, Name": could not allocate raid device\n"); return 0; } @@ -2131,35 +2977,44 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, array_blocks = calc_array_size(info->level, info->raid_disks, info->layout, info->chunk_size, info->size*2); + /* round array size down to closest MB */ + array_blocks = (array_blocks >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT; + dev->size_low = __cpu_to_le32((__u32) array_blocks); dev->size_high = __cpu_to_le32((__u32) (array_blocks >> 32)); dev->status = __cpu_to_le32(0); dev->reserved_blocks = __cpu_to_le32(0); vol = &dev->vol; vol->migr_state = 0; - vol->migr_type = MIGR_INIT; + set_migr_type(dev, MIGR_INIT); vol->dirty = 0; vol->curr_migr_unit = 0; map = get_imsm_map(dev, 0); map->pba_of_lba0 = __cpu_to_le32(super->create_offset); map->blocks_per_member = __cpu_to_le32(info_to_blocks_per_member(info)); map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info)); - map->num_data_stripes = __cpu_to_le32(info_to_num_data_stripes(info)); + map->failed_disk_num = ~0; map->map_state = info->level ? IMSM_T_STATE_UNINITIALIZED : IMSM_T_STATE_NORMAL; + map->ddf = 1; if (info->level == 1 && info->raid_disks > 2) { fprintf(stderr, Name": imsm does not support more than 2 disks" "in a raid1 volume\n"); return 0; } + + map->raid_level = info->level; if (info->level == 10) { map->raid_level = 1; map->num_domains = info->raid_disks / 2; - } else { - map->raid_level = info->level; - map->num_domains = !!map->raid_level; - } + } else if (info->level == 1) + map->num_domains = info->raid_disks; + else + map->num_domains = 1; + + num_data_stripes = info_to_num_data_stripes(info, map->num_domains); + map->num_data_stripes = __cpu_to_le32(num_data_stripes); map->num_members = info->raid_disks; for (i = 0; i < map->num_members; i++) { @@ -2167,7 +3022,11 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, set_imsm_ord_tbl_ent(map, i, 0); } mpb->num_raid_devs++; - super->dev_tbl[super->current_vol] = dev; + + dv->dev = dev; + dv->index = super->current_vol; + dv->next = super->devlist; + super->devlist = dv; imsm_update_version_info(super); @@ -2191,24 +3050,33 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, size_t mpb_size; char *version; - if (!info) { - st->sb = NULL; - return 0; - } if (st->sb) - return init_super_imsm_volume(st, info, size, name, homehost, - uuid); + return init_super_imsm_volume(st, info, size, name, homehost, uuid); + + if (info) + mpb_size = disks_to_mpb_size(info->nr_disks); + else + mpb_size = 512; super = alloc_super(1); - if (!super) - return 0; - mpb_size = disks_to_mpb_size(info->nr_disks); - if (posix_memalign(&super->buf, 512, mpb_size) != 0) { + if (super && posix_memalign(&super->buf, 512, mpb_size) != 0) { free(super); + super = NULL; + } + if (!super) { + fprintf(stderr, Name + ": %s could not allocate superblock\n", __func__); return 0; } + memset(super->buf, 0, mpb_size); mpb = super->buf; - memset(mpb, 0, mpb_size); + mpb->mpb_size = __cpu_to_le32(mpb_size); + st->sb = super; + + if (info == NULL) { + /* zeroing superblock */ + return 0; + } mpb->attributes = MPB_ATTRIB_CHECKSUM_VERIFY; @@ -2216,9 +3084,7 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, strcpy(version, MPB_SIGNATURE); version += strlen(MPB_SIGNATURE); strcpy(version, MPB_VERSION_RAID0); - mpb->mpb_size = mpb_size; - st->sb = super; return 1; } @@ -2241,10 +3107,19 @@ static int add_to_super_imsm_volume(struct supertype *st, mdu_disk_info_t *dk, return 1; } - for (dl = super->disks; dl ; dl = dl->next) - if (dl->major == dk->major && - dl->minor == dk->minor) - break; + if (fd == -1) { + /* we're doing autolayout so grab the pre-marked (in + * validate_geometry) raid_disk + */ + for (dl = super->disks; dl; dl = dl->next) + if (dl->raiddisk == dk->raid_disk) + break; + } else { + for (dl = super->disks; dl ; dl = dl->next) + if (dl->major == dk->major && + dl->minor == dk->minor) + break; + } if (!dl) { fprintf(stderr, Name ": %s is not a member of the same container\n", devname); @@ -2257,7 +3132,7 @@ static int add_to_super_imsm_volume(struct supertype *st, mdu_disk_info_t *dk, super->anchor->num_disks++; } set_imsm_ord_tbl_ent(map, dk->number, dl->index); - dl->disk.status = CONFIGURED_DISK | USABLE_DISK; + dl->disk.status = CONFIGURED_DISK; /* if we are creating the first raid device update the family number */ if (super->current_vol == 0) { @@ -2267,8 +3142,10 @@ static int add_to_super_imsm_volume(struct supertype *st, mdu_disk_info_t *dk, *_dev = *dev; *_disk = dl->disk; - sum = __gen_imsm_checksum(mpb); + sum = random32(); + sum += __gen_imsm_checksum(mpb); mpb->family_num = __cpu_to_le32(sum); + mpb->orig_family_num = mpb->family_num; } return 0; @@ -2310,6 +3187,7 @@ static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk, dd->index = -1; dd->devname = devname ? strdup(devname) : NULL; dd->fd = fd; + dd->e = NULL; rv = imsm_read_serial(fd, devname, dd->serial); if (rv) { fprintf(stderr, @@ -2322,7 +3200,7 @@ static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk, size /= 512; serialcpy(dd->disk.serial, dd->serial); dd->disk.total_blocks = __cpu_to_le32(size); - dd->disk.status = USABLE_DISK | SPARE_DISK; + dd->disk.status = SPARE_DISK; if (sysfs_disk_to_scsi_id(fd, &id) == 0) dd->disk.scsi_id = __cpu_to_le32(id); else @@ -2339,38 +3217,48 @@ static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk, return 0; } -static int store_imsm_mpb(int fd, struct intel_super *super); +static int store_imsm_mpb(int fd, struct imsm_super *mpb); + +static union { + char buf[512]; + struct imsm_super anchor; +} spare_record __attribute__ ((aligned(512))); /* spare records have their own family number and do not have any defined raid * devices */ static int write_super_imsm_spares(struct intel_super *super, int doclose) { - struct imsm_super mpb_save; struct imsm_super *mpb = super->anchor; + struct imsm_super *spare = &spare_record.anchor; __u32 sum; struct dl *d; - mpb_save = *mpb; - mpb->num_raid_devs = 0; - mpb->num_disks = 1; - mpb->mpb_size = sizeof(struct imsm_super); - mpb->generation_num = __cpu_to_le32(1UL); + spare->mpb_size = __cpu_to_le32(sizeof(struct imsm_super)), + spare->generation_num = __cpu_to_le32(1UL), + spare->attributes = MPB_ATTRIB_CHECKSUM_VERIFY; + spare->num_disks = 1, + spare->num_raid_devs = 0, + spare->cache_size = mpb->cache_size, + spare->pwr_cycle_count = __cpu_to_le32(1), + + snprintf((char *) spare->sig, MAX_SIGNATURE_LENGTH, + MPB_SIGNATURE MPB_VERSION_RAID0); for (d = super->disks; d; d = d->next) { if (d->index != -1) continue; - mpb->disk[0] = d->disk; - sum = __gen_imsm_checksum(mpb); - mpb->family_num = __cpu_to_le32(sum); - sum = __gen_imsm_checksum(mpb); - mpb->check_sum = __cpu_to_le32(sum); + spare->disk[0] = d->disk; + sum = __gen_imsm_checksum(spare); + spare->family_num = __cpu_to_le32(sum); + spare->orig_family_num = 0; + sum = __gen_imsm_checksum(spare); + spare->check_sum = __cpu_to_le32(sum); - if (store_imsm_mpb(d->fd, super)) { + if (store_imsm_mpb(d->fd, spare)) { fprintf(stderr, "%s: failed for device %d:%d %s\n", __func__, d->major, d->minor, strerror(errno)); - *mpb = mpb_save; return 1; } if (doclose) { @@ -2379,7 +3267,6 @@ static int write_super_imsm_spares(struct intel_super *super, int doclose) } } - *mpb = mpb_save; return 0; } @@ -2398,6 +3285,12 @@ static int write_super_imsm(struct intel_super *super, int doclose) generation++; mpb->generation_num = __cpu_to_le32(generation); + /* fix up cases where previous mdadm releases failed to set + * orig_family_num + */ + if (mpb->orig_family_num == 0) + mpb->orig_family_num = mpb->family_num; + mpb_size += sizeof(struct imsm_disk) * mpb->num_disks; for (d = super->disks; d; d = d->next) { if (d->index == -1) @@ -2411,7 +3304,7 @@ static int write_super_imsm(struct intel_super *super, int doclose) for (i = 0; i < mpb->num_raid_devs; i++) { struct imsm_dev *dev = __get_imsm_dev(mpb, i); - imsm_copy_dev(dev, super->dev_tbl[i]); + imsm_copy_dev(dev, get_imsm_dev(super, i)); mpb_size += sizeof_imsm_dev(dev, 0); } mpb_size += __le32_to_cpu(mpb->bbm_log_size); @@ -2425,7 +3318,7 @@ static int write_super_imsm(struct intel_super *super, int doclose) for (d = super->disks; d ; d = d->next) { if (d->index < 0) continue; - if (store_imsm_mpb(d->fd, super)) + if (store_imsm_mpb(d->fd, mpb)) fprintf(stderr, "%s: failed for device %d:%d %s\n", __func__, d->major, d->minor, strerror(errno)); if (doclose) { @@ -2441,17 +3334,16 @@ static int write_super_imsm(struct intel_super *super, int doclose) } -static int create_array(struct supertype *st) +static int create_array(struct supertype *st, int dev_idx) { size_t len; struct imsm_update_create_array *u; struct intel_super *super = st->sb; - struct imsm_dev *dev = get_imsm_dev(super, super->current_vol); + struct imsm_dev *dev = get_imsm_dev(super, dev_idx); struct imsm_map *map = get_imsm_map(dev, 0); struct disk_info *inf; struct imsm_disk *disk; int i; - int idx; len = sizeof(*u) - sizeof(*dev) + sizeof_imsm_dev(dev, 0) + sizeof(*inf) * map->num_members; @@ -2463,11 +3355,12 @@ static int create_array(struct supertype *st) } u->type = update_create_array; - u->dev_idx = super->current_vol; + u->dev_idx = dev_idx; imsm_copy_dev(&u->dev, dev); inf = get_disk_info(u); for (i = 0; i < map->num_members; i++) { - idx = get_imsm_disk_idx(dev, i); + int idx = get_imsm_disk_idx(dev, i); + disk = get_imsm_disk(super, idx); serialcpy(inf[i].serial, disk->serial); } @@ -2501,21 +3394,26 @@ static int _add_disk(struct supertype *st) static int write_init_super_imsm(struct supertype *st) { + struct intel_super *super = st->sb; + int current_vol = super->current_vol; + + /* we are done with current_vol reset it to point st at the container */ + super->current_vol = -1; + if (st->update_tail) { /* queue the recently created array / added disk * as a metadata update */ - struct intel_super *super = st->sb; struct dl *d; int rv; /* determine if we are creating a volume or adding a disk */ - if (super->current_vol < 0) { + if (current_vol < 0) { /* in the add disk case we are running in mdmon * context, so don't close fd's */ return _add_disk(st); } else - rv = create_array(st); + rv = create_array(st, current_vol); for (d = super->disks; d ; d = d->next) { close(d->fd); @@ -2528,24 +3426,19 @@ static int write_init_super_imsm(struct supertype *st) } #endif -static int store_zero_imsm(struct supertype *st, int fd) +static int store_super_imsm(struct supertype *st, int fd) { - unsigned long long dsize; - void *buf; - - get_dev_size(fd, NULL, &dsize); - - /* first block is stored on second to last sector of the disk */ - if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) - return 1; + struct intel_super *super = st->sb; + struct imsm_super *mpb = super ? super->anchor : NULL; - if (posix_memalign(&buf, 512, 512) != 0) + if (!mpb) return 1; - memset(buf, 0, 512); - if (write(fd, buf, 512) != 512) - return 1; - return 0; +#ifndef MDASSEMBLE + return store_imsm_mpb(fd, mpb); +#else + return 1; +#endif } static int imsm_bbm_log_size(struct imsm_super *mpb) @@ -2637,12 +3530,12 @@ static unsigned long long merge_extents(struct intel_super *super, int sum_exten int i, j; int start_extent; unsigned long long pos; - unsigned long long start; + unsigned long long start = 0; unsigned long long maxsize; unsigned long reserve; if (!e) - return ~0ULL; /* error */ + return 0; /* coalesce and sort all extents. also, check to see if we need to * reserve space between member arrays @@ -2685,17 +3578,23 @@ static unsigned long long merge_extents(struct intel_super *super, int sum_exten } while (e[i-1].size); free(e); + if (maxsize == 0) + return 0; + + /* FIXME assumes volume at offset 0 is the first volume in a + * container + */ if (start_extent > 0) reserve = IMSM_RESERVED_SECTORS; /* gap between raid regions */ else reserve = 0; if (maxsize < reserve) - return ~0ULL; + return 0; super->create_offset = ~((__u32) 0); if (start + reserve > super->create_offset) - return ~0ULL; /* start overflows create_offset */ + return 0; /* start overflows create_offset */ super->create_offset = start + reserve; return maxsize - reserve; @@ -2713,10 +3612,9 @@ static int is_raid_level_supported(const struct imsm_orom *orom, int level, int case 1: if (raiddisks > 2) return imsm_orom_has_raid1e(orom); - else - return imsm_orom_has_raid1(orom); - case 10: return imsm_orom_has_raid10(orom); - case 5: return imsm_orom_has_raid5(orom); + return imsm_orom_has_raid1(orom) && raiddisks == 2; + case 10: return imsm_orom_has_raid10(orom) && raiddisks == 4; + case 5: return imsm_orom_has_raid5(orom) && raiddisks > 2; } else return 1; /* not on an Intel RAID platform so anything goes */ @@ -2724,7 +3622,35 @@ static int is_raid_level_supported(const struct imsm_orom *orom, int level, int return 0; } -#define vprintf(fmt, arg...) (void) (verbose && fprintf(stderr, Name fmt, ##arg)) +#define pr_vrb(fmt, arg...) (void) (verbose && fprintf(stderr, Name fmt, ##arg)) +static int +validate_geometry_imsm_orom(struct intel_super *super, int level, int layout, + int raiddisks, int chunk, int verbose) +{ + if (!is_raid_level_supported(super->orom, level, raiddisks)) { + pr_vrb(": platform does not support raid%d with %d disk%s\n", + level, raiddisks, raiddisks > 1 ? "s" : ""); + return 0; + } + if (super->orom && level != 1 && + !imsm_orom_has_chunk(super->orom, chunk)) { + pr_vrb(": platform does not support a chunk size of: %d\n", chunk); + return 0; + } + if (layout != imsm_level_to_layout(level)) { + if (level == 5) + pr_vrb(": imsm raid 5 only supports the left-asymmetric layout\n"); + else if (level == 10) + pr_vrb(": imsm raid 10 only supports the n2 layout\n"); + else + pr_vrb(": imsm unknown layout %#x for this raid level %d\n", + layout, level); + return 0; + } + + return 1; +} + /* validate_geometry_imsm_volume - lifted from validate_geometry_ddf_bvd * FIX ME add ahci details */ @@ -2736,6 +3662,7 @@ static int validate_geometry_imsm_volume(struct supertype *st, int level, { struct stat stb; struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; struct dl *dl; unsigned long long pos = 0; unsigned long long maxsize; @@ -2746,32 +3673,16 @@ static int validate_geometry_imsm_volume(struct supertype *st, int level, if (!super) return 0; - if (!is_raid_level_supported(super->orom, level, raiddisks)) { - vprintf(": platform does not support raid level: %d\n", level); - return 0; - } - if (super->orom && !imsm_orom_has_chunk(super->orom, chunk)) { - vprintf(": platform does not support a chunk size of: %d\n", chunk); + if (!validate_geometry_imsm_orom(super, level, layout, raiddisks, chunk, verbose)) return 0; - } - if (layout != imsm_level_to_layout(level)) { - if (level == 5) - vprintf(": imsm raid 5 only supports the left-asymmetric layout\n"); - else if (level == 10) - vprintf(": imsm raid 10 only supports the n2 layout\n"); - else - vprintf(": imsm unknown layout %#x for this raid level %d\n", - layout, level); - return 0; - } if (!dev) { /* General test: make sure there is space for * 'raiddisks' device extents of size 'size' at a given * offset */ - unsigned long long minsize = size*2 /* convert to blocks */; - unsigned long long start_offset = ~0ULL; + unsigned long long minsize = size; + unsigned long long start_offset = MaxSector; int dcnt = 0; if (minsize == 0) minsize = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; @@ -2787,7 +3698,7 @@ static int validate_geometry_imsm_volume(struct supertype *st, int level, esize = e[i].start - pos; if (esize >= minsize) found = 1; - if (found && start_offset == ~0ULL) { + if (found && start_offset == MaxSector) { start_offset = pos; break; } else if (found && pos != start_offset) { @@ -2827,6 +3738,17 @@ static int validate_geometry_imsm_volume(struct supertype *st, int level, fprintf(stderr, Name ": %s is not in the " "same imsm set\n", dev); return 0; + } else if (super->orom && dl->index < 0 && mpb->num_raid_devs) { + /* If a volume is present then the current creation attempt + * cannot incorporate new spares because the orom may not + * understand this configuration (all member disks must be + * members of each array in the container). + */ + fprintf(stderr, Name ": %s is a spare and a volume" + " is already defined for this container\n", dev); + fprintf(stderr, Name ": The option-rom requires all member" + " disks to be a member of all volumes\n"); + return 0; } /* retrieve the largest free space block */ @@ -2865,15 +3787,11 @@ static int validate_geometry_imsm_volume(struct supertype *st, int level, i += dl->extent_cnt; maxsize = merge_extents(super, i); - if (maxsize < size) { + if (maxsize < size || maxsize == 0) { if (verbose) fprintf(stderr, Name ": not enough space after merge (%llu < %llu)\n", maxsize, size); return 0; - } else if (maxsize == ~0ULL) { - if (verbose) - fprintf(stderr, Name ": failed to merge %d extents\n", i); - return 0; } *freesize = maxsize; @@ -2881,6 +3799,79 @@ static int validate_geometry_imsm_volume(struct supertype *st, int level, return 1; } +static int reserve_space(struct supertype *st, int raiddisks, + unsigned long long size, int chunk, + unsigned long long *freesize) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct dl *dl; + int i; + int extent_cnt; + struct extent *e; + unsigned long long maxsize; + unsigned long long minsize; + int cnt; + int used; + + /* find the largest common start free region of the possible disks */ + used = 0; + extent_cnt = 0; + cnt = 0; + for (dl = super->disks; dl; dl = dl->next) { + dl->raiddisk = -1; + + if (dl->index >= 0) + used++; + + /* don't activate new spares if we are orom constrained + * and there is already a volume active in the container + */ + if (super->orom && dl->index < 0 && mpb->num_raid_devs) + continue; + + e = get_extents(super, dl); + if (!e) + continue; + for (i = 1; e[i-1].size; i++) + ; + dl->e = e; + dl->extent_cnt = i; + extent_cnt += i; + cnt++; + } + + maxsize = merge_extents(super, extent_cnt); + minsize = size; + if (size == 0) + minsize = chunk; + + if (cnt < raiddisks || + (super->orom && used && used != raiddisks) || + maxsize < minsize || + maxsize == 0) { + fprintf(stderr, Name ": not enough devices with space to create array.\n"); + return 0; /* No enough free spaces large enough */ + } + + if (size == 0) { + size = maxsize; + if (chunk) { + size /= chunk; + size *= chunk; + } + } + + cnt = 0; + for (dl = super->disks; dl; dl = dl->next) + if (dl->e) + dl->raiddisk = cnt++; + + *freesize = size; + + return 1; +} + static int validate_geometry_imsm(struct supertype *st, int level, int layout, int raiddisks, int chunk, unsigned long long size, char *dev, unsigned long long *freesize, @@ -2900,6 +3891,24 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout, verbose); } + if (!dev) { + if (st->sb && freesize) { + /* we are being asked to automatically layout a + * new volume based on the current contents of + * the container. If the the parameters can be + * satisfied reserve_space will record the disks, + * start offset, and size of the volume to be + * created. add_to_super and getinfo_super + * detect when autolayout is in progress. + */ + if (!validate_geometry_imsm_orom(st->sb, level, layout, + raiddisks, chunk, + verbose)) + return 0; + return reserve_space(st, raiddisks, size, chunk, freesize); + } + return 1; + } if (st->sb) { /* creating in a given container */ return validate_geometry_imsm_volume(st, level, layout, @@ -2914,8 +3923,11 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout, case 1: case 10: case 5: - break; + return 0; default: + if (verbose) + fprintf(stderr, Name + ": IMSM only supports levels 0,1,5,10\n"); return 1; } @@ -2970,6 +3982,46 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout, } #endif /* MDASSEMBLE */ +static int is_rebuilding(struct imsm_dev *dev) +{ + struct imsm_map *migr_map; + + if (!dev->vol.migr_state) + return 0; + + if (migr_type(dev) != MIGR_REBUILD) + return 0; + + migr_map = get_imsm_map(dev, 1); + + if (migr_map->map_state == IMSM_T_STATE_DEGRADED) + return 1; + else + return 0; +} + +static void update_recovery_start(struct imsm_dev *dev, struct mdinfo *array) +{ + struct mdinfo *rebuild = NULL; + struct mdinfo *d; + __u32 units; + + if (!is_rebuilding(dev)) + return; + + /* Find the rebuild target, but punt on the dual rebuild case */ + for (d = array->devs; d; d = d->next) + if (d->recovery_start == 0) { + if (rebuild) + return; + rebuild = d; + } + + units = __le32_to_cpu(dev->vol.curr_migr_unit); + rebuild->recovery_start = units * blocks_per_migr_unit(dev); +} + + static struct mdinfo *container_content_imsm(struct supertype *st) { /* Given a container loaded by load_super_imsm_all, @@ -2998,6 +4050,18 @@ static struct mdinfo *container_content_imsm(struct supertype *st) struct mdinfo *this; int slot; + /* do not publish arrays that are in the middle of an + * unsupported migration + */ + if (dev->vol.migr_state && + (migr_type(dev) == MIGR_GEN_MIGR || + migr_type(dev) == MIGR_STATE_CHANGE)) { + fprintf(stderr, Name ": cannot assemble volume '%.16s':" + " unsupported migration in progress\n", + dev->volume); + continue; + } + this = malloc(sizeof(*this)); memset(this, 0, sizeof(*this)); this->next = rest; @@ -3005,11 +4069,11 @@ static struct mdinfo *container_content_imsm(struct supertype *st) super->current_vol = i; getinfo_super_imsm_volume(st, this); for (slot = 0 ; slot < map->num_members; slot++) { + unsigned long long recovery_start; struct mdinfo *info_d; struct dl *d; int idx; int skip; - __u32 s; __u32 ord; skip = 0; @@ -3019,37 +4083,41 @@ static struct mdinfo *container_content_imsm(struct supertype *st) if (d->index == idx) break; + recovery_start = MaxSector; if (d == NULL) skip = 1; - - s = d ? d->disk.status : 0; - if (s & FAILED_DISK) - skip = 1; - if (!(s & USABLE_DISK)) + if (d && is_failed(&d->disk)) skip = 1; if (ord & IMSM_ORD_REBUILD) - skip = 1; + recovery_start = 0; /* * if we skip some disks the array will be assmebled degraded; - * reset resync start to avoid a dirty-degraded situation + * reset resync start to avoid a dirty-degraded + * situation when performing the intial sync * * FIXME handle dirty degraded */ - if (skip && !dev->vol.dirty) - this->resync_start = ~0ULL; + if ((skip || recovery_start == 0) && !dev->vol.dirty) + this->resync_start = MaxSector; if (skip) continue; - info_d = malloc(sizeof(*info_d)); + info_d = calloc(1, sizeof(*info_d)); if (!info_d) { fprintf(stderr, Name ": failed to allocate disk" - " for volume %s\n", (char *) dev->volume); + " for volume %.16s\n", dev->volume); + info_d = this->devs; + while (info_d) { + struct mdinfo *d = info_d->next; + + free(info_d); + info_d = d; + } free(this); this = rest; break; } - memset(info_d, 0, sizeof(*info_d)); info_d->next = this->devs; this->devs = info_d; @@ -3057,8 +4125,10 @@ static struct mdinfo *container_content_imsm(struct supertype *st) info_d->disk.major = d->major; info_d->disk.minor = d->minor; info_d->disk.raid_disk = slot; + info_d->recovery_start = recovery_start; - this->array.working_disks++; + if (info_d->recovery_start == MaxSector) + this->array.working_disks++; info_d->events = __le32_to_cpu(mpb->generation_num); info_d->data_offset = __le32_to_cpu(map->pba_of_lba0); @@ -3066,6 +4136,8 @@ static struct mdinfo *container_content_imsm(struct supertype *st) if (d->devname) strcpy(info_d->name, d->devname); } + /* now that the disk list is up-to-date fixup recovery_start */ + update_recovery_start(dev, this); rest = this; } @@ -3132,8 +4204,7 @@ static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, insync = 2; disk = get_imsm_disk(super, idx); - if (!disk || disk->status & FAILED_DISK || - ord & IMSM_ORD_REBUILD) + if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD) insync--; /* no in-sync disks left in this mirror the @@ -3164,14 +4235,26 @@ static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev) int failed = 0; struct imsm_disk *disk; struct imsm_map *map = get_imsm_map(dev, 0); + struct imsm_map *prev = get_imsm_map(dev, dev->vol.migr_state); + __u32 ord; + int idx; - for (i = 0; i < map->num_members; i++) { - __u32 ord = get_imsm_ord_tbl_ent(dev, i); - int idx = ord_to_idx(ord); + /* at the beginning of migration we set IMSM_ORD_REBUILD on + * disks that are being rebuilt. New failures are recorded to + * map[0]. So we look through all the disks we started with and + * see if any failures are still present, or if any new ones + * have arrived + * + * FIXME add support for online capacity expansion and + * raid-level-migration + */ + for (i = 0; i < prev->num_members; i++) { + ord = __le32_to_cpu(prev->disk_ord_tbl[i]); + ord |= __le32_to_cpu(map->disk_ord_tbl[i]); + idx = ord_to_idx(ord); disk = get_imsm_disk(super, idx); - if (!disk || disk->status & FAILED_DISK || - ord & IMSM_ORD_REBUILD) + if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD) failed++; } @@ -3185,7 +4268,8 @@ static int is_resyncing(struct imsm_dev *dev) if (!dev->vol.migr_state) return 0; - if (dev->vol.migr_type == MIGR_INIT) + if (migr_type(dev) == MIGR_INIT || + migr_type(dev) == MIGR_REPAIR) return 1; migr_map = get_imsm_map(dev, 1); @@ -3196,29 +4280,39 @@ static int is_resyncing(struct imsm_dev *dev) return 0; } -static int is_rebuilding(struct imsm_dev *dev) +/* return true if we recorded new information */ +static int mark_failure(struct imsm_dev *dev, struct imsm_disk *disk, int idx) { - struct imsm_map *migr_map; + __u32 ord; + int slot; + struct imsm_map *map; - if (!dev->vol.migr_state) - return 0; + /* new failures are always set in map[0] */ + map = get_imsm_map(dev, 0); - if (dev->vol.migr_type != MIGR_REBUILD) + slot = get_imsm_disk_slot(map, idx); + if (slot < 0) return 0; - migr_map = get_imsm_map(dev, 1); - - if (migr_map->map_state == IMSM_T_STATE_DEGRADED) - return 1; - else + ord = __le32_to_cpu(map->disk_ord_tbl[slot]); + if (is_failed(disk) && (ord & IMSM_ORD_REBUILD)) return 0; + + disk->status |= FAILED_DISK; + disk->status &= ~CONFIGURED_DISK; + set_imsm_ord_tbl_ent(map, slot, idx | IMSM_ORD_REBUILD); + if (~map->failed_disk_num == 0) + map->failed_disk_num = slot; + return 1; } -static void mark_failure(struct imsm_disk *disk) +static void mark_missing(struct imsm_dev *dev, struct imsm_disk *disk, int idx) { - if (disk->status & FAILED_DISK) + mark_failure(dev, disk, idx); + + if (disk->scsi_id == __cpu_to_le32(~(__u32)0)) return; - disk->status |= FAILED_DISK; + disk->scsi_id = __cpu_to_le32(~(__u32)0); memmove(&disk->serial[0], &disk->serial[1], MAX_RAID_SERIAL_LEN - 1); } @@ -3236,6 +4330,7 @@ static int imsm_set_array_state(struct active_array *a, int consistent) struct imsm_map *map = get_imsm_map(dev, 0); int failed = imsm_count_failed(super, dev); __u8 map_state = imsm_check_degraded(super, dev, failed); + __u32 blocks_per_unit; /* before we activate this array handle any missing disks */ if (consistent == 2 && super->missing) { @@ -3244,19 +4339,20 @@ static int imsm_set_array_state(struct active_array *a, int consistent) dprintf("imsm: mark missing\n"); end_migration(dev, map_state); for (dl = super->missing; dl; dl = dl->next) - mark_failure(&dl->disk); + mark_missing(dev, &dl->disk, dl->index); super->updates_pending++; } - + if (consistent == 2 && - (!is_resync_complete(a) || + (!is_resync_complete(&a->info) || map_state != IMSM_T_STATE_NORMAL || dev->vol.migr_state)) consistent = 0; - if (is_resync_complete(a)) { + if (is_resync_complete(&a->info)) { /* complete intialization / resync, - * recovery is completed in ->set_disk + * recovery and interrupted recovery is completed in + * ->set_disk */ if (is_resyncing(dev)) { dprintf("imsm: mark resync done\n"); @@ -3265,26 +4361,40 @@ static int imsm_set_array_state(struct active_array *a, int consistent) } } else if (!is_resyncing(dev) && !failed) { /* mark the start of the init process if nothing is failed */ - dprintf("imsm: mark resync start (%llu)\n", a->resync_start); - if (map->map_state == IMSM_T_STATE_NORMAL) - migrate(dev, IMSM_T_STATE_NORMAL, MIGR_REBUILD); - else + dprintf("imsm: mark resync start\n"); + if (map->map_state == IMSM_T_STATE_UNINITIALIZED) migrate(dev, IMSM_T_STATE_NORMAL, MIGR_INIT); + else + migrate(dev, IMSM_T_STATE_NORMAL, MIGR_REPAIR); super->updates_pending++; } - /* check if we can update the migration checkpoint */ - if (dev->vol.migr_state && - __le32_to_cpu(dev->vol.curr_migr_unit) != a->resync_start) { - dprintf("imsm: checkpoint migration (%llu)\n", a->resync_start); - dev->vol.curr_migr_unit = __cpu_to_le32(a->resync_start); - super->updates_pending++; + /* check if we can update curr_migr_unit from resync_start, recovery_start */ + blocks_per_unit = blocks_per_migr_unit(dev); + if (blocks_per_unit && failed <= 1) { + __u32 units32; + __u64 units; + + if (migr_type(dev) == MIGR_REBUILD) + units = min_recovery_start(&a->info) / blocks_per_unit; + else + units = a->info.resync_start / blocks_per_unit; + units32 = units; + + /* check that we did not overflow 32-bits, and that + * curr_migr_unit needs updating + */ + if (units32 == units && + __le32_to_cpu(dev->vol.curr_migr_unit) != units32) { + dprintf("imsm: mark checkpoint (%u)\n", units32); + dev->vol.curr_migr_unit = __cpu_to_le32(units32); + super->updates_pending++; + } } /* mark dirty / clean */ if (dev->vol.dirty != !consistent) { - dprintf("imsm: mark '%s' (%llu)\n", - consistent ? "clean" : "dirty", a->resync_start); + dprintf("imsm: mark '%s'\n", consistent ? "clean" : "dirty"); if (consistent) dev->vol.dirty = 0; else @@ -3318,13 +4428,13 @@ static void imsm_set_disk(struct active_array *a, int n, int state) disk = get_imsm_disk(super, ord_to_idx(ord)); /* check for new failures */ - if ((state & DS_FAULTY) && !(disk->status & FAILED_DISK)) { - mark_failure(disk); - super->updates_pending++; + if (state & DS_FAULTY) { + if (mark_failure(dev, disk, ord_to_idx(ord))) + super->updates_pending++; } /* check if in_sync */ - if (state & DS_INSYNC && ord & IMSM_ORD_REBUILD) { + if (state & DS_INSYNC && ord & IMSM_ORD_REBUILD && is_rebuilding(dev)) { struct imsm_map *migr_map = get_imsm_map(dev, 1); set_imsm_ord_tbl_ent(migr_map, n, ord_to_idx(ord)); @@ -3337,6 +4447,8 @@ static void imsm_set_disk(struct active_array *a, int n, int state) /* check if recovery complete, newly degraded, or failed */ if (map_state == IMSM_T_STATE_NORMAL && is_rebuilding(dev)) { end_migration(dev, map_state); + map = get_imsm_map(dev, 0); + map->failed_disk_num = ~0; super->updates_pending++; } else if (map_state == IMSM_T_STATE_DEGRADED && map->map_state != map_state && @@ -3352,9 +4464,9 @@ static void imsm_set_disk(struct active_array *a, int n, int state) } } -static int store_imsm_mpb(int fd, struct intel_super *super) +static int store_imsm_mpb(int fd, struct imsm_super *mpb) { - struct imsm_super *mpb = super->anchor; + void *buf = mpb; __u32 mpb_size = __le32_to_cpu(mpb->mpb_size); unsigned long long dsize; unsigned long long sectors; @@ -3369,7 +4481,7 @@ static int store_imsm_mpb(int fd, struct intel_super *super) if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) return 1; - if (write(fd, super->buf + 512, 512 * sectors) != 512 * sectors) + if (write(fd, buf + 512, 512 * sectors) != 512 * sectors) return 1; } @@ -3377,7 +4489,7 @@ static int store_imsm_mpb(int fd, struct intel_super *super) if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) return 1; - if (write(fd, super->buf, 512) != 512) + if (write(fd, buf, 512) != 512) return 1; return 0; @@ -3405,7 +4517,7 @@ static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_a if (dl->index == i) break; - if (dl && dl->disk.status & FAILED_DISK) + if (dl && is_failed(&dl->disk)) dl = NULL; if (dl) @@ -3414,18 +4526,20 @@ static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_a return dl; } -static struct dl *imsm_add_spare(struct intel_super *super, int slot, struct active_array *a) +static struct dl *imsm_add_spare(struct intel_super *super, int slot, + struct active_array *a, int activate_new) { struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member); int idx = get_imsm_disk_idx(dev, slot); - struct imsm_map *map = get_imsm_map(dev, 0); - unsigned long long esize; + struct imsm_super *mpb = super->anchor; + struct imsm_map *map; unsigned long long pos; struct mdinfo *d; struct extent *ex; - int j; + int i, j; int found; __u32 array_start; + __u32 array_end; struct dl *dl; for (dl = super->disks; dl; dl = dl->next) { @@ -3441,52 +4555,68 @@ static struct dl *imsm_add_spare(struct intel_super *super, int slot, struct act continue; /* skip in use or failed drives */ - if (dl->disk.status & FAILED_DISK || idx == dl->index) { - dprintf("%x:%x status ( %s%s)\n", - dl->major, dl->minor, - dl->disk.status & FAILED_DISK ? "failed " : "", - idx == dl->index ? "in use " : ""); + if (is_failed(&dl->disk) || idx == dl->index || + dl->index == -2) { + dprintf("%x:%x status (failed: %d index: %d)\n", + dl->major, dl->minor, is_failed(&dl->disk), idx); continue; } + /* skip pure spares when we are looking for partially + * assimilated drives + */ + if (dl->index == -1 && !activate_new) + continue; + /* Does this unused device have the requisite free space? - * We need a->info.component_size sectors + * It needs to be able to cover all member volumes */ ex = get_extents(super, dl); if (!ex) { dprintf("cannot get extents\n"); continue; } - found = 0; - j = 0; - pos = 0; - array_start = __le32_to_cpu(map->pba_of_lba0); + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, 0); - do { - /* check that we can start at pba_of_lba0 with - * a->info.component_size of space + /* check if this disk is already a member of + * this array */ - esize = ex[j].start - pos; - if (array_start >= pos && - array_start + a->info.component_size < ex[j].start) { - found = 1; + if (get_imsm_disk_slot(map, dl->index) >= 0) + continue; + + found = 0; + j = 0; + pos = 0; + array_start = __le32_to_cpu(map->pba_of_lba0); + array_end = array_start + + __le32_to_cpu(map->blocks_per_member) - 1; + + do { + /* check that we can start at pba_of_lba0 with + * blocks_per_member of space + */ + if (array_start >= pos && array_end < ex[j].start) { + found = 1; + break; + } + pos = ex[j].start + ex[j].size; + j++; + } while (ex[j-1].size); + + if (!found) break; - } - pos = ex[j].start + ex[j].size; - j++; - - } while (ex[j-1].size); + } free(ex); - if (!found) { - dprintf("%x:%x does not have %llu at %d\n", - dl->major, dl->minor, - a->info.component_size, - __le32_to_cpu(map->pba_of_lba0)); + if (i < mpb->num_raid_devs) { + dprintf("%x:%x does not have %u to %u available\n", + dl->major, dl->minor, array_start, array_end); /* No room */ continue; - } else - break; + } + return dl; } return dl; @@ -3544,12 +4674,17 @@ static struct mdinfo *imsm_activate_spare(struct active_array *a, continue; /* - * OK, this device needs recovery. Try to re-add the previous - * occupant of this slot, if this fails add a new spare + * OK, this device needs recovery. Try to re-add the + * previous occupant of this slot, if this fails see if + * we can continue the assimilation of a spare that was + * partially assimilated, finally try to activate a new + * spare. */ dl = imsm_readd(super, i, a); if (!dl) - dl = imsm_add_spare(super, i, a); + dl = imsm_add_spare(super, i, a, 0); + if (!dl) + dl = imsm_add_spare(super, i, a, 1); if (!dl) continue; @@ -3576,9 +4711,11 @@ static struct mdinfo *imsm_activate_spare(struct active_array *a, di->disk.major = dl->major; di->disk.minor = dl->minor; di->disk.state = 0; + di->recovery_start = 0; di->data_offset = __le32_to_cpu(map->pba_of_lba0); di->component_size = a->info.component_size; di->container_member = inst; + super->random = random32(); di->next = rv; rv = di; num_spares++; @@ -3724,7 +4861,7 @@ static void imsm_process_update(struct supertype *st, if (i == u->slot) continue; disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i)); - if (!disk || disk->status & FAILED_DISK) + if (!disk || is_failed(disk)) failed++; } @@ -3745,13 +4882,23 @@ static void imsm_process_update(struct supertype *st, set_imsm_ord_tbl_ent(map, u->slot, dl->index); set_imsm_ord_tbl_ent(migr_map, u->slot, dl->index | IMSM_ORD_REBUILD); + /* update the family_num to mark a new container + * generation, being careful to record the existing + * family_num in orig_family_num to clean up after + * earlier mdadm versions that neglected to set it. + */ + if (mpb->orig_family_num == 0) + mpb->orig_family_num = mpb->family_num; + mpb->family_num += super->random; + /* count arrays using the victim in the metadata */ found = 0; for (a = st->arrays; a ; a = a->next) { dev = get_imsm_dev(super, a->info.container_member); - for (i = 0; i < map->num_members; i++) - if (victim == get_imsm_disk_idx(dev, i)) - found++; + map = get_imsm_map(dev, 0); + + if (get_imsm_disk_slot(map, victim) >= 0) + found++; } /* delete the victim if it is no longer being @@ -3787,6 +4934,7 @@ static void imsm_process_update(struct supertype *st, * (FIX ME) notice that its update did not take hold. */ struct imsm_update_create_array *u = (void *) update->buf; + struct intel_dev *dv; struct imsm_dev *dev; struct imsm_map *map, *new_map; unsigned long long start, end; @@ -3799,14 +4947,14 @@ static void imsm_process_update(struct supertype *st, if (u->dev_idx < mpb->num_raid_devs) { dprintf("%s: subarray %d already defined\n", __func__, u->dev_idx); - return; + goto create_error; } /* check update is next in sequence */ if (u->dev_idx != mpb->num_raid_devs) { dprintf("%s: can not create array %d expected index %d\n", __func__, u->dev_idx, mpb->num_raid_devs); - return; + goto create_error; } new_map = get_imsm_map(&u->dev, 0); @@ -3831,14 +4979,14 @@ static void imsm_process_update(struct supertype *st, if (disks_overlap(super, i, u)) { dprintf("%s: arrays overlap\n", __func__); - return; + goto create_error; } } /* check that prepare update was successful */ if (!update->space) { dprintf("%s: prepare update failed\n", __func__); - return; + goto create_error; } /* check that all disks are still active before committing @@ -3850,7 +4998,7 @@ static void imsm_process_update(struct supertype *st, dl = serial_to_dl(inf[i].serial, super); if (!dl) { dprintf("%s: disk disappeared\n", __func__); - return; + goto create_error; } } @@ -3868,14 +5016,26 @@ static void imsm_process_update(struct supertype *st, set_imsm_ord_tbl_ent(new_map, i, dl->index); } - dev = update->space; + dv = update->space; + dev = dv->dev; update->space = NULL; imsm_copy_dev(dev, &u->dev); - super->dev_tbl[u->dev_idx] = dev; + dv->index = u->dev_idx; + dv->next = super->devlist; + super->devlist = dv; mpb->num_raid_devs++; imsm_update_version_info(super); break; + create_error: + /* mdmon knows how to release update->space, but not + * ((struct intel_dev *) update->space)->dev + */ + if (update->space) { + dv = update->space; + free(dv->dev); + } + break; } case update_add_disk: @@ -3923,6 +5083,7 @@ static void imsm_prepare_update(struct supertype *st, switch (type) { case update_create_array: { struct imsm_update_create_array *u = (void *) update->buf; + struct intel_dev *dv; struct imsm_dev *dev = &u->dev; struct imsm_map *map = get_imsm_map(dev, 0); struct dl *dl; @@ -3932,8 +5093,17 @@ static void imsm_prepare_update(struct supertype *st, inf = get_disk_info(u); len = sizeof_imsm_dev(dev, 1); - /* allocate a new super->dev_tbl entry */ - update->space = malloc(len); + /* allocate a new super->devlist entry */ + dv = malloc(sizeof(*dv)); + if (dv) { + dv->dev = malloc(len); + if (dv->dev) + update->space = dv; + else { + free(dv); + update->space = NULL; + } + } /* count how many spares will be converted to members */ for (i = 0; i < map->num_members; i++) { @@ -3970,7 +5140,9 @@ static void imsm_prepare_update(struct supertype *st, free(super->next_buf); super->next_len = buf_len; - if (posix_memalign(&super->next_buf, buf_len, 512) != 0) + if (posix_memalign(&super->next_buf, 512, buf_len) == 0) + memset(super->next_buf, 0, buf_len); + else super->next_buf = NULL; } } @@ -4032,6 +5204,8 @@ struct superswitch super_imsm = { #ifndef MDASSEMBLE .examine_super = examine_super_imsm, .brief_examine_super = brief_examine_super_imsm, + .brief_examine_subarrays = brief_examine_subarrays_imsm, + .export_examine_super = export_examine_super_imsm, .detail_super = detail_super_imsm, .brief_detail_super = brief_detail_super_imsm, .write_init_super = write_init_super_imsm, @@ -4050,10 +5224,11 @@ struct superswitch super_imsm = { .load_super = load_super_imsm, .init_super = init_super_imsm, - .store_super = store_zero_imsm, + .store_super = store_super_imsm, .free_super = free_super_imsm, .match_metadata_desc = match_metadata_desc_imsm, .container_content = container_content_imsm, + .default_layout = imsm_level_to_layout, .external = 1, .name = "imsm",