#define NUM_BLOCKS_DIRTY_STRIPE_REGION 2056
#define SECT_PER_MB_SHIFT 11
#define MAX_SECTOR_SIZE 4096
+#define MULTIPLE_PPL_AREA_SIZE_IMSM (1024 * 1024) /* Size of the whole
+ * mutliple PPL area
+ */
/* Disk configuration info. */
#define IMSM_MAX_DEVICES 255
#define RWH_OFF 0
#define RWH_DISTRIBUTED 1
#define RWH_JOURNALING_DRIVE 2
+#define RWH_MULTIPLE_DISTRIBUTED 3
+#define RWH_MULTIPLE_PPLS_JOURNALING_DRIVE 4
+#define RWH_MULTIPLE_OFF 5
__u8 rwh_policy; /* Raid Write Hole Policy */
__u8 jd_serial[MAX_RAID_SERIAL_LEN]; /* Journal Drive serial number */
__u8 filler1;
* already been migrated and must
* be recovered from checkpoint area */
-#define PPL_ENTRY_SPACE (128 * 1024) /* Size of the PPL, without the header */
+#define PPL_ENTRY_SPACE (128 * 1024) /* Size of single PPL, without the header */
struct migr_record {
__u32 rec_status; /* Status used to determine how to restart
return size;
}
+static int able_to_resync(int raid_level, int missing_disks)
+{
+ int max_missing_disks = 0;
+
+ switch (raid_level) {
+ case 10:
+ max_missing_disks = 1;
+ break;
+ default:
+ max_missing_disks = 0;
+ }
+ return missing_disks <= max_missing_disks;
+}
+
/* try to determine how much space is reserved for metadata from
* the last get_extents() entry on the smallest active disk,
* otherwise fallback to the default
printf(" Dirty State : %s\n", (dev->vol.dirty & RAIDVOL_DIRTY) ?
"dirty" : "clean");
printf(" RWH Policy : ");
- if (dev->rwh_policy == RWH_OFF)
+ if (dev->rwh_policy == RWH_OFF || dev->rwh_policy == RWH_MULTIPLE_OFF)
printf("off\n");
else if (dev->rwh_policy == RWH_DISTRIBUTED)
printf("PPL distributed\n");
else if (dev->rwh_policy == RWH_JOURNALING_DRIVE)
printf("PPL journaling drive\n");
+ else if (dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED)
+ printf("Multiple distributed PPLs\n");
+ else if (dev->rwh_policy == RWH_MULTIPLE_PPLS_JOURNALING_DRIVE)
+ printf("Multiple PPLs on journaling drive\n");
else
printf("<unknown:%d>\n", dev->rwh_policy);
}
printf(" Platform : Intel(R) ");
if (orom->capabilities == 0 && orom->driver_features == 0)
printf("Matrix Storage Manager\n");
+ else if (imsm_orom_is_enterprise(orom) && orom->major_ver >= 6)
+ printf("Virtual RAID on CPU\n");
else
printf("Rapid Storage Technology%s\n",
imsm_orom_is_enterprise(orom) ? " enterprise" : "");
memset(info->uuid, 0, sizeof(info->uuid));
info->recovery_start = MaxSector;
- if (info->array.level == 5 && dev->rwh_policy == RWH_DISTRIBUTED) {
+ if (info->array.level == 5 &&
+ (dev->rwh_policy == RWH_DISTRIBUTED ||
+ dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED)) {
info->consistency_policy = CONSISTENCY_POLICY_PPL;
info->ppl_sector = get_ppl_sector(super, super->current_vol);
- info->ppl_size = (PPL_HEADER_SIZE + PPL_ENTRY_SPACE) >> 9;
+ if (dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED)
+ info->ppl_size = MULTIPLE_PPL_AREA_SIZE_IMSM >> 9;
+ else
+ info->ppl_size = (PPL_HEADER_SIZE + PPL_ENTRY_SPACE)
+ >> 9;
} else if (info->array.level <= 0) {
info->consistency_policy = CONSISTENCY_POLICY_NONE;
} else {
__u32 ord = get_imsm_ord_tbl_ent(dev, j, MAP_0);
__u32 idx = ord_to_idx(ord);
+ if (super->disks && super->disks->index == (int)idx)
+ info->disk.raid_disk = j;
+
if (!(ord & IMSM_ORD_REBUILD) &&
get_imsm_missing(super, idx)) {
missing = 1;
hba = hba->next;
}
fprintf(stderr, ").\n"
- " Mixing devices attached to different %s is not allowed.\n",
- hba_name->type == SYS_DEV_VMD ? "VMD domains" : "controllers");
+ " Mixing devices attached to different controllers is not allowed.\n");
}
return 2;
}
dev->my_vol_raid_dev_num = mpb->num_raid_devs_created;
if (s->consistency_policy <= CONSISTENCY_POLICY_RESYNC) {
- dev->rwh_policy = RWH_OFF;
+ dev->rwh_policy = RWH_MULTIPLE_OFF;
} else if (s->consistency_policy == CONSISTENCY_POLICY_PPL) {
- dev->rwh_policy = RWH_DISTRIBUTED;
+ dev->rwh_policy = RWH_MULTIPLE_DISTRIBUTED;
} else {
free(dev);
free(dv);
} else if (super->hba->type == SYS_DEV_VMD && super->orom &&
!imsm_orom_has_tpv_support(super->orom)) {
pr_err("\tPlatform configuration does not support non-Intel NVMe drives.\n"
- "\tPlease refer to Intel(R) RSTe user guide.\n");
+ "\tPlease refer to Intel(R) RSTe/VROC user guide.\n");
free(dd->devname);
free(dd);
return 1;
struct ppl_header *ppl_hdr;
int ret;
- ret = posix_memalign(&buf, 4096, PPL_HEADER_SIZE);
+ /* first clear entire ppl space */
+ ret = zero_disk_range(fd, info->ppl_sector, info->ppl_size);
+ if (ret)
+ return ret;
+
+ ret = posix_memalign(&buf, MAX_SECTOR_SIZE, PPL_HEADER_SIZE);
if (ret) {
pr_err("Failed to allocate PPL header buffer\n");
return ret;
ppl_hdr = buf;
memset(ppl_hdr->reserved, 0xff, PPL_HDR_RESERVED);
ppl_hdr->signature = __cpu_to_le32(super->anchor->orig_family_num);
+
+ if (info->mismatch_cnt) {
+ /*
+ * We are overwriting an invalid ppl. Make one entry with wrong
+ * checksum to prevent the kernel from skipping resync.
+ */
+ ppl_hdr->entries_count = __cpu_to_le32(1);
+ ppl_hdr->entries[0].checksum = ~0;
+ }
+
ppl_hdr->checksum = __cpu_to_le32(~crc32c_le(~0, buf, PPL_HEADER_SIZE));
if (lseek64(fd, info->ppl_sector * 512, SEEK_SET) < 0) {
struct ppl_header *ppl_hdr;
__u32 crc;
struct imsm_dev *dev;
- struct imsm_map *map;
__u32 idx;
+ unsigned int i;
+ unsigned long long ppl_offset = 0;
+ unsigned long long prev_gen_num = 0;
if (disk->disk.raid_disk < 0)
return 0;
- if (posix_memalign(&buf, 4096, PPL_HEADER_SIZE)) {
+ if (posix_memalign(&buf, MAX_SECTOR_SIZE, PPL_HEADER_SIZE)) {
pr_err("Failed to allocate PPL header buffer\n");
return -1;
}
dev = get_imsm_dev(super, info->container_member);
- map = get_imsm_map(dev, MAP_X);
- idx = get_imsm_disk_idx(dev, disk->disk.raid_disk, MAP_X);
+ idx = get_imsm_disk_idx(dev, disk->disk.raid_disk, MAP_0);
d = get_imsm_dl_disk(super, idx);
if (!d || d->index < 0 || is_failed(&d->disk))
goto out;
- if (lseek64(d->fd, info->ppl_sector * 512, SEEK_SET) < 0) {
- perror("Failed to seek to PPL header location");
- ret = -1;
- goto out;
- }
+ ret = 1;
+ while (ppl_offset < MULTIPLE_PPL_AREA_SIZE_IMSM) {
+ dprintf("Checking potential PPL at offset: %llu\n", ppl_offset);
- if (read(d->fd, buf, PPL_HEADER_SIZE) != PPL_HEADER_SIZE) {
- perror("Read PPL header failed");
- ret = -1;
- goto out;
- }
+ if (lseek64(d->fd, info->ppl_sector * 512 + ppl_offset,
+ SEEK_SET) < 0) {
+ perror("Failed to seek to PPL header location");
+ ret = -1;
+ goto out;
+ }
- ppl_hdr = buf;
+ if (read(d->fd, buf, PPL_HEADER_SIZE) != PPL_HEADER_SIZE) {
+ perror("Read PPL header failed");
+ ret = -1;
+ goto out;
+ }
- crc = __le32_to_cpu(ppl_hdr->checksum);
- ppl_hdr->checksum = 0;
+ ppl_hdr = buf;
- if (crc != ~crc32c_le(~0, buf, PPL_HEADER_SIZE)) {
- dprintf("Wrong PPL header checksum on %s\n",
- d->devname);
- ret = 1;
- }
+ crc = __le32_to_cpu(ppl_hdr->checksum);
+ ppl_hdr->checksum = 0;
+
+ if (crc != ~crc32c_le(~0, buf, PPL_HEADER_SIZE)) {
+ dprintf("Wrong PPL header checksum on %s\n",
+ d->devname);
+ goto out;
+ }
+
+ if (prev_gen_num > __le64_to_cpu(ppl_hdr->generation)) {
+ /* previous was newest, it was already checked */
+ goto out;
+ }
+
+ if ((__le32_to_cpu(ppl_hdr->signature) !=
+ super->anchor->orig_family_num)) {
+ dprintf("Wrong PPL header signature on %s\n",
+ d->devname);
+ ret = 1;
+ goto out;
+ }
- if (!ret && (__le32_to_cpu(ppl_hdr->signature) !=
- super->anchor->orig_family_num)) {
- dprintf("Wrong PPL header signature on %s\n",
- d->devname);
- ret = 1;
+ ret = 0;
+ prev_gen_num = __le64_to_cpu(ppl_hdr->generation);
+
+ ppl_offset += PPL_HEADER_SIZE;
+ for (i = 0; i < __le32_to_cpu(ppl_hdr->entries_count); i++)
+ ppl_offset +=
+ __le32_to_cpu(ppl_hdr->entries[i].pp_size);
}
out:
free(buf);
- if (ret == 1 && map->map_state == IMSM_T_STATE_UNINITIALIZED)
- return st->ss->write_init_ppl(st, info, d->fd);
+ /*
+ * Update metadata to use mutliple PPLs area (1MB).
+ * This is done once for all RAID members
+ */
+ if (info->consistency_policy == CONSISTENCY_POLICY_PPL &&
+ info->ppl_size != (MULTIPLE_PPL_AREA_SIZE_IMSM >> 9)) {
+ char subarray[20];
+ struct mdinfo *member_dev;
+
+ sprintf(subarray, "%d", info->container_member);
+
+ if (mdmon_running(st->container_devnm))
+ st->update_tail = &st->updates;
+
+ if (st->ss->update_subarray(st, subarray, "ppl", NULL)) {
+ pr_err("Failed to update subarray %s\n",
+ subarray);
+ } else {
+ if (st->update_tail)
+ flush_metadata_updates(st);
+ else
+ st->ss->sync_metadata(st);
+ info->ppl_size = (MULTIPLE_PPL_AREA_SIZE_IMSM >> 9);
+ for (member_dev = info->devs; member_dev;
+ member_dev = member_dev->next)
+ member_dev->ppl_size =
+ (MULTIPLE_PPL_AREA_SIZE_IMSM >> 9);
+ }
+ }
+
+ if (ret == 1) {
+ struct imsm_map *map = get_imsm_map(dev, MAP_X);
+
+ if (map->map_state == IMSM_T_STATE_UNINITIALIZED ||
+ (map->map_state == IMSM_T_STATE_NORMAL &&
+ !(dev->vol.dirty & RAIDVOL_DIRTY)) ||
+ (dev->vol.migr_state == MIGR_REBUILD &&
+ dev->vol.curr_migr_unit == 0 &&
+ get_imsm_disk_idx(dev, disk->disk.raid_disk, MAP_1) != idx))
+ ret = st->ss->write_init_ppl(st, info, d->fd);
+ else
+ info->mismatch_cnt++;
+ }
return ret;
}
return 2;
if (strcmp(update, "ppl") == 0)
- new_policy = RWH_DISTRIBUTED;
+ new_policy = RWH_MULTIPLE_DISTRIBUTED;
else
- new_policy = RWH_OFF;
+ new_policy = RWH_MULTIPLE_OFF;
if (st->update_tail) {
struct imsm_update_rwh_policy *u = xmalloc(sizeof(*u));
int slot;
int chunk;
char *ep;
+ int level;
if (subarray &&
(i != strtoul(subarray, &ep, 10) || *ep != '\0'))
dev = get_imsm_dev(super, i);
map = get_imsm_map(dev, MAP_0);
map2 = get_imsm_map(dev, MAP_1);
+ level = get_imsm_raid_level(map);
/* do not publish arrays that are in the middle of an
* unsupported migration
chunk = __le16_to_cpu(map->blocks_per_strip) >> 1;
/* mdadm does not support all metadata features- set the bit in all arrays state */
if (!validate_geometry_imsm_orom(super,
- get_imsm_raid_level(map), /* RAID level */
- imsm_level_to_layout(get_imsm_raid_level(map)),
+ level, /* RAID level */
+ imsm_level_to_layout(level),
map->num_members, /* raid disks */
&chunk, join_u32(dev->size_low, dev->size_high),
1 /* verbose */)) {
int idx;
int skip;
__u32 ord;
+ int missing = 0;
skip = 0;
idx = get_imsm_disk_idx(dev, slot, MAP_0);
skip = 1;
if (d && is_failed(&d->disk))
skip = 1;
- if (ord & IMSM_ORD_REBUILD)
+ if (!skip && (ord & IMSM_ORD_REBUILD))
recovery_start = 0;
/*
* if we skip some disks the array will be assmebled degraded;
* reset resync start to avoid a dirty-degraded
* situation when performing the intial sync
- *
- * FIXME handle dirty degraded
*/
- if ((skip || recovery_start == 0) &&
- !(dev->vol.dirty & RAIDVOL_DIRTY))
- this->resync_start = MaxSector;
+ if (skip)
+ missing++;
+
+ if (!(dev->vol.dirty & RAIDVOL_DIRTY)) {
+ if ((!able_to_resync(level, missing) ||
+ recovery_start == 0))
+ this->resync_start = MaxSector;
+ } else {
+ /*
+ * FIXME handle dirty degraded
+ */
+ }
+
if (skip)
continue;
map->blocks_per_strip;
info_d->ppl_sector = this->ppl_sector;
info_d->ppl_size = this->ppl_size;
+ if (this->consistency_policy == CONSISTENCY_POLICY_PPL &&
+ recovery_start == 0)
+ this->resync_start = 0;
} else {
info_d->component_size = blocks_per_member(map);
}
dev->vol.dirty = RAIDVOL_CLEAN;
} else {
dev->vol.dirty = RAIDVOL_DIRTY;
- if (dev->rwh_policy == RWH_DISTRIBUTED)
+ if (dev->rwh_policy == RWH_DISTRIBUTED ||
+ dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED)
dev->vol.dirty |= RAIDVOL_DSRECORD_VALID;
}
super->updates_pending++;
di->bb.supported = 1;
if (a->info.consistency_policy == CONSISTENCY_POLICY_PPL) {
di->ppl_sector = get_ppl_sector(super, inst);
- di->ppl_size = (PPL_HEADER_SIZE + PPL_ENTRY_SPACE) >> 9;
+ di->ppl_size = MULTIPLE_PPL_AREA_SIZE_IMSM >> 9;
}
super->random = random32();
di->next = rv;