clustermd_tests: add test case to test manage_re-add against cluster-raid10

[thirdparty/mdadm.git] / super-intel.c
diff --git a/super-intel.c b/super-intel.c

index e3dcd3d2a8759e89f4a8d38fdaaddeef1f90b199..a429940d48a51754a9fd672503a5a4b3590ed915 100644 (file)
--- a/super-intel.c
+++ b/super-intel.c
@@ -92,6 +92,9 @@
  #define NUM_BLOCKS_DIRTY_STRIPE_REGION 2056
  #define SECT_PER_MB_SHIFT 11
  #define MAX_SECTOR_SIZE 4096
+#define MULTIPLE_PPL_AREA_SIZE_IMSM (1024 * 1024) /* Size of the whole
+                                                  * mutliple PPL area
+                                                  */
  
  /* Disk configuration info. */
  #define IMSM_MAX_DEVICES 255
@@ -207,6 +210,9 @@ struct imsm_dev {
  #define RWH_OFF 0
  #define RWH_DISTRIBUTED 1
  #define RWH_JOURNALING_DRIVE 2
+#define RWH_MULTIPLE_DISTRIBUTED 3
+#define RWH_MULTIPLE_PPLS_JOURNALING_DRIVE 4
+#define RWH_MULTIPLE_OFF 5
         __u8  rwh_policy; /* Raid Write Hole Policy */
         __u8  jd_serial[MAX_RAID_SERIAL_LEN]; /* Journal Drive serial number */
         __u8  filler1;
@@ -284,7 +290,7 @@ static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed"
                                  *  already been migrated and must
                                  *  be recovered from checkpoint area */
  
-#define PPL_ENTRY_SPACE (128 * 1024) /* Size of the PPL, without the header */
+#define PPL_ENTRY_SPACE (128 * 1024) /* Size of single PPL, without the header */
  
  struct migr_record {
         __u32 rec_status;           /* Status used to determine how to restart
@@ -1336,6 +1342,20 @@ static unsigned long long round_size_to_mb(unsigned long long size, unsigned int
         return size;
  }
  
+static int able_to_resync(int raid_level, int missing_disks)
+{
+       int max_missing_disks = 0;
+
+       switch (raid_level) {
+       case 10:
+               max_missing_disks = 1;
+               break;
+       default:
+               max_missing_disks = 0;
+       }
+       return missing_disks <= max_missing_disks;
+}
+
  /* try to determine how much space is reserved for metadata from
   * the last get_extents() entry on the smallest active disk,
   * otherwise fallback to the default
@@ -1539,12 +1559,16 @@ static void print_imsm_dev(struct intel_super *super,
         printf("    Dirty State : %s\n", (dev->vol.dirty & RAIDVOL_DIRTY) ?
                                          "dirty" : "clean");
         printf("     RWH Policy : ");
-       if (dev->rwh_policy == RWH_OFF)
+       if (dev->rwh_policy == RWH_OFF || dev->rwh_policy == RWH_MULTIPLE_OFF)
                 printf("off\n");
         else if (dev->rwh_policy == RWH_DISTRIBUTED)
                 printf("PPL distributed\n");
         else if (dev->rwh_policy == RWH_JOURNALING_DRIVE)
                 printf("PPL journaling drive\n");
+       else if (dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED)
+               printf("Multiple distributed PPLs\n");
+       else if (dev->rwh_policy == RWH_MULTIPLE_PPLS_JOURNALING_DRIVE)
+               printf("Multiple PPLs on journaling drive\n");
         else
                 printf("<unknown:%d>\n", dev->rwh_policy);
  }
@@ -2345,6 +2369,8 @@ static void print_imsm_capability(const struct imsm_orom *orom)
         printf("       Platform : Intel(R) ");
         if (orom->capabilities == 0 && orom->driver_features == 0)
                 printf("Matrix Storage Manager\n");
+       else if (imsm_orom_is_enterprise(orom) && orom->major_ver >= 6)
+               printf("Virtual RAID on CPU\n");
         else
                 printf("Rapid Storage Technology%s\n",
                         imsm_orom_is_enterprise(orom) ? " enterprise" : "");
@@ -3294,10 +3320,16 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
         memset(info->uuid, 0, sizeof(info->uuid));
         info->recovery_start = MaxSector;
  
-       if (info->array.level == 5 && dev->rwh_policy == RWH_DISTRIBUTED) {
+       if (info->array.level == 5 &&
+           (dev->rwh_policy == RWH_DISTRIBUTED ||
+            dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED)) {
                 info->consistency_policy = CONSISTENCY_POLICY_PPL;
                 info->ppl_sector = get_ppl_sector(super, super->current_vol);
-               info->ppl_size = (PPL_HEADER_SIZE + PPL_ENTRY_SPACE) >> 9;
+               if (dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED)
+                       info->ppl_size = MULTIPLE_PPL_AREA_SIZE_IMSM >> 9;
+               else
+                       info->ppl_size = (PPL_HEADER_SIZE + PPL_ENTRY_SPACE)
+                                         >> 9;
         } else if (info->array.level <= 0) {
                 info->consistency_policy = CONSISTENCY_POLICY_NONE;
         } else {
@@ -3486,6 +3518,9 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *
                         __u32 ord = get_imsm_ord_tbl_ent(dev, j, MAP_0);
                         __u32 idx = ord_to_idx(ord);
  
+                       if (super->disks && super->disks->index == (int)idx)
+                               info->disk.raid_disk = j;
+
                         if (!(ord & IMSM_ORD_REBUILD) &&
                             get_imsm_missing(super, idx)) {
                                 missing = 1;
@@ -4481,8 +4516,7 @@ static int find_intel_hba_capability(int fd, struct intel_super *super, char *de
                                 hba = hba->next;
                         }
                         fprintf(stderr, ").\n"
-                               "    Mixing devices attached to different %s is not allowed.\n",
-                               hba_name->type == SYS_DEV_VMD ? "VMD domains" : "controllers");
+                               "    Mixing devices attached to different controllers is not allowed.\n");
                 }
                 return 2;
         }
@@ -5390,9 +5424,9 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
         dev->my_vol_raid_dev_num = mpb->num_raid_devs_created;
  
         if (s->consistency_policy <= CONSISTENCY_POLICY_RESYNC) {
-               dev->rwh_policy = RWH_OFF;
+               dev->rwh_policy = RWH_MULTIPLE_OFF;
         } else if (s->consistency_policy == CONSISTENCY_POLICY_PPL) {
-               dev->rwh_policy = RWH_DISTRIBUTED;
+               dev->rwh_policy = RWH_MULTIPLE_DISTRIBUTED;
         } else {
                 free(dev);
                 free(dv);
@@ -5737,7 +5771,7 @@ static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk,
                 } else if (super->hba->type == SYS_DEV_VMD && super->orom &&
                     !imsm_orom_has_tpv_support(super->orom)) {
                         pr_err("\tPlatform configuration does not support non-Intel NVMe drives.\n"
-                              "\tPlease refer to Intel(R) RSTe user guide.\n");
+                              "\tPlease refer to Intel(R) RSTe/VROC user guide.\n");
                         free(dd->devname);
                         free(dd);
                         return 1;
@@ -6049,7 +6083,12 @@ static int write_init_ppl_imsm(struct supertype *st, struct mdinfo *info, int fd
         struct ppl_header *ppl_hdr;
         int ret;
  
-       ret = posix_memalign(&buf, 4096, PPL_HEADER_SIZE);
+       /* first clear entire ppl space */
+       ret = zero_disk_range(fd, info->ppl_sector, info->ppl_size);
+       if (ret)
+               return ret;
+
+       ret = posix_memalign(&buf, MAX_SECTOR_SIZE, PPL_HEADER_SIZE);
         if (ret) {
                 pr_err("Failed to allocate PPL header buffer\n");
                 return ret;
@@ -6059,6 +6098,16 @@ static int write_init_ppl_imsm(struct supertype *st, struct mdinfo *info, int fd
         ppl_hdr = buf;
         memset(ppl_hdr->reserved, 0xff, PPL_HDR_RESERVED);
         ppl_hdr->signature = __cpu_to_le32(super->anchor->orig_family_num);
+
+       if (info->mismatch_cnt) {
+               /*
+                * We are overwriting an invalid ppl. Make one entry with wrong
+                * checksum to prevent the kernel from skipping resync.
+                */
+               ppl_hdr->entries_count = __cpu_to_le32(1);
+               ppl_hdr->entries[0].checksum = ~0;
+       }
+
         ppl_hdr->checksum = __cpu_to_le32(~crc32c_le(~0, buf, PPL_HEADER_SIZE));
  
         if (lseek64(fd, info->ppl_sector * 512, SEEK_SET) < 0) {
@@ -6088,60 +6137,122 @@ static int validate_ppl_imsm(struct supertype *st, struct mdinfo *info,
         struct ppl_header *ppl_hdr;
         __u32 crc;
         struct imsm_dev *dev;
-       struct imsm_map *map;
         __u32 idx;
+       unsigned int i;
+       unsigned long long ppl_offset = 0;
+       unsigned long long prev_gen_num = 0;
  
         if (disk->disk.raid_disk < 0)
                 return 0;
  
-       if (posix_memalign(&buf, 4096, PPL_HEADER_SIZE)) {
+       if (posix_memalign(&buf, MAX_SECTOR_SIZE, PPL_HEADER_SIZE)) {
                 pr_err("Failed to allocate PPL header buffer\n");
                 return -1;
         }
  
         dev = get_imsm_dev(super, info->container_member);
-       map = get_imsm_map(dev, MAP_X);
-       idx = get_imsm_disk_idx(dev, disk->disk.raid_disk, MAP_X);
+       idx = get_imsm_disk_idx(dev, disk->disk.raid_disk, MAP_0);
         d = get_imsm_dl_disk(super, idx);
  
         if (!d || d->index < 0 || is_failed(&d->disk))
                 goto out;
  
-       if (lseek64(d->fd, info->ppl_sector * 512, SEEK_SET) < 0) {
-               perror("Failed to seek to PPL header location");
-               ret = -1;
-               goto out;
-       }
+       ret = 1;
+       while (ppl_offset < MULTIPLE_PPL_AREA_SIZE_IMSM) {
+               dprintf("Checking potential PPL at offset: %llu\n", ppl_offset);
  
-       if (read(d->fd, buf, PPL_HEADER_SIZE) != PPL_HEADER_SIZE) {
-               perror("Read PPL header failed");
-               ret = -1;
-               goto out;
-       }
+               if (lseek64(d->fd, info->ppl_sector * 512 + ppl_offset,
+                           SEEK_SET) < 0) {
+                       perror("Failed to seek to PPL header location");
+                       ret = -1;
+                       goto out;
+               }
  
-       ppl_hdr = buf;
+               if (read(d->fd, buf, PPL_HEADER_SIZE) != PPL_HEADER_SIZE) {
+                       perror("Read PPL header failed");
+                       ret = -1;
+                       goto out;
+               }
  
-       crc = __le32_to_cpu(ppl_hdr->checksum);
-       ppl_hdr->checksum = 0;
+               ppl_hdr = buf;
  
-       if (crc != ~crc32c_le(~0, buf, PPL_HEADER_SIZE)) {
-               dprintf("Wrong PPL header checksum on %s\n",
-                       d->devname);
-               ret = 1;
-       }
+               crc = __le32_to_cpu(ppl_hdr->checksum);
+               ppl_hdr->checksum = 0;
+
+               if (crc != ~crc32c_le(~0, buf, PPL_HEADER_SIZE)) {
+                       dprintf("Wrong PPL header checksum on %s\n",
+                               d->devname);
+                       goto out;
+               }
+
+               if (prev_gen_num > __le64_to_cpu(ppl_hdr->generation)) {
+                       /* previous was newest, it was already checked */
+                       goto out;
+               }
+
+               if ((__le32_to_cpu(ppl_hdr->signature) !=
+                             super->anchor->orig_family_num)) {
+                       dprintf("Wrong PPL header signature on %s\n",
+                               d->devname);
+                       ret = 1;
+                       goto out;
+               }
  
-       if (!ret && (__le32_to_cpu(ppl_hdr->signature) !=
-                     super->anchor->orig_family_num)) {
-               dprintf("Wrong PPL header signature on %s\n",
-                       d->devname);
-               ret = 1;
+               ret = 0;
+               prev_gen_num = __le64_to_cpu(ppl_hdr->generation);
+
+               ppl_offset += PPL_HEADER_SIZE;
+               for (i = 0; i < __le32_to_cpu(ppl_hdr->entries_count); i++)
+                       ppl_offset +=
+                                  __le32_to_cpu(ppl_hdr->entries[i].pp_size);
         }
  
  out:
         free(buf);
  
-       if (ret == 1 && map->map_state == IMSM_T_STATE_UNINITIALIZED)
-               return st->ss->write_init_ppl(st, info, d->fd);
+       /*
+        * Update metadata to use mutliple PPLs area (1MB).
+        * This is done once for all RAID members
+        */
+       if (info->consistency_policy == CONSISTENCY_POLICY_PPL &&
+           info->ppl_size != (MULTIPLE_PPL_AREA_SIZE_IMSM >> 9)) {
+               char subarray[20];
+               struct mdinfo *member_dev;
+
+               sprintf(subarray, "%d", info->container_member);
+
+               if (mdmon_running(st->container_devnm))
+                       st->update_tail = &st->updates;
+
+               if (st->ss->update_subarray(st, subarray, "ppl", NULL)) {
+                       pr_err("Failed to update subarray %s\n",
+                             subarray);
+               } else {
+                       if (st->update_tail)
+                               flush_metadata_updates(st);
+                       else
+                               st->ss->sync_metadata(st);
+                       info->ppl_size = (MULTIPLE_PPL_AREA_SIZE_IMSM >> 9);
+                       for (member_dev = info->devs; member_dev;
+                            member_dev = member_dev->next)
+                               member_dev->ppl_size =
+                                   (MULTIPLE_PPL_AREA_SIZE_IMSM >> 9);
+               }
+       }
+
+       if (ret == 1) {
+               struct imsm_map *map = get_imsm_map(dev, MAP_X);
+
+               if (map->map_state == IMSM_T_STATE_UNINITIALIZED ||
+                  (map->map_state == IMSM_T_STATE_NORMAL &&
+                  !(dev->vol.dirty & RAIDVOL_DIRTY)) ||
+                  (dev->vol.migr_state == MIGR_REBUILD &&
+                   dev->vol.curr_migr_unit == 0 &&
+                   get_imsm_disk_idx(dev, disk->disk.raid_disk, MAP_1) != idx))
+                       ret = st->ss->write_init_ppl(st, info, d->fd);
+               else
+                       info->mismatch_cnt++;
+       }
  
         return ret;
  }
@@ -7403,9 +7514,9 @@ static int update_subarray_imsm(struct supertype *st, char *subarray,
                         return 2;
  
                 if (strcmp(update, "ppl") == 0)
-                       new_policy = RWH_DISTRIBUTED;
+                       new_policy = RWH_MULTIPLE_DISTRIBUTED;
                 else
-                       new_policy = RWH_OFF;
+                       new_policy = RWH_MULTIPLE_OFF;
  
                 if (st->update_tail) {
                         struct imsm_update_rwh_policy *u = xmalloc(sizeof(*u));
@@ -7550,6 +7661,7 @@ static struct mdinfo *container_content_imsm(struct supertype *st, char *subarra
                 int slot;
                 int chunk;
                 char *ep;
+               int level;
  
                 if (subarray &&
                     (i != strtoul(subarray, &ep, 10) || *ep != '\0'))
@@ -7558,6 +7670,7 @@ static struct mdinfo *container_content_imsm(struct supertype *st, char *subarra
                 dev = get_imsm_dev(super, i);
                 map = get_imsm_map(dev, MAP_0);
                 map2 = get_imsm_map(dev, MAP_1);
+               level = get_imsm_raid_level(map);
  
                 /* do not publish arrays that are in the middle of an
                  * unsupported migration
@@ -7580,8 +7693,8 @@ static struct mdinfo *container_content_imsm(struct supertype *st, char *subarra
                 chunk = __le16_to_cpu(map->blocks_per_strip) >> 1;
                 /* mdadm does not support all metadata features- set the bit in all arrays state */
                 if (!validate_geometry_imsm_orom(super,
-                                                get_imsm_raid_level(map), /* RAID level */
-                                                imsm_level_to_layout(get_imsm_raid_level(map)),
+                                                level, /* RAID level */
+                                                imsm_level_to_layout(level),
                                                  map->num_members, /* raid disks */
                                                  &chunk, join_u32(dev->size_low, dev->size_high),
                                                  1 /* verbose */)) {
@@ -7605,6 +7718,7 @@ static struct mdinfo *container_content_imsm(struct supertype *st, char *subarra
                         int idx;
                         int skip;
                         __u32 ord;
+                       int missing = 0;
  
                         skip = 0;
                         idx = get_imsm_disk_idx(dev, slot, MAP_0);
@@ -7618,19 +7732,27 @@ static struct mdinfo *container_content_imsm(struct supertype *st, char *subarra
                                 skip = 1;
                         if (d && is_failed(&d->disk))
                                 skip = 1;
-                       if (ord & IMSM_ORD_REBUILD)
+                       if (!skip && (ord & IMSM_ORD_REBUILD))
                                 recovery_start = 0;
  
                         /*
                          * if we skip some disks the array will be assmebled degraded;
                          * reset resync start to avoid a dirty-degraded
                          * situation when performing the intial sync
-                        *
-                        * FIXME handle dirty degraded
                          */
-                       if ((skip || recovery_start == 0) &&
-                           !(dev->vol.dirty & RAIDVOL_DIRTY))
-                               this->resync_start = MaxSector;
+                       if (skip)
+                               missing++;
+
+                       if (!(dev->vol.dirty & RAIDVOL_DIRTY)) {
+                               if ((!able_to_resync(level, missing) ||
+                                    recovery_start == 0))
+                                       this->resync_start = MaxSector;
+                       } else {
+                               /*
+                                * FIXME handle dirty degraded
+                                */
+                       }
+
                         if (skip)
                                 continue;
  
@@ -7666,6 +7788,9 @@ static struct mdinfo *container_content_imsm(struct supertype *st, char *subarra
                                                 map->blocks_per_strip;
                                 info_d->ppl_sector = this->ppl_sector;
                                 info_d->ppl_size = this->ppl_size;
+                               if (this->consistency_policy == CONSISTENCY_POLICY_PPL &&
+                                   recovery_start == 0)
+                                       this->resync_start = 0;
                         } else {
                                 info_d->component_size = blocks_per_member(map);
                         }
@@ -8205,7 +8330,8 @@ skip_mark_checkpoint:
                         dev->vol.dirty = RAIDVOL_CLEAN;
                 } else {
                         dev->vol.dirty = RAIDVOL_DIRTY;
-                       if (dev->rwh_policy == RWH_DISTRIBUTED)
+                       if (dev->rwh_policy == RWH_DISTRIBUTED ||
+                           dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED)
                                 dev->vol.dirty |= RAIDVOL_DSRECORD_VALID;
                 }
                 super->updates_pending++;
@@ -8759,7 +8885,7 @@ static struct mdinfo *imsm_activate_spare(struct active_array *a,
                 di->bb.supported = 1;
                 if (a->info.consistency_policy == CONSISTENCY_POLICY_PPL) {
                         di->ppl_sector = get_ppl_sector(super, inst);
-                       di->ppl_size = (PPL_HEADER_SIZE + PPL_ENTRY_SPACE) >> 9;
+                       di->ppl_size = MULTIPLE_PPL_AREA_SIZE_IMSM >> 9;
                 }
                 super->random = random32();
                 di->next = rv;