super-intel.c

   1 /*
   2  * mdadm - Intel(R) Matrix Storage Manager Support
   3  *
   4  * Copyright (C) 2002-2008 Intel Corporation
   5  *
   6  * This program is free software; you can redistribute it and/or modify it
   7  * under the terms and conditions of the GNU General Public License,
   8  * version 2, as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  13  * more details.
  14  *
  15  * You should have received a copy of the GNU General Public License along with
  16  * this program; if not, write to the Free Software Foundation, Inc.,
  17  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  18  */
  19
  20 #define HAVE_STDINT_H 1
  21 #include "mdadm.h"
  22 #include "mdmon.h"
  23 #include "sha1.h"
  24 #include "platform-intel.h"
  25 #include <values.h>
  26 #include <scsi/sg.h>
  27 #include <ctype.h>
  28 #include <dirent.h>
  29
  30 /* MPB == Metadata Parameter Block */
  31 #define MPB_SIGNATURE "Intel Raid ISM Cfg Sig. "
  32 #define MPB_SIG_LEN (strlen(MPB_SIGNATURE))
  33 #define MPB_VERSION_RAID0 "1.0.00"
  34 #define MPB_VERSION_RAID1 "1.1.00"
  35 #define MPB_VERSION_MANY_VOLUMES_PER_ARRAY "1.2.00"
  36 #define MPB_VERSION_3OR4_DISK_ARRAY "1.2.01"
  37 #define MPB_VERSION_RAID5 "1.2.02"
  38 #define MPB_VERSION_5OR6_DISK_ARRAY "1.2.04"
  39 #define MPB_VERSION_CNG "1.2.06"
  40 #define MPB_VERSION_ATTRIBS "1.3.00"
  41 #define MAX_SIGNATURE_LENGTH  32
  42 #define MAX_RAID_SERIAL_LEN   16
  43
  44 /* supports RAID0 */
  45 #define MPB_ATTRIB_RAID0                __cpu_to_le32(0x00000001)
  46 /* supports RAID1 */
  47 #define MPB_ATTRIB_RAID1                __cpu_to_le32(0x00000002)
  48 /* supports RAID10 */
  49 #define MPB_ATTRIB_RAID10               __cpu_to_le32(0x00000004)
  50 /* supports RAID1E */
  51 #define MPB_ATTRIB_RAID1E               __cpu_to_le32(0x00000008)
  52 /* supports RAID5 */
  53 #define MPB_ATTRIB_RAID5                __cpu_to_le32(0x00000010)
  54 /* supports RAID CNG */
  55 #define MPB_ATTRIB_RAIDCNG              __cpu_to_le32(0x00000020)
  56 /* supports expanded stripe sizes of  256K, 512K and 1MB */
  57 #define MPB_ATTRIB_EXP_STRIPE_SIZE      __cpu_to_le32(0x00000040)
  58
  59 /* The OROM Support RST Caching of Volumes */
  60 #define MPB_ATTRIB_NVM                  __cpu_to_le32(0x02000000)
  61 /* The OROM supports creating disks greater than 2TB */
  62 #define MPB_ATTRIB_2TB_DISK             __cpu_to_le32(0x04000000)
  63 /* The OROM supports Bad Block Management */
  64 #define MPB_ATTRIB_BBM                  __cpu_to_le32(0x08000000)
  65
  66 /* THe OROM Supports NVM Caching of Volumes */
  67 #define MPB_ATTRIB_NEVER_USE2           __cpu_to_le32(0x10000000)
  68 /* The OROM supports creating volumes greater than 2TB */
  69 #define MPB_ATTRIB_2TB                  __cpu_to_le32(0x20000000)
  70 /* originally for PMP, now it's wasted b/c. Never use this bit! */
  71 #define MPB_ATTRIB_NEVER_USE            __cpu_to_le32(0x40000000)
  72 /* Verify MPB contents against checksum after reading MPB */
  73 #define MPB_ATTRIB_CHECKSUM_VERIFY      __cpu_to_le32(0x80000000)
  74
  75 /* Define all supported attributes that have to be accepted by mdadm
  76  */
  77 #define MPB_ATTRIB_SUPPORTED           (MPB_ATTRIB_CHECKSUM_VERIFY | \
  78                                         MPB_ATTRIB_2TB             | \
  79                                         MPB_ATTRIB_2TB_DISK        | \
  80                                         MPB_ATTRIB_RAID0           | \
  81                                         MPB_ATTRIB_RAID1           | \
  82                                         MPB_ATTRIB_RAID10          | \
  83                                         MPB_ATTRIB_RAID5           | \
  84                                         MPB_ATTRIB_EXP_STRIPE_SIZE | \
  85                                         MPB_ATTRIB_BBM)
  86
  87 /* Define attributes that are unused but not harmful */
  88 #define MPB_ATTRIB_IGNORED              (MPB_ATTRIB_NEVER_USE)
  89
  90 #define MPB_SECTOR_CNT 2210
  91 #define IMSM_RESERVED_SECTORS 8192
  92 #define NUM_BLOCKS_DIRTY_STRIPE_REGION 2048
  93 #define SECT_PER_MB_SHIFT 11
  94 #define MAX_SECTOR_SIZE 4096
  95 #define MULTIPLE_PPL_AREA_SIZE_IMSM (1024 * 1024) /* Size of the whole
  96                                                    * mutliple PPL area
  97                                                    */
  98
  99 /*
 100  * Internal Write-intent bitmap is stored in the same area where PPL.
 101  * Both features are mutually exclusive, so it is not an issue.
 102  * The first 8KiB of the area are reserved and shall not be used.
 103  */
 104 #define IMSM_BITMAP_AREA_RESERVED_SIZE 8192
 105
 106 #define IMSM_BITMAP_HEADER_OFFSET (IMSM_BITMAP_AREA_RESERVED_SIZE)
 107 #define IMSM_BITMAP_HEADER_SIZE MAX_SECTOR_SIZE
 108
 109 #define IMSM_BITMAP_START_OFFSET (IMSM_BITMAP_HEADER_OFFSET + IMSM_BITMAP_HEADER_SIZE)
 110 #define IMSM_BITMAP_AREA_SIZE (MULTIPLE_PPL_AREA_SIZE_IMSM - IMSM_BITMAP_START_OFFSET)
 111 #define IMSM_BITMAP_AND_HEADER_SIZE (IMSM_BITMAP_AREA_SIZE + IMSM_BITMAP_HEADER_SIZE)
 112
 113 #define IMSM_DEFAULT_BITMAP_CHUNKSIZE (64 * 1024 * 1024)
 114 #define IMSM_DEFAULT_BITMAP_DAEMON_SLEEP 5
 115
 116 /*
 117  * This macro let's us ensure that no-one accidentally
 118  * changes the size of a struct
 119  */
 120 #define ASSERT_SIZE(_struct, size) \
 121 static inline void __assert_size_##_struct(void)        \
 122 {                                                       \
 123         switch (0) {                                    \
 124         case 0: break;                                  \
 125         case (sizeof(struct _struct) == size): break;   \
 126         }                                               \
 127 }
 128
 129 /* Disk configuration info. */
 130 #define IMSM_MAX_DEVICES 255
 131 struct imsm_disk {
 132         __u8 serial[MAX_RAID_SERIAL_LEN];/* 0xD8 - 0xE7 ascii serial number */
 133         __u32 total_blocks_lo;           /* 0xE8 - 0xEB total blocks lo */
 134         __u32 scsi_id;                   /* 0xEC - 0xEF scsi ID */
 135 #define SPARE_DISK      __cpu_to_le32(0x01)  /* Spare */
 136 #define CONFIGURED_DISK __cpu_to_le32(0x02)  /* Member of some RaidDev */
 137 #define FAILED_DISK     __cpu_to_le32(0x04)  /* Permanent failure */
 138 #define JOURNAL_DISK    __cpu_to_le32(0x2000000) /* Device marked as Journaling Drive */
 139         __u32 status;                    /* 0xF0 - 0xF3 */
 140         __u32 owner_cfg_num; /* which config 0,1,2... owns this disk */
 141         __u32 total_blocks_hi;           /* 0xF4 - 0xF5 total blocks hi */
 142 #define IMSM_DISK_FILLERS       3
 143         __u32 filler[IMSM_DISK_FILLERS]; /* 0xF5 - 0x107 MPB_DISK_FILLERS for future expansion */
 144 };
 145 ASSERT_SIZE(imsm_disk, 48)
 146
 147 /* map selector for map managment
 148  */
 149 #define MAP_0           0
 150 #define MAP_1           1
 151 #define MAP_X           -1
 152
 153 /* RAID map configuration infos. */
 154 struct imsm_map {
 155         __u32 pba_of_lba0_lo;   /* start address of partition */
 156         __u32 blocks_per_member_lo;/* blocks per member */
 157         __u32 num_data_stripes_lo;      /* number of data stripes */
 158         __u16 blocks_per_strip;
 159         __u8  map_state;        /* Normal, Uninitialized, Degraded, Failed */
 160 #define IMSM_T_STATE_NORMAL 0
 161 #define IMSM_T_STATE_UNINITIALIZED 1
 162 #define IMSM_T_STATE_DEGRADED 2
 163 #define IMSM_T_STATE_FAILED 3
 164         __u8  raid_level;
 165 #define IMSM_T_RAID0 0
 166 #define IMSM_T_RAID1 1
 167 #define IMSM_T_RAID5 5          /* since metadata version 1.2.02 ? */
 168         __u8  num_members;      /* number of member disks */
 169         __u8  num_domains;      /* number of parity domains */
 170         __u8  failed_disk_num;  /* valid only when state is degraded */
 171         __u8  ddf;
 172         __u32 pba_of_lba0_hi;
 173         __u32 blocks_per_member_hi;
 174         __u32 num_data_stripes_hi;
 175         __u32 filler[4];        /* expansion area */
 176 #define IMSM_ORD_REBUILD (1 << 24)
 177         __u32 disk_ord_tbl[1];  /* disk_ord_tbl[num_members],
 178                                  * top byte contains some flags
 179                                  */
 180 };
 181 ASSERT_SIZE(imsm_map, 52)
 182
 183 struct imsm_vol {
 184         __u32 curr_migr_unit_lo;
 185         __u32 checkpoint_id;    /* id to access curr_migr_unit */
 186         __u8  migr_state;       /* Normal or Migrating */
 187 #define MIGR_INIT 0
 188 #define MIGR_REBUILD 1
 189 #define MIGR_VERIFY 2 /* analagous to echo check > sync_action */
 190 #define MIGR_GEN_MIGR 3
 191 #define MIGR_STATE_CHANGE 4
 192 #define MIGR_REPAIR 5
 193         __u8  migr_type;        /* Initializing, Rebuilding, ... */
 194 #define RAIDVOL_CLEAN          0
 195 #define RAIDVOL_DIRTY          1
 196 #define RAIDVOL_DSRECORD_VALID 2
 197         __u8  dirty;
 198         __u8  fs_state;         /* fast-sync state for CnG (0xff == disabled) */
 199         __u16 verify_errors;    /* number of mismatches */
 200         __u16 bad_blocks;       /* number of bad blocks during verify */
 201         __u32 curr_migr_unit_hi;
 202         __u32 filler[3];
 203         struct imsm_map map[1];
 204         /* here comes another one if migr_state */
 205 };
 206 ASSERT_SIZE(imsm_vol, 84)
 207
 208 struct imsm_dev {
 209         __u8  volume[MAX_RAID_SERIAL_LEN];
 210         __u32 size_low;
 211         __u32 size_high;
 212 #define DEV_BOOTABLE            __cpu_to_le32(0x01)
 213 #define DEV_BOOT_DEVICE         __cpu_to_le32(0x02)
 214 #define DEV_READ_COALESCING     __cpu_to_le32(0x04)
 215 #define DEV_WRITE_COALESCING    __cpu_to_le32(0x08)
 216 #define DEV_LAST_SHUTDOWN_DIRTY __cpu_to_le32(0x10)
 217 #define DEV_HIDDEN_AT_BOOT      __cpu_to_le32(0x20)
 218 #define DEV_CURRENTLY_HIDDEN    __cpu_to_le32(0x40)
 219 #define DEV_VERIFY_AND_FIX      __cpu_to_le32(0x80)
 220 #define DEV_MAP_STATE_UNINIT    __cpu_to_le32(0x100)
 221 #define DEV_NO_AUTO_RECOVERY    __cpu_to_le32(0x200)
 222 #define DEV_CLONE_N_GO          __cpu_to_le32(0x400)
 223 #define DEV_CLONE_MAN_SYNC      __cpu_to_le32(0x800)
 224 #define DEV_CNG_MASTER_DISK_NUM __cpu_to_le32(0x1000)
 225         __u32 status;   /* Persistent RaidDev status */
 226         __u32 reserved_blocks; /* Reserved blocks at beginning of volume */
 227         __u8  migr_priority;
 228         __u8  num_sub_vols;
 229         __u8  tid;
 230         __u8  cng_master_disk;
 231         __u16 cache_policy;
 232         __u8  cng_state;
 233         __u8  cng_sub_state;
 234         __u16 my_vol_raid_dev_num; /* Used in Unique volume Id for this RaidDev */
 235
 236         /* NVM_EN */
 237         __u8 nv_cache_mode;
 238         __u8 nv_cache_flags;
 239
 240         /* Unique Volume Id of the NvCache Volume associated with this volume */
 241         __u32 nvc_vol_orig_family_num;
 242         __u16 nvc_vol_raid_dev_num;
 243
 244 #define RWH_OFF 0
 245 #define RWH_DISTRIBUTED 1
 246 #define RWH_JOURNALING_DRIVE 2
 247 #define RWH_MULTIPLE_DISTRIBUTED 3
 248 #define RWH_MULTIPLE_PPLS_JOURNALING_DRIVE 4
 249 #define RWH_MULTIPLE_OFF 5
 250 #define RWH_BITMAP 6
 251         __u8  rwh_policy; /* Raid Write Hole Policy */
 252         __u8  jd_serial[MAX_RAID_SERIAL_LEN]; /* Journal Drive serial number */
 253         __u8  filler1;
 254
 255 #define IMSM_DEV_FILLERS 3
 256         __u32 filler[IMSM_DEV_FILLERS];
 257         struct imsm_vol vol;
 258 };
 259 ASSERT_SIZE(imsm_dev, 164)
 260
 261 struct imsm_super {
 262         __u8 sig[MAX_SIGNATURE_LENGTH]; /* 0x00 - 0x1F */
 263         __u32 check_sum;                /* 0x20 - 0x23 MPB Checksum */
 264         __u32 mpb_size;                 /* 0x24 - 0x27 Size of MPB */
 265         __u32 family_num;               /* 0x28 - 0x2B Checksum from first time this config was written */
 266         __u32 generation_num;           /* 0x2C - 0x2F Incremented each time this array's MPB is written */
 267         __u32 error_log_size;           /* 0x30 - 0x33 in bytes */
 268         __u32 attributes;               /* 0x34 - 0x37 */
 269         __u8 num_disks;                 /* 0x38 Number of configured disks */
 270         __u8 num_raid_devs;             /* 0x39 Number of configured volumes */
 271         __u8 error_log_pos;             /* 0x3A  */
 272         __u8 fill[1];                   /* 0x3B */
 273         __u32 cache_size;               /* 0x3c - 0x40 in mb */
 274         __u32 orig_family_num;          /* 0x40 - 0x43 original family num */
 275         __u32 pwr_cycle_count;          /* 0x44 - 0x47 simulated power cycle count for array */
 276         __u32 bbm_log_size;             /* 0x48 - 0x4B - size of bad Block Mgmt Log in bytes */
 277         __u16 num_raid_devs_created;    /* 0x4C - 0x4D Used for generating unique
 278                                          * volume IDs for raid_dev created in this array
 279                                          * (starts at 1)
 280                                          */
 281         __u16 filler1;                  /* 0x4E - 0x4F */
 282         __u64 creation_time;            /* 0x50 - 0x57 Array creation time */
 283 #define IMSM_FILLERS 32
 284         __u32 filler[IMSM_FILLERS];     /* 0x58 - 0xD7 RAID_MPB_FILLERS */
 285         struct imsm_disk disk[1];       /* 0xD8 diskTbl[numDisks] */
 286         /* here comes imsm_dev[num_raid_devs] */
 287         /* here comes BBM logs */
 288 };
 289 ASSERT_SIZE(imsm_super, 264)
 290
 291 #define BBM_LOG_MAX_ENTRIES 254
 292 #define BBM_LOG_MAX_LBA_ENTRY_VAL 256           /* Represents 256 LBAs */
 293 #define BBM_LOG_SIGNATURE 0xabadb10c
 294
 295 struct bbm_log_block_addr {
 296         __u16 w1;
 297         __u32 dw1;
 298 } __attribute__ ((__packed__));
 299
 300 struct bbm_log_entry {
 301         __u8 marked_count;              /* Number of blocks marked - 1 */
 302         __u8 disk_ordinal;              /* Disk entry within the imsm_super */
 303         struct bbm_log_block_addr defective_block_start;
 304 } __attribute__ ((__packed__));
 305
 306 struct bbm_log {
 307         __u32 signature; /* 0xABADB10C */
 308         __u32 entry_count;
 309         struct bbm_log_entry marked_block_entries[BBM_LOG_MAX_ENTRIES];
 310 };
 311 ASSERT_SIZE(bbm_log, 2040)
 312
 313 static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed" };
 314
 315 #define BLOCKS_PER_KB   (1024/512)
 316
 317 #define RAID_DISK_RESERVED_BLOCKS_IMSM_HI 2209
 318
 319 #define GEN_MIGR_AREA_SIZE 2048 /* General Migration Copy Area size in blocks */
 320
 321 #define MIGR_REC_BUF_SECTORS 1 /* size of migr_record i/o buffer in sectors */
 322 #define MIGR_REC_SECTOR_POSITION 1 /* migr_record position offset on disk,
 323                                * MIGR_REC_BUF_SECTORS <= MIGR_REC_SECTOR_POS
 324                                */
 325
 326 #define UNIT_SRC_NORMAL     0   /* Source data for curr_migr_unit must
 327                                  *  be recovered using srcMap */
 328 #define UNIT_SRC_IN_CP_AREA 1   /* Source data for curr_migr_unit has
 329                                  *  already been migrated and must
 330                                  *  be recovered from checkpoint area */
 331
 332 #define PPL_ENTRY_SPACE (128 * 1024) /* Size of single PPL, without the header */
 333
 334 struct migr_record {
 335         __u32 rec_status;           /* Status used to determine how to restart
 336                                      * migration in case it aborts
 337                                      * in some fashion */
 338         __u32 curr_migr_unit_lo;    /* 0..numMigrUnits-1 */
 339         __u32 family_num;           /* Family number of MPB
 340                                      * containing the RaidDev
 341                                      * that is migrating */
 342         __u32 ascending_migr;       /* True if migrating in increasing
 343                                      * order of lbas */
 344         __u32 blocks_per_unit;      /* Num disk blocks per unit of operation */
 345         __u32 dest_depth_per_unit;  /* Num member blocks each destMap
 346                                      * member disk
 347                                      * advances per unit-of-operation */
 348         __u32 ckpt_area_pba_lo;     /* Pba of first block of ckpt copy area */
 349         __u32 dest_1st_member_lba_lo;   /* First member lba on first
 350                                          * stripe of destination */
 351         __u32 num_migr_units_lo;    /* Total num migration units-of-op */
 352         __u32 post_migr_vol_cap;    /* Size of volume after
 353                                      * migration completes */
 354         __u32 post_migr_vol_cap_hi; /* Expansion space for LBA64 */
 355         __u32 ckpt_read_disk_num;   /* Which member disk in destSubMap[0] the
 356                                      * migration ckpt record was read from
 357                                      * (for recovered migrations) */
 358         __u32 curr_migr_unit_hi;    /* 0..numMigrUnits-1 high order 32 bits */
 359         __u32 ckpt_area_pba_hi;     /* Pba of first block of ckpt copy area
 360                                      * high order 32 bits */
 361         __u32 dest_1st_member_lba_hi; /* First member lba on first stripe of
 362                                        * destination - high order 32 bits */
 363         __u32 num_migr_units_hi;      /* Total num migration units-of-op
 364                                        * high order 32 bits */
 365         __u32 filler[16];
 366 };
 367 ASSERT_SIZE(migr_record, 128)
 368
 369 struct md_list {
 370         /* usage marker:
 371          *  1: load metadata
 372          *  2: metadata does not match
 373          *  4: already checked
 374          */
 375         int   used;
 376         char  *devname;
 377         int   found;
 378         int   container;
 379         dev_t st_rdev;
 380         struct md_list *next;
 381 };
 382
 383 #define pr_vrb(fmt, arg...) (void) (verbose && pr_err(fmt, ##arg))
 384
 385 static __u8 migr_type(struct imsm_dev *dev)
 386 {
 387         if (dev->vol.migr_type == MIGR_VERIFY &&
 388             dev->status & DEV_VERIFY_AND_FIX)
 389                 return MIGR_REPAIR;
 390         else
 391                 return dev->vol.migr_type;
 392 }
 393
 394 static void set_migr_type(struct imsm_dev *dev, __u8 migr_type)
 395 {
 396         /* for compatibility with older oroms convert MIGR_REPAIR, into
 397          * MIGR_VERIFY w/ DEV_VERIFY_AND_FIX status
 398          */
 399         if (migr_type == MIGR_REPAIR) {
 400                 dev->vol.migr_type = MIGR_VERIFY;
 401                 dev->status |= DEV_VERIFY_AND_FIX;
 402         } else {
 403                 dev->vol.migr_type = migr_type;
 404                 dev->status &= ~DEV_VERIFY_AND_FIX;
 405         }
 406 }
 407
 408 static unsigned int sector_count(__u32 bytes, unsigned int sector_size)
 409 {
 410         return ROUND_UP(bytes, sector_size) / sector_size;
 411 }
 412
 413 static unsigned int mpb_sectors(struct imsm_super *mpb,
 414                                         unsigned int sector_size)
 415 {
 416         return sector_count(__le32_to_cpu(mpb->mpb_size), sector_size);
 417 }
 418
 419 struct intel_dev {
 420         struct imsm_dev *dev;
 421         struct intel_dev *next;
 422         unsigned index;
 423 };
 424
 425 struct intel_hba {
 426         enum sys_dev_type type;
 427         char *path;
 428         char *pci_id;
 429         struct intel_hba *next;
 430 };
 431
 432 enum action {
 433         DISK_REMOVE = 1,
 434         DISK_ADD
 435 };
 436 /* internal representation of IMSM metadata */
 437 struct intel_super {
 438         union {
 439                 void *buf; /* O_DIRECT buffer for reading/writing metadata */
 440                 struct imsm_super *anchor; /* immovable parameters */
 441         };
 442         union {
 443                 void *migr_rec_buf; /* buffer for I/O operations */
 444                 struct migr_record *migr_rec; /* migration record */
 445         };
 446         int clean_migration_record_by_mdmon; /* when reshape is switched to next
 447                 array, it indicates that mdmon is allowed to clean migration
 448                 record */
 449         size_t len; /* size of the 'buf' allocation */
 450         size_t extra_space; /* extra space in 'buf' that is not used yet */
 451         void *next_buf; /* for realloc'ing buf from the manager */
 452         size_t next_len;
 453         int updates_pending; /* count of pending updates for mdmon */
 454         int current_vol; /* index of raid device undergoing creation */
 455         unsigned long long create_offset; /* common start for 'current_vol' */
 456         __u32 random; /* random data for seeding new family numbers */
 457         struct intel_dev *devlist;
 458         unsigned int sector_size; /* sector size of used member drives */
 459         struct dl {
 460                 struct dl *next;
 461                 int index;
 462                 __u8 serial[MAX_RAID_SERIAL_LEN];
 463                 int major, minor;
 464                 char *devname;
 465                 struct imsm_disk disk;
 466                 int fd;
 467                 int extent_cnt;
 468                 struct extent *e; /* for determining freespace @ create */
 469                 int raiddisk; /* slot to fill in autolayout */
 470                 enum action action;
 471         } *disks, *current_disk;
 472         struct dl *disk_mgmt_list; /* list of disks to add/remove while mdmon
 473                                       active */
 474         struct dl *missing; /* disks removed while we weren't looking */
 475         struct bbm_log *bbm_log;
 476         struct intel_hba *hba; /* device path of the raid controller for this metadata */
 477         const struct imsm_orom *orom; /* platform firmware support */
 478         struct intel_super *next; /* (temp) list for disambiguating family_num */
 479         struct md_bb bb;        /* memory for get_bad_blocks call */
 480 };
 481
 482 struct intel_disk {
 483         struct imsm_disk disk;
 484         #define IMSM_UNKNOWN_OWNER (-1)
 485         int owner;
 486         struct intel_disk *next;
 487 };
 488
 489 struct extent {
 490         unsigned long long start, size;
 491 };
 492
 493 /* definitions of reshape process types */
 494 enum imsm_reshape_type {
 495         CH_TAKEOVER,
 496         CH_MIGRATION,
 497         CH_ARRAY_SIZE,
 498 };
 499
 500 /* definition of messages passed to imsm_process_update */
 501 enum imsm_update_type {
 502         update_activate_spare,
 503         update_create_array,
 504         update_kill_array,
 505         update_rename_array,
 506         update_add_remove_disk,
 507         update_reshape_container_disks,
 508         update_reshape_migration,
 509         update_takeover,
 510         update_general_migration_checkpoint,
 511         update_size_change,
 512         update_prealloc_badblocks_mem,
 513         update_rwh_policy,
 514 };
 515
 516 struct imsm_update_activate_spare {
 517         enum imsm_update_type type;
 518         struct dl *dl;
 519         int slot;
 520         int array;
 521         struct imsm_update_activate_spare *next;
 522 };
 523
 524 struct geo_params {
 525         char devnm[32];
 526         char *dev_name;
 527         unsigned long long size;
 528         int level;
 529         int layout;
 530         int chunksize;
 531         int raid_disks;
 532 };
 533
 534 enum takeover_direction {
 535         R10_TO_R0,
 536         R0_TO_R10
 537 };
 538 struct imsm_update_takeover {
 539         enum imsm_update_type type;
 540         int subarray;
 541         enum takeover_direction direction;
 542 };
 543
 544 struct imsm_update_reshape {
 545         enum imsm_update_type type;
 546         int old_raid_disks;
 547         int new_raid_disks;
 548
 549         int new_disks[1]; /* new_raid_disks - old_raid_disks makedev number */
 550 };
 551
 552 struct imsm_update_reshape_migration {
 553         enum imsm_update_type type;
 554         int old_raid_disks;
 555         int new_raid_disks;
 556         /* fields for array migration changes
 557          */
 558         int subdev;
 559         int new_level;
 560         int new_layout;
 561         int new_chunksize;
 562
 563         int new_disks[1]; /* new_raid_disks - old_raid_disks makedev number */
 564 };
 565
 566 struct imsm_update_size_change {
 567         enum imsm_update_type type;
 568         int subdev;
 569         long long new_size;
 570 };
 571
 572 struct imsm_update_general_migration_checkpoint {
 573         enum imsm_update_type type;
 574         __u64 curr_migr_unit;
 575 };
 576
 577 struct disk_info {
 578         __u8 serial[MAX_RAID_SERIAL_LEN];
 579 };
 580
 581 struct imsm_update_create_array {
 582         enum imsm_update_type type;
 583         int dev_idx;
 584         struct imsm_dev dev;
 585 };
 586
 587 struct imsm_update_kill_array {
 588         enum imsm_update_type type;
 589         int dev_idx;
 590 };
 591
 592 struct imsm_update_rename_array {
 593         enum imsm_update_type type;
 594         __u8 name[MAX_RAID_SERIAL_LEN];
 595         int dev_idx;
 596 };
 597
 598 struct imsm_update_add_remove_disk {
 599         enum imsm_update_type type;
 600 };
 601
 602 struct imsm_update_prealloc_bb_mem {
 603         enum imsm_update_type type;
 604 };
 605
 606 struct imsm_update_rwh_policy {
 607         enum imsm_update_type type;
 608         int new_policy;
 609         int dev_idx;
 610 };
 611
 612 static const char *_sys_dev_type[] = {
 613         [SYS_DEV_UNKNOWN] = "Unknown",
 614         [SYS_DEV_SAS] = "SAS",
 615         [SYS_DEV_SATA] = "SATA",
 616         [SYS_DEV_NVME] = "NVMe",
 617         [SYS_DEV_VMD] = "VMD"
 618 };
 619
 620 const char *get_sys_dev_type(enum sys_dev_type type)
 621 {
 622         if (type >= SYS_DEV_MAX)
 623                 type = SYS_DEV_UNKNOWN;
 624
 625         return _sys_dev_type[type];
 626 }
 627
 628 static struct intel_hba * alloc_intel_hba(struct sys_dev *device)
 629 {
 630         struct intel_hba *result = xmalloc(sizeof(*result));
 631
 632         result->type = device->type;
 633         result->path = xstrdup(device->path);
 634         result->next = NULL;
 635         if (result->path && (result->pci_id = strrchr(result->path, '/')) != NULL)
 636                 result->pci_id++;
 637
 638         return result;
 639 }
 640
 641 static struct intel_hba * find_intel_hba(struct intel_hba *hba, struct sys_dev *device)
 642 {
 643         struct intel_hba *result;
 644
 645         for (result = hba; result; result = result->next) {
 646                 if (result->type == device->type && strcmp(result->path, device->path) == 0)
 647                         break;
 648         }
 649         return result;
 650 }
 651
 652 static int attach_hba_to_super(struct intel_super *super, struct sys_dev *device)
 653 {
 654         struct intel_hba *hba;
 655
 656         /* check if disk attached to Intel HBA */
 657         hba = find_intel_hba(super->hba, device);
 658         if (hba != NULL)
 659                 return 1;
 660         /* Check if HBA is already attached to super */
 661         if (super->hba == NULL) {
 662                 super->hba = alloc_intel_hba(device);
 663                 return 1;
 664         }
 665
 666         hba = super->hba;
 667         /* Intel metadata allows for all disks attached to the same type HBA.
 668          * Do not support HBA types mixing
 669          */
 670         if (device->type != hba->type)
 671                 return 2;
 672
 673         /* Multiple same type HBAs can be used if they share the same OROM */
 674         const struct imsm_orom *device_orom = get_orom_by_device_id(device->dev_id);
 675
 676         if (device_orom != super->orom)
 677                 return 2;
 678
 679         while (hba->next)
 680                 hba = hba->next;
 681
 682         hba->next = alloc_intel_hba(device);
 683         return 1;
 684 }
 685
 686 static struct sys_dev* find_disk_attached_hba(int fd, const char *devname)
 687 {
 688         struct sys_dev *list, *elem;
 689         char *disk_path;
 690
 691         if ((list = find_intel_devices()) == NULL)
 692                 return 0;
 693
 694         if (fd < 0)
 695                 disk_path  = (char *) devname;
 696         else
 697                 disk_path = diskfd_to_devpath(fd, 1, NULL);
 698
 699         if (!disk_path)
 700                 return 0;
 701
 702         for (elem = list; elem; elem = elem->next)
 703                 if (path_attached_to_hba(disk_path, elem->path))
 704                         return elem;
 705
 706         if (disk_path != devname)
 707                 free(disk_path);
 708
 709         return NULL;
 710 }
 711
 712 static int find_intel_hba_capability(int fd, struct intel_super *super,
 713                                      char *devname);
 714
 715 static struct supertype *match_metadata_desc_imsm(char *arg)
 716 {
 717         struct supertype *st;
 718
 719         if (strcmp(arg, "imsm") != 0 &&
 720             strcmp(arg, "default") != 0
 721                 )
 722                 return NULL;
 723
 724         st = xcalloc(1, sizeof(*st));
 725         st->ss = &super_imsm;
 726         st->max_devs = IMSM_MAX_DEVICES;
 727         st->minor_version = 0;
 728         st->sb = NULL;
 729         return st;
 730 }
 731
 732 static __u8 *get_imsm_version(struct imsm_super *mpb)
 733 {
 734         return &mpb->sig[MPB_SIG_LEN];
 735 }
 736
 737 /* retrieve a disk directly from the anchor when the anchor is known to be
 738  * up-to-date, currently only at load time
 739  */
 740 static struct imsm_disk *__get_imsm_disk(struct imsm_super *mpb, __u8 index)
 741 {
 742         if (index >= mpb->num_disks)
 743                 return NULL;
 744         return &mpb->disk[index];
 745 }
 746
 747 /* retrieve the disk description based on a index of the disk
 748  * in the sub-array
 749  */
 750 static struct dl *get_imsm_dl_disk(struct intel_super *super, __u8 index)
 751 {
 752         struct dl *d;
 753
 754         for (d = super->disks; d; d = d->next)
 755                 if (d->index == index)
 756                         return d;
 757
 758         return NULL;
 759 }
 760 /* retrieve a disk from the parsed metadata */
 761 static struct imsm_disk *get_imsm_disk(struct intel_super *super, __u8 index)
 762 {
 763         struct dl *dl;
 764
 765         dl = get_imsm_dl_disk(super, index);
 766         if (dl)
 767                 return &dl->disk;
 768
 769         return NULL;
 770 }
 771
 772 /* generate a checksum directly from the anchor when the anchor is known to be
 773  * up-to-date, currently only at load or write_super after coalescing
 774  */
 775 static __u32 __gen_imsm_checksum(struct imsm_super *mpb)
 776 {
 777         __u32 end = mpb->mpb_size / sizeof(end);
 778         __u32 *p = (__u32 *) mpb;
 779         __u32 sum = 0;
 780
 781         while (end--) {
 782                 sum += __le32_to_cpu(*p);
 783                 p++;
 784         }
 785
 786         return sum - __le32_to_cpu(mpb->check_sum);
 787 }
 788
 789 static size_t sizeof_imsm_map(struct imsm_map *map)
 790 {
 791         return sizeof(struct imsm_map) + sizeof(__u32) * (map->num_members - 1);
 792 }
 793
 794 struct imsm_map *get_imsm_map(struct imsm_dev *dev, int second_map)
 795 {
 796         /* A device can have 2 maps if it is in the middle of a migration.
 797          * If second_map is:
 798          *    MAP_0 - we return the first map
 799          *    MAP_1 - we return the second map if it exists, else NULL
 800          *    MAP_X - we return the second map if it exists, else the first
 801          */
 802         struct imsm_map *map = &dev->vol.map[0];
 803         struct imsm_map *map2 = NULL;
 804
 805         if (dev->vol.migr_state)
 806                 map2 = (void *)map + sizeof_imsm_map(map);
 807
 808         switch (second_map) {
 809         case MAP_0:
 810                 break;
 811         case MAP_1:
 812                 map = map2;
 813                 break;
 814         case MAP_X:
 815                 if (map2)
 816                         map = map2;
 817                 break;
 818         default:
 819                 map = NULL;
 820         }
 821         return map;
 822
 823 }
 824
 825 /* return the size of the device.
 826  * migr_state increases the returned size if map[0] were to be duplicated
 827  */
 828 static size_t sizeof_imsm_dev(struct imsm_dev *dev, int migr_state)
 829 {
 830         size_t size = sizeof(*dev) - sizeof(struct imsm_map) +
 831                       sizeof_imsm_map(get_imsm_map(dev, MAP_0));
 832
 833         /* migrating means an additional map */
 834         if (dev->vol.migr_state)
 835                 size += sizeof_imsm_map(get_imsm_map(dev, MAP_1));
 836         else if (migr_state)
 837                 size += sizeof_imsm_map(get_imsm_map(dev, MAP_0));
 838
 839         return size;
 840 }
 841
 842 /* retrieve disk serial number list from a metadata update */
 843 static struct disk_info *get_disk_info(struct imsm_update_create_array *update)
 844 {
 845         void *u = update;
 846         struct disk_info *inf;
 847
 848         inf = u + sizeof(*update) - sizeof(struct imsm_dev) +
 849               sizeof_imsm_dev(&update->dev, 0);
 850
 851         return inf;
 852 }
 853
 854 static struct imsm_dev *__get_imsm_dev(struct imsm_super *mpb, __u8 index)
 855 {
 856         int offset;
 857         int i;
 858         void *_mpb = mpb;
 859
 860         if (index >= mpb->num_raid_devs)
 861                 return NULL;
 862
 863         /* devices start after all disks */
 864         offset = ((void *) &mpb->disk[mpb->num_disks]) - _mpb;
 865
 866         for (i = 0; i <= index; i++)
 867                 if (i == index)
 868                         return _mpb + offset;
 869                 else
 870                         offset += sizeof_imsm_dev(_mpb + offset, 0);
 871
 872         return NULL;
 873 }
 874
 875 static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index)
 876 {
 877         struct intel_dev *dv;
 878
 879         if (index >= super->anchor->num_raid_devs)
 880                 return NULL;
 881         for (dv = super->devlist; dv; dv = dv->next)
 882                 if (dv->index == index)
 883                         return dv->dev;
 884         return NULL;
 885 }
 886
 887 static inline unsigned long long __le48_to_cpu(const struct bbm_log_block_addr
 888                                                *addr)
 889 {
 890         return ((((__u64)__le32_to_cpu(addr->dw1)) << 16) |
 891                 __le16_to_cpu(addr->w1));
 892 }
 893
 894 static inline struct bbm_log_block_addr __cpu_to_le48(unsigned long long sec)
 895 {
 896         struct bbm_log_block_addr addr;
 897
 898         addr.w1 =  __cpu_to_le16((__u16)(sec & 0xffff));
 899         addr.dw1 = __cpu_to_le32((__u32)(sec >> 16) & 0xffffffff);
 900         return addr;
 901 }
 902
 903 /* get size of the bbm log */
 904 static __u32 get_imsm_bbm_log_size(struct bbm_log *log)
 905 {
 906         if (!log || log->entry_count == 0)
 907                 return 0;
 908
 909         return sizeof(log->signature) +
 910                 sizeof(log->entry_count) +
 911                 log->entry_count * sizeof(struct bbm_log_entry);
 912 }
 913
 914 /* check if bad block is not partially stored in bbm log */
 915 static int is_stored_in_bbm(struct bbm_log *log, const __u8 idx, const unsigned
 916                             long long sector, const int length, __u32 *pos)
 917 {
 918         __u32 i;
 919
 920         for (i = *pos; i < log->entry_count; i++) {
 921                 struct bbm_log_entry *entry = &log->marked_block_entries[i];
 922                 unsigned long long bb_start;
 923                 unsigned long long bb_end;
 924
 925                 bb_start = __le48_to_cpu(&entry->defective_block_start);
 926                 bb_end = bb_start + (entry->marked_count + 1);
 927
 928                 if ((entry->disk_ordinal == idx) && (bb_start >= sector) &&
 929                     (bb_end <= sector + length)) {
 930                         *pos = i;
 931                         return 1;
 932                 }
 933         }
 934         return 0;
 935 }
 936
 937 /* record new bad block in bbm log */
 938 static int record_new_badblock(struct bbm_log *log, const __u8 idx, unsigned
 939                                long long sector, int length)
 940 {
 941         int new_bb = 0;
 942         __u32 pos = 0;
 943         struct bbm_log_entry *entry = NULL;
 944
 945         while (is_stored_in_bbm(log, idx, sector, length, &pos)) {
 946                 struct bbm_log_entry *e = &log->marked_block_entries[pos];
 947
 948                 if ((e->marked_count + 1 == BBM_LOG_MAX_LBA_ENTRY_VAL) &&
 949                     (__le48_to_cpu(&e->defective_block_start) == sector)) {
 950                         sector += BBM_LOG_MAX_LBA_ENTRY_VAL;
 951                         length -= BBM_LOG_MAX_LBA_ENTRY_VAL;
 952                         pos = pos + 1;
 953                         continue;
 954                 }
 955                 entry = e;
 956                 break;
 957         }
 958
 959         if (entry) {
 960                 int cnt = (length <= BBM_LOG_MAX_LBA_ENTRY_VAL) ? length :
 961                         BBM_LOG_MAX_LBA_ENTRY_VAL;
 962                 entry->defective_block_start = __cpu_to_le48(sector);
 963                 entry->marked_count = cnt - 1;
 964                 if (cnt == length)
 965                         return 1;
 966                 sector += cnt;
 967                 length -= cnt;
 968         }
 969
 970         new_bb = ROUND_UP(length, BBM_LOG_MAX_LBA_ENTRY_VAL) /
 971                 BBM_LOG_MAX_LBA_ENTRY_VAL;
 972         if (log->entry_count + new_bb > BBM_LOG_MAX_ENTRIES)
 973                 return 0;
 974
 975         while (length > 0) {
 976                 int cnt = (length <= BBM_LOG_MAX_LBA_ENTRY_VAL) ? length :
 977                         BBM_LOG_MAX_LBA_ENTRY_VAL;
 978                 struct bbm_log_entry *entry =
 979                         &log->marked_block_entries[log->entry_count];
 980
 981                 entry->defective_block_start = __cpu_to_le48(sector);
 982                 entry->marked_count = cnt - 1;
 983                 entry->disk_ordinal = idx;
 984
 985                 sector += cnt;
 986                 length -= cnt;
 987
 988                 log->entry_count++;
 989         }
 990
 991         return new_bb;
 992 }
 993
 994 /* clear all bad blocks for given disk */
 995 static void clear_disk_badblocks(struct bbm_log *log, const __u8 idx)
 996 {
 997         __u32 i = 0;
 998
 999         while (i < log->entry_count) {
1000                 struct bbm_log_entry *entries = log->marked_block_entries;
1001
1002                 if (entries[i].disk_ordinal == idx) {
1003                         if (i < log->entry_count - 1)
1004                                 entries[i] = entries[log->entry_count - 1];
1005                         log->entry_count--;
1006                 } else {
1007                         i++;
1008                 }
1009         }
1010 }
1011
1012 /* clear given bad block */
1013 static int clear_badblock(struct bbm_log *log, const __u8 idx, const unsigned
1014                           long long sector, const int length) {
1015         __u32 i = 0;
1016
1017         while (i < log->entry_count) {
1018                 struct bbm_log_entry *entries = log->marked_block_entries;
1019
1020                 if ((entries[i].disk_ordinal == idx) &&
1021                     (__le48_to_cpu(&entries[i].defective_block_start) ==
1022                      sector) && (entries[i].marked_count + 1 == length)) {
1023                         if (i < log->entry_count - 1)
1024                                 entries[i] = entries[log->entry_count - 1];
1025                         log->entry_count--;
1026                         break;
1027                 }
1028                 i++;
1029         }
1030
1031         return 1;
1032 }
1033
1034 /* allocate and load BBM log from metadata */
1035 static int load_bbm_log(struct intel_super *super)
1036 {
1037         struct imsm_super *mpb = super->anchor;
1038         __u32 bbm_log_size =  __le32_to_cpu(mpb->bbm_log_size);
1039
1040         super->bbm_log = xcalloc(1, sizeof(struct bbm_log));
1041         if (!super->bbm_log)
1042                 return 1;
1043
1044         if (bbm_log_size) {
1045                 struct bbm_log *log = (void *)mpb +
1046                         __le32_to_cpu(mpb->mpb_size) - bbm_log_size;
1047
1048                 __u32 entry_count;
1049
1050                 if (bbm_log_size < sizeof(log->signature) +
1051                     sizeof(log->entry_count))
1052                         return 2;
1053
1054                 entry_count = __le32_to_cpu(log->entry_count);
1055                 if ((__le32_to_cpu(log->signature) != BBM_LOG_SIGNATURE) ||
1056                     (entry_count > BBM_LOG_MAX_ENTRIES))
1057                         return 3;
1058
1059                 if (bbm_log_size !=
1060                     sizeof(log->signature) + sizeof(log->entry_count) +
1061                     entry_count * sizeof(struct bbm_log_entry))
1062                         return 4;
1063
1064                 memcpy(super->bbm_log, log, bbm_log_size);
1065         } else {
1066                 super->bbm_log->signature = __cpu_to_le32(BBM_LOG_SIGNATURE);
1067                 super->bbm_log->entry_count = 0;
1068         }
1069
1070         return 0;
1071 }
1072
1073 /* checks if bad block is within volume boundaries */
1074 static int is_bad_block_in_volume(const struct bbm_log_entry *entry,
1075                         const unsigned long long start_sector,
1076                         const unsigned long long size)
1077 {
1078         unsigned long long bb_start;
1079         unsigned long long bb_end;
1080
1081         bb_start = __le48_to_cpu(&entry->defective_block_start);
1082         bb_end = bb_start + (entry->marked_count + 1);
1083
1084         if (((bb_start >= start_sector) && (bb_start < start_sector + size)) ||
1085             ((bb_end >= start_sector) && (bb_end <= start_sector + size)))
1086                 return 1;
1087
1088         return 0;
1089 }
1090
1091 /* get list of bad blocks on a drive for a volume */
1092 static void get_volume_badblocks(const struct bbm_log *log, const __u8 idx,
1093                         const unsigned long long start_sector,
1094                         const unsigned long long size,
1095                         struct md_bb *bbs)
1096 {
1097         __u32 count = 0;
1098         __u32 i;
1099
1100         for (i = 0; i < log->entry_count; i++) {
1101                 const struct bbm_log_entry *ent =
1102                         &log->marked_block_entries[i];
1103                 struct md_bb_entry *bb;
1104
1105                 if ((ent->disk_ordinal == idx) &&
1106                     is_bad_block_in_volume(ent, start_sector, size)) {
1107
1108                         if (!bbs->entries) {
1109                                 bbs->entries = xmalloc(BBM_LOG_MAX_ENTRIES *
1110                                                      sizeof(*bb));
1111                                 if (!bbs->entries)
1112                                         break;
1113                         }
1114
1115                         bb = &bbs->entries[count++];
1116                         bb->sector = __le48_to_cpu(&ent->defective_block_start);
1117                         bb->length = ent->marked_count + 1;
1118                 }
1119         }
1120         bbs->count = count;
1121 }
1122
1123 /*
1124  * for second_map:
1125  *  == MAP_0 get first map
1126  *  == MAP_1 get second map
1127  *  == MAP_X than get map according to the current migr_state
1128  */
1129 static __u32 get_imsm_ord_tbl_ent(struct imsm_dev *dev,
1130                                   int slot,
1131                                   int second_map)
1132 {
1133         struct imsm_map *map;
1134
1135         map = get_imsm_map(dev, second_map);
1136
1137         /* top byte identifies disk under rebuild */
1138         return __le32_to_cpu(map->disk_ord_tbl[slot]);
1139 }
1140
1141 #define ord_to_idx(ord) (((ord) << 8) >> 8)
1142 static __u32 get_imsm_disk_idx(struct imsm_dev *dev, int slot, int second_map)
1143 {
1144         __u32 ord = get_imsm_ord_tbl_ent(dev, slot, second_map);
1145
1146         return ord_to_idx(ord);
1147 }
1148
1149 static void set_imsm_ord_tbl_ent(struct imsm_map *map, int slot, __u32 ord)
1150 {
1151         map->disk_ord_tbl[slot] = __cpu_to_le32(ord);
1152 }
1153
1154 static int get_imsm_disk_slot(struct imsm_map *map, unsigned idx)
1155 {
1156         int slot;
1157         __u32 ord;
1158
1159         for (slot = 0; slot < map->num_members; slot++) {
1160                 ord = __le32_to_cpu(map->disk_ord_tbl[slot]);
1161                 if (ord_to_idx(ord) == idx)
1162                         return slot;
1163         }
1164
1165         return -1;
1166 }
1167
1168 static int get_imsm_raid_level(struct imsm_map *map)
1169 {
1170         if (map->raid_level == 1) {
1171                 if (map->num_members == 2)
1172                         return 1;
1173                 else
1174                         return 10;
1175         }
1176
1177         return map->raid_level;
1178 }
1179
1180 static int cmp_extent(const void *av, const void *bv)
1181 {
1182         const struct extent *a = av;
1183         const struct extent *b = bv;
1184         if (a->start < b->start)
1185                 return -1;
1186         if (a->start > b->start)
1187                 return 1;
1188         return 0;
1189 }
1190
1191 static int count_memberships(struct dl *dl, struct intel_super *super)
1192 {
1193         int memberships = 0;
1194         int i;
1195
1196         for (i = 0; i < super->anchor->num_raid_devs; i++) {
1197                 struct imsm_dev *dev = get_imsm_dev(super, i);
1198                 struct imsm_map *map = get_imsm_map(dev, MAP_0);
1199
1200                 if (get_imsm_disk_slot(map, dl->index) >= 0)
1201                         memberships++;
1202         }
1203
1204         return memberships;
1205 }
1206
1207 static __u32 imsm_min_reserved_sectors(struct intel_super *super);
1208
1209 static int split_ull(unsigned long long n, void *lo, void *hi)
1210 {
1211         if (lo == 0 || hi == 0)
1212                 return 1;
1213         __put_unaligned32(__cpu_to_le32((__u32)n), lo);
1214         __put_unaligned32(__cpu_to_le32((n >> 32)), hi);
1215         return 0;
1216 }
1217
1218 static unsigned long long join_u32(__u32 lo, __u32 hi)
1219 {
1220         return (unsigned long long)__le32_to_cpu(lo) |
1221                (((unsigned long long)__le32_to_cpu(hi)) << 32);
1222 }
1223
1224 static unsigned long long total_blocks(struct imsm_disk *disk)
1225 {
1226         if (disk == NULL)
1227                 return 0;
1228         return join_u32(disk->total_blocks_lo, disk->total_blocks_hi);
1229 }
1230
1231 static unsigned long long pba_of_lba0(struct imsm_map *map)
1232 {
1233         if (map == NULL)
1234                 return 0;
1235         return join_u32(map->pba_of_lba0_lo, map->pba_of_lba0_hi);
1236 }
1237
1238 static unsigned long long blocks_per_member(struct imsm_map *map)
1239 {
1240         if (map == NULL)
1241                 return 0;
1242         return join_u32(map->blocks_per_member_lo, map->blocks_per_member_hi);
1243 }
1244
1245 static unsigned long long num_data_stripes(struct imsm_map *map)
1246 {
1247         if (map == NULL)
1248                 return 0;
1249         return join_u32(map->num_data_stripes_lo, map->num_data_stripes_hi);
1250 }
1251
1252 static unsigned long long vol_curr_migr_unit(struct imsm_dev *dev)
1253 {
1254         if (dev == NULL)
1255                 return 0;
1256
1257         return join_u32(dev->vol.curr_migr_unit_lo, dev->vol.curr_migr_unit_hi);
1258 }
1259
1260 static unsigned long long imsm_dev_size(struct imsm_dev *dev)
1261 {
1262         if (dev == NULL)
1263                 return 0;
1264         return join_u32(dev->size_low, dev->size_high);
1265 }
1266
1267 static unsigned long long migr_chkp_area_pba(struct migr_record *migr_rec)
1268 {
1269         if (migr_rec == NULL)
1270                 return 0;
1271         return join_u32(migr_rec->ckpt_area_pba_lo,
1272                         migr_rec->ckpt_area_pba_hi);
1273 }
1274
1275 static unsigned long long current_migr_unit(struct migr_record *migr_rec)
1276 {
1277         if (migr_rec == NULL)
1278                 return 0;
1279         return join_u32(migr_rec->curr_migr_unit_lo,
1280                         migr_rec->curr_migr_unit_hi);
1281 }
1282
1283 static unsigned long long migr_dest_1st_member_lba(struct migr_record *migr_rec)
1284 {
1285         if (migr_rec == NULL)
1286                 return 0;
1287         return join_u32(migr_rec->dest_1st_member_lba_lo,
1288                         migr_rec->dest_1st_member_lba_hi);
1289 }
1290
1291 static unsigned long long get_num_migr_units(struct migr_record *migr_rec)
1292 {
1293         if (migr_rec == NULL)
1294                 return 0;
1295         return join_u32(migr_rec->num_migr_units_lo,
1296                         migr_rec->num_migr_units_hi);
1297 }
1298
1299 static void set_total_blocks(struct imsm_disk *disk, unsigned long long n)
1300 {
1301         split_ull(n, &disk->total_blocks_lo, &disk->total_blocks_hi);
1302 }
1303
1304 static void set_pba_of_lba0(struct imsm_map *map, unsigned long long n)
1305 {
1306         split_ull(n, &map->pba_of_lba0_lo, &map->pba_of_lba0_hi);
1307 }
1308
1309 static void set_blocks_per_member(struct imsm_map *map, unsigned long long n)
1310 {
1311         split_ull(n, &map->blocks_per_member_lo, &map->blocks_per_member_hi);
1312 }
1313
1314 static void set_num_data_stripes(struct imsm_map *map, unsigned long long n)
1315 {
1316         split_ull(n, &map->num_data_stripes_lo, &map->num_data_stripes_hi);
1317 }
1318
1319 static void set_vol_curr_migr_unit(struct imsm_dev *dev, unsigned long long n)
1320 {
1321         if (dev == NULL)
1322                 return;
1323
1324         split_ull(n, &dev->vol.curr_migr_unit_lo, &dev->vol.curr_migr_unit_hi);
1325 }
1326
1327 static void set_imsm_dev_size(struct imsm_dev *dev, unsigned long long n)
1328 {
1329         split_ull(n, &dev->size_low, &dev->size_high);
1330 }
1331
1332 static void set_migr_chkp_area_pba(struct migr_record *migr_rec,
1333                                    unsigned long long n)
1334 {
1335         split_ull(n, &migr_rec->ckpt_area_pba_lo, &migr_rec->ckpt_area_pba_hi);
1336 }
1337
1338 static void set_current_migr_unit(struct migr_record *migr_rec,
1339                                   unsigned long long n)
1340 {
1341         split_ull(n, &migr_rec->curr_migr_unit_lo,
1342                   &migr_rec->curr_migr_unit_hi);
1343 }
1344
1345 static void set_migr_dest_1st_member_lba(struct migr_record *migr_rec,
1346                                          unsigned long long n)
1347 {
1348         split_ull(n, &migr_rec->dest_1st_member_lba_lo,
1349                   &migr_rec->dest_1st_member_lba_hi);
1350 }
1351
1352 static void set_num_migr_units(struct migr_record *migr_rec,
1353                                unsigned long long n)
1354 {
1355         split_ull(n, &migr_rec->num_migr_units_lo,
1356                   &migr_rec->num_migr_units_hi);
1357 }
1358
1359 static unsigned long long per_dev_array_size(struct imsm_map *map)
1360 {
1361         unsigned long long array_size = 0;
1362
1363         if (map == NULL)
1364                 return array_size;
1365
1366         array_size = num_data_stripes(map) * map->blocks_per_strip;
1367         if (get_imsm_raid_level(map) == 1 || get_imsm_raid_level(map) == 10)
1368                 array_size *= 2;
1369
1370         return array_size;
1371 }
1372
1373 static struct extent *get_extents(struct intel_super *super, struct dl *dl,
1374                                   int get_minimal_reservation)
1375 {
1376         /* find a list of used extents on the given physical device */
1377         struct extent *rv, *e;
1378         int i;
1379         int memberships = count_memberships(dl, super);
1380         __u32 reservation;
1381
1382         /* trim the reserved area for spares, so they can join any array
1383          * regardless of whether the OROM has assigned sectors from the
1384          * IMSM_RESERVED_SECTORS region
1385          */
1386         if (dl->index == -1 || get_minimal_reservation)
1387                 reservation = imsm_min_reserved_sectors(super);
1388         else
1389                 reservation = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
1390
1391         rv = xcalloc(sizeof(struct extent), (memberships + 1));
1392         e = rv;
1393
1394         for (i = 0; i < super->anchor->num_raid_devs; i++) {
1395                 struct imsm_dev *dev = get_imsm_dev(super, i);
1396                 struct imsm_map *map = get_imsm_map(dev, MAP_0);
1397
1398                 if (get_imsm_disk_slot(map, dl->index) >= 0) {
1399                         e->start = pba_of_lba0(map);
1400                         e->size = per_dev_array_size(map);
1401                         e++;
1402                 }
1403         }
1404         qsort(rv, memberships, sizeof(*rv), cmp_extent);
1405
1406         /* determine the start of the metadata
1407          * when no raid devices are defined use the default
1408          * ...otherwise allow the metadata to truncate the value
1409          * as is the case with older versions of imsm
1410          */
1411         if (memberships) {
1412                 struct extent *last = &rv[memberships - 1];
1413                 unsigned long long remainder;
1414
1415                 remainder = total_blocks(&dl->disk) - (last->start + last->size);
1416                 /* round down to 1k block to satisfy precision of the kernel
1417                  * 'size' interface
1418                  */
1419                 remainder &= ~1UL;
1420                 /* make sure remainder is still sane */
1421                 if (remainder < (unsigned)ROUND_UP(super->len, 512) >> 9)
1422                         remainder = ROUND_UP(super->len, 512) >> 9;
1423                 if (reservation > remainder)
1424                         reservation = remainder;
1425         }
1426         e->start = total_blocks(&dl->disk) - reservation;
1427         e->size = 0;
1428         return rv;
1429 }
1430
1431 /* try to determine how much space is reserved for metadata from
1432  * the last get_extents() entry, otherwise fallback to the
1433  * default
1434  */
1435 static __u32 imsm_reserved_sectors(struct intel_super *super, struct dl *dl)
1436 {
1437         struct extent *e;
1438         int i;
1439         __u32 rv;
1440
1441         /* for spares just return a minimal reservation which will grow
1442          * once the spare is picked up by an array
1443          */
1444         if (dl->index == -1)
1445                 return MPB_SECTOR_CNT;
1446
1447         e = get_extents(super, dl, 0);
1448         if (!e)
1449                 return MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
1450
1451         /* scroll to last entry */
1452         for (i = 0; e[i].size; i++)
1453                 continue;
1454
1455         rv = total_blocks(&dl->disk) - e[i].start;
1456
1457         free(e);
1458
1459         return rv;
1460 }
1461
1462 static int is_spare(struct imsm_disk *disk)
1463 {
1464         return (disk->status & SPARE_DISK) == SPARE_DISK;
1465 }
1466
1467 static int is_configured(struct imsm_disk *disk)
1468 {
1469         return (disk->status & CONFIGURED_DISK) == CONFIGURED_DISK;
1470 }
1471
1472 static int is_failed(struct imsm_disk *disk)
1473 {
1474         return (disk->status & FAILED_DISK) == FAILED_DISK;
1475 }
1476
1477 static int is_journal(struct imsm_disk *disk)
1478 {
1479         return (disk->status & JOURNAL_DISK) == JOURNAL_DISK;
1480 }
1481
1482 /* round array size down to closest MB and ensure it splits evenly
1483  * between members
1484  */
1485 static unsigned long long round_size_to_mb(unsigned long long size, unsigned int
1486                                            disk_count)
1487 {
1488         size /= disk_count;
1489         size = (size >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT;
1490         size *= disk_count;
1491
1492         return size;
1493 }
1494
1495 static int able_to_resync(int raid_level, int missing_disks)
1496 {
1497         int max_missing_disks = 0;
1498
1499         switch (raid_level) {
1500         case 10:
1501                 max_missing_disks = 1;
1502                 break;
1503         default:
1504                 max_missing_disks = 0;
1505         }
1506         return missing_disks <= max_missing_disks;
1507 }
1508
1509 /* try to determine how much space is reserved for metadata from
1510  * the last get_extents() entry on the smallest active disk,
1511  * otherwise fallback to the default
1512  */
1513 static __u32 imsm_min_reserved_sectors(struct intel_super *super)
1514 {
1515         struct extent *e;
1516         int i;
1517         unsigned long long min_active;
1518         __u32 remainder;
1519         __u32 rv = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
1520         struct dl *dl, *dl_min = NULL;
1521
1522         if (!super)
1523                 return rv;
1524
1525         min_active = 0;
1526         for (dl = super->disks; dl; dl = dl->next) {
1527                 if (dl->index < 0)
1528                         continue;
1529                 unsigned long long blocks = total_blocks(&dl->disk);
1530                 if (blocks < min_active || min_active == 0) {
1531                         dl_min = dl;
1532                         min_active = blocks;
1533                 }
1534         }
1535         if (!dl_min)
1536                 return rv;
1537
1538         /* find last lba used by subarrays on the smallest active disk */
1539         e = get_extents(super, dl_min, 0);
1540         if (!e)
1541                 return rv;
1542         for (i = 0; e[i].size; i++)
1543                 continue;
1544
1545         remainder = min_active - e[i].start;
1546         free(e);
1547
1548         /* to give priority to recovery we should not require full
1549            IMSM_RESERVED_SECTORS from the spare */
1550         rv = MPB_SECTOR_CNT + NUM_BLOCKS_DIRTY_STRIPE_REGION;
1551
1552         /* if real reservation is smaller use that value */
1553         return  (remainder < rv) ? remainder : rv;
1554 }
1555
1556 /*
1557  * Return minimum size of a spare and sector size
1558  * that can be used in this array
1559  */
1560 int get_spare_criteria_imsm(struct supertype *st, struct spare_criteria *c)
1561 {
1562         struct intel_super *super = st->sb;
1563         struct dl *dl;
1564         struct extent *e;
1565         int i;
1566         unsigned long long size = 0;
1567
1568         c->min_size = 0;
1569         c->sector_size = 0;
1570
1571         if (!super)
1572                 return -EINVAL;
1573         /* find first active disk in array */
1574         dl = super->disks;
1575         while (dl && (is_failed(&dl->disk) || dl->index == -1))
1576                 dl = dl->next;
1577         if (!dl)
1578                 return -EINVAL;
1579         /* find last lba used by subarrays */
1580         e = get_extents(super, dl, 0);
1581         if (!e)
1582                 return -EINVAL;
1583         for (i = 0; e[i].size; i++)
1584                 continue;
1585         if (i > 0)
1586                 size = e[i-1].start + e[i-1].size;
1587         free(e);
1588
1589         /* add the amount of space needed for metadata */
1590         size += imsm_min_reserved_sectors(super);
1591
1592         c->min_size = size * 512;
1593         c->sector_size = super->sector_size;
1594
1595         return 0;
1596 }
1597
1598 static int is_gen_migration(struct imsm_dev *dev);
1599
1600 #define IMSM_4K_DIV 8
1601
1602 static __u64 blocks_per_migr_unit(struct intel_super *super,
1603                                   struct imsm_dev *dev);
1604
1605 static void print_imsm_dev(struct intel_super *super,
1606                            struct imsm_dev *dev,
1607                            char *uuid,
1608                            int disk_idx)
1609 {
1610         __u64 sz;
1611         int slot, i;
1612         struct imsm_map *map = get_imsm_map(dev, MAP_0);
1613         struct imsm_map *map2 = get_imsm_map(dev, MAP_1);
1614         __u32 ord;
1615
1616         printf("\n");
1617         printf("[%.16s]:\n", dev->volume);
1618         printf("       Subarray : %d\n", super->current_vol);
1619         printf("           UUID : %s\n", uuid);
1620         printf("     RAID Level : %d", get_imsm_raid_level(map));
1621         if (map2)
1622                 printf(" <-- %d", get_imsm_raid_level(map2));
1623         printf("\n");
1624         printf("        Members : %d", map->num_members);
1625         if (map2)
1626                 printf(" <-- %d", map2->num_members);
1627         printf("\n");
1628         printf("          Slots : [");
1629         for (i = 0; i < map->num_members; i++) {
1630                 ord = get_imsm_ord_tbl_ent(dev, i, MAP_0);
1631                 printf("%s", ord & IMSM_ORD_REBUILD ? "_" : "U");
1632         }
1633         printf("]");
1634         if (map2) {
1635                 printf(" <-- [");
1636                 for (i = 0; i < map2->num_members; i++) {
1637                         ord = get_imsm_ord_tbl_ent(dev, i, MAP_1);
1638                         printf("%s", ord & IMSM_ORD_REBUILD ? "_" : "U");
1639                 }
1640                 printf("]");
1641         }
1642         printf("\n");
1643         printf("    Failed disk : ");
1644         if (map->failed_disk_num == 0xff)
1645                 printf("none");
1646         else
1647                 printf("%i", map->failed_disk_num);
1648         printf("\n");
1649         slot = get_imsm_disk_slot(map, disk_idx);
1650         if (slot >= 0) {
1651                 ord = get_imsm_ord_tbl_ent(dev, slot, MAP_X);
1652                 printf("      This Slot : %d%s\n", slot,
1653                        ord & IMSM_ORD_REBUILD ? " (out-of-sync)" : "");
1654         } else
1655                 printf("      This Slot : ?\n");
1656         printf("    Sector Size : %u\n", super->sector_size);
1657         sz = imsm_dev_size(dev);
1658         printf("     Array Size : %llu%s\n",
1659                    (unsigned long long)sz * 512 / super->sector_size,
1660                human_size(sz * 512));
1661         sz = blocks_per_member(map);
1662         printf("   Per Dev Size : %llu%s\n",
1663                    (unsigned long long)sz * 512 / super->sector_size,
1664                human_size(sz * 512));
1665         printf("  Sector Offset : %llu\n",
1666                 pba_of_lba0(map));
1667         printf("    Num Stripes : %llu\n",
1668                 num_data_stripes(map));
1669         printf("     Chunk Size : %u KiB",
1670                 __le16_to_cpu(map->blocks_per_strip) / 2);
1671         if (map2)
1672                 printf(" <-- %u KiB",
1673                         __le16_to_cpu(map2->blocks_per_strip) / 2);
1674         printf("\n");
1675         printf("       Reserved : %d\n", __le32_to_cpu(dev->reserved_blocks));
1676         printf("  Migrate State : ");
1677         if (dev->vol.migr_state) {
1678                 if (migr_type(dev) == MIGR_INIT)
1679                         printf("initialize\n");
1680                 else if (migr_type(dev) == MIGR_REBUILD)
1681                         printf("rebuild\n");
1682                 else if (migr_type(dev) == MIGR_VERIFY)
1683                         printf("check\n");
1684                 else if (migr_type(dev) == MIGR_GEN_MIGR)
1685                         printf("general migration\n");
1686                 else if (migr_type(dev) == MIGR_STATE_CHANGE)
1687                         printf("state change\n");
1688                 else if (migr_type(dev) == MIGR_REPAIR)
1689                         printf("repair\n");
1690                 else
1691                         printf("<unknown:%d>\n", migr_type(dev));
1692         } else
1693                 printf("idle\n");
1694         printf("      Map State : %s", map_state_str[map->map_state]);
1695         if (dev->vol.migr_state) {
1696                 struct imsm_map *map = get_imsm_map(dev, MAP_1);
1697
1698                 printf(" <-- %s", map_state_str[map->map_state]);
1699                 printf("\n     Checkpoint : %llu ", vol_curr_migr_unit(dev));
1700                 if (is_gen_migration(dev) && (slot > 1 || slot < 0))
1701                         printf("(N/A)");
1702                 else
1703                         printf("(%llu)", (unsigned long long)
1704                                    blocks_per_migr_unit(super, dev));
1705         }
1706         printf("\n");
1707         printf("    Dirty State : %s\n", (dev->vol.dirty & RAIDVOL_DIRTY) ?
1708                                          "dirty" : "clean");
1709         printf("     RWH Policy : ");
1710         if (dev->rwh_policy == RWH_OFF || dev->rwh_policy == RWH_MULTIPLE_OFF)
1711                 printf("off\n");
1712         else if (dev->rwh_policy == RWH_DISTRIBUTED)
1713                 printf("PPL distributed\n");
1714         else if (dev->rwh_policy == RWH_JOURNALING_DRIVE)
1715                 printf("PPL journaling drive\n");
1716         else if (dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED)
1717                 printf("Multiple distributed PPLs\n");
1718         else if (dev->rwh_policy == RWH_MULTIPLE_PPLS_JOURNALING_DRIVE)
1719                 printf("Multiple PPLs on journaling drive\n");
1720         else if (dev->rwh_policy == RWH_BITMAP)
1721                 printf("Write-intent bitmap\n");
1722         else
1723                 printf("<unknown:%d>\n", dev->rwh_policy);
1724
1725         printf("      Volume ID : %u\n", dev->my_vol_raid_dev_num);
1726 }
1727
1728 static void print_imsm_disk(struct imsm_disk *disk,
1729                             int index,
1730                             __u32 reserved,
1731                             unsigned int sector_size) {
1732         char str[MAX_RAID_SERIAL_LEN + 1];
1733         __u64 sz;
1734
1735         if (index < -1 || !disk)
1736                 return;
1737
1738         printf("\n");
1739         snprintf(str, MAX_RAID_SERIAL_LEN + 1, "%s", disk->serial);
1740         if (index >= 0)
1741                 printf("  Disk%02d Serial : %s\n", index, str);
1742         else
1743                 printf("    Disk Serial : %s\n", str);
1744         printf("          State :%s%s%s%s\n", is_spare(disk) ? " spare" : "",
1745                                               is_configured(disk) ? " active" : "",
1746                                               is_failed(disk) ? " failed" : "",
1747                                               is_journal(disk) ? " journal" : "");
1748         printf("             Id : %08x\n", __le32_to_cpu(disk->scsi_id));
1749         sz = total_blocks(disk) - reserved;
1750         printf("    Usable Size : %llu%s\n",
1751                (unsigned long long)sz * 512 / sector_size,
1752                human_size(sz * 512));
1753 }
1754
1755 void convert_to_4k_imsm_migr_rec(struct intel_super *super)
1756 {
1757         struct migr_record *migr_rec = super->migr_rec;
1758
1759         migr_rec->blocks_per_unit /= IMSM_4K_DIV;
1760         migr_rec->dest_depth_per_unit /= IMSM_4K_DIV;
1761         split_ull((join_u32(migr_rec->post_migr_vol_cap,
1762                  migr_rec->post_migr_vol_cap_hi) / IMSM_4K_DIV),
1763                  &migr_rec->post_migr_vol_cap, &migr_rec->post_migr_vol_cap_hi);
1764         set_migr_chkp_area_pba(migr_rec,
1765                  migr_chkp_area_pba(migr_rec) / IMSM_4K_DIV);
1766         set_migr_dest_1st_member_lba(migr_rec,
1767                  migr_dest_1st_member_lba(migr_rec) / IMSM_4K_DIV);
1768 }
1769
1770 void convert_to_4k_imsm_disk(struct imsm_disk *disk)
1771 {
1772         set_total_blocks(disk, (total_blocks(disk)/IMSM_4K_DIV));
1773 }
1774
1775 void convert_to_4k(struct intel_super *super)
1776 {
1777         struct imsm_super *mpb = super->anchor;
1778         struct imsm_disk *disk;
1779         int i;
1780         __u32 bbm_log_size = __le32_to_cpu(mpb->bbm_log_size);
1781
1782         for (i = 0; i < mpb->num_disks ; i++) {
1783                 disk = __get_imsm_disk(mpb, i);
1784                 /* disk */
1785                 convert_to_4k_imsm_disk(disk);
1786         }
1787         for (i = 0; i < mpb->num_raid_devs; i++) {
1788                 struct imsm_dev *dev = __get_imsm_dev(mpb, i);
1789                 struct imsm_map *map = get_imsm_map(dev, MAP_0);
1790                 /* dev */
1791                 set_imsm_dev_size(dev, imsm_dev_size(dev)/IMSM_4K_DIV);
1792                 set_vol_curr_migr_unit(dev,
1793                                        vol_curr_migr_unit(dev) / IMSM_4K_DIV);
1794
1795                 /* map0 */
1796                 set_blocks_per_member(map, blocks_per_member(map)/IMSM_4K_DIV);
1797                 map->blocks_per_strip /= IMSM_4K_DIV;
1798                 set_pba_of_lba0(map, pba_of_lba0(map)/IMSM_4K_DIV);
1799
1800                 if (dev->vol.migr_state) {
1801                         /* map1 */
1802                         map = get_imsm_map(dev, MAP_1);
1803                         set_blocks_per_member(map,
1804                             blocks_per_member(map)/IMSM_4K_DIV);
1805                         map->blocks_per_strip /= IMSM_4K_DIV;
1806                         set_pba_of_lba0(map, pba_of_lba0(map)/IMSM_4K_DIV);
1807                 }
1808         }
1809         if (bbm_log_size) {
1810                 struct bbm_log *log = (void *)mpb +
1811                         __le32_to_cpu(mpb->mpb_size) - bbm_log_size;
1812                 __u32 i;
1813
1814                 for (i = 0; i < log->entry_count; i++) {
1815                         struct bbm_log_entry *entry =
1816                                 &log->marked_block_entries[i];
1817
1818                         __u8 count = entry->marked_count + 1;
1819                         unsigned long long sector =
1820                                 __le48_to_cpu(&entry->defective_block_start);
1821
1822                         entry->defective_block_start =
1823                                 __cpu_to_le48(sector/IMSM_4K_DIV);
1824                         entry->marked_count = max(count/IMSM_4K_DIV, 1) - 1;
1825                 }
1826         }
1827
1828         mpb->check_sum = __gen_imsm_checksum(mpb);
1829 }
1830
1831 void examine_migr_rec_imsm(struct intel_super *super)
1832 {
1833         struct migr_record *migr_rec = super->migr_rec;
1834         struct imsm_super *mpb = super->anchor;
1835         int i;
1836
1837         for (i = 0; i < mpb->num_raid_devs; i++) {
1838                 struct imsm_dev *dev = __get_imsm_dev(mpb, i);
1839                 struct imsm_map *map;
1840                 int slot = -1;
1841
1842                 if (is_gen_migration(dev) == 0)
1843                                 continue;
1844
1845                 printf("\nMigration Record Information:");
1846
1847                 /* first map under migration */
1848                 map = get_imsm_map(dev, MAP_0);
1849                 if (map)
1850                         slot = get_imsm_disk_slot(map, super->disks->index);
1851                 if (map == NULL || slot > 1 || slot < 0) {
1852                         printf(" Empty\n                              ");
1853                         printf("Examine one of first two disks in array\n");
1854                         break;
1855                 }
1856                 printf("\n                     Status : ");
1857                 if (__le32_to_cpu(migr_rec->rec_status) == UNIT_SRC_NORMAL)
1858                         printf("Normal\n");
1859                 else
1860                         printf("Contains Data\n");
1861                 printf("               Current Unit : %llu\n",
1862                        current_migr_unit(migr_rec));
1863                 printf("                     Family : %u\n",
1864                        __le32_to_cpu(migr_rec->family_num));
1865                 printf("                  Ascending : %u\n",
1866                        __le32_to_cpu(migr_rec->ascending_migr));
1867                 printf("            Blocks Per Unit : %u\n",
1868                        __le32_to_cpu(migr_rec->blocks_per_unit));
1869                 printf("       Dest. Depth Per Unit : %u\n",
1870                        __le32_to_cpu(migr_rec->dest_depth_per_unit));
1871                 printf("        Checkpoint Area pba : %llu\n",
1872                        migr_chkp_area_pba(migr_rec));
1873                 printf("           First member lba : %llu\n",
1874                        migr_dest_1st_member_lba(migr_rec));
1875                 printf("      Total Number of Units : %llu\n",
1876                        get_num_migr_units(migr_rec));
1877                 printf("             Size of volume : %llu\n",
1878                        join_u32(migr_rec->post_migr_vol_cap,
1879                                 migr_rec->post_migr_vol_cap_hi));
1880                 printf("       Record was read from : %u\n",
1881                        __le32_to_cpu(migr_rec->ckpt_read_disk_num));
1882
1883                 break;
1884         }
1885 }
1886
1887 void convert_from_4k_imsm_migr_rec(struct intel_super *super)
1888 {
1889         struct migr_record *migr_rec = super->migr_rec;
1890
1891         migr_rec->blocks_per_unit *= IMSM_4K_DIV;
1892         migr_rec->dest_depth_per_unit *= IMSM_4K_DIV;
1893         split_ull((join_u32(migr_rec->post_migr_vol_cap,
1894                  migr_rec->post_migr_vol_cap_hi) * IMSM_4K_DIV),
1895                  &migr_rec->post_migr_vol_cap,
1896                  &migr_rec->post_migr_vol_cap_hi);
1897         set_migr_chkp_area_pba(migr_rec,
1898                  migr_chkp_area_pba(migr_rec) * IMSM_4K_DIV);
1899         set_migr_dest_1st_member_lba(migr_rec,
1900                  migr_dest_1st_member_lba(migr_rec) * IMSM_4K_DIV);
1901 }
1902
1903 void convert_from_4k(struct intel_super *super)
1904 {
1905         struct imsm_super *mpb = super->anchor;
1906         struct imsm_disk *disk;
1907         int i;
1908         __u32 bbm_log_size = __le32_to_cpu(mpb->bbm_log_size);
1909
1910         for (i = 0; i < mpb->num_disks ; i++) {
1911                 disk = __get_imsm_disk(mpb, i);
1912                 /* disk */
1913                 set_total_blocks(disk, (total_blocks(disk)*IMSM_4K_DIV));
1914         }
1915
1916         for (i = 0; i < mpb->num_raid_devs; i++) {
1917                 struct imsm_dev *dev = __get_imsm_dev(mpb, i);
1918                 struct imsm_map *map = get_imsm_map(dev, MAP_0);
1919                 /* dev */
1920                 set_imsm_dev_size(dev, imsm_dev_size(dev)*IMSM_4K_DIV);
1921                 set_vol_curr_migr_unit(dev,
1922                                        vol_curr_migr_unit(dev) * IMSM_4K_DIV);
1923
1924                 /* map0 */
1925                 set_blocks_per_member(map, blocks_per_member(map)*IMSM_4K_DIV);
1926                 map->blocks_per_strip *= IMSM_4K_DIV;
1927                 set_pba_of_lba0(map, pba_of_lba0(map)*IMSM_4K_DIV);
1928
1929                 if (dev->vol.migr_state) {
1930                         /* map1 */
1931                         map = get_imsm_map(dev, MAP_1);
1932                         set_blocks_per_member(map,
1933                             blocks_per_member(map)*IMSM_4K_DIV);
1934                         map->blocks_per_strip *= IMSM_4K_DIV;
1935                         set_pba_of_lba0(map, pba_of_lba0(map)*IMSM_4K_DIV);
1936                 }
1937         }
1938         if (bbm_log_size) {
1939                 struct bbm_log *log = (void *)mpb +
1940                         __le32_to_cpu(mpb->mpb_size) - bbm_log_size;
1941                 __u32 i;
1942
1943                 for (i = 0; i < log->entry_count; i++) {
1944                         struct bbm_log_entry *entry =
1945                                 &log->marked_block_entries[i];
1946
1947                         __u8 count = entry->marked_count + 1;
1948                         unsigned long long sector =
1949                                 __le48_to_cpu(&entry->defective_block_start);
1950
1951                         entry->defective_block_start =
1952                                 __cpu_to_le48(sector*IMSM_4K_DIV);
1953                         entry->marked_count = count*IMSM_4K_DIV - 1;
1954                 }
1955         }
1956
1957         mpb->check_sum = __gen_imsm_checksum(mpb);
1958 }
1959
1960 /*******************************************************************************
1961  * function: imsm_check_attributes
1962  * Description: Function checks if features represented by attributes flags
1963  *              are supported by mdadm.
1964  * Parameters:
1965  *              attributes - Attributes read from metadata
1966  * Returns:
1967  *              0 - passed attributes contains unsupported features flags
1968  *              1 - all features are supported
1969  ******************************************************************************/
1970 static int imsm_check_attributes(__u32 attributes)
1971 {
1972         int ret_val = 1;
1973         __u32 not_supported = MPB_ATTRIB_SUPPORTED^0xffffffff;
1974
1975         not_supported &= ~MPB_ATTRIB_IGNORED;
1976
1977         not_supported &= attributes;
1978         if (not_supported) {
1979                 pr_err("(IMSM): Unsupported attributes : %x\n",
1980                         (unsigned)__le32_to_cpu(not_supported));
1981                 if (not_supported & MPB_ATTRIB_CHECKSUM_VERIFY) {
1982                         dprintf("\t\tMPB_ATTRIB_CHECKSUM_VERIFY \n");
1983                         not_supported ^= MPB_ATTRIB_CHECKSUM_VERIFY;
1984                 }
1985                 if (not_supported & MPB_ATTRIB_2TB) {
1986                         dprintf("\t\tMPB_ATTRIB_2TB\n");
1987                         not_supported ^= MPB_ATTRIB_2TB;
1988                 }
1989                 if (not_supported & MPB_ATTRIB_RAID0) {
1990                         dprintf("\t\tMPB_ATTRIB_RAID0\n");
1991                         not_supported ^= MPB_ATTRIB_RAID0;
1992                 }
1993                 if (not_supported & MPB_ATTRIB_RAID1) {
1994                         dprintf("\t\tMPB_ATTRIB_RAID1\n");
1995                         not_supported ^= MPB_ATTRIB_RAID1;
1996                 }
1997                 if (not_supported & MPB_ATTRIB_RAID10) {
1998                         dprintf("\t\tMPB_ATTRIB_RAID10\n");
1999                         not_supported ^= MPB_ATTRIB_RAID10;
2000                 }
2001                 if (not_supported & MPB_ATTRIB_RAID1E) {
2002                         dprintf("\t\tMPB_ATTRIB_RAID1E\n");
2003                         not_supported ^= MPB_ATTRIB_RAID1E;
2004                 }
2005                 if (not_supported & MPB_ATTRIB_RAID5) {
2006                 dprintf("\t\tMPB_ATTRIB_RAID5\n");
2007                         not_supported ^= MPB_ATTRIB_RAID5;
2008                 }
2009                 if (not_supported & MPB_ATTRIB_RAIDCNG) {
2010                         dprintf("\t\tMPB_ATTRIB_RAIDCNG\n");
2011                         not_supported ^= MPB_ATTRIB_RAIDCNG;
2012                 }
2013                 if (not_supported & MPB_ATTRIB_BBM) {
2014                         dprintf("\t\tMPB_ATTRIB_BBM\n");
2015                 not_supported ^= MPB_ATTRIB_BBM;
2016                 }
2017                 if (not_supported & MPB_ATTRIB_CHECKSUM_VERIFY) {
2018                         dprintf("\t\tMPB_ATTRIB_CHECKSUM_VERIFY (== MPB_ATTRIB_LEGACY)\n");
2019                         not_supported ^= MPB_ATTRIB_CHECKSUM_VERIFY;
2020                 }
2021                 if (not_supported & MPB_ATTRIB_EXP_STRIPE_SIZE) {
2022                         dprintf("\t\tMPB_ATTRIB_EXP_STRIP_SIZE\n");
2023                         not_supported ^= MPB_ATTRIB_EXP_STRIPE_SIZE;
2024                 }
2025                 if (not_supported & MPB_ATTRIB_2TB_DISK) {
2026                         dprintf("\t\tMPB_ATTRIB_2TB_DISK\n");
2027                         not_supported ^= MPB_ATTRIB_2TB_DISK;
2028                 }
2029                 if (not_supported & MPB_ATTRIB_NEVER_USE2) {
2030                         dprintf("\t\tMPB_ATTRIB_NEVER_USE2\n");
2031                         not_supported ^= MPB_ATTRIB_NEVER_USE2;
2032                 }
2033                 if (not_supported & MPB_ATTRIB_NEVER_USE) {
2034                         dprintf("\t\tMPB_ATTRIB_NEVER_USE\n");
2035                         not_supported ^= MPB_ATTRIB_NEVER_USE;
2036                 }
2037
2038                 if (not_supported)
2039                         dprintf("(IMSM): Unknown attributes : %x\n", not_supported);
2040
2041                 ret_val = 0;
2042         }
2043
2044         return ret_val;
2045 }
2046
2047 static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *map);
2048
2049 static void examine_super_imsm(struct supertype *st, char *homehost)
2050 {
2051         struct intel_super *super = st->sb;
2052         struct imsm_super *mpb = super->anchor;
2053         char str[MAX_SIGNATURE_LENGTH];
2054         int i;
2055         struct mdinfo info;
2056         char nbuf[64];
2057         __u32 sum;
2058         __u32 reserved = imsm_reserved_sectors(super, super->disks);
2059         struct dl *dl;
2060         time_t creation_time;
2061
2062         strncpy(str, (char *)mpb->sig, MPB_SIG_LEN);
2063         str[MPB_SIG_LEN-1] = '\0';
2064         printf("          Magic : %s\n", str);
2065         printf("        Version : %s\n", get_imsm_version(mpb));
2066         printf("    Orig Family : %08x\n", __le32_to_cpu(mpb->orig_family_num));
2067         printf("         Family : %08x\n", __le32_to_cpu(mpb->family_num));
2068         printf("     Generation : %08x\n", __le32_to_cpu(mpb->generation_num));
2069         creation_time = __le64_to_cpu(mpb->creation_time);
2070         printf("  Creation Time : %.24s\n",
2071                 creation_time ? ctime(&creation_time) : "Unknown");
2072         printf("     Attributes : ");
2073         if (imsm_check_attributes(mpb->attributes))
2074                 printf("All supported\n");
2075         else
2076                 printf("not supported\n");
2077         getinfo_super_imsm(st, &info, NULL);
2078         fname_from_uuid(st, &info, nbuf, ':');
2079         printf("           UUID : %s\n", nbuf + 5);
2080         sum = __le32_to_cpu(mpb->check_sum);
2081         printf("       Checksum : %08x %s\n", sum,
2082                 __gen_imsm_checksum(mpb) == sum ? "correct" : "incorrect");
2083         printf("    MPB Sectors : %d\n", mpb_sectors(mpb, super->sector_size));
2084         printf("          Disks : %d\n", mpb->num_disks);
2085         printf("   RAID Devices : %d\n", mpb->num_raid_devs);
2086         print_imsm_disk(__get_imsm_disk(mpb, super->disks->index),
2087                         super->disks->index, reserved, super->sector_size);
2088         if (get_imsm_bbm_log_size(super->bbm_log)) {
2089                 struct bbm_log *log = super->bbm_log;
2090
2091                 printf("\n");
2092                 printf("Bad Block Management Log:\n");
2093                 printf("       Log Size : %d\n", __le32_to_cpu(mpb->bbm_log_size));
2094                 printf("      Signature : %x\n", __le32_to_cpu(log->signature));
2095                 printf("    Entry Count : %d\n", __le32_to_cpu(log->entry_count));
2096         }
2097         for (i = 0; i < mpb->num_raid_devs; i++) {
2098                 struct mdinfo info;
2099                 struct imsm_dev *dev = __get_imsm_dev(mpb, i);
2100
2101                 super->current_vol = i;
2102                 getinfo_super_imsm(st, &info, NULL);
2103                 fname_from_uuid(st, &info, nbuf, ':');
2104                 print_imsm_dev(super, dev, nbuf + 5, super->disks->index);
2105         }
2106         for (i = 0; i < mpb->num_disks; i++) {
2107                 if (i == super->disks->index)
2108                         continue;
2109                 print_imsm_disk(__get_imsm_disk(mpb, i), i, reserved,
2110                                 super->sector_size);
2111         }
2112
2113         for (dl = super->disks; dl; dl = dl->next)
2114                 if (dl->index == -1)
2115                         print_imsm_disk(&dl->disk, -1, reserved,
2116                                         super->sector_size);
2117
2118         examine_migr_rec_imsm(super);
2119 }
2120
2121 static void brief_examine_super_imsm(struct supertype *st, int verbose)
2122 {
2123         /* We just write a generic IMSM ARRAY entry */
2124         struct mdinfo info;
2125         char nbuf[64];
2126         struct intel_super *super = st->sb;
2127
2128         if (!super->anchor->num_raid_devs) {
2129                 printf("ARRAY metadata=imsm\n");
2130                 return;
2131         }
2132
2133         getinfo_super_imsm(st, &info, NULL);
2134         fname_from_uuid(st, &info, nbuf, ':');
2135         printf("ARRAY metadata=imsm UUID=%s\n", nbuf + 5);
2136 }
2137
2138 static void brief_examine_subarrays_imsm(struct supertype *st, int verbose)
2139 {
2140         /* We just write a generic IMSM ARRAY entry */
2141         struct mdinfo info;
2142         char nbuf[64];
2143         char nbuf1[64];
2144         struct intel_super *super = st->sb;
2145         int i;
2146
2147         if (!super->anchor->num_raid_devs)
2148                 return;
2149
2150         getinfo_super_imsm(st, &info, NULL);
2151         fname_from_uuid(st, &info, nbuf, ':');
2152         for (i = 0; i < super->anchor->num_raid_devs; i++) {
2153                 struct imsm_dev *dev = get_imsm_dev(super, i);
2154
2155                 super->current_vol = i;
2156                 getinfo_super_imsm(st, &info, NULL);
2157                 fname_from_uuid(st, &info, nbuf1, ':');
2158                 printf("ARRAY /dev/md/%.16s container=%s member=%d UUID=%s\n",
2159                        dev->volume, nbuf + 5, i, nbuf1 + 5);
2160         }
2161 }
2162
2163 static void export_examine_super_imsm(struct supertype *st)
2164 {
2165         struct intel_super *super = st->sb;
2166         struct imsm_super *mpb = super->anchor;
2167         struct mdinfo info;
2168         char nbuf[64];
2169
2170         getinfo_super_imsm(st, &info, NULL);
2171         fname_from_uuid(st, &info, nbuf, ':');
2172         printf("MD_METADATA=imsm\n");
2173         printf("MD_LEVEL=container\n");
2174         printf("MD_UUID=%s\n", nbuf+5);
2175         printf("MD_DEVICES=%u\n", mpb->num_disks);
2176         printf("MD_CREATION_TIME=%llu\n", __le64_to_cpu(mpb->creation_time));
2177 }
2178
2179 static void detail_super_imsm(struct supertype *st, char *homehost,
2180                               char *subarray)
2181 {
2182         struct mdinfo info;
2183         char nbuf[64];
2184         struct intel_super *super = st->sb;
2185         int temp_vol = super->current_vol;
2186
2187         if (subarray)
2188                 super->current_vol = strtoul(subarray, NULL, 10);
2189
2190         getinfo_super_imsm(st, &info, NULL);
2191         fname_from_uuid(st, &info, nbuf, ':');
2192         printf("\n              UUID : %s\n", nbuf + 5);
2193
2194         super->current_vol = temp_vol;
2195 }
2196
2197 static void brief_detail_super_imsm(struct supertype *st, char *subarray)
2198 {
2199         struct mdinfo info;
2200         char nbuf[64];
2201         struct intel_super *super = st->sb;
2202         int temp_vol = super->current_vol;
2203
2204         if (subarray)
2205                 super->current_vol = strtoul(subarray, NULL, 10);
2206
2207         getinfo_super_imsm(st, &info, NULL);
2208         fname_from_uuid(st, &info, nbuf, ':');
2209         printf(" UUID=%s", nbuf + 5);
2210
2211         super->current_vol = temp_vol;
2212 }
2213
2214 static int imsm_read_serial(int fd, char *devname, __u8 *serial,
2215                             size_t serial_buf_len);
2216 static void fd2devname(int fd, char *name);
2217
2218 static int ahci_enumerate_ports(const char *hba_path, int port_count, int host_base, int verbose)
2219 {
2220         /* dump an unsorted list of devices attached to AHCI Intel storage
2221          * controller, as well as non-connected ports
2222          */
2223         int hba_len = strlen(hba_path) + 1;
2224         struct dirent *ent;
2225         DIR *dir;
2226         char *path = NULL;
2227         int err = 0;
2228         unsigned long port_mask = (1 << port_count) - 1;
2229
2230         if (port_count > (int)sizeof(port_mask) * 8) {
2231                 if (verbose > 0)
2232                         pr_err("port_count %d out of range\n", port_count);
2233                 return 2;
2234         }
2235
2236         /* scroll through /sys/dev/block looking for devices attached to
2237          * this hba
2238          */
2239         dir = opendir("/sys/dev/block");
2240         if (!dir)
2241                 return 1;
2242
2243         for (ent = readdir(dir); ent; ent = readdir(dir)) {
2244                 int fd;
2245                 char model[64];
2246                 char vendor[64];
2247                 char buf[1024];
2248                 int major, minor;
2249                 char device[PATH_MAX];
2250                 char *c;
2251                 int port;
2252                 int type;
2253
2254                 if (sscanf(ent->d_name, "%d:%d", &major, &minor) != 2)
2255                         continue;
2256                 path = devt_to_devpath(makedev(major, minor), 1, NULL);
2257                 if (!path)
2258                         continue;
2259                 if (!path_attached_to_hba(path, hba_path)) {
2260                         free(path);
2261                         path = NULL;
2262                         continue;
2263                 }
2264
2265                 /* retrieve the scsi device */
2266                 if (!devt_to_devpath(makedev(major, minor), 1, device)) {
2267                         if (verbose > 0)
2268                                 pr_err("failed to get device\n");
2269                         err = 2;
2270                         break;
2271                 }
2272                 if (devpath_to_char(device, "type", buf, sizeof(buf), 0)) {
2273                         err = 2;
2274                         break;
2275                 }
2276                 type = strtoul(buf, NULL, 10);
2277
2278                 /* if it's not a disk print the vendor and model */
2279                 if (!(type == 0 || type == 7 || type == 14)) {
2280                         vendor[0] = '\0';
2281                         model[0] = '\0';
2282
2283                         if (devpath_to_char(device, "vendor", buf,
2284                                             sizeof(buf), 0) == 0) {
2285                                 strncpy(vendor, buf, sizeof(vendor));
2286                                 vendor[sizeof(vendor) - 1] = '\0';
2287                                 c = (char *) &vendor[sizeof(vendor) - 1];
2288                                 while (isspace(*c) || *c == '\0')
2289                                         *c-- = '\0';
2290
2291                         }
2292
2293                         if (devpath_to_char(device, "model", buf,
2294                                             sizeof(buf), 0) == 0) {
2295                                 strncpy(model, buf, sizeof(model));
2296                                 model[sizeof(model) - 1] = '\0';
2297                                 c = (char *) &model[sizeof(model) - 1];
2298                                 while (isspace(*c) || *c == '\0')
2299                                         *c-- = '\0';
2300                         }
2301
2302                         if (vendor[0] && model[0])
2303                                 sprintf(buf, "%.64s %.64s", vendor, model);
2304                         else
2305                                 switch (type) { /* numbers from hald/linux/device.c */
2306                                 case 1: sprintf(buf, "tape"); break;
2307                                 case 2: sprintf(buf, "printer"); break;
2308                                 case 3: sprintf(buf, "processor"); break;
2309                                 case 4:
2310                                 case 5: sprintf(buf, "cdrom"); break;
2311                                 case 6: sprintf(buf, "scanner"); break;
2312                                 case 8: sprintf(buf, "media_changer"); break;
2313                                 case 9: sprintf(buf, "comm"); break;
2314                                 case 12: sprintf(buf, "raid"); break;
2315                                 default: sprintf(buf, "unknown");
2316                                 }
2317                 } else
2318                         buf[0] = '\0';
2319
2320                 /* chop device path to 'host%d' and calculate the port number */
2321                 c = strchr(&path[hba_len], '/');
2322                 if (!c) {
2323                         if (verbose > 0)
2324                                 pr_err("%s - invalid path name\n", path + hba_len);
2325                         err = 2;
2326                         break;
2327                 }
2328                 *c = '\0';
2329                 if ((sscanf(&path[hba_len], "ata%d", &port) == 1) ||
2330                    ((sscanf(&path[hba_len], "host%d", &port) == 1)))
2331                         port -= host_base;
2332                 else {
2333                         if (verbose > 0) {
2334                                 *c = '/'; /* repair the full string */
2335                                 pr_err("failed to determine port number for %s\n",
2336                                         path);
2337                         }
2338                         err = 2;
2339                         break;
2340                 }
2341
2342                 /* mark this port as used */
2343                 port_mask &= ~(1 << port);
2344
2345                 /* print out the device information */
2346                 if (buf[0]) {
2347                         printf("          Port%d : - non-disk device (%s) -\n", port, buf);
2348                         continue;
2349                 }
2350
2351                 fd = dev_open(ent->d_name, O_RDONLY);
2352                 if (fd < 0)
2353                         printf("          Port%d : - disk info unavailable -\n", port);
2354                 else {
2355                         fd2devname(fd, buf);
2356                         printf("          Port%d : %s", port, buf);
2357                         if (imsm_read_serial(fd, NULL, (__u8 *)buf,
2358                                              sizeof(buf)) == 0)
2359                                 printf(" (%s)\n", buf);
2360                         else
2361                                 printf(" ()\n");
2362                         close(fd);
2363                 }
2364                 free(path);
2365                 path = NULL;
2366         }
2367         if (path)
2368                 free(path);
2369         if (dir)
2370                 closedir(dir);
2371         if (err == 0) {
2372                 int i;
2373
2374                 for (i = 0; i < port_count; i++)
2375                         if (port_mask & (1 << i))
2376                                 printf("          Port%d : - no device attached -\n", i);
2377         }
2378
2379         return err;
2380 }
2381
2382 static int print_nvme_info(struct sys_dev *hba)
2383 {
2384         struct dirent *ent;
2385         DIR *dir;
2386
2387         dir = opendir("/sys/block/");
2388         if (!dir)
2389                 return 1;
2390
2391         for (ent = readdir(dir); ent; ent = readdir(dir)) {
2392                 char ns_path[PATH_MAX];
2393                 char cntrl_path[PATH_MAX];
2394                 char buf[PATH_MAX];
2395                 int fd = -1;
2396
2397                 if (!strstr(ent->d_name, "nvme"))
2398                         goto skip;
2399
2400                 fd = open_dev(ent->d_name);
2401                 if (fd < 0)
2402                         goto skip;
2403
2404                 if (!diskfd_to_devpath(fd, 0, ns_path) ||
2405                     !diskfd_to_devpath(fd, 1, cntrl_path))
2406                         goto skip;
2407
2408                 if (!path_attached_to_hba(cntrl_path, hba->path))
2409                         goto skip;
2410
2411                 if (!imsm_is_nvme_namespace_supported(fd, 0))
2412                         goto skip;
2413
2414                 fd2devname(fd, buf);
2415                 if (hba->type == SYS_DEV_VMD)
2416                         printf(" NVMe under VMD : %s", buf);
2417                 else if (hba->type == SYS_DEV_NVME)
2418                         printf("    NVMe Device : %s", buf);
2419
2420                 if (!imsm_read_serial(fd, NULL, (__u8 *)buf,
2421                                       sizeof(buf)))
2422                         printf(" (%s)\n", buf);
2423                 else
2424                         printf("()\n");
2425
2426 skip:
2427                 if (fd > -1)
2428                         close(fd);
2429         }
2430
2431         closedir(dir);
2432         return 0;
2433 }
2434
2435 static void print_found_intel_controllers(struct sys_dev *elem)
2436 {
2437         for (; elem; elem = elem->next) {
2438                 pr_err("found Intel(R) ");
2439                 if (elem->type == SYS_DEV_SATA)
2440                         fprintf(stderr, "SATA ");
2441                 else if (elem->type == SYS_DEV_SAS)
2442                         fprintf(stderr, "SAS ");
2443                 else if (elem->type == SYS_DEV_NVME)
2444                         fprintf(stderr, "NVMe ");
2445
2446                 if (elem->type == SYS_DEV_VMD)
2447                         fprintf(stderr, "VMD domain");
2448                 else
2449                         fprintf(stderr, "RAID controller");
2450
2451                 if (elem->pci_id)
2452                         fprintf(stderr, " at %s", elem->pci_id);
2453                 fprintf(stderr, ".\n");
2454         }
2455         fflush(stderr);
2456 }
2457
2458 static int ahci_get_port_count(const char *hba_path, int *port_count)
2459 {
2460         struct dirent *ent;
2461         DIR *dir;
2462         int host_base = -1;
2463
2464         *port_count = 0;
2465         if ((dir = opendir(hba_path)) == NULL)
2466                 return -1;
2467
2468         for (ent = readdir(dir); ent; ent = readdir(dir)) {
2469                 int host;
2470
2471                 if ((sscanf(ent->d_name, "ata%d", &host) != 1) &&
2472                    ((sscanf(ent->d_name, "host%d", &host) != 1)))
2473                         continue;
2474                 if (*port_count == 0)
2475                         host_base = host;
2476                 else if (host < host_base)
2477                         host_base = host;
2478
2479                 if (host + 1 > *port_count + host_base)
2480                         *port_count = host + 1 - host_base;
2481         }
2482         closedir(dir);
2483         return host_base;
2484 }
2485
2486 static void print_imsm_capability(const struct imsm_orom *orom)
2487 {
2488         printf("       Platform : Intel(R) ");
2489         if (orom->capabilities == 0 && orom->driver_features == 0)
2490                 printf("Matrix Storage Manager\n");
2491         else if (imsm_orom_is_enterprise(orom) && orom->major_ver >= 6)
2492                 printf("Virtual RAID on CPU\n");
2493         else
2494                 printf("Rapid Storage Technology%s\n",
2495                         imsm_orom_is_enterprise(orom) ? " enterprise" : "");
2496         if (orom->major_ver || orom->minor_ver || orom->hotfix_ver || orom->build)
2497                 printf("        Version : %d.%d.%d.%d\n", orom->major_ver,
2498                                 orom->minor_ver, orom->hotfix_ver, orom->build);
2499         printf("    RAID Levels :%s%s%s%s%s\n",
2500                imsm_orom_has_raid0(orom) ? " raid0" : "",
2501                imsm_orom_has_raid1(orom) ? " raid1" : "",
2502                imsm_orom_has_raid1e(orom) ? " raid1e" : "",
2503                imsm_orom_has_raid10(orom) ? " raid10" : "",
2504                imsm_orom_has_raid5(orom) ? " raid5" : "");
2505         printf("    Chunk Sizes :%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
2506                imsm_orom_has_chunk(orom, 2) ? " 2k" : "",
2507                imsm_orom_has_chunk(orom, 4) ? " 4k" : "",
2508                imsm_orom_has_chunk(orom, 8) ? " 8k" : "",
2509                imsm_orom_has_chunk(orom, 16) ? " 16k" : "",
2510                imsm_orom_has_chunk(orom, 32) ? " 32k" : "",
2511                imsm_orom_has_chunk(orom, 64) ? " 64k" : "",
2512                imsm_orom_has_chunk(orom, 128) ? " 128k" : "",
2513                imsm_orom_has_chunk(orom, 256) ? " 256k" : "",
2514                imsm_orom_has_chunk(orom, 512) ? " 512k" : "",
2515                imsm_orom_has_chunk(orom, 1024*1) ? " 1M" : "",
2516                imsm_orom_has_chunk(orom, 1024*2) ? " 2M" : "",
2517                imsm_orom_has_chunk(orom, 1024*4) ? " 4M" : "",
2518                imsm_orom_has_chunk(orom, 1024*8) ? " 8M" : "",
2519                imsm_orom_has_chunk(orom, 1024*16) ? " 16M" : "",
2520                imsm_orom_has_chunk(orom, 1024*32) ? " 32M" : "",
2521                imsm_orom_has_chunk(orom, 1024*64) ? " 64M" : "");
2522         printf("    2TB volumes :%s supported\n",
2523                (orom->attr & IMSM_OROM_ATTR_2TB)?"":" not");
2524         printf("      2TB disks :%s supported\n",
2525                (orom->attr & IMSM_OROM_ATTR_2TB_DISK)?"":" not");
2526         printf("      Max Disks : %d\n", orom->tds);
2527         printf("    Max Volumes : %d per array, %d per %s\n",
2528                orom->vpa, orom->vphba,
2529                imsm_orom_is_nvme(orom) ? "platform" : "controller");
2530         return;
2531 }
2532
2533 static void print_imsm_capability_export(const struct imsm_orom *orom)
2534 {
2535         printf("MD_FIRMWARE_TYPE=imsm\n");
2536         if (orom->major_ver || orom->minor_ver || orom->hotfix_ver || orom->build)
2537                 printf("IMSM_VERSION=%d.%d.%d.%d\n", orom->major_ver, orom->minor_ver,
2538                                 orom->hotfix_ver, orom->build);
2539         printf("IMSM_SUPPORTED_RAID_LEVELS=%s%s%s%s%s\n",
2540                         imsm_orom_has_raid0(orom) ? "raid0 " : "",
2541                         imsm_orom_has_raid1(orom) ? "raid1 " : "",
2542                         imsm_orom_has_raid1e(orom) ? "raid1e " : "",
2543                         imsm_orom_has_raid5(orom) ? "raid10 " : "",
2544                         imsm_orom_has_raid10(orom) ? "raid5 " : "");
2545         printf("IMSM_SUPPORTED_CHUNK_SIZES=%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
2546                         imsm_orom_has_chunk(orom, 2) ? "2k " : "",
2547                         imsm_orom_has_chunk(orom, 4) ? "4k " : "",
2548                         imsm_orom_has_chunk(orom, 8) ? "8k " : "",
2549                         imsm_orom_has_chunk(orom, 16) ? "16k " : "",
2550                         imsm_orom_has_chunk(orom, 32) ? "32k " : "",
2551                         imsm_orom_has_chunk(orom, 64) ? "64k " : "",
2552                         imsm_orom_has_chunk(orom, 128) ? "128k " : "",
2553                         imsm_orom_has_chunk(orom, 256) ? "256k " : "",
2554                         imsm_orom_has_chunk(orom, 512) ? "512k " : "",
2555                         imsm_orom_has_chunk(orom, 1024*1) ? "1M " : "",
2556                         imsm_orom_has_chunk(orom, 1024*2) ? "2M " : "",
2557                         imsm_orom_has_chunk(orom, 1024*4) ? "4M " : "",
2558                         imsm_orom_has_chunk(orom, 1024*8) ? "8M " : "",
2559                         imsm_orom_has_chunk(orom, 1024*16) ? "16M " : "",
2560                         imsm_orom_has_chunk(orom, 1024*32) ? "32M " : "",
2561                         imsm_orom_has_chunk(orom, 1024*64) ? "64M " : "");
2562         printf("IMSM_2TB_VOLUMES=%s\n",(orom->attr & IMSM_OROM_ATTR_2TB) ? "yes" : "no");
2563         printf("IMSM_2TB_DISKS=%s\n",(orom->attr & IMSM_OROM_ATTR_2TB_DISK) ? "yes" : "no");
2564         printf("IMSM_MAX_DISKS=%d\n",orom->tds);
2565         printf("IMSM_MAX_VOLUMES_PER_ARRAY=%d\n",orom->vpa);
2566         printf("IMSM_MAX_VOLUMES_PER_CONTROLLER=%d\n",orom->vphba);
2567 }
2568
2569 static int detail_platform_imsm(int verbose, int enumerate_only, char *controller_path)
2570 {
2571         /* There are two components to imsm platform support, the ahci SATA
2572          * controller and the option-rom.  To find the SATA controller we
2573          * simply look in /sys/bus/pci/drivers/ahci to see if an ahci
2574          * controller with the Intel vendor id is present.  This approach
2575          * allows mdadm to leverage the kernel's ahci detection logic, with the
2576          * caveat that if ahci.ko is not loaded mdadm will not be able to
2577          * detect platform raid capabilities.  The option-rom resides in a
2578          * platform "Adapter ROM".  We scan for its signature to retrieve the
2579          * platform capabilities.  If raid support is disabled in the BIOS the
2580          * option-rom capability structure will not be available.
2581          */
2582         struct sys_dev *list, *hba;
2583         int host_base = 0;
2584         int port_count = 0;
2585         int result=1;
2586
2587         if (enumerate_only) {
2588                 if (check_env("IMSM_NO_PLATFORM"))
2589                         return 0;
2590                 list = find_intel_devices();
2591                 if (!list)
2592                         return 2;
2593                 for (hba = list; hba; hba = hba->next) {
2594                         if (find_imsm_capability(hba)) {
2595                                 result = 0;
2596                                 break;
2597                         }
2598                         else
2599                                 result = 2;
2600                 }
2601                 return result;
2602         }
2603
2604         list = find_intel_devices();
2605         if (!list) {
2606                 if (verbose > 0)
2607                         pr_err("no active Intel(R) RAID controller found.\n");
2608                 return 2;
2609         } else if (verbose > 0)
2610                 print_found_intel_controllers(list);
2611
2612         for (hba = list; hba; hba = hba->next) {
2613                 if (controller_path && (compare_paths(hba->path, controller_path) != 0))
2614                         continue;
2615                 if (!find_imsm_capability(hba)) {
2616                         char buf[PATH_MAX];
2617                         pr_err("imsm capabilities not found for controller: %s (type %s)\n",
2618                                   hba->type == SYS_DEV_VMD ? vmd_domain_to_controller(hba, buf) : hba->path,
2619                                   get_sys_dev_type(hba->type));
2620                         continue;
2621                 }
2622                 result = 0;
2623         }
2624
2625         if (controller_path && result == 1) {
2626                 pr_err("no active Intel(R) RAID controller found under %s\n",
2627                                 controller_path);
2628                 return result;
2629         }
2630
2631         const struct orom_entry *entry;
2632
2633         for (entry = orom_entries; entry; entry = entry->next) {
2634                 if (entry->type == SYS_DEV_VMD) {
2635                         print_imsm_capability(&entry->orom);
2636                         printf(" 3rd party NVMe :%s supported\n",
2637                             imsm_orom_has_tpv_support(&entry->orom)?"":" not");
2638                         for (hba = list; hba; hba = hba->next) {
2639                                 if (hba->type == SYS_DEV_VMD) {
2640                                         char buf[PATH_MAX];
2641                                         printf(" I/O Controller : %s (%s)\n",
2642                                                 vmd_domain_to_controller(hba, buf), get_sys_dev_type(hba->type));
2643                                         if (print_nvme_info(hba)) {
2644                                                 if (verbose > 0)
2645                                                         pr_err("failed to get devices attached to VMD domain.\n");
2646                                                 result |= 2;
2647                                         }
2648                                 }
2649                         }
2650                         printf("\n");
2651                         continue;
2652                 }
2653
2654                 print_imsm_capability(&entry->orom);
2655                 if (entry->type == SYS_DEV_NVME) {
2656                         for (hba = list; hba; hba = hba->next) {
2657                                 if (hba->type == SYS_DEV_NVME)
2658                                         print_nvme_info(hba);
2659                         }
2660                         printf("\n");
2661                         continue;
2662                 }
2663
2664                 struct devid_list *devid;
2665                 for (devid = entry->devid_list; devid; devid = devid->next) {
2666                         hba = device_by_id(devid->devid);
2667                         if (!hba)
2668                                 continue;
2669
2670                         printf(" I/O Controller : %s (%s)\n",
2671                                 hba->path, get_sys_dev_type(hba->type));
2672                         if (hba->type == SYS_DEV_SATA) {
2673                                 host_base = ahci_get_port_count(hba->path, &port_count);
2674                                 if (ahci_enumerate_ports(hba->path, port_count, host_base, verbose)) {
2675                                         if (verbose > 0)
2676                                                 pr_err("failed to enumerate ports on SATA controller at %s.\n", hba->pci_id);
2677                                         result |= 2;
2678                                 }
2679                         }
2680                 }
2681                 printf("\n");
2682         }
2683
2684         return result;
2685 }
2686
2687 static int export_detail_platform_imsm(int verbose, char *controller_path)
2688 {
2689         struct sys_dev *list, *hba;
2690         int result=1;
2691
2692         list = find_intel_devices();
2693         if (!list) {
2694                 if (verbose > 0)
2695                         pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_INTEL_DEVICES\n");
2696                 result = 2;
2697                 return result;
2698         }
2699
2700         for (hba = list; hba; hba = hba->next) {
2701                 if (controller_path && (compare_paths(hba->path,controller_path) != 0))
2702                         continue;
2703                 if (!find_imsm_capability(hba) && verbose > 0) {
2704                         char buf[PATH_MAX];
2705                         pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_IMSM_CAPABLE_DEVICE_UNDER_%s\n",
2706                         hba->type == SYS_DEV_VMD ? vmd_domain_to_controller(hba, buf) : hba->path);
2707                 }
2708                 else
2709                         result = 0;
2710         }
2711
2712         const struct orom_entry *entry;
2713
2714         for (entry = orom_entries; entry; entry = entry->next) {
2715                 if (entry->type == SYS_DEV_VMD) {
2716                         for (hba = list; hba; hba = hba->next)
2717                                 print_imsm_capability_export(&entry->orom);
2718                         continue;
2719                 }
2720                 print_imsm_capability_export(&entry->orom);
2721         }
2722
2723         return result;
2724 }
2725
2726 static int match_home_imsm(struct supertype *st, char *homehost)
2727 {
2728         /* the imsm metadata format does not specify any host
2729          * identification information.  We return -1 since we can never
2730          * confirm nor deny whether a given array is "meant" for this
2731          * host.  We rely on compare_super and the 'family_num' fields to
2732          * exclude member disks that do not belong, and we rely on
2733          * mdadm.conf to specify the arrays that should be assembled.
2734          * Auto-assembly may still pick up "foreign" arrays.
2735          */
2736
2737         return -1;
2738 }
2739
2740 static void uuid_from_super_imsm(struct supertype *st, int uuid[4])
2741 {
2742         /* The uuid returned here is used for:
2743          *  uuid to put into bitmap file (Create, Grow)
2744          *  uuid for backup header when saving critical section (Grow)
2745          *  comparing uuids when re-adding a device into an array
2746          *    In these cases the uuid required is that of the data-array,
2747          *    not the device-set.
2748          *  uuid to recognise same set when adding a missing device back
2749          *    to an array.   This is a uuid for the device-set.
2750          *
2751          * For each of these we can make do with a truncated
2752          * or hashed uuid rather than the original, as long as
2753          * everyone agrees.
2754          * In each case the uuid required is that of the data-array,
2755          * not the device-set.
2756          */
2757         /* imsm does not track uuid's so we synthesis one using sha1 on
2758          * - The signature (Which is constant for all imsm array, but no matter)
2759          * - the orig_family_num of the container
2760          * - the index number of the volume
2761          * - the 'serial' number of the volume.
2762          * Hopefully these are all constant.
2763          */
2764         struct intel_super *super = st->sb;
2765
2766         char buf[20];
2767         struct sha1_ctx ctx;
2768         struct imsm_dev *dev = NULL;
2769         __u32 family_num;
2770
2771         /* some mdadm versions failed to set ->orig_family_num, in which
2772          * case fall back to ->family_num.  orig_family_num will be
2773          * fixed up with the first metadata update.
2774          */
2775         family_num = super->anchor->orig_family_num;
2776         if (family_num == 0)
2777                 family_num = super->anchor->family_num;
2778         sha1_init_ctx(&ctx);
2779         sha1_process_bytes(super->anchor->sig, MPB_SIG_LEN, &ctx);
2780         sha1_process_bytes(&family_num, sizeof(__u32), &ctx);
2781         if (super->current_vol >= 0)
2782                 dev = get_imsm_dev(super, super->current_vol);
2783         if (dev) {
2784                 __u32 vol = super->current_vol;
2785                 sha1_process_bytes(&vol, sizeof(vol), &ctx);
2786                 sha1_process_bytes(dev->volume, MAX_RAID_SERIAL_LEN, &ctx);
2787         }
2788         sha1_finish_ctx(&ctx, buf);
2789         memcpy(uuid, buf, 4*4);
2790 }
2791
2792 #if 0
2793 static void
2794 get_imsm_numerical_version(struct imsm_super *mpb, int *m, int *p)
2795 {
2796         __u8 *v = get_imsm_version(mpb);
2797         __u8 *end = mpb->sig + MAX_SIGNATURE_LENGTH;
2798         char major[] = { 0, 0, 0 };
2799         char minor[] = { 0 ,0, 0 };
2800         char patch[] = { 0, 0, 0 };
2801         char *ver_parse[] = { major, minor, patch };
2802         int i, j;
2803
2804         i = j = 0;
2805         while (*v != '\0' && v < end) {
2806                 if (*v != '.' && j < 2)
2807                         ver_parse[i][j++] = *v;
2808                 else {
2809                         i++;
2810                         j = 0;
2811                 }
2812                 v++;
2813         }
2814
2815         *m = strtol(minor, NULL, 0);
2816         *p = strtol(patch, NULL, 0);
2817 }
2818 #endif
2819
2820 static __u32 migr_strip_blocks_resync(struct imsm_dev *dev)
2821 {
2822         /* migr_strip_size when repairing or initializing parity */
2823         struct imsm_map *map = get_imsm_map(dev, MAP_0);
2824         __u32 chunk = __le32_to_cpu(map->blocks_per_strip);
2825
2826         switch (get_imsm_raid_level(map)) {
2827         case 5:
2828         case 10:
2829                 return chunk;
2830         default:
2831                 return 128*1024 >> 9;
2832         }
2833 }
2834
2835 static __u32 migr_strip_blocks_rebuild(struct imsm_dev *dev)
2836 {
2837         /* migr_strip_size when rebuilding a degraded disk, no idea why
2838          * this is different than migr_strip_size_resync(), but it's good
2839          * to be compatible
2840          */
2841         struct imsm_map *map = get_imsm_map(dev, MAP_1);
2842         __u32 chunk = __le32_to_cpu(map->blocks_per_strip);
2843
2844         switch (get_imsm_raid_level(map)) {
2845         case 1:
2846         case 10:
2847                 if (map->num_members % map->num_domains == 0)
2848                         return 128*1024 >> 9;
2849                 else
2850                         return chunk;
2851         case 5:
2852                 return max((__u32) 64*1024 >> 9, chunk);
2853         default:
2854                 return 128*1024 >> 9;
2855         }
2856 }
2857
2858 static __u32 num_stripes_per_unit_resync(struct imsm_dev *dev)
2859 {
2860         struct imsm_map *lo = get_imsm_map(dev, MAP_0);
2861         struct imsm_map *hi = get_imsm_map(dev, MAP_1);
2862         __u32 lo_chunk = __le32_to_cpu(lo->blocks_per_strip);
2863         __u32 hi_chunk = __le32_to_cpu(hi->blocks_per_strip);
2864
2865         return max((__u32) 1, hi_chunk / lo_chunk);
2866 }
2867
2868 static __u32 num_stripes_per_unit_rebuild(struct imsm_dev *dev)
2869 {
2870         struct imsm_map *lo = get_imsm_map(dev, MAP_0);
2871         int level = get_imsm_raid_level(lo);
2872
2873         if (level == 1 || level == 10) {
2874                 struct imsm_map *hi = get_imsm_map(dev, MAP_1);
2875
2876                 return hi->num_domains;
2877         } else
2878                 return num_stripes_per_unit_resync(dev);
2879 }
2880
2881 static __u8 imsm_num_data_members(struct imsm_map *map)
2882 {
2883         /* named 'imsm_' because raid0, raid1 and raid10
2884          * counter-intuitively have the same number of data disks
2885          */
2886         switch (get_imsm_raid_level(map)) {
2887         case 0:
2888                 return map->num_members;
2889                 break;
2890         case 1:
2891         case 10:
2892                 return map->num_members/2;
2893         case 5:
2894                 return map->num_members - 1;
2895         default:
2896                 dprintf("unsupported raid level\n");
2897                 return 0;
2898         }
2899 }
2900
2901 static unsigned long long calc_component_size(struct imsm_map *map,
2902                                               struct imsm_dev *dev)
2903 {
2904         unsigned long long component_size;
2905         unsigned long long dev_size = imsm_dev_size(dev);
2906         long long calc_dev_size = 0;
2907         unsigned int member_disks = imsm_num_data_members(map);
2908
2909         if (member_disks == 0)
2910                 return 0;
2911
2912         component_size = per_dev_array_size(map);
2913         calc_dev_size = component_size * member_disks;
2914
2915         /* Component size is rounded to 1MB so difference between size from
2916          * metadata and size calculated from num_data_stripes equals up to
2917          * 2048 blocks per each device. If the difference is higher it means
2918          * that array size was expanded and num_data_stripes was not updated.
2919          */
2920         if (llabs(calc_dev_size - (long long)dev_size) >
2921             (1 << SECT_PER_MB_SHIFT) * member_disks) {
2922                 component_size = dev_size / member_disks;
2923                 dprintf("Invalid num_data_stripes in metadata; expected=%llu, found=%llu\n",
2924                         component_size / map->blocks_per_strip,
2925                         num_data_stripes(map));
2926         }
2927
2928         return component_size;
2929 }
2930
2931 static __u32 parity_segment_depth(struct imsm_dev *dev)
2932 {
2933         struct imsm_map *map = get_imsm_map(dev, MAP_0);
2934         __u32 chunk =  __le32_to_cpu(map->blocks_per_strip);
2935
2936         switch(get_imsm_raid_level(map)) {
2937         case 1:
2938         case 10:
2939                 return chunk * map->num_domains;
2940         case 5:
2941                 return chunk * map->num_members;
2942         default:
2943                 return chunk;
2944         }
2945 }
2946
2947 static __u32 map_migr_block(struct imsm_dev *dev, __u32 block)
2948 {
2949         struct imsm_map *map = get_imsm_map(dev, MAP_1);
2950         __u32 chunk = __le32_to_cpu(map->blocks_per_strip);
2951         __u32 strip = block / chunk;
2952
2953         switch (get_imsm_raid_level(map)) {
2954         case 1:
2955         case 10: {
2956                 __u32 vol_strip = (strip * map->num_domains) + 1;
2957                 __u32 vol_stripe = vol_strip / map->num_members;
2958
2959                 return vol_stripe * chunk + block % chunk;
2960         } case 5: {
2961                 __u32 stripe = strip / (map->num_members - 1);
2962
2963                 return stripe * chunk + block % chunk;
2964         }
2965         default:
2966                 return 0;
2967         }
2968 }
2969
2970 static __u64 blocks_per_migr_unit(struct intel_super *super,
2971                                   struct imsm_dev *dev)
2972 {
2973         /* calculate the conversion factor between per member 'blocks'
2974          * (md/{resync,rebuild}_start) and imsm migration units, return
2975          * 0 for the 'not migrating' and 'unsupported migration' cases
2976          */
2977         if (!dev->vol.migr_state)
2978                 return 0;
2979
2980         switch (migr_type(dev)) {
2981         case MIGR_GEN_MIGR: {
2982                 struct migr_record *migr_rec = super->migr_rec;
2983                 return __le32_to_cpu(migr_rec->blocks_per_unit);
2984         }
2985         case MIGR_VERIFY:
2986         case MIGR_REPAIR:
2987         case MIGR_INIT: {
2988                 struct imsm_map *map = get_imsm_map(dev, MAP_0);
2989                 __u32 stripes_per_unit;
2990                 __u32 blocks_per_unit;
2991                 __u32 parity_depth;
2992                 __u32 migr_chunk;
2993                 __u32 block_map;
2994                 __u32 block_rel;
2995                 __u32 segment;
2996                 __u32 stripe;
2997                 __u8  disks;
2998
2999                 /* yes, this is really the translation of migr_units to
3000                  * per-member blocks in the 'resync' case
3001                  */
3002                 stripes_per_unit = num_stripes_per_unit_resync(dev);
3003                 migr_chunk = migr_strip_blocks_resync(dev);
3004                 disks = imsm_num_data_members(map);
3005                 blocks_per_unit = stripes_per_unit * migr_chunk * disks;
3006                 stripe = __le16_to_cpu(map->blocks_per_strip) * disks;
3007                 segment = blocks_per_unit / stripe;
3008                 block_rel = blocks_per_unit - segment * stripe;
3009                 parity_depth = parity_segment_depth(dev);
3010                 block_map = map_migr_block(dev, block_rel);
3011                 return block_map + parity_depth * segment;
3012         }
3013         case MIGR_REBUILD: {
3014                 __u32 stripes_per_unit;
3015                 __u32 migr_chunk;
3016
3017                 stripes_per_unit = num_stripes_per_unit_rebuild(dev);
3018                 migr_chunk = migr_strip_blocks_rebuild(dev);
3019                 return migr_chunk * stripes_per_unit;
3020         }
3021         case MIGR_STATE_CHANGE:
3022         default:
3023                 return 0;
3024         }
3025 }
3026
3027 static int imsm_level_to_layout(int level)
3028 {
3029         switch (level) {
3030         case 0:
3031         case 1:
3032                 return 0;
3033         case 5:
3034         case 6:
3035                 return ALGORITHM_LEFT_ASYMMETRIC;
3036         case 10:
3037                 return 0x102;
3038         }
3039         return UnSet;
3040 }
3041
3042 /*******************************************************************************
3043  * Function:    read_imsm_migr_rec
3044  * Description: Function reads imsm migration record from last sector of disk
3045  * Parameters:
3046  *      fd      : disk descriptor
3047  *      super   : metadata info
3048  * Returns:
3049  *       0 : success,
3050  *      -1 : fail
3051  ******************************************************************************/
3052 static int read_imsm_migr_rec(int fd, struct intel_super *super)
3053 {
3054         int ret_val = -1;
3055         unsigned int sector_size = super->sector_size;
3056         unsigned long long dsize;
3057
3058         get_dev_size(fd, NULL, &dsize);
3059         if (lseek64(fd, dsize - (sector_size*MIGR_REC_SECTOR_POSITION),
3060                    SEEK_SET) < 0) {
3061                 pr_err("Cannot seek to anchor block: %s\n",
3062                        strerror(errno));
3063                 goto out;
3064         }
3065         if ((unsigned int)read(fd, super->migr_rec_buf,
3066             MIGR_REC_BUF_SECTORS*sector_size) !=
3067             MIGR_REC_BUF_SECTORS*sector_size) {
3068                 pr_err("Cannot read migr record block: %s\n",
3069                        strerror(errno));
3070                 goto out;
3071         }
3072         ret_val = 0;
3073         if (sector_size == 4096)
3074                 convert_from_4k_imsm_migr_rec(super);
3075
3076 out:
3077         return ret_val;
3078 }
3079
3080 static struct imsm_dev *imsm_get_device_during_migration(
3081         struct intel_super *super)
3082 {
3083
3084         struct intel_dev *dv;
3085
3086         for (dv = super->devlist; dv; dv = dv->next) {
3087                 if (is_gen_migration(dv->dev))
3088                         return dv->dev;
3089         }
3090         return NULL;
3091 }
3092
3093 /*******************************************************************************
3094  * Function:    load_imsm_migr_rec
3095  * Description: Function reads imsm migration record (it is stored at the last
3096  *              sector of disk)
3097  * Parameters:
3098  *      super   : imsm internal array info
3099  * Returns:
3100  *       0 : success
3101  *      -1 : fail
3102  *      -2 : no migration in progress
3103  ******************************************************************************/
3104 static int load_imsm_migr_rec(struct intel_super *super)
3105 {
3106         struct dl *dl;
3107         char nm[30];
3108         int retval = -1;
3109         int fd = -1;
3110         struct imsm_dev *dev;
3111         struct imsm_map *map;
3112         int slot = -1;
3113         int keep_fd = 1;
3114
3115         /* find map under migration */
3116         dev = imsm_get_device_during_migration(super);
3117         /* nothing to load,no migration in progress?
3118         */
3119         if (dev == NULL)
3120                 return -2;
3121
3122         map = get_imsm_map(dev, MAP_0);
3123         if (!map)
3124                 return -1;
3125
3126         for (dl = super->disks; dl; dl = dl->next) {
3127                 /* skip spare and failed disks
3128                  */
3129                 if (dl->index < 0)
3130                         continue;
3131                 /* read only from one of the first two slots
3132                  */
3133                 slot = get_imsm_disk_slot(map, dl->index);
3134                 if (slot > 1 || slot < 0)
3135                         continue;
3136
3137                 if (dl->fd < 0) {
3138                         sprintf(nm, "%d:%d", dl->major, dl->minor);
3139                         fd = dev_open(nm, O_RDONLY);
3140                         if (fd >= 0) {
3141                                 keep_fd = 0;
3142                                 break;
3143                         }
3144                 } else {
3145                         fd = dl->fd;
3146                         break;
3147                 }
3148         }
3149
3150         if (fd < 0)
3151                 return retval;
3152         retval = read_imsm_migr_rec(fd, super);
3153         if (!keep_fd)
3154                 close(fd);
3155
3156         return retval;
3157 }
3158
3159 /*******************************************************************************
3160  * function: imsm_create_metadata_checkpoint_update
3161  * Description: It creates update for checkpoint change.
3162  * Parameters:
3163  *      super   : imsm internal array info
3164  *      u       : pointer to prepared update
3165  * Returns:
3166  *      Uptate length.
3167  *      If length is equal to 0, input pointer u contains no update
3168  ******************************************************************************/
3169 static int imsm_create_metadata_checkpoint_update(
3170         struct intel_super *super,
3171         struct imsm_update_general_migration_checkpoint **u)
3172 {
3173
3174         int update_memory_size = 0;
3175
3176         dprintf("(enter)\n");
3177
3178         if (u == NULL)
3179                 return 0;
3180         *u = NULL;
3181
3182         /* size of all update data without anchor */
3183         update_memory_size =
3184                 sizeof(struct imsm_update_general_migration_checkpoint);
3185
3186         *u = xcalloc(1, update_memory_size);
3187         if (*u == NULL) {
3188                 dprintf("error: cannot get memory\n");
3189                 return 0;
3190         }
3191         (*u)->type = update_general_migration_checkpoint;
3192         (*u)->curr_migr_unit = current_migr_unit(super->migr_rec);
3193         dprintf("prepared for %llu\n", (unsigned long long)(*u)->curr_migr_unit);
3194
3195         return update_memory_size;
3196 }
3197
3198 static void imsm_update_metadata_locally(struct supertype *st,
3199                                          void *buf, int len);
3200
3201 /*******************************************************************************
3202  * Function:    write_imsm_migr_rec
3203  * Description: Function writes imsm migration record
3204  *              (at the last sector of disk)
3205  * Parameters:
3206  *      super   : imsm internal array info
3207  * Returns:
3208  *       0 : success
3209  *      -1 : if fail
3210  ******************************************************************************/
3211 static int write_imsm_migr_rec(struct supertype *st)
3212 {
3213         struct intel_super *super = st->sb;
3214         unsigned int sector_size = super->sector_size;
3215         unsigned long long dsize;
3216         int retval = -1;
3217         struct dl *sd;
3218         int len;
3219         struct imsm_update_general_migration_checkpoint *u;
3220         struct imsm_dev *dev;
3221         struct imsm_map *map;
3222
3223         /* find map under migration */
3224         dev = imsm_get_device_during_migration(super);
3225         /* if no migration, write buffer anyway to clear migr_record
3226          * on disk based on first available device
3227         */
3228         if (dev == NULL)
3229                 dev = get_imsm_dev(super, super->current_vol < 0 ? 0 :
3230                                           super->current_vol);
3231
3232         map = get_imsm_map(dev, MAP_0);
3233
3234         if (sector_size == 4096)
3235                 convert_to_4k_imsm_migr_rec(super);
3236         for (sd = super->disks ; sd ; sd = sd->next) {
3237                 int slot = -1;
3238
3239                 /* skip failed and spare devices */
3240                 if (sd->index < 0)
3241                         continue;
3242                 /* write to 2 first slots only */
3243                 if (map)
3244                         slot = get_imsm_disk_slot(map, sd->index);
3245                 if (map == NULL || slot > 1 || slot < 0)
3246                         continue;
3247
3248                 get_dev_size(sd->fd, NULL, &dsize);
3249                 if (lseek64(sd->fd, dsize - (MIGR_REC_SECTOR_POSITION *
3250                     sector_size),
3251                     SEEK_SET) < 0) {
3252                         pr_err("Cannot seek to anchor block: %s\n",
3253                                strerror(errno));
3254                         goto out;
3255                 }
3256                 if ((unsigned int)write(sd->fd, super->migr_rec_buf,
3257                     MIGR_REC_BUF_SECTORS*sector_size) !=
3258                     MIGR_REC_BUF_SECTORS*sector_size) {
3259                         pr_err("Cannot write migr record block: %s\n",
3260                                strerror(errno));
3261                         goto out;
3262                 }
3263         }
3264         if (sector_size == 4096)
3265                 convert_from_4k_imsm_migr_rec(super);
3266         /* update checkpoint information in metadata */
3267         len = imsm_create_metadata_checkpoint_update(super, &u);
3268         if (len <= 0) {
3269                 dprintf("imsm: Cannot prepare update\n");
3270                 goto out;
3271         }
3272         /* update metadata locally */
3273         imsm_update_metadata_locally(st, u, len);
3274         /* and possibly remotely */
3275         if (st->update_tail) {
3276                 append_metadata_update(st, u, len);
3277                 /* during reshape we do all work inside metadata handler
3278                  * manage_reshape(), so metadata update has to be triggered
3279                  * insida it
3280                  */
3281                 flush_metadata_updates(st);
3282                 st->update_tail = &st->updates;
3283         } else
3284                 free(u);
3285
3286         retval = 0;
3287  out:
3288         return retval;
3289 }
3290
3291 /* spare/missing disks activations are not allowe when
3292  * array/container performs reshape operation, because
3293  * all arrays in container works on the same disks set
3294  */
3295 int imsm_reshape_blocks_arrays_changes(struct intel_super *super)
3296 {
3297         int rv = 0;
3298         struct intel_dev *i_dev;
3299         struct imsm_dev *dev;
3300
3301         /* check whole container
3302          */
3303         for (i_dev = super->devlist; i_dev; i_dev = i_dev->next) {
3304                 dev = i_dev->dev;
3305                 if (is_gen_migration(dev)) {
3306                         /* No repair during any migration in container
3307                          */
3308                         rv = 1;
3309                         break;
3310                 }
3311         }
3312         return rv;
3313 }
3314 static unsigned long long imsm_component_size_alignment_check(int level,
3315                                               int chunk_size,
3316                                               unsigned int sector_size,
3317                                               unsigned long long component_size)
3318 {
3319         unsigned int component_size_alignment;
3320
3321         /* check component size alignment
3322         */
3323         component_size_alignment = component_size % (chunk_size/sector_size);
3324
3325         dprintf("(Level: %i, chunk_size = %i, component_size = %llu), component_size_alignment = %u\n",
3326                 level, chunk_size, component_size,
3327                 component_size_alignment);
3328
3329         if (component_size_alignment && (level != 1) && (level != UnSet)) {
3330                 dprintf("imsm: reported component size aligned from %llu ",
3331                         component_size);
3332                 component_size -= component_size_alignment;
3333                 dprintf_cont("to %llu (%i).\n",
3334                         component_size, component_size_alignment);
3335         }
3336
3337         return component_size;
3338 }
3339
3340 /*******************************************************************************
3341  * Function:    get_bitmap_header_sector
3342  * Description: Returns the sector where the bitmap header is placed.
3343  * Parameters:
3344  *      st              : supertype information
3345  *      dev_idx         : index of the device with bitmap
3346  *
3347  * Returns:
3348  *       The sector where the bitmap header is placed
3349  ******************************************************************************/
3350 static unsigned long long get_bitmap_header_sector(struct intel_super *super,
3351                                                    int dev_idx)
3352 {
3353         struct imsm_dev *dev = get_imsm_dev(super, dev_idx);
3354         struct imsm_map *map = get_imsm_map(dev, MAP_0);
3355
3356         if (!super->sector_size) {
3357                 dprintf("sector size is not set\n");
3358                 return 0;
3359         }
3360
3361         return pba_of_lba0(map) + calc_component_size(map, dev) +
3362                (IMSM_BITMAP_HEADER_OFFSET / super->sector_size);
3363 }
3364
3365 /*******************************************************************************
3366  * Function:    get_bitmap_sector
3367  * Description: Returns the sector where the bitmap is placed.
3368  * Parameters:
3369  *      st              : supertype information
3370  *      dev_idx         : index of the device with bitmap
3371  *
3372  * Returns:
3373  *       The sector where the bitmap is placed
3374  ******************************************************************************/
3375 static unsigned long long get_bitmap_sector(struct intel_super *super,
3376                                             int dev_idx)
3377 {
3378         if (!super->sector_size) {
3379                 dprintf("sector size is not set\n");
3380                 return 0;
3381         }
3382
3383         return get_bitmap_header_sector(super, dev_idx) +
3384                (IMSM_BITMAP_HEADER_SIZE / super->sector_size);
3385 }
3386
3387 static unsigned long long get_ppl_sector(struct intel_super *super, int dev_idx)
3388 {
3389         struct imsm_dev *dev = get_imsm_dev(super, dev_idx);
3390         struct imsm_map *map = get_imsm_map(dev, MAP_0);
3391
3392         return pba_of_lba0(map) +
3393                (num_data_stripes(map) * map->blocks_per_strip);
3394 }
3395
3396 static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info, char *dmap)
3397 {
3398         struct intel_super *super = st->sb;
3399         struct migr_record *migr_rec = super->migr_rec;
3400         struct imsm_dev *dev = get_imsm_dev(super, super->current_vol);
3401         struct imsm_map *map = get_imsm_map(dev, MAP_0);
3402         struct imsm_map *prev_map = get_imsm_map(dev, MAP_1);
3403         struct imsm_map *map_to_analyse = map;
3404         struct dl *dl;
3405         int map_disks = info->array.raid_disks;
3406
3407         memset(info, 0, sizeof(*info));
3408         if (prev_map)
3409                 map_to_analyse = prev_map;
3410
3411         dl = super->current_disk;
3412
3413         info->container_member    = super->current_vol;
3414         info->array.raid_disks    = map->num_members;
3415         info->array.level         = get_imsm_raid_level(map_to_analyse);
3416         info->array.layout        = imsm_level_to_layout(info->array.level);
3417         info->array.md_minor      = -1;
3418         info->array.ctime         = 0;
3419         info->array.utime         = 0;
3420         info->array.chunk_size    =
3421                 __le16_to_cpu(map_to_analyse->blocks_per_strip) << 9;
3422         info->array.state         = !(dev->vol.dirty & RAIDVOL_DIRTY);
3423         info->custom_array_size   = imsm_dev_size(dev);
3424         info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
3425
3426         if (is_gen_migration(dev)) {
3427                 info->reshape_active = 1;
3428                 info->new_level = get_imsm_raid_level(map);
3429                 info->new_layout = imsm_level_to_layout(info->new_level);
3430                 info->new_chunk = __le16_to_cpu(map->blocks_per_strip) << 9;
3431                 info->delta_disks = map->num_members - prev_map->num_members;
3432                 if (info->delta_disks) {
3433                         /* this needs to be applied to every array
3434                          * in the container.
3435                          */
3436                         info->reshape_active = CONTAINER_RESHAPE;
3437                 }
3438                 /* We shape information that we give to md might have to be
3439                  * modify to cope with md's requirement for reshaping arrays.
3440                  * For example, when reshaping a RAID0, md requires it to be
3441                  * presented as a degraded RAID4.
3442                  * Also if a RAID0 is migrating to a RAID5 we need to specify
3443                  * the array as already being RAID5, but the 'before' layout
3444                  * is a RAID4-like layout.
3445                  */
3446                 switch (info->array.level) {
3447                 case 0:
3448                         switch(info->new_level) {
3449                         case 0:
3450                                 /* conversion is happening as RAID4 */
3451                                 info->array.level = 4;
3452                                 info->array.raid_disks += 1;
3453                                 break;
3454                         case 5:
3455                                 /* conversion is happening as RAID5 */
3456                                 info->array.level = 5;
3457                                 info->array.layout = ALGORITHM_PARITY_N;
3458                                 info->delta_disks -= 1;
3459                                 break;
3460                         default:
3461                                 /* FIXME error message */
3462                                 info->array.level = UnSet;
3463                                 break;
3464                         }
3465                         break;
3466                 }
3467         } else {
3468                 info->new_level = UnSet;
3469                 info->new_layout = UnSet;
3470                 info->new_chunk = info->array.chunk_size;
3471                 info->delta_disks = 0;
3472         }
3473
3474         if (dl) {
3475                 info->disk.major = dl->major;
3476                 info->disk.minor = dl->minor;
3477                 info->disk.number = dl->index;
3478                 info->disk.raid_disk = get_imsm_disk_slot(map_to_analyse,
3479                                                           dl->index);
3480         }
3481
3482         info->data_offset         = pba_of_lba0(map_to_analyse);
3483         info->component_size = calc_component_size(map, dev);
3484         info->component_size = imsm_component_size_alignment_check(
3485                                                         info->array.level,
3486                                                         info->array.chunk_size,
3487                                                         super->sector_size,
3488                                                         info->component_size);
3489         info->bb.supported = 1;
3490
3491         memset(info->uuid, 0, sizeof(info->uuid));
3492         info->recovery_start = MaxSector;
3493
3494         if (info->array.level == 5 &&
3495             (dev->rwh_policy == RWH_DISTRIBUTED ||
3496              dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED)) {
3497                 info->consistency_policy = CONSISTENCY_POLICY_PPL;
3498                 info->ppl_sector = get_ppl_sector(super, super->current_vol);
3499                 if (dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED)
3500                         info->ppl_size = MULTIPLE_PPL_AREA_SIZE_IMSM >> 9;
3501                 else
3502                         info->ppl_size = (PPL_HEADER_SIZE + PPL_ENTRY_SPACE)
3503                                           >> 9;
3504         } else if (info->array.level <= 0) {
3505                 info->consistency_policy = CONSISTENCY_POLICY_NONE;
3506         } else {
3507                 if (dev->rwh_policy == RWH_BITMAP) {
3508                         info->bitmap_offset = get_bitmap_sector(super, super->current_vol);
3509                         info->consistency_policy = CONSISTENCY_POLICY_BITMAP;
3510                 } else {
3511                         info->consistency_policy = CONSISTENCY_POLICY_RESYNC;
3512                 }
3513         }
3514
3515         info->reshape_progress = 0;
3516         info->resync_start = MaxSector;
3517         if ((map_to_analyse->map_state == IMSM_T_STATE_UNINITIALIZED ||
3518             !(info->array.state & 1)) &&
3519             imsm_reshape_blocks_arrays_changes(super) == 0) {
3520                 info->resync_start = 0;
3521         }
3522         if (dev->vol.migr_state) {
3523                 switch (migr_type(dev)) {
3524                 case MIGR_REPAIR:
3525                 case MIGR_INIT: {
3526                         __u64 blocks_per_unit = blocks_per_migr_unit(super,
3527                                                                      dev);
3528                         __u64 units = vol_curr_migr_unit(dev);
3529
3530                         info->resync_start = blocks_per_unit * units;
3531                         break;
3532                 }
3533                 case MIGR_GEN_MIGR: {
3534                         __u64 blocks_per_unit = blocks_per_migr_unit(super,
3535                                                                      dev);
3536                         __u64 units = current_migr_unit(migr_rec);
3537                         int used_disks;
3538
3539                         if (__le32_to_cpu(migr_rec->ascending_migr) &&
3540                             (units <
3541                                 (get_num_migr_units(migr_rec)-1)) &&
3542                             (super->migr_rec->rec_status ==
3543                                         __cpu_to_le32(UNIT_SRC_IN_CP_AREA)))
3544                                 units++;
3545
3546                         info->reshape_progress = blocks_per_unit * units;
3547
3548                         dprintf("IMSM: General Migration checkpoint : %llu (%llu) -> read reshape progress : %llu\n",
3549                                 (unsigned long long)units,
3550                                 (unsigned long long)blocks_per_unit,
3551                                 info->reshape_progress);
3552
3553                         used_disks = imsm_num_data_members(prev_map);
3554                         if (used_disks > 0) {
3555                                 info->custom_array_size = per_dev_array_size(map) *
3556                                         used_disks;
3557                         }
3558                 }
3559                 case MIGR_VERIFY:
3560                         /* we could emulate the checkpointing of
3561                          * 'sync_action=check' migrations, but for now
3562                          * we just immediately complete them
3563                          */
3564                 case MIGR_REBUILD:
3565                         /* this is handled by container_content_imsm() */
3566                 case MIGR_STATE_CHANGE:
3567                         /* FIXME handle other migrations */
3568                 default:
3569                         /* we are not dirty, so... */
3570                         info->resync_start = MaxSector;
3571                 }
3572         }
3573
3574         strncpy(info->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN);
3575         info->name[MAX_RAID_SERIAL_LEN] = 0;
3576
3577         info->array.major_version = -1;
3578         info->array.minor_version = -2;
3579         sprintf(info->text_version, "/%s/%d", st->container_devnm, info->container_member);
3580         info->safe_mode_delay = 4000;  /* 4 secs like the Matrix driver */
3581         uuid_from_super_imsm(st, info->uuid);
3582
3583         if (dmap) {
3584                 int i, j;
3585                 for (i=0; i<map_disks; i++) {
3586                         dmap[i] = 0;
3587                         if (i < info->array.raid_disks) {
3588                                 struct imsm_disk *dsk;
3589                                 j = get_imsm_disk_idx(dev, i, MAP_X);
3590                                 dsk = get_imsm_disk(super, j);
3591                                 if (dsk && (dsk->status & CONFIGURED_DISK))
3592                                         dmap[i] = 1;
3593                         }
3594                 }
3595         }
3596 }
3597
3598 static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev,
3599                                 int failed, int look_in_map);
3600
3601 static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev,
3602                              int look_in_map);
3603
3604 static void manage_second_map(struct intel_super *super, struct imsm_dev *dev)
3605 {
3606         if (is_gen_migration(dev)) {
3607                 int failed;
3608                 __u8 map_state;
3609                 struct imsm_map *map2 = get_imsm_map(dev, MAP_1);
3610
3611                 failed = imsm_count_failed(super, dev, MAP_1);
3612                 map_state = imsm_check_degraded(super, dev, failed, MAP_1);
3613                 if (map2->map_state != map_state) {
3614                         map2->map_state = map_state;
3615                         super->updates_pending++;
3616                 }
3617         }
3618 }
3619
3620 static struct imsm_disk *get_imsm_missing(struct intel_super *super, __u8 index)
3621 {
3622         struct dl *d;
3623
3624         for (d = super->missing; d; d = d->next)
3625                 if (d->index == index)
3626                         return &d->disk;
3627         return NULL;
3628 }
3629
3630 static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *map)
3631 {
3632         struct intel_super *super = st->sb;
3633         struct imsm_disk *disk;
3634         int map_disks = info->array.raid_disks;
3635         int max_enough = -1;
3636         int i;
3637         struct imsm_super *mpb;
3638
3639         if (super->current_vol >= 0) {
3640                 getinfo_super_imsm_volume(st, info, map);
3641                 return;
3642         }
3643         memset(info, 0, sizeof(*info));
3644
3645         /* Set raid_disks to zero so that Assemble will always pull in valid
3646          * spares
3647          */
3648         info->array.raid_disks    = 0;
3649         info->array.level         = LEVEL_CONTAINER;
3650         info->array.layout        = 0;
3651         info->array.md_minor      = -1;
3652         info->array.ctime         = 0; /* N/A for imsm */
3653         info->array.utime         = 0;
3654         info->array.chunk_size    = 0;
3655
3656         info->disk.major = 0;
3657         info->disk.minor = 0;
3658         info->disk.raid_disk = -1;
3659         info->reshape_active = 0;
3660         info->array.major_version = -1;
3661         info->array.minor_version = -2;
3662         strcpy(info->text_version, "imsm");
3663         info->safe_mode_delay = 0;
3664         info->disk.number = -1;
3665         info->disk.state = 0;
3666         info->name[0] = 0;
3667         info->recovery_start = MaxSector;
3668         info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
3669         info->bb.supported = 1;
3670
3671         /* do we have the all the insync disks that we expect? */
3672         mpb = super->anchor;
3673         info->events = __le32_to_cpu(mpb->generation_num);
3674
3675         for (i = 0; i < mpb->num_raid_devs; i++) {
3676                 struct imsm_dev *dev = get_imsm_dev(super, i);
3677                 int failed, enough, j, missing = 0;
3678                 struct imsm_map *map;
3679                 __u8 state;
3680
3681                 failed = imsm_count_failed(super, dev, MAP_0);
3682                 state = imsm_check_degraded(super, dev, failed, MAP_0);
3683                 map = get_imsm_map(dev, MAP_0);
3684
3685                 /* any newly missing disks?
3686                  * (catches single-degraded vs double-degraded)
3687                  */
3688                 for (j = 0; j < map->num_members; j++) {
3689                         __u32 ord = get_imsm_ord_tbl_ent(dev, j, MAP_0);
3690                         __u32 idx = ord_to_idx(ord);
3691
3692                         if (super->disks && super->disks->index == (int)idx)
3693                                 info->disk.raid_disk = j;
3694
3695                         if (!(ord & IMSM_ORD_REBUILD) &&
3696                             get_imsm_missing(super, idx)) {
3697                                 missing = 1;
3698                                 break;
3699                         }
3700                 }
3701
3702                 if (state == IMSM_T_STATE_FAILED)
3703                         enough = -1;
3704                 else if (state == IMSM_T_STATE_DEGRADED &&
3705                          (state != map->map_state || missing))
3706                         enough = 0;
3707                 else /* we're normal, or already degraded */
3708                         enough = 1;
3709                 if (is_gen_migration(dev) && missing) {
3710                         /* during general migration we need all disks
3711                          * that process is running on.
3712                          * No new missing disk is allowed.
3713                          */
3714                         max_enough = -1;
3715                         enough = -1;
3716                         /* no more checks necessary
3717                          */
3718                         break;
3719                 }
3720                 /* in the missing/failed disk case check to see
3721                  * if at least one array is runnable
3722                  */
3723                 max_enough = max(max_enough, enough);
3724         }
3725         dprintf("enough: %d\n", max_enough);
3726         info->container_enough = max_enough;
3727
3728         if (super->disks) {
3729                 __u32 reserved = imsm_reserved_sectors(super, super->disks);
3730
3731                 disk = &super->disks->disk;
3732                 info->data_offset = total_blocks(&super->disks->disk) - reserved;
3733                 info->component_size = reserved;
3734                 info->disk.state  = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0;
3735                 /* we don't change info->disk.raid_disk here because
3736                  * this state will be finalized in mdmon after we have
3737                  * found the 'most fresh' version of the metadata
3738                  */
3739                 info->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0;
3740                 info->disk.state |= (is_spare(disk) || is_journal(disk)) ?
3741                                     0 : (1 << MD_DISK_SYNC);
3742         }
3743
3744         /* only call uuid_from_super_imsm when this disk is part of a populated container,
3745          * ->compare_super may have updated the 'num_raid_devs' field for spares
3746          */
3747         if (info->disk.state & (1 << MD_DISK_SYNC) || super->anchor->num_raid_devs)
3748                 uuid_from_super_imsm(st, info->uuid);
3749         else
3750                 memcpy(info->uuid, uuid_zero, sizeof(uuid_zero));
3751
3752         /* I don't know how to compute 'map' on imsm, so use safe default */
3753         if (map) {
3754                 int i;
3755                 for (i = 0; i < map_disks; i++)
3756                         map[i] = 1;
3757         }
3758
3759 }
3760
3761 /* allocates memory and fills disk in mdinfo structure
3762  * for each disk in array */
3763 struct mdinfo *getinfo_super_disks_imsm(struct supertype *st)
3764 {
3765         struct mdinfo *mddev;
3766         struct intel_super *super = st->sb;
3767         struct imsm_disk *disk;
3768         int count = 0;
3769         struct dl *dl;
3770         if (!super || !super->disks)
3771                 return NULL;
3772         dl = super->disks;
3773         mddev = xcalloc(1, sizeof(*mddev));
3774         while (dl) {
3775                 struct mdinfo *tmp;
3776                 disk = &dl->disk;
3777                 tmp = xcalloc(1, sizeof(*tmp));
3778                 if (mddev->devs)
3779                         tmp->next = mddev->devs;
3780                 mddev->devs = tmp;
3781                 tmp->disk.number = count++;
3782                 tmp->disk.major = dl->major;
3783                 tmp->disk.minor = dl->minor;
3784                 tmp->disk.state = is_configured(disk) ?
3785                                   (1 << MD_DISK_ACTIVE) : 0;
3786                 tmp->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0;
3787                 tmp->disk.state |= is_spare(disk) ? 0 : (1 << MD_DISK_SYNC);
3788                 tmp->disk.raid_disk = -1;
3789                 dl = dl->next;
3790         }
3791         return mddev;
3792 }
3793
3794 static int update_super_imsm(struct supertype *st, struct mdinfo *info,
3795                              char *update, char *devname, int verbose,
3796                              int uuid_set, char *homehost)
3797 {
3798         /* For 'assemble' and 'force' we need to return non-zero if any
3799          * change was made.  For others, the return value is ignored.
3800          * Update options are:
3801          *  force-one : This device looks a bit old but needs to be included,
3802          *        update age info appropriately.
3803          *  assemble: clear any 'faulty' flag to allow this device to
3804          *              be assembled.
3805          *  force-array: Array is degraded but being forced, mark it clean
3806          *         if that will be needed to assemble it.
3807          *
3808          *  newdev:  not used ????
3809          *  grow:  Array has gained a new device - this is currently for
3810          *              linear only
3811          *  resync: mark as dirty so a resync will happen.
3812          *  name:  update the name - preserving the homehost
3813          *  uuid:  Change the uuid of the array to match watch is given
3814          *
3815          * Following are not relevant for this imsm:
3816          *  sparc2.2 : update from old dodgey metadata
3817          *  super-minor: change the preferred_minor number
3818          *  summaries:  update redundant counters.
3819          *  homehost:  update the recorded homehost
3820          *  _reshape_progress: record new reshape_progress position.
3821          */
3822         int rv = 1;
3823         struct intel_super *super = st->sb;
3824         struct imsm_super *mpb;
3825
3826         /* we can only update container info */
3827         if (!super || super->current_vol >= 0 || !super->anchor)
3828                 return 1;
3829
3830         mpb = super->anchor;
3831
3832         if (strcmp(update, "uuid") == 0) {
3833                 /* We take this to mean that the family_num should be updated.
3834                  * However that is much smaller than the uuid so we cannot really
3835                  * allow an explicit uuid to be given.  And it is hard to reliably
3836                  * know if one was.
3837                  * So if !uuid_set we know the current uuid is random and just used
3838                  * the first 'int' and copy it to the other 3 positions.
3839                  * Otherwise we require the 4 'int's to be the same as would be the
3840                  * case if we are using a random uuid.  So an explicit uuid will be
3841                  * accepted as long as all for ints are the same... which shouldn't hurt
3842                  */
3843                 if (!uuid_set) {
3844                         info->uuid[1] = info->uuid[2] = info->uuid[3] = info->uuid[0];
3845                         rv = 0;
3846                 } else {
3847                         if (info->uuid[0] != info->uuid[1] ||
3848                             info->uuid[1] != info->uuid[2] ||
3849                             info->uuid[2] != info->uuid[3])
3850                                 rv = -1;
3851                         else
3852                                 rv = 0;
3853                 }
3854                 if (rv == 0)
3855                         mpb->orig_family_num = info->uuid[0];
3856         } else if (strcmp(update, "assemble") == 0)
3857                 rv = 0;
3858         else
3859                 rv = -1;
3860
3861         /* successful update? recompute checksum */
3862         if (rv == 0)
3863                 mpb->check_sum = __le32_to_cpu(__gen_imsm_checksum(mpb));
3864
3865         return rv;
3866 }
3867
3868 static size_t disks_to_mpb_size(int disks)
3869 {
3870         size_t size;
3871
3872         size = sizeof(struct imsm_super);
3873         size += (disks - 1) * sizeof(struct imsm_disk);
3874         size += 2 * sizeof(struct imsm_dev);
3875         /* up to 2 maps per raid device (-2 for imsm_maps in imsm_dev */
3876         size += (4 - 2) * sizeof(struct imsm_map);
3877         /* 4 possible disk_ord_tbl's */
3878         size += 4 * (disks - 1) * sizeof(__u32);
3879         /* maximum bbm log */
3880         size += sizeof(struct bbm_log);
3881
3882         return size;
3883 }
3884
3885 static __u64 avail_size_imsm(struct supertype *st, __u64 devsize,
3886                              unsigned long long data_offset)
3887 {
3888         if (devsize < (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS))
3889                 return 0;
3890
3891         return devsize - (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS);
3892 }
3893
3894 static void free_devlist(struct intel_super *super)
3895 {
3896         struct intel_dev *dv;
3897
3898         while (super->devlist) {
3899                 dv = super->devlist->next;
3900                 free(super->devlist->dev);
3901                 free(super->devlist);
3902                 super->devlist = dv;
3903         }
3904 }
3905
3906 static void imsm_copy_dev(struct imsm_dev *dest, struct imsm_dev *src)
3907 {
3908         memcpy(dest, src, sizeof_imsm_dev(src, 0));
3909 }
3910
3911 static int compare_super_imsm(struct supertype *st, struct supertype *tst,
3912                               int verbose)
3913 {
3914         /*
3915          * return:
3916          *  0 same, or first was empty, and second was copied
3917          *  1 second had wrong number
3918          *  2 wrong uuid
3919          *  3 wrong other info
3920          */
3921         struct intel_super *first = st->sb;
3922         struct intel_super *sec = tst->sb;
3923
3924         if (!first) {
3925                 st->sb = tst->sb;
3926                 tst->sb = NULL;
3927                 return 0;
3928         }
3929         /* in platform dependent environment test if the disks
3930          * use the same Intel hba
3931          * If not on Intel hba at all, allow anything.
3932          */
3933         if (!check_env("IMSM_NO_PLATFORM") && first->hba && sec->hba) {
3934                 if (first->hba->type != sec->hba->type) {
3935                         if (verbose)
3936                                 pr_err("HBAs of devices do not match %s != %s\n",
3937                                        get_sys_dev_type(first->hba->type),
3938                                        get_sys_dev_type(sec->hba->type));
3939                         return 3;
3940                 }
3941
3942                 if (first->orom != sec->orom) {
3943                         if (verbose)
3944                                 pr_err("HBAs of devices do not match %s != %s\n",
3945                                        first->hba->pci_id, sec->hba->pci_id);
3946                         return 3;
3947                 }
3948
3949         }
3950
3951         /* if an anchor does not have num_raid_devs set then it is a free
3952          * floating spare
3953          */
3954         if (first->anchor->num_raid_devs > 0 &&
3955             sec->anchor->num_raid_devs > 0) {
3956                 /* Determine if these disks might ever have been
3957                  * related.  Further disambiguation can only take place
3958                  * in load_super_imsm_all
3959                  */
3960                 __u32 first_family = first->anchor->orig_family_num;
3961                 __u32 sec_family = sec->anchor->orig_family_num;
3962
3963                 if (memcmp(first->anchor->sig, sec->anchor->sig,
3964                            MAX_SIGNATURE_LENGTH) != 0)
3965                         return 3;
3966
3967                 if (first_family == 0)
3968                         first_family = first->anchor->family_num;
3969                 if (sec_family == 0)
3970                         sec_family = sec->anchor->family_num;
3971
3972                 if (first_family != sec_family)
3973                         return 3;
3974
3975         }
3976
3977         /* if 'first' is a spare promote it to a populated mpb with sec's
3978          * family number
3979          */
3980         if (first->anchor->num_raid_devs == 0 &&
3981             sec->anchor->num_raid_devs > 0) {
3982                 int i;
3983                 struct intel_dev *dv;
3984                 struct imsm_dev *dev;
3985
3986                 /* we need to copy raid device info from sec if an allocation
3987                  * fails here we don't associate the spare
3988                  */
3989                 for (i = 0; i < sec->anchor->num_raid_devs; i++) {
3990                         dv = xmalloc(sizeof(*dv));
3991                         dev = xmalloc(sizeof_imsm_dev(get_imsm_dev(sec, i), 1));
3992                         dv->dev = dev;
3993                         dv->index = i;
3994                         dv->next = first->devlist;
3995                         first->devlist = dv;
3996                 }
3997                 if (i < sec->anchor->num_raid_devs) {
3998                         /* allocation failure */
3999                         free_devlist(first);
4000                         pr_err("imsm: failed to associate spare\n");
4001                         return 3;
4002                 }
4003                 first->anchor->num_raid_devs = sec->anchor->num_raid_devs;
4004                 first->anchor->orig_family_num = sec->anchor->orig_family_num;
4005                 first->anchor->family_num = sec->anchor->family_num;
4006                 memcpy(first->anchor->sig, sec->anchor->sig, MAX_SIGNATURE_LENGTH);
4007                 for (i = 0; i < sec->anchor->num_raid_devs; i++)
4008                         imsm_copy_dev(get_imsm_dev(first, i), get_imsm_dev(sec, i));
4009         }
4010
4011         return 0;
4012 }
4013
4014 static void fd2devname(int fd, char *name)
4015 {
4016         char *nm;
4017
4018         nm = fd2kname(fd);
4019         if (!nm)
4020                 return;
4021
4022         snprintf(name, MAX_RAID_SERIAL_LEN, "/dev/%s", nm);
4023 }
4024
4025 static int nvme_get_serial(int fd, void *buf, size_t buf_len)
4026 {
4027         char path[PATH_MAX];
4028         char *name = fd2kname(fd);
4029
4030         if (!name)
4031                 return 1;
4032
4033         if (strncmp(name, "nvme", 4) != 0)
4034                 return 1;
4035
4036         if (!diskfd_to_devpath(fd, 1, path))
4037                 return 1;
4038
4039         return devpath_to_char(path, "serial", buf, buf_len, 0);
4040 }
4041
4042 extern int scsi_get_serial(int fd, void *buf, size_t buf_len);
4043
4044 static int imsm_read_serial(int fd, char *devname,
4045                             __u8 *serial, size_t serial_buf_len)
4046 {
4047         char buf[50];
4048         int rv;
4049         size_t len;
4050         char *dest;
4051         char *src;
4052         unsigned int i;
4053
4054         memset(buf, 0, sizeof(buf));
4055
4056         rv = nvme_get_serial(fd, buf, sizeof(buf));
4057
4058         if (rv)
4059                 rv = scsi_get_serial(fd, buf, sizeof(buf));
4060
4061         if (rv && check_env("IMSM_DEVNAME_AS_SERIAL")) {
4062                 memset(serial, 0, MAX_RAID_SERIAL_LEN);
4063                 fd2devname(fd, (char *) serial);
4064                 return 0;
4065         }
4066
4067         if (rv != 0) {
4068                 if (devname)
4069                         pr_err("Failed to retrieve serial for %s\n",
4070                                devname);
4071                 return rv;
4072         }
4073
4074         /* trim all whitespace and non-printable characters and convert
4075          * ':' to ';'
4076          */
4077         for (i = 0, dest = buf; i < sizeof(buf) && buf[i]; i++) {
4078                 src = &buf[i];
4079                 if (*src > 0x20) {
4080                         /* ':' is reserved for use in placeholder serial
4081                          * numbers for missing disks
4082                          */
4083                         if (*src == ':')
4084                                 *dest++ = ';';
4085                         else
4086                                 *dest++ = *src;
4087                 }
4088         }
4089         len = dest - buf;
4090         dest = buf;
4091
4092         if (len > serial_buf_len) {
4093                 /* truncate leading characters */
4094                 dest += len - serial_buf_len;
4095                 len = serial_buf_len;
4096         }
4097
4098         memset(serial, 0, serial_buf_len);
4099         memcpy(serial, dest, len);
4100
4101         return 0;
4102 }
4103
4104 static int serialcmp(__u8 *s1, __u8 *s2)
4105 {
4106         return strncmp((char *) s1, (char *) s2, MAX_RAID_SERIAL_LEN);
4107 }
4108
4109 static void serialcpy(__u8 *dest, __u8 *src)
4110 {
4111         strncpy((char *) dest, (char *) src, MAX_RAID_SERIAL_LEN);
4112 }
4113
4114 static struct dl *serial_to_dl(__u8 *serial, struct intel_super *super)
4115 {
4116         struct dl *dl;
4117
4118         for (dl = super->disks; dl; dl = dl->next)
4119                 if (serialcmp(dl->serial, serial) == 0)
4120                         break;
4121
4122         return dl;
4123 }
4124
4125 static struct imsm_disk *
4126 __serial_to_disk(__u8 *serial, struct imsm_super *mpb, int *idx)
4127 {
4128         int i;
4129
4130         for (i = 0; i < mpb->num_disks; i++) {
4131                 struct imsm_disk *disk = __get_imsm_disk(mpb, i);
4132
4133                 if (serialcmp(disk->serial, serial) == 0) {
4134                         if (idx)
4135                                 *idx = i;
4136                         return disk;
4137                 }
4138         }
4139
4140         return NULL;
4141 }
4142
4143 static int
4144 load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd)
4145 {
4146         struct imsm_disk *disk;
4147         struct dl *dl;
4148         struct stat stb;
4149         int rv;
4150         char name[40];
4151         __u8 serial[MAX_RAID_SERIAL_LEN];
4152
4153         rv = imsm_read_serial(fd, devname, serial, MAX_RAID_SERIAL_LEN);
4154
4155         if (rv != 0)
4156                 return 2;
4157
4158         dl = xcalloc(1, sizeof(*dl));
4159
4160         fstat(fd, &stb);
4161         dl->major = major(stb.st_rdev);
4162         dl->minor = minor(stb.st_rdev);
4163         dl->next = super->disks;
4164         dl->fd = keep_fd ? fd : -1;
4165         assert(super->disks == NULL);
4166         super->disks = dl;
4167         serialcpy(dl->serial, serial);
4168         dl->index = -2;
4169         dl->e = NULL;
4170         fd2devname(fd, name);
4171         if (devname)
4172                 dl->devname = xstrdup(devname);
4173         else
4174                 dl->devname = xstrdup(name);
4175
4176         /* look up this disk's index in the current anchor */
4177         disk = __serial_to_disk(dl->serial, super->anchor, &dl->index);
4178         if (disk) {
4179                 dl->disk = *disk;
4180                 /* only set index on disks that are a member of a
4181                  * populated contianer, i.e. one with raid_devs
4182                  */
4183                 if (is_failed(&dl->disk))
4184                         dl->index = -2;
4185                 else if (is_spare(&dl->disk) || is_journal(&dl->disk))
4186                         dl->index = -1;
4187         }
4188
4189         return 0;
4190 }
4191
4192 /* When migrating map0 contains the 'destination' state while map1
4193  * contains the current state.  When not migrating map0 contains the
4194  * current state.  This routine assumes that map[0].map_state is set to
4195  * the current array state before being called.
4196  *
4197  * Migration is indicated by one of the following states
4198  * 1/ Idle (migr_state=0 map0state=normal||unitialized||degraded||failed)
4199  * 2/ Initialize (migr_state=1 migr_type=MIGR_INIT map0state=normal
4200  *    map1state=unitialized)
4201  * 3/ Repair (Resync) (migr_state=1 migr_type=MIGR_REPAIR  map0state=normal
4202  *    map1state=normal)
4203  * 4/ Rebuild (migr_state=1 migr_type=MIGR_REBUILD map0state=normal
4204  *    map1state=degraded)
4205  * 5/ Migration (mig_state=1 migr_type=MIGR_GEN_MIGR map0state=normal
4206  *    map1state=normal)
4207  */
4208 static void migrate(struct imsm_dev *dev, struct intel_super *super,
4209                     __u8 to_state, int migr_type)
4210 {
4211         struct imsm_map *dest;
4212         struct imsm_map *src = get_imsm_map(dev, MAP_0);
4213
4214         dev->vol.migr_state = 1;
4215         set_migr_type(dev, migr_type);
4216         set_vol_curr_migr_unit(dev, 0);
4217         dest = get_imsm_map(dev, MAP_1);
4218
4219         /* duplicate and then set the target end state in map[0] */
4220         memcpy(dest, src, sizeof_imsm_map(src));
4221         if (migr_type == MIGR_GEN_MIGR) {
4222                 __u32 ord;
4223                 int i;
4224
4225                 for (i = 0; i < src->num_members; i++) {
4226                         ord = __le32_to_cpu(src->disk_ord_tbl[i]);
4227                         set_imsm_ord_tbl_ent(src, i, ord_to_idx(ord));
4228                 }
4229         }
4230
4231         if (migr_type == MIGR_GEN_MIGR)
4232                 /* Clear migration record */
4233                 memset(super->migr_rec, 0, sizeof(struct migr_record));
4234
4235         src->map_state = to_state;
4236 }
4237
4238 static void end_migration(struct imsm_dev *dev, struct intel_super *super,
4239                           __u8 map_state)
4240 {
4241         struct imsm_map *map = get_imsm_map(dev, MAP_0);
4242         struct imsm_map *prev = get_imsm_map(dev, dev->vol.migr_state == 0 ?
4243                                                     MAP_0 : MAP_1);
4244         int i, j;
4245
4246         /* merge any IMSM_ORD_REBUILD bits that were not successfully
4247          * completed in the last migration.
4248          *
4249          * FIXME add support for raid-level-migration
4250          */
4251         if (map_state != map->map_state && (is_gen_migration(dev) == 0) &&
4252             prev->map_state != IMSM_T_STATE_UNINITIALIZED) {
4253                 /* when final map state is other than expected
4254                  * merge maps (not for migration)
4255                  */
4256                 int failed;
4257
4258                 for (i = 0; i < prev->num_members; i++)
4259                         for (j = 0; j < map->num_members; j++)
4260                                 /* during online capacity expansion
4261                                  * disks position can be changed
4262                                  * if takeover is used
4263                                  */
4264                                 if (ord_to_idx(map->disk_ord_tbl[j]) ==
4265                                     ord_to_idx(prev->disk_ord_tbl[i])) {
4266                                         map->disk_ord_tbl[j] |=
4267                                                 prev->disk_ord_tbl[i];
4268                                         break;
4269                                 }
4270                 failed = imsm_count_failed(super, dev, MAP_0);
4271                 map_state = imsm_check_degraded(super, dev, failed, MAP_0);
4272         }
4273
4274         dev->vol.migr_state = 0;
4275         set_migr_type(dev, 0);
4276         set_vol_curr_migr_unit(dev, 0);
4277         map->map_state = map_state;
4278 }
4279
4280 static int parse_raid_devices(struct intel_super *super)
4281 {
4282         int i;
4283         struct imsm_dev *dev_new;
4284         size_t len, len_migr;
4285         size_t max_len = 0;
4286         size_t space_needed = 0;
4287         struct imsm_super *mpb = super->anchor;
4288
4289         for (i = 0; i < super->anchor->num_raid_devs; i++) {
4290                 struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i);
4291                 struct intel_dev *dv;
4292
4293                 len = sizeof_imsm_dev(dev_iter, 0);
4294                 len_migr = sizeof_imsm_dev(dev_iter, 1);
4295                 if (len_migr > len)
4296                         space_needed += len_migr - len;
4297
4298                 dv = xmalloc(sizeof(*dv));
4299                 if (max_len < len_migr)
4300                         max_len = len_migr;
4301                 if (max_len > len_migr)
4302                         space_needed += max_len - len_migr;
4303                 dev_new = xmalloc(max_len);
4304                 imsm_copy_dev(dev_new, dev_iter);
4305                 dv->dev = dev_new;
4306                 dv->index = i;
4307                 dv->next = super->devlist;
4308                 super->devlist = dv;
4309         }
4310
4311         /* ensure that super->buf is large enough when all raid devices
4312          * are migrating
4313          */
4314         if (__le32_to_cpu(mpb->mpb_size) + space_needed > super->len) {
4315                 void *buf;
4316
4317                 len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + space_needed,
4318                               super->sector_size);
4319                 if (posix_memalign(&buf, MAX_SECTOR_SIZE, len) != 0)
4320                         return 1;
4321
4322                 memcpy(buf, super->buf, super->len);
4323                 memset(buf + super->len, 0, len - super->len);
4324                 free(super->buf);
4325                 super->buf = buf;
4326                 super->len = len;
4327         }
4328
4329         super->extra_space += space_needed;
4330
4331         return 0;
4332 }
4333
4334 /*******************************************************************************
4335  * Function:    check_mpb_migr_compatibility
4336  * Description: Function checks for unsupported migration features:
4337  *              - migration optimization area (pba_of_lba0)
4338  *              - descending reshape (ascending_migr)
4339  * Parameters:
4340  *      super   : imsm metadata information
4341  * Returns:
4342  *       0 : migration is compatible
4343  *      -1 : migration is not compatible
4344  ******************************************************************************/
4345 int check_mpb_migr_compatibility(struct intel_super *super)
4346 {
4347         struct imsm_map *map0, *map1;
4348         struct migr_record *migr_rec = super->migr_rec;
4349         int i;
4350
4351         for (i = 0; i < super->anchor->num_raid_devs; i++) {
4352                 struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i);
4353
4354                 if (dev_iter &&
4355                     dev_iter->vol.migr_state == 1 &&
4356                     dev_iter->vol.migr_type == MIGR_GEN_MIGR) {
4357                         /* This device is migrating */
4358                         map0 = get_imsm_map(dev_iter, MAP_0);
4359                         map1 = get_imsm_map(dev_iter, MAP_1);
4360                         if (pba_of_lba0(map0) != pba_of_lba0(map1))
4361                                 /* migration optimization area was used */
4362                                 return -1;
4363                         if (migr_rec->ascending_migr == 0 &&
4364                             migr_rec->dest_depth_per_unit > 0)
4365                                 /* descending reshape not supported yet */
4366                                 return -1;
4367                 }
4368         }
4369         return 0;
4370 }
4371
4372 static void __free_imsm(struct intel_super *super, int free_disks);
4373
4374 /* load_imsm_mpb - read matrix metadata
4375  * allocates super->mpb to be freed by free_imsm
4376  */
4377 static int load_imsm_mpb(int fd, struct intel_super *super, char *devname)
4378 {
4379         unsigned long long dsize;
4380         unsigned long long sectors;
4381         unsigned int sector_size = super->sector_size;
4382         struct stat;
4383         struct imsm_super *anchor;
4384         __u32 check_sum;
4385
4386         get_dev_size(fd, NULL, &dsize);
4387         if (dsize < 2*sector_size) {
4388                 if (devname)
4389                         pr_err("%s: device to small for imsm\n",
4390                                devname);
4391                 return 1;
4392         }
4393
4394         if (lseek64(fd, dsize - (sector_size * 2), SEEK_SET) < 0) {
4395                 if (devname)
4396                         pr_err("Cannot seek to anchor block on %s: %s\n",
4397                                devname, strerror(errno));
4398                 return 1;
4399         }
4400
4401         if (posix_memalign((void **)&anchor, sector_size, sector_size) != 0) {
4402                 if (devname)
4403                         pr_err("Failed to allocate imsm anchor buffer on %s\n", devname);
4404                 return 1;
4405         }
4406         if ((unsigned int)read(fd, anchor, sector_size) != sector_size) {
4407                 if (devname)
4408                         pr_err("Cannot read anchor block on %s: %s\n",
4409                                devname, strerror(errno));
4410                 free(anchor);
4411                 return 1;
4412         }
4413
4414         if (strncmp((char *) anchor->sig, MPB_SIGNATURE, MPB_SIG_LEN) != 0) {
4415                 if (devname)
4416                         pr_err("no IMSM anchor on %s\n", devname);
4417                 free(anchor);
4418                 return 2;
4419         }
4420
4421         __free_imsm(super, 0);
4422         /*  reload capability and hba */
4423
4424         /* capability and hba must be updated with new super allocation */
4425         find_intel_hba_capability(fd, super, devname);
4426         super->len = ROUND_UP(anchor->mpb_size, sector_size);
4427         if (posix_memalign(&super->buf, MAX_SECTOR_SIZE, super->len) != 0) {
4428                 if (devname)
4429                         pr_err("unable to allocate %zu byte mpb buffer\n",
4430                                super->len);
4431                 free(anchor);
4432                 return 2;
4433         }
4434         memcpy(super->buf, anchor, sector_size);
4435
4436         sectors = mpb_sectors(anchor, sector_size) - 1;
4437         free(anchor);
4438
4439         if (posix_memalign(&super->migr_rec_buf, MAX_SECTOR_SIZE,
4440             MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE) != 0) {
4441                 pr_err("could not allocate migr_rec buffer\n");
4442                 free(super->buf);
4443                 return 2;
4444         }
4445         super->clean_migration_record_by_mdmon = 0;
4446
4447         if (!sectors) {
4448                 check_sum = __gen_imsm_checksum(super->anchor);
4449                 if (check_sum != __le32_to_cpu(super->anchor->check_sum)) {
4450                         if (devname)
4451                                 pr_err("IMSM checksum %x != %x on %s\n",
4452                                        check_sum,
4453                                        __le32_to_cpu(super->anchor->check_sum),
4454                                        devname);
4455                         return 2;
4456                 }
4457
4458                 return 0;
4459         }
4460
4461         /* read the extended mpb */
4462         if (lseek64(fd, dsize - (sector_size * (2 + sectors)), SEEK_SET) < 0) {
4463                 if (devname)
4464                         pr_err("Cannot seek to extended mpb on %s: %s\n",
4465                                devname, strerror(errno));
4466                 return 1;
4467         }
4468
4469         if ((unsigned int)read(fd, super->buf + sector_size,
4470                     super->len - sector_size) != super->len - sector_size) {
4471                 if (devname)
4472                         pr_err("Cannot read extended mpb on %s: %s\n",
4473                                devname, strerror(errno));
4474                 return 2;
4475         }
4476
4477         check_sum = __gen_imsm_checksum(super->anchor);
4478         if (check_sum != __le32_to_cpu(super->anchor->check_sum)) {
4479                 if (devname)
4480                         pr_err("IMSM checksum %x != %x on %s\n",
4481                                check_sum, __le32_to_cpu(super->anchor->check_sum),
4482                                devname);
4483                 return 3;
4484         }
4485
4486         return 0;
4487 }
4488
4489 static int read_imsm_migr_rec(int fd, struct intel_super *super);
4490
4491 /* clears hi bits in metadata if MPB_ATTRIB_2TB_DISK not set */
4492 static void clear_hi(struct intel_super *super)
4493 {
4494         struct imsm_super *mpb = super->anchor;
4495         int i, n;
4496         if (mpb->attributes & MPB_ATTRIB_2TB_DISK)
4497                 return;
4498         for (i = 0; i < mpb->num_disks; ++i) {
4499                 struct imsm_disk *disk = &mpb->disk[i];
4500                 disk->total_blocks_hi = 0;
4501         }
4502         for (i = 0; i < mpb->num_raid_devs; ++i) {
4503                 struct imsm_dev *dev = get_imsm_dev(super, i);
4504                 if (!dev)
4505                         return;
4506                 for (n = 0; n < 2; ++n) {
4507                         struct imsm_map *map = get_imsm_map(dev, n);
4508                         if (!map)
4509                                 continue;
4510                         map->pba_of_lba0_hi = 0;
4511                         map->blocks_per_member_hi = 0;
4512                         map->num_data_stripes_hi = 0;
4513                 }
4514         }
4515 }
4516
4517 static int
4518 load_and_parse_mpb(int fd, struct intel_super *super, char *devname, int keep_fd)
4519 {
4520         int err;
4521
4522         err = load_imsm_mpb(fd, super, devname);
4523         if (err)
4524                 return err;
4525         if (super->sector_size == 4096)
4526                 convert_from_4k(super);
4527         err = load_imsm_disk(fd, super, devname, keep_fd);
4528         if (err)
4529                 return err;
4530         err = parse_raid_devices(super);
4531         if (err)
4532                 return err;
4533         err = load_bbm_log(super);
4534         clear_hi(super);
4535         return err;
4536 }
4537
4538 static void __free_imsm_disk(struct dl *d)
4539 {
4540         if (d->fd >= 0)
4541                 close(d->fd);
4542         if (d->devname)
4543                 free(d->devname);
4544         if (d->e)
4545                 free(d->e);
4546         free(d);
4547
4548 }
4549
4550 static void free_imsm_disks(struct intel_super *super)
4551 {
4552         struct dl *d;
4553
4554         while (super->disks) {
4555                 d = super->disks;
4556                 super->disks = d->next;
4557                 __free_imsm_disk(d);
4558         }
4559         while (super->disk_mgmt_list) {
4560                 d = super->disk_mgmt_list;
4561                 super->disk_mgmt_list = d->next;
4562                 __free_imsm_disk(d);
4563         }
4564         while (super->missing) {
4565                 d = super->missing;
4566                 super->missing = d->next;
4567                 __free_imsm_disk(d);
4568         }
4569
4570 }
4571
4572 /* free all the pieces hanging off of a super pointer */
4573 static void __free_imsm(struct intel_super *super, int free_disks)
4574 {
4575         struct intel_hba *elem, *next;
4576
4577         if (super->buf) {
4578                 free(super->buf);
4579                 super->buf = NULL;
4580         }
4581         /* unlink capability description */
4582         super->orom = NULL;
4583         if (super->migr_rec_buf) {
4584                 free(super->migr_rec_buf);
4585                 super->migr_rec_buf = NULL;
4586         }
4587         if (free_disks)
4588                 free_imsm_disks(super);
4589         free_devlist(super);
4590         elem = super->hba;
4591         while (elem) {
4592                 if (elem->path)
4593                         free((void *)elem->path);
4594                 next = elem->next;
4595                 free(elem);
4596                 elem = next;
4597         }
4598         if (super->bbm_log)
4599                 free(super->bbm_log);
4600         super->hba = NULL;
4601 }
4602
4603 static void free_imsm(struct intel_super *super)
4604 {
4605         __free_imsm(super, 1);
4606         free(super->bb.entries);
4607         free(super);
4608 }
4609
4610 static void free_super_imsm(struct supertype *st)
4611 {
4612         struct intel_super *super = st->sb;
4613
4614         if (!super)
4615                 return;
4616
4617         free_imsm(super);
4618         st->sb = NULL;
4619 }
4620
4621 static struct intel_super *alloc_super(void)
4622 {
4623         struct intel_super *super = xcalloc(1, sizeof(*super));
4624
4625         super->current_vol = -1;
4626         super->create_offset = ~((unsigned long long) 0);
4627
4628         super->bb.entries = xmalloc(BBM_LOG_MAX_ENTRIES *
4629                                    sizeof(struct md_bb_entry));
4630         if (!super->bb.entries) {
4631                 free(super);
4632                 return NULL;
4633         }
4634
4635         return super;
4636 }
4637
4638 /*
4639  * find and allocate hba and OROM/EFI based on valid fd of RAID component device
4640  */
4641 static int find_intel_hba_capability(int fd, struct intel_super *super, char *devname)
4642 {
4643         struct sys_dev *hba_name;
4644         int rv = 0;
4645
4646         if (fd >= 0 && test_partition(fd)) {
4647                 pr_err("imsm: %s is a partition, cannot be used in IMSM\n",
4648                        devname);
4649                 return 1;
4650         }
4651         if (fd < 0 || check_env("IMSM_NO_PLATFORM")) {
4652                 super->orom = NULL;
4653                 super->hba = NULL;
4654                 return 0;
4655         }
4656         hba_name = find_disk_attached_hba(fd, NULL);
4657         if (!hba_name) {
4658                 if (devname)
4659                         pr_err("%s is not attached to Intel(R) RAID controller.\n",
4660                                devname);
4661                 return 1;
4662         }
4663         rv = attach_hba_to_super(super, hba_name);
4664         if (rv == 2) {
4665                 if (devname) {
4666                         struct intel_hba *hba = super->hba;
4667
4668                         pr_err("%s is attached to Intel(R) %s %s (%s),\n"
4669                                 "    but the container is assigned to Intel(R) %s %s (",
4670                                 devname,
4671                                 get_sys_dev_type(hba_name->type),
4672                                 hba_name->type == SYS_DEV_VMD ? "domain" : "RAID controller",
4673                                 hba_name->pci_id ? : "Err!",
4674                                 get_sys_dev_type(super->hba->type),
4675                                 hba->type == SYS_DEV_VMD ? "domain" : "RAID controller");
4676
4677                         while (hba) {
4678                                 fprintf(stderr, "%s", hba->pci_id ? : "Err!");
4679                                 if (hba->next)
4680                                         fprintf(stderr, ", ");
4681                                 hba = hba->next;
4682                         }
4683                         fprintf(stderr, ").\n"
4684                                 "    Mixing devices attached to different controllers is not allowed.\n");
4685                 }
4686                 return 2;
4687         }
4688         super->orom = find_imsm_capability(hba_name);
4689         if (!super->orom)
4690                 return 3;
4691
4692         return 0;
4693 }
4694
4695 /* find_missing - helper routine for load_super_imsm_all that identifies
4696  * disks that have disappeared from the system.  This routine relies on
4697  * the mpb being uptodate, which it is at load time.
4698  */
4699 static int find_missing(struct intel_super *super)
4700 {
4701         int i;
4702         struct imsm_super *mpb = super->anchor;
4703         struct dl *dl;
4704         struct imsm_disk *disk;
4705
4706         for (i = 0; i < mpb->num_disks; i++) {
4707                 disk = __get_imsm_disk(mpb, i);
4708                 dl = serial_to_dl(disk->serial, super);
4709                 if (dl)
4710                         continue;
4711
4712                 dl = xmalloc(sizeof(*dl));
4713                 dl->major = 0;
4714                 dl->minor = 0;
4715                 dl->fd = -1;
4716                 dl->devname = xstrdup("missing");
4717                 dl->index = i;
4718                 serialcpy(dl->serial, disk->serial);
4719                 dl->disk = *disk;
4720                 dl->e = NULL;
4721                 dl->next = super->missing;
4722                 super->missing = dl;
4723         }
4724
4725         return 0;
4726 }
4727
4728 static struct intel_disk *disk_list_get(__u8 *serial, struct intel_disk *disk_list)
4729 {
4730         struct intel_disk *idisk = disk_list;
4731
4732         while (idisk) {
4733                 if (serialcmp(idisk->disk.serial, serial) == 0)
4734                         break;
4735                 idisk = idisk->next;
4736         }
4737
4738         return idisk;
4739 }
4740
4741 static int __prep_thunderdome(struct intel_super **table, int tbl_size,
4742                               struct intel_super *super,
4743                               struct intel_disk **disk_list)
4744 {
4745         struct imsm_disk *d = &super->disks->disk;
4746         struct imsm_super *mpb = super->anchor;
4747         int i, j;
4748
4749         for (i = 0; i < tbl_size; i++) {
4750                 struct imsm_super *tbl_mpb = table[i]->anchor;
4751                 struct imsm_disk *tbl_d = &table[i]->disks->disk;
4752
4753                 if (tbl_mpb->family_num == mpb->family_num) {
4754                         if (tbl_mpb->check_sum == mpb->check_sum) {
4755                                 dprintf("mpb from %d:%d matches %d:%d\n",
4756                                         super->disks->major,
4757                                         super->disks->minor,
4758                                         table[i]->disks->major,
4759                                         table[i]->disks->minor);
4760                                 break;
4761                         }
4762
4763                         if (((is_configured(d) && !is_configured(tbl_d)) ||
4764                              is_configured(d) == is_configured(tbl_d)) &&
4765                             tbl_mpb->generation_num < mpb->generation_num) {
4766                                 /* current version of the mpb is a
4767                                  * better candidate than the one in
4768                                  * super_table, but copy over "cross
4769                                  * generational" status
4770                                  */
4771                                 struct intel_disk *idisk;
4772
4773                                 dprintf("mpb from %d:%d replaces %d:%d\n",
4774                                         super->disks->major,
4775                                         super->disks->minor,
4776                                         table[i]->disks->major,
4777                                         table[i]->disks->minor);
4778
4779                                 idisk = disk_list_get(tbl_d->serial, *disk_list);
4780                                 if (idisk && is_failed(&idisk->disk))
4781                                         tbl_d->status |= FAILED_DISK;
4782                                 break;
4783                         } else {
4784                                 struct intel_disk *idisk;
4785                                 struct imsm_disk *disk;
4786
4787                                 /* tbl_mpb is more up to date, but copy
4788                                  * over cross generational status before
4789                                  * returning
4790                                  */
4791                                 disk = __serial_to_disk(d->serial, mpb, NULL);
4792                                 if (disk && is_failed(disk))
4793                                         d->status |= FAILED_DISK;
4794
4795                                 idisk = disk_list_get(d->serial, *disk_list);
4796                                 if (idisk) {
4797                                         idisk->owner = i;
4798                                         if (disk && is_configured(disk))
4799                                                 idisk->disk.status |= CONFIGURED_DISK;
4800                                 }
4801
4802                                 dprintf("mpb from %d:%d prefer %d:%d\n",
4803                                         super->disks->major,
4804                                         super->disks->minor,
4805                                         table[i]->disks->major,
4806                                         table[i]->disks->minor);
4807
4808                                 return tbl_size;
4809                         }
4810                 }
4811         }
4812
4813         if (i >= tbl_size)
4814                 table[tbl_size++] = super;
4815         else
4816                 table[i] = super;
4817
4818         /* update/extend the merged list of imsm_disk records */
4819         for (j = 0; j < mpb->num_disks; j++) {
4820                 struct imsm_disk *disk = __get_imsm_disk(mpb, j);
4821                 struct intel_disk *idisk;
4822
4823                 idisk = disk_list_get(disk->serial, *disk_list);
4824                 if (idisk) {
4825                         idisk->disk.status |= disk->status;
4826                         if (is_configured(&idisk->disk) ||
4827                             is_failed(&idisk->disk))
4828                                 idisk->disk.status &= ~(SPARE_DISK);
4829                 } else {
4830                         idisk = xcalloc(1, sizeof(*idisk));
4831                         idisk->owner = IMSM_UNKNOWN_OWNER;
4832                         idisk->disk = *disk;
4833                         idisk->next = *disk_list;
4834                         *disk_list = idisk;
4835                 }
4836
4837                 if (serialcmp(idisk->disk.serial, d->serial) == 0)
4838                         idisk->owner = i;
4839         }
4840
4841         return tbl_size;
4842 }
4843
4844 static struct intel_super *
4845 validate_members(struct intel_super *super, struct intel_disk *disk_list,
4846                  const int owner)
4847 {
4848         struct imsm_super *mpb = super->anchor;
4849         int ok_count = 0;
4850         int i;
4851
4852         for (i = 0; i < mpb->num_disks; i++) {
4853                 struct imsm_disk *disk = __get_imsm_disk(mpb, i);
4854                 struct intel_disk *idisk;
4855
4856                 idisk = disk_list_get(disk->serial, disk_list);
4857                 if (idisk) {
4858                         if (idisk->owner == owner ||
4859                             idisk->owner == IMSM_UNKNOWN_OWNER)
4860                                 ok_count++;
4861                         else
4862                                 dprintf("'%.16s' owner %d != %d\n",
4863                                         disk->serial, idisk->owner,
4864                                         owner);
4865                 } else {
4866                         dprintf("unknown disk %x [%d]: %.16s\n",
4867                                 __le32_to_cpu(mpb->family_num), i,
4868                                 disk->serial);
4869                         break;
4870                 }
4871         }
4872
4873         if (ok_count == mpb->num_disks)
4874                 return super;
4875         return NULL;
4876 }
4877
4878 static void show_conflicts(__u32 family_num, struct intel_super *super_list)
4879 {
4880         struct intel_super *s;
4881
4882         for (s = super_list; s; s = s->next) {
4883                 if (family_num != s->anchor->family_num)
4884                         continue;
4885                 pr_err("Conflict, offlining family %#x on '%s'\n",
4886                         __le32_to_cpu(family_num), s->disks->devname);
4887         }
4888 }
4889
4890 static struct intel_super *
4891 imsm_thunderdome(struct intel_super **super_list, int len)
4892 {
4893         struct intel_super *super_table[len];
4894         struct intel_disk *disk_list = NULL;
4895         struct intel_super *champion, *spare;
4896         struct intel_super *s, **del;
4897         int tbl_size = 0;
4898         int conflict;
4899         int i;
4900
4901         memset(super_table, 0, sizeof(super_table));
4902         for (s = *super_list; s; s = s->next)
4903                 tbl_size = __prep_thunderdome(super_table, tbl_size, s, &disk_list);
4904
4905         for (i = 0; i < tbl_size; i++) {
4906                 struct imsm_disk *d;
4907                 struct intel_disk *idisk;
4908                 struct imsm_super *mpb = super_table[i]->anchor;
4909
4910                 s = super_table[i];
4911                 d = &s->disks->disk;
4912
4913                 /* 'd' must appear in merged disk list for its
4914                  * configuration to be valid
4915                  */
4916                 idisk = disk_list_get(d->serial, disk_list);
4917                 if (idisk && idisk->owner == i)
4918                         s = validate_members(s, disk_list, i);
4919                 else
4920                         s = NULL;
4921
4922                 if (!s)
4923                         dprintf("marking family: %#x from %d:%d offline\n",
4924                                 mpb->family_num,
4925                                 super_table[i]->disks->major,
4926                                 super_table[i]->disks->minor);
4927                 super_table[i] = s;
4928         }
4929
4930         /* This is where the mdadm implementation differs from the Windows
4931          * driver which has no strict concept of a container.  We can only
4932          * assemble one family from a container, so when returning a prodigal
4933          * array member to this system the code will not be able to disambiguate
4934          * the container contents that should be assembled ("foreign" versus
4935          * "local").  It requires user intervention to set the orig_family_num
4936          * to a new value to establish a new container.  The Windows driver in
4937          * this situation fixes up the volume name in place and manages the
4938          * foreign array as an independent entity.
4939          */
4940         s = NULL;
4941         spare = NULL;
4942         conflict = 0;
4943         for (i = 0; i < tbl_size; i++) {
4944                 struct intel_super *tbl_ent = super_table[i];
4945                 int is_spare = 0;
4946
4947                 if (!tbl_ent)
4948                         continue;
4949
4950                 if (tbl_ent->anchor->num_raid_devs == 0) {
4951                         spare = tbl_ent;
4952                         is_spare = 1;
4953                 }
4954
4955                 if (s && !is_spare) {
4956                         show_conflicts(tbl_ent->anchor->family_num, *super_list);
4957                         conflict++;
4958                 } else if (!s && !is_spare)
4959                         s = tbl_ent;
4960         }
4961
4962         if (!s)
4963                 s = spare;
4964         if (!s) {
4965                 champion = NULL;
4966                 goto out;
4967         }
4968         champion = s;
4969
4970         if (conflict)
4971                 pr_err("Chose family %#x on '%s', assemble conflicts to new container with '--update=uuid'\n",
4972                         __le32_to_cpu(s->anchor->family_num), s->disks->devname);
4973
4974         /* collect all dl's onto 'champion', and update them to
4975          * champion's version of the status
4976          */
4977         for (s = *super_list; s; s = s->next) {
4978                 struct imsm_super *mpb = champion->anchor;
4979                 struct dl *dl = s->disks;
4980
4981                 if (s == champion)
4982                         continue;
4983
4984                 mpb->attributes |= s->anchor->attributes & MPB_ATTRIB_2TB_DISK;
4985
4986                 for (i = 0; i < mpb->num_disks; i++) {
4987                         struct imsm_disk *disk;
4988
4989                         disk = __serial_to_disk(dl->serial, mpb, &dl->index);
4990                         if (disk) {
4991                                 dl->disk = *disk;
4992                                 /* only set index on disks that are a member of
4993                                  * a populated contianer, i.e. one with
4994                                  * raid_devs
4995                                  */
4996                                 if (is_failed(&dl->disk))
4997                                         dl->index = -2;
4998                                 else if (is_spare(&dl->disk))
4999                                         dl->index = -1;
5000                                 break;
5001                         }
5002                 }
5003
5004                 if (i >= mpb->num_disks) {
5005                         struct intel_disk *idisk;
5006
5007                         idisk = disk_list_get(dl->serial, disk_list);
5008                         if (idisk && is_spare(&idisk->disk) &&
5009                             !is_failed(&idisk->disk) && !is_configured(&idisk->disk))
5010                                 dl->index = -1;
5011                         else {
5012                                 dl->index = -2;
5013                                 continue;
5014                         }
5015                 }
5016
5017                 dl->next = champion->disks;
5018                 champion->disks = dl;
5019                 s->disks = NULL;
5020         }
5021
5022         /* delete 'champion' from super_list */
5023         for (del = super_list; *del; ) {
5024                 if (*del == champion) {
5025                         *del = (*del)->next;
5026                         break;
5027                 } else
5028                         del = &(*del)->next;
5029         }
5030         champion->next = NULL;
5031
5032  out:
5033         while (disk_list) {
5034                 struct intel_disk *idisk = disk_list;
5035
5036                 disk_list = disk_list->next;
5037                 free(idisk);
5038         }
5039
5040         return champion;
5041 }
5042
5043 static int
5044 get_sra_super_block(int fd, struct intel_super **super_list, char *devname, int *max, int keep_fd);
5045 static int get_super_block(struct intel_super **super_list, char *devnm, char *devname,
5046                            int major, int minor, int keep_fd);
5047 static int
5048 get_devlist_super_block(struct md_list *devlist, struct intel_super **super_list,
5049                         int *max, int keep_fd);
5050
5051 static int load_super_imsm_all(struct supertype *st, int fd, void **sbp,
5052                                char *devname, struct md_list *devlist,
5053                                int keep_fd)
5054 {
5055         struct intel_super *super_list = NULL;
5056         struct intel_super *super = NULL;
5057         int err = 0;
5058         int i = 0;
5059
5060         if (fd >= 0)
5061                 /* 'fd' is an opened container */
5062                 err = get_sra_super_block(fd, &super_list, devname, &i, keep_fd);
5063         else
5064                 /* get super block from devlist devices */
5065                 err = get_devlist_super_block(devlist, &super_list, &i, keep_fd);
5066         if (err)
5067                 goto error;
5068         /* all mpbs enter, maybe one leaves */
5069         super = imsm_thunderdome(&super_list, i);
5070         if (!super) {
5071                 err = 1;
5072                 goto error;
5073         }
5074
5075         if (find_missing(super) != 0) {
5076                 free_imsm(super);
5077                 err = 2;
5078                 goto error;
5079         }
5080
5081         /* load migration record */
5082         err = load_imsm_migr_rec(super);
5083         if (err == -1) {
5084                 /* migration is in progress,
5085                  * but migr_rec cannot be loaded,
5086                  */
5087                 err = 4;
5088                 goto error;
5089         }
5090
5091         /* Check migration compatibility */
5092         if (err == 0 && check_mpb_migr_compatibility(super) != 0) {
5093                 pr_err("Unsupported migration detected");
5094                 if (devname)
5095                         fprintf(stderr, " on %s\n", devname);
5096                 else
5097                         fprintf(stderr, " (IMSM).\n");
5098
5099                 err = 5;
5100                 goto error;
5101         }
5102
5103         err = 0;
5104
5105  error:
5106         while (super_list) {
5107                 struct intel_super *s = super_list;
5108
5109                 super_list = super_list->next;
5110                 free_imsm(s);
5111         }
5112
5113         if (err)
5114                 return err;
5115
5116         *sbp = super;
5117         if (fd >= 0)
5118                 strcpy(st->container_devnm, fd2devnm(fd));
5119         else
5120                 st->container_devnm[0] = 0;
5121         if (err == 0 && st->ss == NULL) {
5122                 st->ss = &super_imsm;
5123                 st->minor_version = 0;
5124                 st->max_devs = IMSM_MAX_DEVICES;
5125         }
5126         return 0;
5127 }
5128
5129 static int
5130 get_devlist_super_block(struct md_list *devlist, struct intel_super **super_list,
5131                         int *max, int keep_fd)
5132 {
5133         struct md_list *tmpdev;
5134         int err = 0;
5135         int i = 0;
5136
5137         for (i = 0, tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) {
5138                 if (tmpdev->used != 1)
5139                         continue;
5140                 if (tmpdev->container == 1) {
5141                         int lmax = 0;
5142                         int fd = dev_open(tmpdev->devname, O_RDONLY|O_EXCL);
5143                         if (fd < 0) {
5144                                 pr_err("cannot open device %s: %s\n",
5145                                         tmpdev->devname, strerror(errno));
5146                                 err = 8;
5147                                 goto error;
5148                         }
5149                         err = get_sra_super_block(fd, super_list,
5150                                                   tmpdev->devname, &lmax,
5151                                                   keep_fd);
5152                         i += lmax;
5153                         close(fd);
5154                         if (err) {
5155                                 err = 7;
5156                                 goto error;
5157                         }
5158                 } else {
5159                         int major = major(tmpdev->st_rdev);
5160                         int minor = minor(tmpdev->st_rdev);
5161                         err = get_super_block(super_list,
5162                                               NULL,
5163                                               tmpdev->devname,
5164                                               major, minor,
5165                                               keep_fd);
5166                         i++;
5167                         if (err) {
5168                                 err = 6;
5169                                 goto error;
5170                         }
5171                 }
5172         }
5173  error:
5174         *max = i;
5175         return err;
5176 }
5177
5178 static int get_super_block(struct intel_super **super_list, char *devnm, char *devname,
5179                            int major, int minor, int keep_fd)
5180 {
5181         struct intel_super *s;
5182         char nm[32];
5183         int dfd = -1;
5184         int err = 0;
5185         int retry;
5186
5187         s = alloc_super();
5188         if (!s) {
5189                 err = 1;
5190                 goto error;
5191         }
5192
5193         sprintf(nm, "%d:%d", major, minor);
5194         dfd = dev_open(nm, O_RDWR);
5195         if (dfd < 0) {
5196                 err = 2;
5197                 goto error;
5198         }
5199
5200         if (!get_dev_sector_size(dfd, NULL, &s->sector_size)) {
5201                 err = 2;
5202                 goto error;
5203         }
5204         find_intel_hba_capability(dfd, s, devname);
5205         err = load_and_parse_mpb(dfd, s, NULL, keep_fd);
5206
5207         /* retry the load if we might have raced against mdmon */
5208         if (err == 3 && devnm && mdmon_running(devnm))
5209                 for (retry = 0; retry < 3; retry++) {
5210                         usleep(3000);
5211                         err = load_and_parse_mpb(dfd, s, NULL, keep_fd);
5212                         if (err != 3)
5213                                 break;
5214                 }
5215  error:
5216         if (!err) {
5217                 s->next = *super_list;
5218                 *super_list = s;
5219         } else {
5220                 if (s)
5221                         free_imsm(s);
5222                 if (dfd >= 0)
5223                         close(dfd);
5224         }
5225         if (dfd >= 0 && !keep_fd)
5226                 close(dfd);
5227         return err;
5228
5229 }
5230
5231 static int
5232 get_sra_super_block(int fd, struct intel_super **super_list, char *devname, int *max, int keep_fd)
5233 {
5234         struct mdinfo *sra;
5235         char *devnm;
5236         struct mdinfo *sd;
5237         int err = 0;
5238         int i = 0;
5239         sra = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE);
5240         if (!sra)
5241                 return 1;
5242
5243         if (sra->array.major_version != -1 ||
5244             sra->array.minor_version != -2 ||
5245             strcmp(sra->text_version, "imsm") != 0) {
5246                 err = 1;
5247                 goto error;
5248         }
5249         /* load all mpbs */
5250         devnm = fd2devnm(fd);
5251         for (sd = sra->devs, i = 0; sd; sd = sd->next, i++) {
5252                 if (get_super_block(super_list, devnm, devname,
5253                                     sd->disk.major, sd->disk.minor, keep_fd) != 0) {
5254                         err = 7;
5255                         goto error;
5256                 }
5257         }
5258  error:
5259         sysfs_free(sra);
5260         *max = i;
5261         return err;
5262 }
5263
5264 static int load_container_imsm(struct supertype *st, int fd, char *devname)
5265 {
5266         return load_super_imsm_all(st, fd, &st->sb, devname, NULL, 1);
5267 }
5268
5269 static int load_super_imsm(struct supertype *st, int fd, char *devname)
5270 {
5271         struct intel_super *super;
5272         int rv;
5273         int retry;
5274
5275         if (test_partition(fd))
5276                 /* IMSM not allowed on partitions */
5277                 return 1;
5278
5279         free_super_imsm(st);
5280
5281         super = alloc_super();
5282         if (!get_dev_sector_size(fd, NULL, &super->sector_size))
5283                 return 1;
5284         if (!super)
5285                 return 1;
5286         /* Load hba and capabilities if they exist.
5287          * But do not preclude loading metadata in case capabilities or hba are
5288          * non-compliant and ignore_hw_compat is set.
5289          */
5290         rv = find_intel_hba_capability(fd, super, devname);
5291         /* no orom/efi or non-intel hba of the disk */
5292         if (rv != 0 && st->ignore_hw_compat == 0) {
5293                 if (devname)
5294                         pr_err("No OROM/EFI properties for %s\n", devname);
5295                 free_imsm(super);
5296                 return 2;
5297         }
5298         rv = load_and_parse_mpb(fd, super, devname, 0);
5299
5300         /* retry the load if we might have raced against mdmon */
5301         if (rv == 3) {
5302                 struct mdstat_ent *mdstat = NULL;
5303                 char *name = fd2kname(fd);
5304
5305                 if (name)
5306                         mdstat = mdstat_by_component(name);
5307
5308                 if (mdstat && mdmon_running(mdstat->devnm) && getpid() != mdmon_pid(mdstat->devnm)) {
5309                         for (retry = 0; retry < 3; retry++) {
5310                                 usleep(3000);
5311                                 rv = load_and_parse_mpb(fd, super, devname, 0);
5312                                 if (rv != 3)
5313                                         break;
5314                         }
5315                 }
5316
5317                 free_mdstat(mdstat);
5318         }
5319
5320         if (rv) {
5321                 if (devname)
5322                         pr_err("Failed to load all information sections on %s\n", devname);
5323                 free_imsm(super);
5324                 return rv;
5325         }
5326
5327         st->sb = super;
5328         if (st->ss == NULL) {
5329                 st->ss = &super_imsm;
5330                 st->minor_version = 0;
5331                 st->max_devs = IMSM_MAX_DEVICES;
5332         }
5333
5334         /* load migration record */
5335         if (load_imsm_migr_rec(super) == 0) {
5336                 /* Check for unsupported migration features */
5337                 if (check_mpb_migr_compatibility(super) != 0) {
5338                         pr_err("Unsupported migration detected");
5339                         if (devname)
5340                                 fprintf(stderr, " on %s\n", devname);
5341                         else
5342                                 fprintf(stderr, " (IMSM).\n");
5343                         return 3;
5344                 }
5345         }
5346
5347         return 0;
5348 }
5349
5350 static __u16 info_to_blocks_per_strip(mdu_array_info_t *info)
5351 {
5352         if (info->level == 1)
5353                 return 128;
5354         return info->chunk_size >> 9;
5355 }
5356
5357 static unsigned long long info_to_blocks_per_member(mdu_array_info_t *info,
5358                                                     unsigned long long size)
5359 {
5360         if (info->level == 1)
5361                 return size * 2;
5362         else
5363                 return (size * 2) & ~(info_to_blocks_per_strip(info) - 1);
5364 }
5365
5366 static void imsm_update_version_info(struct intel_super *super)
5367 {
5368         /* update the version and attributes */
5369         struct imsm_super *mpb = super->anchor;
5370         char *version;
5371         struct imsm_dev *dev;
5372         struct imsm_map *map;
5373         int i;
5374
5375         for (i = 0; i < mpb->num_raid_devs; i++) {
5376                 dev = get_imsm_dev(super, i);
5377                 map = get_imsm_map(dev, MAP_0);
5378                 if (__le32_to_cpu(dev->size_high) > 0)
5379                         mpb->attributes |= MPB_ATTRIB_2TB;
5380
5381                 /* FIXME detect when an array spans a port multiplier */
5382                 #if 0
5383                 mpb->attributes |= MPB_ATTRIB_PM;
5384                 #endif
5385
5386                 if (mpb->num_raid_devs > 1 ||
5387                     mpb->attributes != MPB_ATTRIB_CHECKSUM_VERIFY) {
5388                         version = MPB_VERSION_ATTRIBS;
5389                         switch (get_imsm_raid_level(map)) {
5390                         case 0: mpb->attributes |= MPB_ATTRIB_RAID0; break;
5391                         case 1: mpb->attributes |= MPB_ATTRIB_RAID1; break;
5392                         case 10: mpb->attributes |= MPB_ATTRIB_RAID10; break;
5393                         case 5: mpb->attributes |= MPB_ATTRIB_RAID5; break;
5394                         }
5395                 } else {
5396                         if (map->num_members >= 5)
5397                                 version = MPB_VERSION_5OR6_DISK_ARRAY;
5398                         else if (dev->status == DEV_CLONE_N_GO)
5399                                 version = MPB_VERSION_CNG;
5400                         else if (get_imsm_raid_level(map) == 5)
5401                                 version = MPB_VERSION_RAID5;
5402                         else if (map->num_members >= 3)
5403                                 version = MPB_VERSION_3OR4_DISK_ARRAY;
5404                         else if (get_imsm_raid_level(map) == 1)
5405                                 version = MPB_VERSION_RAID1;
5406                         else
5407                                 version = MPB_VERSION_RAID0;
5408                 }
5409                 strcpy(((char *) mpb->sig) + strlen(MPB_SIGNATURE), version);
5410         }
5411 }
5412
5413 static int check_name(struct intel_super *super, char *name, int quiet)
5414 {
5415         struct imsm_super *mpb = super->anchor;
5416         char *reason = NULL;
5417         char *start = name;
5418         size_t len = strlen(name);
5419         int i;
5420
5421         if (len > 0) {
5422                 while (isspace(start[len - 1]))
5423                         start[--len] = 0;
5424                 while (*start && isspace(*start))
5425                         ++start, --len;
5426                 memmove(name, start, len + 1);
5427         }
5428
5429         if (len > MAX_RAID_SERIAL_LEN)
5430                 reason = "must be 16 characters or less";
5431         else if (len == 0)
5432                 reason = "must be a non-empty string";
5433
5434         for (i = 0; i < mpb->num_raid_devs; i++) {
5435                 struct imsm_dev *dev = get_imsm_dev(super, i);
5436
5437                 if (strncmp((char *) dev->volume, name, MAX_RAID_SERIAL_LEN) == 0) {
5438                         reason = "already exists";
5439                         break;
5440                 }
5441         }
5442
5443         if (reason && !quiet)
5444                 pr_err("imsm volume name %s\n", reason);
5445
5446         return !reason;
5447 }
5448
5449 static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
5450                                   struct shape *s, char *name,
5451                                   char *homehost, int *uuid,
5452                                   long long data_offset)
5453 {
5454         /* We are creating a volume inside a pre-existing container.
5455          * so st->sb is already set.
5456          */
5457         struct intel_super *super = st->sb;
5458         unsigned int sector_size = super->sector_size;
5459         struct imsm_super *mpb = super->anchor;
5460         struct intel_dev *dv;
5461         struct imsm_dev *dev;
5462         struct imsm_vol *vol;
5463         struct imsm_map *map;
5464         int idx = mpb->num_raid_devs;
5465         int i;
5466         int namelen;
5467         unsigned long long array_blocks;
5468         size_t size_old, size_new;
5469         unsigned long long num_data_stripes;
5470         unsigned int data_disks;
5471         unsigned long long size_per_member;
5472
5473         if (super->orom && mpb->num_raid_devs >= super->orom->vpa) {
5474                 pr_err("This imsm-container already has the maximum of %d volumes\n", super->orom->vpa);
5475                 return 0;
5476         }
5477
5478         /* ensure the mpb is large enough for the new data */
5479         size_old = __le32_to_cpu(mpb->mpb_size);
5480         size_new = disks_to_mpb_size(info->nr_disks);
5481         if (size_new > size_old) {
5482                 void *mpb_new;
5483                 size_t size_round = ROUND_UP(size_new, sector_size);
5484
5485                 if (posix_memalign(&mpb_new, sector_size, size_round) != 0) {
5486                         pr_err("could not allocate new mpb\n");
5487                         return 0;
5488                 }
5489                 if (posix_memalign(&super->migr_rec_buf, MAX_SECTOR_SIZE,
5490                                    MIGR_REC_BUF_SECTORS*
5491                                    MAX_SECTOR_SIZE) != 0) {
5492                         pr_err("could not allocate migr_rec buffer\n");
5493                         free(super->buf);
5494                         free(super);
5495                         free(mpb_new);
5496                         return 0;
5497                 }
5498                 memcpy(mpb_new, mpb, size_old);
5499                 free(mpb);
5500                 mpb = mpb_new;
5501                 super->anchor = mpb_new;
5502                 mpb->mpb_size = __cpu_to_le32(size_new);
5503                 memset(mpb_new + size_old, 0, size_round - size_old);
5504                 super->len = size_round;
5505         }
5506         super->current_vol = idx;
5507
5508         /* handle 'failed_disks' by either:
5509          * a) create dummy disk entries in the table if this the first
5510          *    volume in the array.  We add them here as this is the only
5511          *    opportunity to add them. add_to_super_imsm_volume()
5512          *    handles the non-failed disks and continues incrementing
5513          *    mpb->num_disks.
5514          * b) validate that 'failed_disks' matches the current number
5515          *    of missing disks if the container is populated
5516          */
5517         if (super->current_vol == 0) {
5518                 mpb->num_disks = 0;
5519                 for (i = 0; i < info->failed_disks; i++) {
5520                         struct imsm_disk *disk;
5521
5522                         mpb->num_disks++;
5523                         disk = __get_imsm_disk(mpb, i);
5524                         disk->status = CONFIGURED_DISK | FAILED_DISK;
5525                         disk->scsi_id = __cpu_to_le32(~(__u32)0);
5526                         snprintf((char *) disk->serial, MAX_RAID_SERIAL_LEN,
5527                                  "missing:%d", (__u8)i);
5528                 }
5529                 find_missing(super);
5530         } else {
5531                 int missing = 0;
5532                 struct dl *d;
5533
5534                 for (d = super->missing; d; d = d->next)
5535                         missing++;
5536                 if (info->failed_disks > missing) {
5537                         pr_err("unable to add 'missing' disk to container\n");
5538                         return 0;
5539                 }
5540         }
5541
5542         if (!check_name(super, name, 0))
5543                 return 0;
5544         dv = xmalloc(sizeof(*dv));
5545         dev = xcalloc(1, sizeof(*dev) + sizeof(__u32) * (info->raid_disks - 1));
5546         /*
5547          * Explicitly allow truncating to not confuse gcc's
5548          * -Werror=stringop-truncation
5549          */
5550         namelen = min((int) strlen(name), MAX_RAID_SERIAL_LEN);
5551         memcpy(dev->volume, name, namelen);
5552         array_blocks = calc_array_size(info->level, info->raid_disks,
5553                                                info->layout, info->chunk_size,
5554                                                s->size * BLOCKS_PER_KB);
5555         data_disks = get_data_disks(info->level, info->layout,
5556                                     info->raid_disks);
5557         array_blocks = round_size_to_mb(array_blocks, data_disks);
5558         size_per_member = array_blocks / data_disks;
5559
5560         set_imsm_dev_size(dev, array_blocks);
5561         dev->status = (DEV_READ_COALESCING | DEV_WRITE_COALESCING);
5562         vol = &dev->vol;
5563         vol->migr_state = 0;
5564         set_migr_type(dev, MIGR_INIT);
5565         vol->dirty = !info->state;
5566         set_vol_curr_migr_unit(dev, 0);
5567         map = get_imsm_map(dev, MAP_0);
5568         set_pba_of_lba0(map, super->create_offset);
5569         map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info));
5570         map->failed_disk_num = ~0;
5571         if (info->level > 0)
5572                 map->map_state = (info->state ? IMSM_T_STATE_NORMAL
5573                                   : IMSM_T_STATE_UNINITIALIZED);
5574         else
5575                 map->map_state = info->failed_disks ? IMSM_T_STATE_FAILED :
5576                                                       IMSM_T_STATE_NORMAL;
5577         map->ddf = 1;
5578
5579         if (info->level == 1 && info->raid_disks > 2) {
5580                 free(dev);
5581                 free(dv);
5582                 pr_err("imsm does not support more than 2 disksin a raid1 volume\n");
5583                 return 0;
5584         }
5585
5586         map->raid_level = info->level;
5587         if (info->level == 10) {
5588                 map->raid_level = 1;
5589                 map->num_domains = info->raid_disks / 2;
5590         } else if (info->level == 1)
5591                 map->num_domains = info->raid_disks;
5592         else
5593                 map->num_domains = 1;
5594
5595         /* info->size is only int so use the 'size' parameter instead */
5596         num_data_stripes = size_per_member / info_to_blocks_per_strip(info);
5597         num_data_stripes /= map->num_domains;
5598         set_num_data_stripes(map, num_data_stripes);
5599
5600         size_per_member += NUM_BLOCKS_DIRTY_STRIPE_REGION;
5601         set_blocks_per_member(map, info_to_blocks_per_member(info,
5602                                                              size_per_member /
5603                                                              BLOCKS_PER_KB));
5604
5605         map->num_members = info->raid_disks;
5606         for (i = 0; i < map->num_members; i++) {
5607                 /* initialized in add_to_super */
5608                 set_imsm_ord_tbl_ent(map, i, IMSM_ORD_REBUILD);
5609         }
5610         mpb->num_raid_devs++;
5611         mpb->num_raid_devs_created++;
5612         dev->my_vol_raid_dev_num = mpb->num_raid_devs_created;
5613
5614         if (s->consistency_policy <= CONSISTENCY_POLICY_RESYNC) {
5615                 dev->rwh_policy = RWH_MULTIPLE_OFF;
5616         } else if (s->consistency_policy == CONSISTENCY_POLICY_PPL) {
5617                 dev->rwh_policy = RWH_MULTIPLE_DISTRIBUTED;
5618         } else {
5619                 free(dev);
5620                 free(dv);
5621                 pr_err("imsm does not support consistency policy %s\n",
5622                        map_num(consistency_policies, s->consistency_policy));
5623                 return 0;
5624         }
5625
5626         dv->dev = dev;
5627         dv->index = super->current_vol;
5628         dv->next = super->devlist;
5629         super->devlist = dv;
5630
5631         imsm_update_version_info(super);
5632
5633         return 1;
5634 }
5635
5636 static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
5637                            struct shape *s, char *name,
5638                            char *homehost, int *uuid,
5639                            unsigned long long data_offset)
5640 {
5641         /* This is primarily called by Create when creating a new array.
5642          * We will then get add_to_super called for each component, and then
5643          * write_init_super called to write it out to each device.
5644          * For IMSM, Create can create on fresh devices or on a pre-existing
5645          * array.
5646          * To create on a pre-existing array a different method will be called.
5647          * This one is just for fresh drives.
5648          */
5649         struct intel_super *super;
5650         struct imsm_super *mpb;
5651         size_t mpb_size;
5652         char *version;
5653
5654         if (data_offset != INVALID_SECTORS) {
5655                 pr_err("data-offset not supported by imsm\n");
5656                 return 0;
5657         }
5658
5659         if (st->sb)
5660                 return init_super_imsm_volume(st, info, s, name, homehost, uuid,
5661                                               data_offset);
5662
5663         if (info)
5664                 mpb_size = disks_to_mpb_size(info->nr_disks);
5665         else
5666                 mpb_size = MAX_SECTOR_SIZE;
5667
5668         super = alloc_super();
5669         if (super &&
5670             posix_memalign(&super->buf, MAX_SECTOR_SIZE, mpb_size) != 0) {
5671                 free_imsm(super);
5672                 super = NULL;
5673         }
5674         if (!super) {
5675                 pr_err("could not allocate superblock\n");
5676                 return 0;
5677         }
5678         if (posix_memalign(&super->migr_rec_buf, MAX_SECTOR_SIZE,
5679             MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE) != 0) {
5680                 pr_err("could not allocate migr_rec buffer\n");
5681                 free(super->buf);
5682                 free_imsm(super);
5683                 return 0;
5684         }
5685         memset(super->buf, 0, mpb_size);
5686         mpb = super->buf;
5687         mpb->mpb_size = __cpu_to_le32(mpb_size);
5688         st->sb = super;
5689
5690         if (info == NULL) {
5691                 /* zeroing superblock */
5692                 return 0;
5693         }
5694
5695         mpb->attributes = MPB_ATTRIB_CHECKSUM_VERIFY;
5696
5697         version = (char *) mpb->sig;
5698         strcpy(version, MPB_SIGNATURE);
5699         version += strlen(MPB_SIGNATURE);
5700         strcpy(version, MPB_VERSION_RAID0);
5701
5702         return 1;
5703 }
5704
5705 static int drive_validate_sector_size(struct intel_super *super, struct dl *dl)
5706 {
5707         unsigned int member_sector_size;
5708
5709         if (dl->fd < 0) {
5710                 pr_err("Invalid file descriptor for %s\n", dl->devname);
5711                 return 0;
5712         }
5713
5714         if (!get_dev_sector_size(dl->fd, dl->devname, &member_sector_size))
5715                 return 0;
5716         if (member_sector_size != super->sector_size)
5717                 return 0;
5718         return 1;
5719 }
5720
5721 static int add_to_super_imsm_volume(struct supertype *st, mdu_disk_info_t *dk,
5722                                      int fd, char *devname)
5723 {
5724         struct intel_super *super = st->sb;
5725         struct imsm_super *mpb = super->anchor;
5726         struct imsm_disk *_disk;
5727         struct imsm_dev *dev;
5728         struct imsm_map *map;
5729         struct dl *dl, *df;
5730         int slot;
5731
5732         dev = get_imsm_dev(super, super->current_vol);
5733         map = get_imsm_map(dev, MAP_0);
5734
5735         if (! (dk->state & (1<<MD_DISK_SYNC))) {
5736                 pr_err("%s: Cannot add spare devices to IMSM volume\n",
5737                         devname);
5738                 return 1;
5739         }
5740
5741         if (fd == -1) {
5742                 /* we're doing autolayout so grab the pre-marked (in
5743                  * validate_geometry) raid_disk
5744                  */
5745                 for (dl = super->disks; dl; dl = dl->next)
5746                         if (dl->raiddisk == dk->raid_disk)
5747                                 break;
5748         } else {
5749                 for (dl = super->disks; dl ; dl = dl->next)
5750                         if (dl->major == dk->major &&
5751                             dl->minor == dk->minor)
5752                                 break;
5753         }
5754
5755         if (!dl) {
5756                 pr_err("%s is not a member of the same container\n", devname);
5757                 return 1;
5758         }
5759
5760         if (mpb->num_disks == 0)
5761                 if (!get_dev_sector_size(dl->fd, dl->devname,
5762                                          &super->sector_size))
5763                         return 1;
5764
5765         if (!drive_validate_sector_size(super, dl)) {
5766                 pr_err("Combining drives of different sector size in one volume is not allowed\n");
5767                 return 1;
5768         }
5769
5770         /* add a pristine spare to the metadata */
5771         if (dl->index < 0) {
5772                 dl->index = super->anchor->num_disks;
5773                 super->anchor->num_disks++;
5774         }
5775         /* Check the device has not already been added */
5776         slot = get_imsm_disk_slot(map, dl->index);
5777         if (slot >= 0 &&
5778             (get_imsm_ord_tbl_ent(dev, slot, MAP_X) & IMSM_ORD_REBUILD) == 0) {
5779                 pr_err("%s has been included in this array twice\n",
5780                         devname);
5781                 return 1;
5782         }
5783         set_imsm_ord_tbl_ent(map, dk->raid_disk, dl->index);
5784         dl->disk.status = CONFIGURED_DISK;
5785
5786         /* update size of 'missing' disks to be at least as large as the
5787          * largest acitve member (we only have dummy missing disks when
5788          * creating the first volume)
5789          */
5790         if (super->current_vol == 0) {
5791                 for (df = super->missing; df; df = df->next) {
5792                         if (total_blocks(&dl->disk) > total_blocks(&df->disk))
5793                                 set_total_blocks(&df->disk, total_blocks(&dl->disk));
5794                         _disk = __get_imsm_disk(mpb, df->index);
5795                         *_disk = df->disk;
5796                 }
5797         }
5798
5799         /* refresh unset/failed slots to point to valid 'missing' entries */
5800         for (df = super->missing; df; df = df->next)
5801                 for (slot = 0; slot < mpb->num_disks; slot++) {
5802                         __u32 ord = get_imsm_ord_tbl_ent(dev, slot, MAP_X);
5803
5804                         if ((ord & IMSM_ORD_REBUILD) == 0)
5805                                 continue;
5806                         set_imsm_ord_tbl_ent(map, slot, df->index | IMSM_ORD_REBUILD);
5807                         if (is_gen_migration(dev)) {
5808                                 struct imsm_map *map2 = get_imsm_map(dev,
5809                                                                      MAP_1);
5810                                 int slot2 = get_imsm_disk_slot(map2, df->index);
5811                                 if (slot2 < map2->num_members && slot2 >= 0) {
5812                                         __u32 ord2 = get_imsm_ord_tbl_ent(dev,
5813                                                                          slot2,
5814                                                                          MAP_1);
5815                                         if ((unsigned)df->index ==
5816                                                                ord_to_idx(ord2))
5817                                                 set_imsm_ord_tbl_ent(map2,
5818                                                         slot2,
5819                                                         df->index |
5820                                                         IMSM_ORD_REBUILD);
5821                                 }
5822                         }
5823                         dprintf("set slot:%d to missing disk:%d\n", slot, df->index);
5824                         break;
5825                 }
5826
5827         /* if we are creating the first raid device update the family number */
5828         if (super->current_vol == 0) {
5829                 __u32 sum;
5830                 struct imsm_dev *_dev = __get_imsm_dev(mpb, 0);
5831
5832                 _disk = __get_imsm_disk(mpb, dl->index);
5833                 if (!_dev || !_disk) {
5834                         pr_err("BUG mpb setup error\n");
5835                         return 1;
5836                 }
5837                 *_dev = *dev;
5838                 *_disk = dl->disk;
5839                 sum = random32();
5840                 sum += __gen_imsm_checksum(mpb);
5841                 mpb->family_num = __cpu_to_le32(sum);
5842                 mpb->orig_family_num = mpb->family_num;
5843                 mpb->creation_time = __cpu_to_le64((__u64)time(NULL));
5844         }
5845         super->current_disk = dl;
5846         return 0;
5847 }
5848
5849 /* mark_spare()
5850  *   Function marks disk as spare and restores disk serial
5851  *   in case it was previously marked as failed by takeover operation
5852  * reruns:
5853  *   -1 : critical error
5854  *    0 : disk is marked as spare but serial is not set
5855  *    1 : success
5856  */
5857 int mark_spare(struct dl *disk)
5858 {
5859         __u8 serial[MAX_RAID_SERIAL_LEN];
5860         int ret_val = -1;
5861
5862         if (!disk)
5863                 return ret_val;
5864
5865         ret_val = 0;
5866         if (!imsm_read_serial(disk->fd, NULL, serial, MAX_RAID_SERIAL_LEN)) {
5867                 /* Restore disk serial number, because takeover marks disk
5868                  * as failed and adds to serial ':0' before it becomes
5869                  * a spare disk.
5870                  */
5871                 serialcpy(disk->serial, serial);
5872                 serialcpy(disk->disk.serial, serial);
5873                 ret_val = 1;
5874         }
5875         disk->disk.status = SPARE_DISK;
5876         disk->index = -1;
5877
5878         return ret_val;
5879 }
5880
5881
5882 static int write_super_imsm_spare(struct intel_super *super, struct dl *d);
5883
5884 static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk,
5885                              int fd, char *devname,
5886                              unsigned long long data_offset)
5887 {
5888         struct intel_super *super = st->sb;
5889         struct dl *dd;
5890         unsigned long long size;
5891         unsigned int member_sector_size;
5892         __u32 id;
5893         int rv;
5894         struct stat stb;
5895
5896         /* If we are on an RAID enabled platform check that the disk is
5897          * attached to the raid controller.
5898          * We do not need to test disks attachment for container based additions,
5899          * they shall be already tested when container was created/assembled.
5900          */
5901         rv = find_intel_hba_capability(fd, super, devname);
5902         /* no orom/efi or non-intel hba of the disk */
5903         if (rv != 0) {
5904                 dprintf("capability: %p fd: %d ret: %d\n",
5905                         super->orom, fd, rv);
5906                 return 1;
5907         }
5908
5909         if (super->current_vol >= 0)
5910                 return add_to_super_imsm_volume(st, dk, fd, devname);
5911
5912         fstat(fd, &stb);
5913         dd = xcalloc(sizeof(*dd), 1);
5914         dd->major = major(stb.st_rdev);
5915         dd->minor = minor(stb.st_rdev);
5916         dd->devname = devname ? xstrdup(devname) : NULL;
5917         dd->fd = fd;
5918         dd->e = NULL;
5919         dd->action = DISK_ADD;
5920         rv = imsm_read_serial(fd, devname, dd->serial, MAX_RAID_SERIAL_LEN);
5921         if (rv) {
5922                 pr_err("failed to retrieve scsi serial, aborting\n");
5923                 if (dd->devname)
5924                         free(dd->devname);
5925                 free(dd);
5926                 abort();
5927         }
5928
5929         if (super->hba && ((super->hba->type == SYS_DEV_NVME) ||
5930            (super->hba->type == SYS_DEV_VMD))) {
5931                 int i;
5932                 char cntrl_path[PATH_MAX];
5933                 char *cntrl_name;
5934                 char pci_dev_path[PATH_MAX];
5935
5936                 if (!diskfd_to_devpath(fd, 2, pci_dev_path) ||
5937                     !diskfd_to_devpath(fd, 1, cntrl_path)) {
5938                         pr_err("failed to get dev paths, aborting\n");
5939
5940                         if (dd->devname)
5941                                 free(dd->devname);
5942                         free(dd);
5943                         return 1;
5944                 }
5945
5946                 cntrl_name = basename(cntrl_path);
5947                 if (is_multipath_nvme(fd))
5948                         pr_err("%s controller supports Multi-Path I/O, Intel (R) VROC does not support multipathing\n",
5949                                cntrl_name);
5950
5951                 if (devpath_to_vendor(pci_dev_path) == 0x8086) {
5952                         /*
5953                          * If Intel's NVMe drive has serial ended with
5954                          * "-A","-B","-1" or "-2" it means that this is "x8"
5955                          * device (double drive on single PCIe card).
5956                          * User should be warned about potential data loss.
5957                          */
5958                         for (i = MAX_RAID_SERIAL_LEN-1; i > 0; i--) {
5959                                 /* Skip empty character at the end */
5960                                 if (dd->serial[i] == 0)
5961                                         continue;
5962
5963                                 if (((dd->serial[i] == 'A') ||
5964                                    (dd->serial[i] == 'B') ||
5965                                    (dd->serial[i] == '1') ||
5966                                    (dd->serial[i] == '2')) &&
5967                                    (dd->serial[i-1] == '-'))
5968                                         pr_err("\tThe action you are about to take may put your data at risk.\n"
5969                                                 "\tPlease note that x8 devices may consist of two separate x4 devices "
5970                                                 "located on a single PCIe port.\n"
5971                                                 "\tRAID 0 is the only supported configuration for this type of x8 device.\n");
5972                                 break;
5973                         }
5974                 } else if (super->hba->type == SYS_DEV_VMD && super->orom &&
5975                     !imsm_orom_has_tpv_support(super->orom)) {
5976                         pr_err("\tPlatform configuration does not support non-Intel NVMe drives.\n"
5977                                "\tPlease refer to Intel(R) RSTe/VROC user guide.\n");
5978                         free(dd->devname);
5979                         free(dd);
5980                         return 1;
5981                 }
5982         }
5983
5984         get_dev_size(fd, NULL, &size);
5985         if (!get_dev_sector_size(fd, NULL, &member_sector_size))
5986                 return 1;
5987
5988         if (super->sector_size == 0) {
5989                 /* this a first device, so sector_size is not set yet */
5990                 super->sector_size = member_sector_size;
5991         }
5992
5993         /* clear migr_rec when adding disk to container */
5994         memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE);
5995         if (lseek64(fd, size - MIGR_REC_SECTOR_POSITION*member_sector_size,
5996             SEEK_SET) >= 0) {
5997                 if ((unsigned int)write(fd, super->migr_rec_buf,
5998                     MIGR_REC_BUF_SECTORS*member_sector_size) !=
5999                     MIGR_REC_BUF_SECTORS*member_sector_size)
6000                         perror("Write migr_rec failed");
6001         }
6002
6003         size /= 512;
6004         serialcpy(dd->disk.serial, dd->serial);
6005         set_total_blocks(&dd->disk, size);
6006         if (__le32_to_cpu(dd->disk.total_blocks_hi) > 0) {
6007                 struct imsm_super *mpb = super->anchor;
6008                 mpb->attributes |= MPB_ATTRIB_2TB_DISK;
6009         }
6010         mark_spare(dd);
6011         if (sysfs_disk_to_scsi_id(fd, &id) == 0)
6012                 dd->disk.scsi_id = __cpu_to_le32(id);
6013         else
6014                 dd->disk.scsi_id = __cpu_to_le32(0);
6015
6016         if (st->update_tail) {
6017                 dd->next = super->disk_mgmt_list;
6018                 super->disk_mgmt_list = dd;
6019         } else {
6020                 /* this is called outside of mdmon
6021                  * write initial spare metadata
6022                  * mdmon will overwrite it.
6023                  */
6024                 dd->next = super->disks;
6025                 super->disks = dd;
6026                 write_super_imsm_spare(super, dd);
6027         }
6028
6029         return 0;
6030 }
6031
6032 static int remove_from_super_imsm(struct supertype *st, mdu_disk_info_t *dk)
6033 {
6034         struct intel_super *super = st->sb;
6035         struct dl *dd;
6036
6037         /* remove from super works only in mdmon - for communication
6038          * manager - monitor. Check if communication memory buffer
6039          * is prepared.
6040          */
6041         if (!st->update_tail) {
6042                 pr_err("shall be used in mdmon context only\n");
6043                 return 1;
6044         }
6045         dd = xcalloc(1, sizeof(*dd));
6046         dd->major = dk->major;
6047         dd->minor = dk->minor;
6048         dd->fd = -1;
6049         mark_spare(dd);
6050         dd->action = DISK_REMOVE;
6051
6052         dd->next = super->disk_mgmt_list;
6053         super->disk_mgmt_list = dd;
6054
6055         return 0;
6056 }
6057
6058 static int store_imsm_mpb(int fd, struct imsm_super *mpb);
6059
6060 static union {
6061         char buf[MAX_SECTOR_SIZE];
6062         struct imsm_super anchor;
6063 } spare_record __attribute__ ((aligned(MAX_SECTOR_SIZE)));
6064
6065
6066 static int write_super_imsm_spare(struct intel_super *super, struct dl *d)
6067 {
6068         struct imsm_super *mpb = super->anchor;
6069         struct imsm_super *spare = &spare_record.anchor;
6070         __u32 sum;
6071
6072         if (d->index != -1)
6073                 return 1;
6074
6075         spare->mpb_size = __cpu_to_le32(sizeof(struct imsm_super));
6076         spare->generation_num = __cpu_to_le32(1UL);
6077         spare->attributes = MPB_ATTRIB_CHECKSUM_VERIFY;
6078         spare->num_disks = 1;
6079         spare->num_raid_devs = 0;
6080         spare->cache_size = mpb->cache_size;
6081         spare->pwr_cycle_count = __cpu_to_le32(1);
6082
6083         snprintf((char *) spare->sig, MAX_SIGNATURE_LENGTH,
6084                  MPB_SIGNATURE MPB_VERSION_RAID0);
6085
6086         spare->disk[0] = d->disk;
6087         if (__le32_to_cpu(d->disk.total_blocks_hi) > 0)
6088                 spare->attributes |= MPB_ATTRIB_2TB_DISK;
6089
6090         if (super->sector_size == 4096)
6091                 convert_to_4k_imsm_disk(&spare->disk[0]);
6092
6093         sum = __gen_imsm_checksum(spare);
6094         spare->family_num = __cpu_to_le32(sum);
6095         spare->orig_family_num = 0;
6096         sum = __gen_imsm_checksum(spare);
6097         spare->check_sum = __cpu_to_le32(sum);
6098
6099         if (store_imsm_mpb(d->fd, spare)) {
6100                 pr_err("failed for device %d:%d %s\n",
6101                         d->major, d->minor, strerror(errno));
6102                 return 1;
6103         }
6104
6105         return 0;
6106 }
6107 /* spare records have their own family number and do not have any defined raid
6108  * devices
6109  */
6110 static int write_super_imsm_spares(struct intel_super *super, int doclose)
6111 {
6112         struct dl *d;
6113
6114         for (d = super->disks; d; d = d->next) {
6115                 if (d->index != -1)
6116                         continue;
6117
6118                 if (write_super_imsm_spare(super, d))
6119                         return 1;
6120
6121                 if (doclose) {
6122                         close(d->fd);
6123                         d->fd = -1;
6124                 }
6125         }
6126
6127         return 0;
6128 }
6129
6130 static int write_super_imsm(struct supertype *st, int doclose)
6131 {
6132         struct intel_super *super = st->sb;
6133         unsigned int sector_size = super->sector_size;
6134         struct imsm_super *mpb = super->anchor;
6135         struct dl *d;
6136         __u32 generation;
6137         __u32 sum;
6138         int spares = 0;
6139         int i;
6140         __u32 mpb_size = sizeof(struct imsm_super) - sizeof(struct imsm_disk);
6141         int num_disks = 0;
6142         int clear_migration_record = 1;
6143         __u32 bbm_log_size;
6144
6145         /* 'generation' is incremented everytime the metadata is written */
6146         generation = __le32_to_cpu(mpb->generation_num);
6147         generation++;
6148         mpb->generation_num = __cpu_to_le32(generation);
6149
6150         /* fix up cases where previous mdadm releases failed to set
6151          * orig_family_num
6152          */
6153         if (mpb->orig_family_num == 0)
6154                 mpb->orig_family_num = mpb->family_num;
6155
6156         for (d = super->disks; d; d = d->next) {
6157                 if (d->index == -1)
6158                         spares++;
6159                 else {
6160                         mpb->disk[d->index] = d->disk;
6161                         num_disks++;
6162                 }
6163         }
6164         for (d = super->missing; d; d = d->next) {
6165                 mpb->disk[d->index] = d->disk;
6166                 num_disks++;
6167         }
6168         mpb->num_disks = num_disks;
6169         mpb_size += sizeof(struct imsm_disk) * mpb->num_disks;
6170
6171         for (i = 0; i < mpb->num_raid_devs; i++) {
6172                 struct imsm_dev *dev = __get_imsm_dev(mpb, i);
6173                 struct imsm_dev *dev2 = get_imsm_dev(super, i);
6174                 if (dev && dev2) {
6175                         imsm_copy_dev(dev, dev2);
6176                         mpb_size += sizeof_imsm_dev(dev, 0);
6177                 }
6178                 if (is_gen_migration(dev2))
6179                         clear_migration_record = 0;
6180         }
6181
6182         bbm_log_size = get_imsm_bbm_log_size(super->bbm_log);
6183
6184         if (bbm_log_size) {
6185                 memcpy((void *)mpb + mpb_size, super->bbm_log, bbm_log_size);
6186                 mpb->attributes |= MPB_ATTRIB_BBM;
6187         } else
6188                 mpb->attributes &= ~MPB_ATTRIB_BBM;
6189
6190         super->anchor->bbm_log_size = __cpu_to_le32(bbm_log_size);
6191         mpb_size += bbm_log_size;
6192         mpb->mpb_size = __cpu_to_le32(mpb_size);
6193
6194 #ifdef DEBUG
6195         assert(super->len == 0 || mpb_size <= super->len);
6196 #endif
6197
6198         /* recalculate checksum */
6199         sum = __gen_imsm_checksum(mpb);
6200         mpb->check_sum = __cpu_to_le32(sum);
6201
6202         if (super->clean_migration_record_by_mdmon) {
6203                 clear_migration_record = 1;
6204                 super->clean_migration_record_by_mdmon = 0;
6205         }
6206         if (clear_migration_record)
6207                 memset(super->migr_rec_buf, 0,
6208                     MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE);
6209
6210         if (sector_size == 4096)
6211                 convert_to_4k(super);
6212
6213         /* write the mpb for disks that compose raid devices */
6214         for (d = super->disks; d ; d = d->next) {
6215                 if (d->index < 0 || is_failed(&d->disk))
6216                         continue;
6217
6218                 if (clear_migration_record) {
6219                         unsigned long long dsize;
6220
6221                         get_dev_size(d->fd, NULL, &dsize);
6222                         if (lseek64(d->fd, dsize - sector_size,
6223                             SEEK_SET) >= 0) {
6224                                 if ((unsigned int)write(d->fd,
6225                                     super->migr_rec_buf,
6226                                     MIGR_REC_BUF_SECTORS*sector_size) !=
6227                                     MIGR_REC_BUF_SECTORS*sector_size)
6228                                         perror("Write migr_rec failed");
6229                         }
6230                 }
6231
6232                 if (store_imsm_mpb(d->fd, mpb))
6233                         fprintf(stderr,
6234                                 "failed for device %d:%d (fd: %d)%s\n",
6235                                 d->major, d->minor,
6236                                 d->fd, strerror(errno));
6237
6238                 if (doclose) {
6239                         close(d->fd);
6240                         d->fd = -1;
6241                 }
6242         }
6243
6244         if (spares)
6245                 return write_super_imsm_spares(super, doclose);
6246
6247         return 0;
6248 }
6249
6250 static int create_array(struct supertype *st, int dev_idx)
6251 {
6252         size_t len;
6253         struct imsm_update_create_array *u;
6254         struct intel_super *super = st->sb;
6255         struct imsm_dev *dev = get_imsm_dev(super, dev_idx);
6256         struct imsm_map *map = get_imsm_map(dev, MAP_0);
6257         struct disk_info *inf;
6258         struct imsm_disk *disk;
6259         int i;
6260
6261         len = sizeof(*u) - sizeof(*dev) + sizeof_imsm_dev(dev, 0) +
6262               sizeof(*inf) * map->num_members;
6263         u = xmalloc(len);
6264         u->type = update_create_array;
6265         u->dev_idx = dev_idx;
6266         imsm_copy_dev(&u->dev, dev);
6267         inf = get_disk_info(u);
6268         for (i = 0; i < map->num_members; i++) {
6269                 int idx = get_imsm_disk_idx(dev, i, MAP_X);
6270
6271                 disk = get_imsm_disk(super, idx);
6272                 if (!disk)
6273                         disk = get_imsm_missing(super, idx);
6274                 serialcpy(inf[i].serial, disk->serial);
6275         }
6276         append_metadata_update(st, u, len);
6277
6278         return 0;
6279 }
6280
6281 static int mgmt_disk(struct supertype *st)
6282 {
6283         struct intel_super *super = st->sb;
6284         size_t len;
6285         struct imsm_update_add_remove_disk *u;
6286
6287         if (!super->disk_mgmt_list)
6288                 return 0;
6289
6290         len = sizeof(*u);
6291         u = xmalloc(len);
6292         u->type = update_add_remove_disk;
6293         append_metadata_update(st, u, len);
6294
6295         return 0;
6296 }
6297
6298 __u32 crc32c_le(__u32 crc, unsigned char const *p, size_t len);
6299
6300 static int write_ppl_header(unsigned long long ppl_sector, int fd, void *buf)
6301 {
6302         struct ppl_header *ppl_hdr = buf;
6303         int ret;
6304
6305         ppl_hdr->checksum = __cpu_to_le32(~crc32c_le(~0, buf, PPL_HEADER_SIZE));
6306
6307         if (lseek64(fd, ppl_sector * 512, SEEK_SET) < 0) {
6308                 ret = -errno;
6309                 perror("Failed to seek to PPL header location");
6310                 return ret;
6311         }
6312
6313         if (write(fd, buf, PPL_HEADER_SIZE) != PPL_HEADER_SIZE) {
6314                 ret = -errno;
6315                 perror("Write PPL header failed");
6316                 return ret;
6317         }
6318
6319         fsync(fd);
6320
6321         return 0;
6322 }
6323
6324 static int write_init_ppl_imsm(struct supertype *st, struct mdinfo *info, int fd)
6325 {
6326         struct intel_super *super = st->sb;
6327         void *buf;
6328         struct ppl_header *ppl_hdr;
6329         int ret;
6330
6331         /* first clear entire ppl space */
6332         ret = zero_disk_range(fd, info->ppl_sector, info->ppl_size);
6333         if (ret)
6334                 return ret;
6335
6336         ret = posix_memalign(&buf, MAX_SECTOR_SIZE, PPL_HEADER_SIZE);
6337         if (ret) {
6338                 pr_err("Failed to allocate PPL header buffer\n");
6339                 return -ret;
6340         }
6341
6342         memset(buf, 0, PPL_HEADER_SIZE);
6343         ppl_hdr = buf;
6344         memset(ppl_hdr->reserved, 0xff, PPL_HDR_RESERVED);
6345         ppl_hdr->signature = __cpu_to_le32(super->anchor->orig_family_num);
6346
6347         if (info->mismatch_cnt) {
6348                 /*
6349                  * We are overwriting an invalid ppl. Make one entry with wrong
6350                  * checksum to prevent the kernel from skipping resync.
6351                  */
6352                 ppl_hdr->entries_count = __cpu_to_le32(1);
6353                 ppl_hdr->entries[0].checksum = ~0;
6354         }
6355
6356         ret = write_ppl_header(info->ppl_sector, fd, buf);
6357
6358         free(buf);
6359         return ret;
6360 }
6361
6362 static int is_rebuilding(struct imsm_dev *dev);
6363
6364 static int validate_ppl_imsm(struct supertype *st, struct mdinfo *info,
6365                              struct mdinfo *disk)
6366 {
6367         struct intel_super *super = st->sb;
6368         struct dl *d;
6369         void *buf_orig, *buf, *buf_prev = NULL;
6370         int ret = 0;
6371         struct ppl_header *ppl_hdr = NULL;
6372         __u32 crc;
6373         struct imsm_dev *dev;
6374         __u32 idx;
6375         unsigned int i;
6376         unsigned long long ppl_offset = 0;
6377         unsigned long long prev_gen_num = 0;
6378
6379         if (disk->disk.raid_disk < 0)
6380                 return 0;
6381
6382         dev = get_imsm_dev(super, info->container_member);
6383         idx = get_imsm_disk_idx(dev, disk->disk.raid_disk, MAP_0);
6384         d = get_imsm_dl_disk(super, idx);
6385
6386         if (!d || d->index < 0 || is_failed(&d->disk))
6387                 return 0;
6388
6389         if (posix_memalign(&buf_orig, MAX_SECTOR_SIZE, PPL_HEADER_SIZE * 2)) {
6390                 pr_err("Failed to allocate PPL header buffer\n");
6391                 return -1;
6392         }
6393         buf = buf_orig;
6394
6395         ret = 1;
6396         while (ppl_offset < MULTIPLE_PPL_AREA_SIZE_IMSM) {
6397                 void *tmp;
6398
6399                 dprintf("Checking potential PPL at offset: %llu\n", ppl_offset);
6400
6401                 if (lseek64(d->fd, info->ppl_sector * 512 + ppl_offset,
6402                             SEEK_SET) < 0) {
6403                         perror("Failed to seek to PPL header location");
6404                         ret = -1;
6405                         break;
6406                 }
6407
6408                 if (read(d->fd, buf, PPL_HEADER_SIZE) != PPL_HEADER_SIZE) {
6409                         perror("Read PPL header failed");
6410                         ret = -1;
6411                         break;
6412                 }
6413
6414                 ppl_hdr = buf;
6415
6416                 crc = __le32_to_cpu(ppl_hdr->checksum);
6417                 ppl_hdr->checksum = 0;
6418
6419                 if (crc != ~crc32c_le(~0, buf, PPL_HEADER_SIZE)) {
6420                         dprintf("Wrong PPL header checksum on %s\n",
6421                                 d->devname);
6422                         break;
6423                 }
6424
6425                 if (prev_gen_num > __le64_to_cpu(ppl_hdr->generation)) {
6426                         /* previous was newest, it was already checked */
6427                         break;
6428                 }
6429
6430                 if ((__le32_to_cpu(ppl_hdr->signature) !=
6431                               super->anchor->orig_family_num)) {
6432                         dprintf("Wrong PPL header signature on %s\n",
6433                                 d->devname);
6434                         ret = 1;
6435                         break;
6436                 }
6437
6438                 ret = 0;
6439                 prev_gen_num = __le64_to_cpu(ppl_hdr->generation);
6440
6441                 ppl_offset += PPL_HEADER_SIZE;
6442                 for (i = 0; i < __le32_to_cpu(ppl_hdr->entries_count); i++)
6443                         ppl_offset +=
6444                                    __le32_to_cpu(ppl_hdr->entries[i].pp_size);
6445
6446                 if (!buf_prev)
6447                         buf_prev = buf + PPL_HEADER_SIZE;
6448                 tmp = buf_prev;
6449                 buf_prev = buf;
6450                 buf = tmp;
6451         }
6452
6453         if (buf_prev) {
6454                 buf = buf_prev;
6455                 ppl_hdr = buf_prev;
6456         }
6457
6458         /*
6459          * Update metadata to use mutliple PPLs area (1MB).
6460          * This is done once for all RAID members
6461          */
6462         if (info->consistency_policy == CONSISTENCY_POLICY_PPL &&
6463             info->ppl_size != (MULTIPLE_PPL_AREA_SIZE_IMSM >> 9)) {
6464                 char subarray[20];
6465                 struct mdinfo *member_dev;
6466
6467                 sprintf(subarray, "%d", info->container_member);
6468
6469                 if (mdmon_running(st->container_devnm))
6470                         st->update_tail = &st->updates;
6471
6472                 if (st->ss->update_subarray(st, subarray, "ppl", NULL)) {
6473                         pr_err("Failed to update subarray %s\n",
6474                               subarray);
6475                 } else {
6476                         if (st->update_tail)
6477                                 flush_metadata_updates(st);
6478                         else
6479                                 st->ss->sync_metadata(st);
6480                         info->ppl_size = (MULTIPLE_PPL_AREA_SIZE_IMSM >> 9);
6481                         for (member_dev = info->devs; member_dev;
6482                              member_dev = member_dev->next)
6483                                 member_dev->ppl_size =
6484                                     (MULTIPLE_PPL_AREA_SIZE_IMSM >> 9);
6485                 }
6486         }
6487
6488         if (ret == 1) {
6489                 struct imsm_map *map = get_imsm_map(dev, MAP_X);
6490
6491                 if (map->map_state == IMSM_T_STATE_UNINITIALIZED ||
6492                    (map->map_state == IMSM_T_STATE_NORMAL &&
6493                    !(dev->vol.dirty & RAIDVOL_DIRTY)) ||
6494                    (is_rebuilding(dev) &&
6495                     vol_curr_migr_unit(dev) == 0 &&
6496                     get_imsm_disk_idx(dev, disk->disk.raid_disk, MAP_1) != idx))
6497                         ret = st->ss->write_init_ppl(st, info, d->fd);
6498                 else
6499                         info->mismatch_cnt++;
6500         } else if (ret == 0 &&
6501                    ppl_hdr->entries_count == 0 &&
6502                    is_rebuilding(dev) &&
6503                    info->resync_start == 0) {
6504                 /*
6505                  * The header has no entries - add a single empty entry and
6506                  * rewrite the header to prevent the kernel from going into
6507                  * resync after an interrupted rebuild.
6508                  */
6509                 ppl_hdr->entries_count = __cpu_to_le32(1);
6510                 ret = write_ppl_header(info->ppl_sector, d->fd, buf);
6511         }
6512
6513         free(buf_orig);
6514
6515         return ret;
6516 }
6517
6518 static int write_init_ppl_imsm_all(struct supertype *st, struct mdinfo *info)
6519 {
6520         struct intel_super *super = st->sb;
6521         struct dl *d;
6522         int ret = 0;
6523
6524         if (info->consistency_policy != CONSISTENCY_POLICY_PPL ||
6525             info->array.level != 5)
6526                 return 0;
6527
6528         for (d = super->disks; d ; d = d->next) {
6529                 if (d->index < 0 || is_failed(&d->disk))
6530                         continue;
6531
6532                 ret = st->ss->write_init_ppl(st, info, d->fd);
6533                 if (ret)
6534                         break;
6535         }
6536
6537         return ret;
6538 }
6539
6540 /*******************************************************************************
6541  * Function:    write_init_bitmap_imsm_vol
6542  * Description: Write a bitmap header and prepares the area for the bitmap.
6543  * Parameters:
6544  *      st      : supertype information
6545  *      vol_idx : the volume index to use
6546  *
6547  * Returns:
6548  *       0 : success
6549  *      -1 : fail
6550  ******************************************************************************/
6551 static int write_init_bitmap_imsm_vol(struct supertype *st, int vol_idx)
6552 {
6553         struct intel_super *super = st->sb;
6554         int prev_current_vol = super->current_vol;
6555         struct dl *d;
6556         int ret = 0;
6557
6558         super->current_vol = vol_idx;
6559         for (d = super->disks; d; d = d->next) {
6560                 if (d->index < 0 || is_failed(&d->disk))
6561                         continue;
6562                 ret = st->ss->write_bitmap(st, d->fd, NoUpdate);
6563                 if (ret)
6564                         break;
6565         }
6566         super->current_vol = prev_current_vol;
6567         return ret;
6568 }
6569
6570 /*******************************************************************************
6571  * Function:    write_init_bitmap_imsm_all
6572  * Description: Write a bitmap header and prepares the area for the bitmap.
6573  *              Operation is executed for volumes with CONSISTENCY_POLICY_BITMAP.
6574  * Parameters:
6575  *      st      : supertype information
6576  *      info    : info about the volume where the bitmap should be written
6577  *      vol_idx : the volume index to use
6578  *
6579  * Returns:
6580  *       0 : success
6581  *      -1 : fail
6582  ******************************************************************************/
6583 static int write_init_bitmap_imsm_all(struct supertype *st, struct mdinfo *info,
6584                                       int vol_idx)
6585 {
6586         int ret = 0;
6587
6588         if (info && (info->consistency_policy == CONSISTENCY_POLICY_BITMAP))
6589                 ret = write_init_bitmap_imsm_vol(st, vol_idx);
6590
6591         return ret;
6592 }
6593
6594 static int write_init_super_imsm(struct supertype *st)
6595 {
6596         struct intel_super *super = st->sb;
6597         int current_vol = super->current_vol;
6598         int rv = 0;
6599         struct mdinfo info;
6600
6601         getinfo_super_imsm(st, &info, NULL);
6602
6603         /* we are done with current_vol reset it to point st at the container */
6604         super->current_vol = -1;
6605
6606         if (st->update_tail) {
6607                 /* queue the recently created array / added disk
6608                  * as a metadata update */
6609
6610                 /* determine if we are creating a volume or adding a disk */
6611                 if (current_vol < 0) {
6612                         /* in the mgmt (add/remove) disk case we are running
6613                          * in mdmon context, so don't close fd's
6614                          */
6615                         rv = mgmt_disk(st);
6616                 } else {
6617                         /* adding the second volume to the array */
6618                         rv = write_init_ppl_imsm_all(st, &info);
6619                         if (!rv)
6620                                 rv = write_init_bitmap_imsm_all(st, &info, current_vol);
6621                         if (!rv)
6622                                 rv = create_array(st, current_vol);
6623                 }
6624         } else {
6625                 struct dl *d;
6626                 for (d = super->disks; d; d = d->next)
6627                         Kill(d->devname, NULL, 0, -1, 1);
6628                 if (current_vol >= 0) {
6629                         rv = write_init_ppl_imsm_all(st, &info);
6630                         if (!rv)
6631                                 rv = write_init_bitmap_imsm_all(st, &info, current_vol);
6632                 }
6633
6634                 if (!rv)
6635                         rv = write_super_imsm(st, 1);
6636         }
6637
6638         return rv;
6639 }
6640
6641 static int store_super_imsm(struct supertype *st, int fd)
6642 {
6643         struct intel_super *super = st->sb;
6644         struct imsm_super *mpb = super ? super->anchor : NULL;
6645
6646         if (!mpb)
6647                 return 1;
6648
6649         if (super->sector_size == 4096)
6650                 convert_to_4k(super);
6651         return store_imsm_mpb(fd, mpb);
6652 }
6653
6654 static int validate_geometry_imsm_container(struct supertype *st, int level,
6655                                             int layout, int raiddisks, int chunk,
6656                                             unsigned long long size,
6657                                             unsigned long long data_offset,
6658                                             char *dev,
6659                                             unsigned long long *freesize,
6660                                             int verbose)
6661 {
6662         int fd;
6663         unsigned long long ldsize;
6664         struct intel_super *super = NULL;
6665         int rv = 0;
6666
6667         if (level != LEVEL_CONTAINER)
6668                 return 0;
6669         if (!dev)
6670                 return 1;
6671
6672         fd = open(dev, O_RDONLY|O_EXCL, 0);
6673         if (fd < 0) {
6674                 if (verbose > 0)
6675                         pr_err("imsm: Cannot open %s: %s\n",
6676                                 dev, strerror(errno));
6677                 return 0;
6678         }
6679         if (!get_dev_size(fd, dev, &ldsize))
6680                 goto exit;
6681
6682         /* capabilities retrieve could be possible
6683          * note that there is no fd for the disks in array.
6684          */
6685         super = alloc_super();
6686         if (!super)
6687                 goto exit;
6688
6689         if (!get_dev_sector_size(fd, NULL, &super->sector_size))
6690                 goto exit;
6691
6692         rv = find_intel_hba_capability(fd, super, verbose > 0 ? dev : NULL);
6693         if (rv != 0) {
6694 #if DEBUG
6695                 char str[256];
6696                 fd2devname(fd, str);
6697                 dprintf("fd: %d %s orom: %p rv: %d raiddisk: %d\n",
6698                         fd, str, super->orom, rv, raiddisks);
6699 #endif
6700                 /* no orom/efi or non-intel hba of the disk */
6701                 rv = 0;
6702                 goto exit;
6703         }
6704         if (super->orom) {
6705                 if (raiddisks > super->orom->tds) {
6706                         if (verbose)
6707                                 pr_err("%d exceeds maximum number of platform supported disks: %d\n",
6708                                         raiddisks, super->orom->tds);
6709                         goto exit;
6710                 }
6711                 if ((super->orom->attr & IMSM_OROM_ATTR_2TB_DISK) == 0 &&
6712                     (ldsize >> 9) >> 32 > 0) {
6713                         if (verbose)
6714                                 pr_err("%s exceeds maximum platform supported size\n", dev);
6715                         goto exit;
6716                 }
6717
6718                 if (super->hba->type == SYS_DEV_VMD ||
6719                     super->hba->type == SYS_DEV_NVME) {
6720                         if (!imsm_is_nvme_namespace_supported(fd, 1)) {
6721                                 if (verbose)
6722                                         pr_err("NVMe namespace %s is not supported by IMSM\n",
6723                                                 basename(dev));
6724                                 goto exit;
6725                         }
6726                 }
6727         }
6728
6729         *freesize = avail_size_imsm(st, ldsize >> 9, data_offset);
6730         rv = 1;
6731 exit:
6732         if (super)
6733                 free_imsm(super);
6734         close(fd);
6735
6736         return rv;
6737 }
6738
6739 static unsigned long long find_size(struct extent *e, int *idx, int num_extents)
6740 {
6741         const unsigned long long base_start = e[*idx].start;
6742         unsigned long long end = base_start + e[*idx].size;
6743         int i;
6744
6745         if (base_start == end)
6746                 return 0;
6747
6748         *idx = *idx + 1;
6749         for (i = *idx; i < num_extents; i++) {
6750                 /* extend overlapping extents */
6751                 if (e[i].start >= base_start &&
6752                     e[i].start <= end) {
6753                         if (e[i].size == 0)
6754                                 return 0;
6755                         if (e[i].start + e[i].size > end)
6756                                 end = e[i].start + e[i].size;
6757                 } else if (e[i].start > end) {
6758                         *idx = i;
6759                         break;
6760                 }
6761         }
6762
6763         return end - base_start;
6764 }
6765
6766 static unsigned long long merge_extents(struct intel_super *super, int sum_extents)
6767 {
6768         /* build a composite disk with all known extents and generate a new
6769          * 'maxsize' given the "all disks in an array must share a common start
6770          * offset" constraint
6771          */
6772         struct extent *e = xcalloc(sum_extents, sizeof(*e));
6773         struct dl *dl;
6774         int i, j;
6775         int start_extent;
6776         unsigned long long pos;
6777         unsigned long long start = 0;
6778         unsigned long long maxsize;
6779         unsigned long reserve;
6780
6781         /* coalesce and sort all extents. also, check to see if we need to
6782          * reserve space between member arrays
6783          */
6784         j = 0;
6785         for (dl = super->disks; dl; dl = dl->next) {
6786                 if (!dl->e)
6787                         continue;
6788                 for (i = 0; i < dl->extent_cnt; i++)
6789                         e[j++] = dl->e[i];
6790         }
6791         qsort(e, sum_extents, sizeof(*e), cmp_extent);
6792
6793         /* merge extents */
6794         i = 0;
6795         j = 0;
6796         while (i < sum_extents) {
6797                 e[j].start = e[i].start;
6798                 e[j].size = find_size(e, &i, sum_extents);
6799                 j++;
6800                 if (e[j-1].size == 0)
6801                         break;
6802         }
6803
6804         pos = 0;
6805         maxsize = 0;
6806         start_extent = 0;
6807         i = 0;
6808         do {
6809                 unsigned long long esize;
6810
6811                 esize = e[i].start - pos;
6812                 if (esize >= maxsize) {
6813                         maxsize = esize;
6814                         start = pos;
6815                         start_extent = i;
6816                 }
6817                 pos = e[i].start + e[i].size;
6818                 i++;
6819         } while (e[i-1].size);
6820         free(e);
6821
6822         if (maxsize == 0)
6823                 return 0;
6824
6825         /* FIXME assumes volume at offset 0 is the first volume in a
6826          * container
6827          */
6828         if (start_extent > 0)
6829                 reserve = IMSM_RESERVED_SECTORS; /* gap between raid regions */
6830         else
6831                 reserve = 0;
6832
6833         if (maxsize < reserve)
6834                 return 0;
6835
6836         super->create_offset = ~((unsigned long long) 0);
6837         if (start + reserve > super->create_offset)
6838                 return 0; /* start overflows create_offset */
6839         super->create_offset = start + reserve;
6840
6841         return maxsize - reserve;
6842 }
6843
6844 static int is_raid_level_supported(const struct imsm_orom *orom, int level, int raiddisks)
6845 {
6846         if (level < 0 || level == 6 || level == 4)
6847                 return 0;
6848
6849         /* if we have an orom prevent invalid raid levels */
6850         if (orom)
6851                 switch (level) {
6852                 case 0: return imsm_orom_has_raid0(orom);
6853                 case 1:
6854                         if (raiddisks > 2)
6855                                 return imsm_orom_has_raid1e(orom);
6856                         return imsm_orom_has_raid1(orom) && raiddisks == 2;
6857                 case 10: return imsm_orom_has_raid10(orom) && raiddisks == 4;
6858                 case 5: return imsm_orom_has_raid5(orom) && raiddisks > 2;
6859                 }
6860         else
6861                 return 1; /* not on an Intel RAID platform so anything goes */
6862
6863         return 0;
6864 }
6865
6866 static int
6867 active_arrays_by_format(char *name, char* hba, struct md_list **devlist,
6868                         int dpa, int verbose)
6869 {
6870         struct mdstat_ent *mdstat = mdstat_read(0, 0);
6871         struct mdstat_ent *memb;
6872         int count = 0;
6873         int num = 0;
6874         struct md_list *dv;
6875         int found;
6876
6877         for (memb = mdstat ; memb ; memb = memb->next) {
6878                 if (memb->metadata_version &&
6879                     (strncmp(memb->metadata_version, "external:", 9) == 0) &&
6880                     (strcmp(&memb->metadata_version[9], name) == 0) &&
6881                     !is_subarray(memb->metadata_version+9) &&
6882                     memb->members) {
6883                         struct dev_member *dev = memb->members;
6884                         int fd = -1;
6885                         while(dev && (fd < 0)) {
6886                                 char *path = xmalloc(strlen(dev->name) + strlen("/dev/") + 1);
6887                                 num = sprintf(path, "%s%s", "/dev/", dev->name);
6888                                 if (num > 0)
6889                                         fd = open(path, O_RDONLY, 0);
6890                                 if (num <= 0 || fd < 0) {
6891                                         pr_vrb("Cannot open %s: %s\n",
6892                                                dev->name, strerror(errno));
6893                                 }
6894                                 free(path);
6895                                 dev = dev->next;
6896                         }
6897                         found = 0;
6898                         if (fd >= 0 && disk_attached_to_hba(fd, hba)) {
6899                                 struct mdstat_ent *vol;
6900                                 for (vol = mdstat ; vol ; vol = vol->next) {
6901                                         if (vol->active > 0 &&
6902                                             vol->metadata_version &&
6903                                             is_container_member(vol, memb->devnm)) {
6904                                                 found++;
6905                                                 count++;
6906                                         }
6907                                 }
6908                                 if (*devlist && (found < dpa)) {
6909                                         dv = xcalloc(1, sizeof(*dv));
6910                                         dv->devname = xmalloc(strlen(memb->devnm) + strlen("/dev/") + 1);
6911                                         sprintf(dv->devname, "%s%s", "/dev/", memb->devnm);
6912                                         dv->found = found;
6913                                         dv->used = 0;
6914                                         dv->next = *devlist;
6915                                         *devlist = dv;
6916                                 }
6917                         }
6918                         if (fd >= 0)
6919                                 close(fd);
6920                 }
6921         }
6922         free_mdstat(mdstat);
6923         return count;
6924 }
6925
6926 #ifdef DEBUG_LOOP
6927 static struct md_list*
6928 get_loop_devices(void)
6929 {
6930         int i;
6931         struct md_list *devlist = NULL;
6932         struct md_list *dv;
6933
6934         for(i = 0; i < 12; i++) {
6935                 dv = xcalloc(1, sizeof(*dv));
6936                 dv->devname = xmalloc(40);
6937                 sprintf(dv->devname, "/dev/loop%d", i);
6938                 dv->next = devlist;
6939                 devlist = dv;
6940         }
6941         return devlist;
6942 }
6943 #endif
6944
6945 static struct md_list*
6946 get_devices(const char *hba_path)
6947 {
6948         struct md_list *devlist = NULL;
6949         struct md_list *dv;
6950         struct dirent *ent;
6951         DIR *dir;
6952         int err = 0;
6953
6954 #if DEBUG_LOOP
6955         devlist = get_loop_devices();
6956         return devlist;
6957 #endif
6958         /* scroll through /sys/dev/block looking for devices attached to
6959          * this hba
6960          */
6961         dir = opendir("/sys/dev/block");
6962         for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) {
6963                 int fd;
6964                 char buf[1024];
6965                 int major, minor;
6966                 char *path = NULL;
6967                 if (sscanf(ent->d_name, "%d:%d", &major, &minor) != 2)
6968                         continue;
6969                 path = devt_to_devpath(makedev(major, minor), 1, NULL);
6970                 if (!path)
6971                         continue;
6972                 if (!path_attached_to_hba(path, hba_path)) {
6973                         free(path);
6974                         path = NULL;
6975                         continue;
6976                 }
6977                 free(path);
6978                 path = NULL;
6979                 fd = dev_open(ent->d_name, O_RDONLY);
6980                 if (fd >= 0) {
6981                         fd2devname(fd, buf);
6982                         close(fd);
6983                 } else {
6984                         pr_err("cannot open device: %s\n",
6985                                 ent->d_name);
6986                         continue;
6987                 }
6988
6989                 dv = xcalloc(1, sizeof(*dv));
6990                 dv->devname = xstrdup(buf);
6991                 dv->next = devlist;
6992                 devlist = dv;
6993         }
6994         if (err) {
6995                 while(devlist) {
6996                         dv = devlist;
6997                         devlist = devlist->next;
6998                         free(dv->devname);
6999                         free(dv);
7000                 }
7001         }
7002         closedir(dir);
7003         return devlist;
7004 }
7005
7006 static int
7007 count_volumes_list(struct md_list *devlist, char *homehost,
7008                    int verbose, int *found)
7009 {
7010         struct md_list *tmpdev;
7011         int count = 0;
7012         struct supertype *st;
7013
7014         /* first walk the list of devices to find a consistent set
7015          * that match the criterea, if that is possible.
7016          * We flag the ones we like with 'used'.
7017          */
7018         *found = 0;
7019         st = match_metadata_desc_imsm("imsm");
7020         if (st == NULL) {
7021                 pr_vrb("cannot allocate memory for imsm supertype\n");
7022                 return 0;
7023         }
7024
7025         for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) {
7026                 char *devname = tmpdev->devname;
7027                 dev_t rdev;
7028                 struct supertype *tst;
7029                 int dfd;
7030                 if (tmpdev->used > 1)
7031                         continue;
7032                 tst = dup_super(st);
7033                 if (tst == NULL) {
7034                         pr_vrb("cannot allocate memory for imsm supertype\n");
7035                         goto err_1;
7036                 }
7037                 tmpdev->container = 0;
7038                 dfd = dev_open(devname, O_RDONLY|O_EXCL);
7039                 if (dfd < 0) {
7040                         dprintf("cannot open device %s: %s\n",
7041                                 devname, strerror(errno));
7042                         tmpdev->used = 2;
7043                 } else if (!fstat_is_blkdev(dfd, devname, &rdev)) {
7044                         tmpdev->used = 2;
7045                 } else if (must_be_container(dfd)) {
7046                         struct supertype *cst;
7047                         cst = super_by_fd(dfd, NULL);
7048                         if (cst == NULL) {
7049                                 dprintf("cannot recognize container type %s\n",
7050                                         devname);
7051                                 tmpdev->used = 2;
7052                         } else if (tst->ss != st->ss) {
7053                                 dprintf("non-imsm container - ignore it: %s\n",
7054                                         devname);
7055                                 tmpdev->used = 2;
7056                         } else if (!tst->ss->load_container ||
7057                                    tst->ss->load_container(tst, dfd, NULL))
7058                                 tmpdev->used = 2;
7059                         else {
7060                                 tmpdev->container = 1;
7061                         }
7062                         if (cst)
7063                                 cst->ss->free_super(cst);
7064                 } else {
7065                         tmpdev->st_rdev = rdev;
7066                         if (tst->ss->load_super(tst,dfd, NULL)) {
7067                                 dprintf("no RAID superblock on %s\n",
7068                                         devname);
7069                                 tmpdev->used = 2;
7070                         } else if (tst->ss->compare_super == NULL) {
7071                                 dprintf("Cannot assemble %s metadata on %s\n",
7072                                         tst->ss->name, devname);
7073                                 tmpdev->used = 2;
7074                         }
7075                 }
7076                 if (dfd >= 0)
7077                         close(dfd);
7078                 if (tmpdev->used == 2 || tmpdev->used == 4) {
7079                         /* Ignore unrecognised devices during auto-assembly */
7080                         goto loop;
7081                 }
7082                 else {
7083                         struct mdinfo info;
7084                         tst->ss->getinfo_super(tst, &info, NULL);
7085
7086                         if (st->minor_version == -1)
7087                                 st->minor_version = tst->minor_version;
7088
7089                         if (memcmp(info.uuid, uuid_zero,
7090                                    sizeof(int[4])) == 0) {
7091                                 /* this is a floating spare.  It cannot define
7092                                  * an array unless there are no more arrays of
7093                                  * this type to be found.  It can be included
7094                                  * in an array of this type though.
7095                                  */
7096                                 tmpdev->used = 3;
7097                                 goto loop;
7098                         }
7099
7100                         if (st->ss != tst->ss ||
7101                             st->minor_version != tst->minor_version ||
7102                             st->ss->compare_super(st, tst, 1) != 0) {
7103                                 /* Some mismatch. If exactly one array matches this host,
7104                                  * we can resolve on that one.
7105                                  * Or, if we are auto assembling, we just ignore the second
7106                                  * for now.
7107                                  */
7108                                 dprintf("superblock on %s doesn't match others - assembly aborted\n",
7109                                         devname);
7110                                 goto loop;
7111                         }
7112                         tmpdev->used = 1;
7113                         *found = 1;
7114                         dprintf("found: devname: %s\n", devname);
7115                 }
7116         loop:
7117                 if (tst)
7118                         tst->ss->free_super(tst);
7119         }
7120         if (*found != 0) {
7121                 int err;
7122                 if ((err = load_super_imsm_all(st, -1, &st->sb, NULL, devlist, 0)) == 0) {
7123                         struct mdinfo *iter, *head = st->ss->container_content(st, NULL);
7124                         for (iter = head; iter; iter = iter->next) {
7125                                 dprintf("content->text_version: %s vol\n",
7126                                         iter->text_version);
7127                                 if (iter->array.state & (1<<MD_SB_BLOCK_VOLUME)) {
7128                                         /* do not assemble arrays with unsupported
7129                                            configurations */
7130                                         dprintf("Cannot activate member %s.\n",
7131                                                 iter->text_version);
7132                                 } else
7133                                         count++;
7134                         }
7135                         sysfs_free(head);
7136
7137                 } else {
7138                         dprintf("No valid super block on device list: err: %d %p\n",
7139                                 err, st->sb);
7140                 }
7141         } else {
7142                 dprintf("no more devices to examine\n");
7143         }
7144
7145         for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) {
7146                 if (tmpdev->used == 1 && tmpdev->found) {
7147                         if (count) {
7148                                 if (count < tmpdev->found)
7149                                         count = 0;
7150                                 else
7151                                         count -= tmpdev->found;
7152                         }
7153                 }
7154                 if (tmpdev->used == 1)
7155                         tmpdev->used = 4;
7156         }
7157         err_1:
7158         if (st)
7159                 st->ss->free_super(st);
7160         return count;
7161 }
7162
7163 static int __count_volumes(char *hba_path, int dpa, int verbose,
7164                            int cmp_hba_path)
7165 {
7166         struct sys_dev *idev, *intel_devices = find_intel_devices();
7167         int count = 0;
7168         const struct orom_entry *entry;
7169         struct devid_list *dv, *devid_list;
7170
7171         if (!hba_path)
7172                 return 0;
7173
7174         for (idev = intel_devices; idev; idev = idev->next) {
7175                 if (strstr(idev->path, hba_path))
7176                         break;
7177         }
7178
7179         if (!idev || !idev->dev_id)
7180                 return 0;
7181
7182         entry = get_orom_entry_by_device_id(idev->dev_id);
7183
7184         if (!entry || !entry->devid_list)
7185                 return 0;
7186
7187         devid_list = entry->devid_list;
7188         for (dv = devid_list; dv; dv = dv->next) {
7189                 struct md_list *devlist;
7190                 struct sys_dev *device = NULL;
7191                 char *hpath;
7192                 int found = 0;
7193
7194                 if (cmp_hba_path)
7195                         device = device_by_id_and_path(dv->devid, hba_path);
7196                 else
7197                         device = device_by_id(dv->devid);
7198
7199                 if (device)
7200                         hpath = device->path;
7201                 else
7202                         return 0;
7203
7204                 devlist = get_devices(hpath);
7205                 /* if no intel devices return zero volumes */
7206                 if (devlist == NULL)
7207                         return 0;
7208
7209                 count += active_arrays_by_format("imsm", hpath, &devlist, dpa,
7210                                                  verbose);
7211                 dprintf("path: %s active arrays: %d\n", hpath, count);
7212                 if (devlist == NULL)
7213                         return 0;
7214                 do  {
7215                         found = 0;
7216                         count += count_volumes_list(devlist,
7217                                                         NULL,
7218                                                         verbose,
7219                                                         &found);
7220                         dprintf("found %d count: %d\n", found, count);
7221                 } while (found);
7222
7223                 dprintf("path: %s total number of volumes: %d\n", hpath, count);
7224
7225                 while (devlist) {
7226                         struct md_list *dv = devlist;
7227                         devlist = devlist->next;
7228                         free(dv->devname);
7229                         free(dv);
7230                 }
7231         }
7232         return count;
7233 }
7234
7235 static int count_volumes(struct intel_hba *hba, int dpa, int verbose)
7236 {
7237         if (!hba)
7238                 return 0;
7239         if (hba->type == SYS_DEV_VMD) {
7240                 struct sys_dev *dev;
7241                 int count = 0;
7242
7243                 for (dev = find_intel_devices(); dev; dev = dev->next) {
7244                         if (dev->type == SYS_DEV_VMD)
7245                                 count += __count_volumes(dev->path, dpa,
7246                                                          verbose, 1);
7247                 }
7248                 return count;
7249         }
7250         return __count_volumes(hba->path, dpa, verbose, 0);
7251 }
7252
7253 static int imsm_default_chunk(const struct imsm_orom *orom)
7254 {
7255         /* up to 512 if the plaform supports it, otherwise the platform max.
7256          * 128 if no platform detected
7257          */
7258         int fs = max(7, orom ? fls(orom->sss) : 0);
7259
7260         return min(512, (1 << fs));
7261 }
7262
7263 static int
7264 validate_geometry_imsm_orom(struct intel_super *super, int level, int layout,
7265                             int raiddisks, int *chunk, unsigned long long size, int verbose)
7266 {
7267         /* check/set platform and metadata limits/defaults */
7268         if (super->orom && raiddisks > super->orom->dpa) {
7269                 pr_vrb("platform supports a maximum of %d disks per array\n",
7270                        super->orom->dpa);
7271                 return 0;
7272         }
7273
7274         /* capabilities of OROM tested - copied from validate_geometry_imsm_volume */
7275         if (!is_raid_level_supported(super->orom, level, raiddisks)) {
7276                 pr_vrb("platform does not support raid%d with %d disk%s\n",
7277                         level, raiddisks, raiddisks > 1 ? "s" : "");
7278                 return 0;
7279         }
7280
7281         if (*chunk == 0 || *chunk == UnSet)
7282                 *chunk = imsm_default_chunk(super->orom);
7283
7284         if (super->orom && !imsm_orom_has_chunk(super->orom, *chunk)) {
7285                 pr_vrb("platform does not support a chunk size of: %d\n", *chunk);
7286                 return 0;
7287         }
7288
7289         if (layout != imsm_level_to_layout(level)) {
7290                 if (level == 5)
7291                         pr_vrb("imsm raid 5 only supports the left-asymmetric layout\n");
7292                 else if (level == 10)
7293                         pr_vrb("imsm raid 10 only supports the n2 layout\n");
7294                 else
7295                         pr_vrb("imsm unknown layout %#x for this raid level %d\n",
7296                                 layout, level);
7297                 return 0;
7298         }
7299
7300         if (super->orom && (super->orom->attr & IMSM_OROM_ATTR_2TB) == 0 &&
7301                         (calc_array_size(level, raiddisks, layout, *chunk, size) >> 32) > 0) {
7302                 pr_vrb("platform does not support a volume size over 2TB\n");
7303                 return 0;
7304         }
7305
7306         return 1;
7307 }
7308
7309 /* validate_geometry_imsm_volume - lifted from validate_geometry_ddf_bvd
7310  * FIX ME add ahci details
7311  */
7312 static int validate_geometry_imsm_volume(struct supertype *st, int level,
7313                                          int layout, int raiddisks, int *chunk,
7314                                          unsigned long long size,
7315                                          unsigned long long data_offset,
7316                                          char *dev,
7317                                          unsigned long long *freesize,
7318                                          int verbose)
7319 {
7320         dev_t rdev;
7321         struct intel_super *super = st->sb;
7322         struct imsm_super *mpb;
7323         struct dl *dl;
7324         unsigned long long pos = 0;
7325         unsigned long long maxsize;
7326         struct extent *e;
7327         int i;
7328
7329         /* We must have the container info already read in. */
7330         if (!super)
7331                 return 0;
7332
7333         mpb = super->anchor;
7334
7335         if (!validate_geometry_imsm_orom(super, level, layout, raiddisks, chunk, size, verbose)) {
7336                 pr_err("RAID geometry validation failed. Cannot proceed with the action(s).\n");
7337                 return 0;
7338         }
7339         if (!dev) {
7340                 /* General test:  make sure there is space for
7341                  * 'raiddisks' device extents of size 'size' at a given
7342                  * offset
7343                  */
7344                 unsigned long long minsize = size;
7345                 unsigned long long start_offset = MaxSector;
7346                 int dcnt = 0;
7347                 if (minsize == 0)
7348                         minsize = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
7349                 for (dl = super->disks; dl ; dl = dl->next) {
7350                         int found = 0;
7351
7352                         pos = 0;
7353                         i = 0;
7354                         e = get_extents(super, dl, 0);
7355                         if (!e) continue;
7356                         do {
7357                                 unsigned long long esize;
7358                                 esize = e[i].start - pos;
7359                                 if (esize >= minsize)
7360                                         found = 1;
7361                                 if (found && start_offset == MaxSector) {
7362                                         start_offset = pos;
7363                                         break;
7364                                 } else if (found && pos != start_offset) {
7365                                         found = 0;
7366                                         break;
7367                                 }
7368                                 pos = e[i].start + e[i].size;
7369                                 i++;
7370                         } while (e[i-1].size);
7371                         if (found)
7372                                 dcnt++;
7373                         free(e);
7374                 }
7375                 if (dcnt < raiddisks) {
7376                         if (verbose)
7377                                 pr_err("imsm: Not enough devices with space for this array (%d < %d)\n",
7378                                         dcnt, raiddisks);
7379                         return 0;
7380                 }
7381                 return 1;
7382         }
7383
7384         /* This device must be a member of the set */
7385         if (!stat_is_blkdev(dev, &rdev))
7386                 return 0;
7387         for (dl = super->disks ; dl ; dl = dl->next) {
7388                 if (dl->major == (int)major(rdev) &&
7389                     dl->minor == (int)minor(rdev))
7390                         break;
7391         }
7392         if (!dl) {
7393                 if (verbose)
7394                         pr_err("%s is not in the same imsm set\n", dev);
7395                 return 0;
7396         } else if (super->orom && dl->index < 0 && mpb->num_raid_devs) {
7397                 /* If a volume is present then the current creation attempt
7398                  * cannot incorporate new spares because the orom may not
7399                  * understand this configuration (all member disks must be
7400                  * members of each array in the container).
7401                  */
7402                 pr_err("%s is a spare and a volume is already defined for this container\n", dev);
7403                 pr_err("The option-rom requires all member disks to be a member of all volumes\n");
7404                 return 0;
7405         } else if (super->orom && mpb->num_raid_devs > 0 &&
7406                    mpb->num_disks != raiddisks) {
7407                 pr_err("The option-rom requires all member disks to be a member of all volumes\n");
7408                 return 0;
7409         }
7410
7411         /* retrieve the largest free space block */
7412         e = get_extents(super, dl, 0);
7413         maxsize = 0;
7414         i = 0;
7415         if (e) {
7416                 do {
7417                         unsigned long long esize;
7418
7419                         esize = e[i].start - pos;
7420                         if (esize >= maxsize)
7421                                 maxsize = esize;
7422                         pos = e[i].start + e[i].size;
7423                         i++;
7424                 } while (e[i-1].size);
7425                 dl->e = e;
7426                 dl->extent_cnt = i;
7427         } else {
7428                 if (verbose)
7429                         pr_err("unable to determine free space for: %s\n",
7430                                 dev);
7431                 return 0;
7432         }
7433         if (maxsize < size) {
7434                 if (verbose)
7435                         pr_err("%s not enough space (%llu < %llu)\n",
7436                                 dev, maxsize, size);
7437                 return 0;
7438         }
7439
7440         /* count total number of extents for merge */
7441         i = 0;
7442         for (dl = super->disks; dl; dl = dl->next)
7443                 if (dl->e)
7444                         i += dl->extent_cnt;
7445
7446         maxsize = merge_extents(super, i);
7447
7448         if (mpb->num_raid_devs > 0 && size && size != maxsize)
7449                 pr_err("attempting to create a second volume with size less then remaining space.\n");
7450
7451         if (maxsize < size || maxsize == 0) {
7452                 if (verbose) {
7453                         if (maxsize == 0)
7454                                 pr_err("no free space left on device. Aborting...\n");
7455                         else
7456                                 pr_err("not enough space to create volume of given size (%llu < %llu). Aborting...\n",
7457                                                 maxsize, size);
7458                 }
7459                 return 0;
7460         }
7461
7462         *freesize = maxsize;
7463
7464         if (super->orom) {
7465                 int count = count_volumes(super->hba,
7466                                       super->orom->dpa, verbose);
7467                 if (super->orom->vphba <= count) {
7468                         pr_vrb("platform does not support more than %d raid volumes.\n",
7469                                super->orom->vphba);
7470                         return 0;
7471                 }
7472         }
7473         return 1;
7474 }
7475
7476 static int imsm_get_free_size(struct supertype *st, int raiddisks,
7477                          unsigned long long size, int chunk,
7478                          unsigned long long *freesize)
7479 {
7480         struct intel_super *super = st->sb;
7481         struct imsm_super *mpb = super->anchor;
7482         struct dl *dl;
7483         int i;
7484         int extent_cnt;
7485         struct extent *e;
7486         unsigned long long maxsize;
7487         unsigned long long minsize;
7488         int cnt;
7489         int used;
7490
7491         /* find the largest common start free region of the possible disks */
7492         used = 0;
7493         extent_cnt = 0;
7494         cnt = 0;
7495         for (dl = super->disks; dl; dl = dl->next) {
7496                 dl->raiddisk = -1;
7497
7498                 if (dl->index >= 0)
7499                         used++;
7500
7501                 /* don't activate new spares if we are orom constrained
7502                  * and there is already a volume active in the container
7503                  */
7504                 if (super->orom && dl->index < 0 && mpb->num_raid_devs)
7505                         continue;
7506
7507                 e = get_extents(super, dl, 0);
7508                 if (!e)
7509                         continue;
7510                 for (i = 1; e[i-1].size; i++)
7511                         ;
7512                 dl->e = e;
7513                 dl->extent_cnt = i;
7514                 extent_cnt += i;
7515                 cnt++;
7516         }
7517
7518         maxsize = merge_extents(super, extent_cnt);
7519         minsize = size;
7520         if (size == 0)
7521                 /* chunk is in K */
7522                 minsize = chunk * 2;
7523
7524         if (cnt < raiddisks ||
7525             (super->orom && used && used != raiddisks) ||
7526             maxsize < minsize ||
7527             maxsize == 0) {
7528                 pr_err("not enough devices with space to create array.\n");
7529                 return 0; /* No enough free spaces large enough */
7530         }
7531
7532         if (size == 0) {
7533                 size = maxsize;
7534                 if (chunk) {
7535                         size /= 2 * chunk;
7536                         size *= 2 * chunk;
7537                 }
7538                 maxsize = size;
7539         }
7540         if (mpb->num_raid_devs > 0 && size && size != maxsize)
7541                 pr_err("attempting to create a second volume with size less then remaining space.\n");
7542         cnt = 0;
7543         for (dl = super->disks; dl; dl = dl->next)
7544                 if (dl->e)
7545                         dl->raiddisk = cnt++;
7546
7547         *freesize = size;
7548
7549         dprintf("imsm: imsm_get_free_size() returns : %llu\n", size);
7550
7551         return 1;
7552 }
7553
7554 static int reserve_space(struct supertype *st, int raiddisks,
7555                          unsigned long long size, int chunk,
7556                          unsigned long long *freesize)
7557 {
7558         struct intel_super *super = st->sb;
7559         struct dl *dl;
7560         int cnt;
7561         int rv = 0;
7562
7563         rv = imsm_get_free_size(st, raiddisks, size, chunk, freesize);
7564         if (rv) {
7565                 cnt = 0;
7566                 for (dl = super->disks; dl; dl = dl->next)
7567                         if (dl->e)
7568                                 dl->raiddisk = cnt++;
7569                 rv = 1;
7570         }
7571
7572         return rv;
7573 }
7574
7575 static int validate_geometry_imsm(struct supertype *st, int level, int layout,
7576                                   int raiddisks, int *chunk, unsigned long long size,
7577                                   unsigned long long data_offset,
7578                                   char *dev, unsigned long long *freesize,
7579                                   int consistency_policy, int verbose)
7580 {
7581         int fd, cfd;
7582         struct mdinfo *sra;
7583         int is_member = 0;
7584
7585         /* load capability
7586          * if given unused devices create a container
7587          * if given given devices in a container create a member volume
7588          */
7589         if (level == LEVEL_CONTAINER) {
7590                 /* Must be a fresh device to add to a container */
7591                 return validate_geometry_imsm_container(st, level, layout,
7592                                                         raiddisks,
7593                                                         *chunk,
7594                                                         size, data_offset,
7595                                                         dev, freesize,
7596                                                         verbose);
7597         }
7598
7599         /*
7600          * Size is given in sectors.
7601          */
7602         if (size && (size < 2048)) {
7603                 pr_err("Given size must be greater than 1M.\n");
7604                 /* Depends on algorithm in Create.c :
7605                  * if container was given (dev == NULL) return -1,
7606                  * if block device was given ( dev != NULL) return 0.
7607                  */
7608                 return dev ? -1 : 0;
7609         }
7610
7611         if (!dev) {
7612                 if (st->sb) {
7613                         struct intel_super *super = st->sb;
7614                         if (!validate_geometry_imsm_orom(st->sb, level, layout,
7615                                                          raiddisks, chunk, size,
7616                                                          verbose))
7617                                 return 0;
7618                         /* we are being asked to automatically layout a
7619                          * new volume based on the current contents of
7620                          * the container.  If the the parameters can be
7621                          * satisfied reserve_space will record the disks,
7622                          * start offset, and size of the volume to be
7623                          * created.  add_to_super and getinfo_super
7624                          * detect when autolayout is in progress.
7625                          */
7626                         /* assuming that freesize is always given when array is
7627                            created */
7628                         if (super->orom && freesize) {
7629                                 int count;
7630                                 count = count_volumes(super->hba,
7631                                                       super->orom->dpa, verbose);
7632                                 if (super->orom->vphba <= count) {
7633                                         pr_vrb("platform does not support more than %d raid volumes.\n",
7634                                                super->orom->vphba);
7635                                         return 0;
7636                                 }
7637                         }
7638                         if (freesize)
7639                                 return reserve_space(st, raiddisks, size,
7640                                                      *chunk, freesize);
7641                 }
7642                 return 1;
7643         }
7644         if (st->sb) {
7645                 /* creating in a given container */
7646                 return validate_geometry_imsm_volume(st, level, layout,
7647                                                      raiddisks, chunk, size,
7648                                                      data_offset,
7649                                                      dev, freesize, verbose);
7650         }
7651
7652         /* This device needs to be a device in an 'imsm' container */
7653         fd = open(dev, O_RDONLY|O_EXCL, 0);
7654         if (fd >= 0) {
7655                 if (verbose)
7656                         pr_err("Cannot create this array on device %s\n",
7657                                dev);
7658                 close(fd);
7659                 return 0;
7660         }
7661         if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) {
7662                 if (verbose)
7663                         pr_err("Cannot open %s: %s\n",
7664                                 dev, strerror(errno));
7665                 return 0;
7666         }
7667         /* Well, it is in use by someone, maybe an 'imsm' container. */
7668         cfd = open_container(fd);
7669         close(fd);
7670         if (cfd < 0) {
7671                 if (verbose)
7672                         pr_err("Cannot use %s: It is busy\n",
7673                                 dev);
7674                 return 0;
7675         }
7676         sra = sysfs_read(cfd, NULL, GET_VERSION);
7677         if (sra && sra->array.major_version == -1 &&
7678             strcmp(sra->text_version, "imsm") == 0)
7679                 is_member = 1;
7680         sysfs_free(sra);
7681         if (is_member) {
7682                 /* This is a member of a imsm container.  Load the container
7683                  * and try to create a volume
7684                  */
7685                 struct intel_super *super;
7686
7687                 if (load_super_imsm_all(st, cfd, (void **) &super, NULL, NULL, 1) == 0) {
7688                         st->sb = super;
7689                         strcpy(st->container_devnm, fd2devnm(cfd));
7690                         close(cfd);
7691                         return validate_geometry_imsm_volume(st, level, layout,
7692                                                              raiddisks, chunk,
7693                                                              size, data_offset, dev,
7694                                                              freesize, 1)
7695                                 ? 1 : -1;
7696                 }
7697         }
7698
7699         if (verbose)
7700                 pr_err("failed container membership check\n");
7701
7702         close(cfd);
7703         return 0;
7704 }
7705
7706 static void default_geometry_imsm(struct supertype *st, int *level, int *layout, int *chunk)
7707 {
7708         struct intel_super *super = st->sb;
7709
7710         if (level && *level == UnSet)
7711                 *level = LEVEL_CONTAINER;
7712
7713         if (level && layout && *layout == UnSet)
7714                 *layout = imsm_level_to_layout(*level);
7715
7716         if (chunk && (*chunk == UnSet || *chunk == 0))
7717                 *chunk = imsm_default_chunk(super->orom);
7718 }
7719
7720 static void handle_missing(struct intel_super *super, struct imsm_dev *dev);
7721
7722 static int kill_subarray_imsm(struct supertype *st, char *subarray_id)
7723 {
7724         /* remove the subarray currently referenced by subarray_id */
7725         __u8 i;
7726         struct intel_dev **dp;
7727         struct intel_super *super = st->sb;
7728         __u8 current_vol = strtoul(subarray_id, NULL, 10);
7729         struct imsm_super *mpb = super->anchor;
7730
7731         if (mpb->num_raid_devs == 0)
7732                 return 2;
7733
7734         /* block deletions that would change the uuid of active subarrays
7735          *
7736          * FIXME when immutable ids are available, but note that we'll
7737          * also need to fixup the invalidated/active subarray indexes in
7738          * mdstat
7739          */
7740         for (i = 0; i < mpb->num_raid_devs; i++) {
7741                 char subarray[4];
7742
7743                 if (i < current_vol)
7744                         continue;
7745                 sprintf(subarray, "%u", i);
7746                 if (is_subarray_active(subarray, st->devnm)) {
7747                         pr_err("deleting subarray-%d would change the UUID of active subarray-%d, aborting\n",
7748                                current_vol, i);
7749
7750                         return 2;
7751                 }
7752         }
7753
7754         if (st->update_tail) {
7755                 struct imsm_update_kill_array *u = xmalloc(sizeof(*u));
7756
7757                 u->type = update_kill_array;
7758                 u->dev_idx = current_vol;
7759                 append_metadata_update(st, u, sizeof(*u));
7760
7761                 return 0;
7762         }
7763
7764         for (dp = &super->devlist; *dp;)
7765                 if ((*dp)->index == current_vol) {
7766                         *dp = (*dp)->next;
7767                 } else {
7768                         handle_missing(super, (*dp)->dev);
7769                         if ((*dp)->index > current_vol)
7770                                 (*dp)->index--;
7771                         dp = &(*dp)->next;
7772                 }
7773
7774         /* no more raid devices, all active components are now spares,
7775          * but of course failed are still failed
7776          */
7777         if (--mpb->num_raid_devs == 0) {
7778                 struct dl *d;
7779
7780                 for (d = super->disks; d; d = d->next)
7781                         if (d->index > -2)
7782                                 mark_spare(d);
7783         }
7784
7785         super->updates_pending++;
7786
7787         return 0;
7788 }
7789
7790 static int get_rwh_policy_from_update(char *update)
7791 {
7792         if (strcmp(update, "ppl") == 0)
7793                 return RWH_MULTIPLE_DISTRIBUTED;
7794         else if (strcmp(update, "no-ppl") == 0)
7795                 return RWH_MULTIPLE_OFF;
7796         else if (strcmp(update, "bitmap") == 0)
7797                 return RWH_BITMAP;
7798         else if (strcmp(update, "no-bitmap") == 0)
7799                 return RWH_OFF;
7800         return -1;
7801 }
7802
7803 static int update_subarray_imsm(struct supertype *st, char *subarray,
7804                                 char *update, struct mddev_ident *ident)
7805 {
7806         /* update the subarray currently referenced by ->current_vol */
7807         struct intel_super *super = st->sb;
7808         struct imsm_super *mpb = super->anchor;
7809
7810         if (strcmp(update, "name") == 0) {
7811                 char *name = ident->name;
7812                 char *ep;
7813                 int vol;
7814
7815                 if (is_subarray_active(subarray, st->devnm)) {
7816                         pr_err("Unable to update name of active subarray\n");
7817                         return 2;
7818                 }
7819
7820                 if (!check_name(super, name, 0))
7821                         return 2;
7822
7823                 vol = strtoul(subarray, &ep, 10);
7824                 if (*ep != '\0' || vol >= super->anchor->num_raid_devs)
7825                         return 2;
7826
7827                 if (st->update_tail) {
7828                         struct imsm_update_rename_array *u = xmalloc(sizeof(*u));
7829
7830                         u->type = update_rename_array;
7831                         u->dev_idx = vol;
7832                         strncpy((char *) u->name, name, MAX_RAID_SERIAL_LEN);
7833                         u->name[MAX_RAID_SERIAL_LEN-1] = '\0';
7834                         append_metadata_update(st, u, sizeof(*u));
7835                 } else {
7836                         struct imsm_dev *dev;
7837                         int i, namelen;
7838
7839                         dev = get_imsm_dev(super, vol);
7840                         memset(dev->volume, '\0', MAX_RAID_SERIAL_LEN);
7841                         namelen = min((int)strlen(name), MAX_RAID_SERIAL_LEN);
7842                         memcpy(dev->volume, name, namelen);
7843                         for (i = 0; i < mpb->num_raid_devs; i++) {
7844                                 dev = get_imsm_dev(super, i);
7845                                 handle_missing(super, dev);
7846                         }
7847                         super->updates_pending++;
7848                 }
7849         } else if (get_rwh_policy_from_update(update) != -1) {
7850                 int new_policy;
7851                 char *ep;
7852                 int vol = strtoul(subarray, &ep, 10);
7853
7854                 if (*ep != '\0' || vol >= super->anchor->num_raid_devs)
7855                         return 2;
7856
7857                 new_policy = get_rwh_policy_from_update(update);
7858
7859                 if (st->update_tail) {
7860                         struct imsm_update_rwh_policy *u = xmalloc(sizeof(*u));
7861
7862                         u->type = update_rwh_policy;
7863                         u->dev_idx = vol;
7864                         u->new_policy = new_policy;
7865                         append_metadata_update(st, u, sizeof(*u));
7866                 } else {
7867                         struct imsm_dev *dev;
7868
7869                         dev = get_imsm_dev(super, vol);
7870                         dev->rwh_policy = new_policy;
7871                         super->updates_pending++;
7872                 }
7873                 if (new_policy == RWH_BITMAP)
7874                         return write_init_bitmap_imsm_vol(st, vol);
7875         } else
7876                 return 2;
7877
7878         return 0;
7879 }
7880
7881 static int is_gen_migration(struct imsm_dev *dev)
7882 {
7883         if (dev == NULL)
7884                 return 0;
7885
7886         if (!dev->vol.migr_state)
7887                 return 0;
7888
7889         if (migr_type(dev) == MIGR_GEN_MIGR)
7890                 return 1;
7891
7892         return 0;
7893 }
7894
7895 static int is_rebuilding(struct imsm_dev *dev)
7896 {
7897         struct imsm_map *migr_map;
7898
7899         if (!dev->vol.migr_state)
7900                 return 0;
7901
7902         if (migr_type(dev) != MIGR_REBUILD)
7903                 return 0;
7904
7905         migr_map = get_imsm_map(dev, MAP_1);
7906
7907         if (migr_map->map_state == IMSM_T_STATE_DEGRADED)
7908                 return 1;
7909         else
7910                 return 0;
7911 }
7912
7913 static int is_initializing(struct imsm_dev *dev)
7914 {
7915         struct imsm_map *migr_map;
7916
7917         if (!dev->vol.migr_state)
7918                 return 0;
7919
7920         if (migr_type(dev) != MIGR_INIT)
7921                 return 0;
7922
7923         migr_map = get_imsm_map(dev, MAP_1);
7924
7925         if (migr_map->map_state == IMSM_T_STATE_UNINITIALIZED)
7926                 return 1;
7927
7928         return 0;
7929 }
7930
7931 static void update_recovery_start(struct intel_super *super,
7932                                         struct imsm_dev *dev,
7933                                         struct mdinfo *array)
7934 {
7935         struct mdinfo *rebuild = NULL;
7936         struct mdinfo *d;
7937         __u32 units;
7938
7939         if (!is_rebuilding(dev))
7940                 return;
7941
7942         /* Find the rebuild target, but punt on the dual rebuild case */
7943         for (d = array->devs; d; d = d->next)
7944                 if (d->recovery_start == 0) {
7945                         if (rebuild)
7946                                 return;
7947                         rebuild = d;
7948                 }
7949
7950         if (!rebuild) {
7951                 /* (?) none of the disks are marked with
7952                  * IMSM_ORD_REBUILD, so assume they are missing and the
7953                  * disk_ord_tbl was not correctly updated
7954                  */
7955                 dprintf("failed to locate out-of-sync disk\n");
7956                 return;
7957         }
7958
7959         units = vol_curr_migr_unit(dev);
7960         rebuild->recovery_start = units * blocks_per_migr_unit(super, dev);
7961 }
7962
7963 static int recover_backup_imsm(struct supertype *st, struct mdinfo *info);
7964
7965 static struct mdinfo *container_content_imsm(struct supertype *st, char *subarray)
7966 {
7967         /* Given a container loaded by load_super_imsm_all,
7968          * extract information about all the arrays into
7969          * an mdinfo tree.
7970          * If 'subarray' is given, just extract info about that array.
7971          *
7972          * For each imsm_dev create an mdinfo, fill it in,
7973          *  then look for matching devices in super->disks
7974          *  and create appropriate device mdinfo.
7975          */
7976         struct intel_super *super = st->sb;
7977         struct imsm_super *mpb = super->anchor;
7978         struct mdinfo *rest = NULL;
7979         unsigned int i;
7980         int sb_errors = 0;
7981         struct dl *d;
7982         int spare_disks = 0;
7983         int current_vol = super->current_vol;
7984
7985         /* do not assemble arrays when not all attributes are supported */
7986         if (imsm_check_attributes(mpb->attributes) == 0) {
7987                 sb_errors = 1;
7988                 pr_err("Unsupported attributes in IMSM metadata.Arrays activation is blocked.\n");
7989         }
7990
7991         /* count spare devices, not used in maps
7992          */
7993         for (d = super->disks; d; d = d->next)
7994                 if (d->index == -1)
7995                         spare_disks++;
7996
7997         for (i = 0; i < mpb->num_raid_devs; i++) {
7998                 struct imsm_dev *dev;
7999                 struct imsm_map *map;
8000                 struct imsm_map *map2;
8001                 struct mdinfo *this;
8002                 int slot;
8003                 int chunk;
8004                 char *ep;
8005                 int level;
8006
8007                 if (subarray &&
8008                     (i != strtoul(subarray, &ep, 10) || *ep != '\0'))
8009                         continue;
8010
8011                 dev = get_imsm_dev(super, i);
8012                 map = get_imsm_map(dev, MAP_0);
8013                 map2 = get_imsm_map(dev, MAP_1);
8014                 level = get_imsm_raid_level(map);
8015
8016                 /* do not publish arrays that are in the middle of an
8017                  * unsupported migration
8018                  */
8019                 if (dev->vol.migr_state &&
8020                     (migr_type(dev) == MIGR_STATE_CHANGE)) {
8021                         pr_err("cannot assemble volume '%.16s': unsupported migration in progress\n",
8022                                 dev->volume);
8023                         continue;
8024                 }
8025                 /* do not publish arrays that are not support by controller's
8026                  * OROM/EFI
8027                  */
8028
8029                 this = xmalloc(sizeof(*this));
8030
8031                 super->current_vol = i;
8032                 getinfo_super_imsm_volume(st, this, NULL);
8033                 this->next = rest;
8034                 chunk = __le16_to_cpu(map->blocks_per_strip) >> 1;
8035                 /* mdadm does not support all metadata features- set the bit in all arrays state */
8036                 if (!validate_geometry_imsm_orom(super,
8037                                                  level, /* RAID level */
8038                                                  imsm_level_to_layout(level),
8039                                                  map->num_members, /* raid disks */
8040                                                  &chunk, imsm_dev_size(dev),
8041                                                  1 /* verbose */)) {
8042                         pr_err("IMSM RAID geometry validation failed.  Array %s activation is blocked.\n",
8043                                 dev->volume);
8044                         this->array.state |=
8045                           (1<<MD_SB_BLOCK_CONTAINER_RESHAPE) |
8046                           (1<<MD_SB_BLOCK_VOLUME);
8047                 }
8048
8049                 /* if array has bad blocks, set suitable bit in all arrays state */
8050                 if (sb_errors)
8051                         this->array.state |=
8052                           (1<<MD_SB_BLOCK_CONTAINER_RESHAPE) |
8053                           (1<<MD_SB_BLOCK_VOLUME);
8054
8055                 for (slot = 0 ; slot <  map->num_members; slot++) {
8056                         unsigned long long recovery_start;
8057                         struct mdinfo *info_d;
8058                         struct dl *d;
8059                         int idx;
8060                         int skip;
8061                         __u32 ord;
8062                         int missing = 0;
8063
8064                         skip = 0;
8065                         idx = get_imsm_disk_idx(dev, slot, MAP_0);
8066                         ord = get_imsm_ord_tbl_ent(dev, slot, MAP_X);
8067                         for (d = super->disks; d ; d = d->next)
8068                                 if (d->index == idx)
8069                                         break;
8070
8071                         recovery_start = MaxSector;
8072                         if (d == NULL)
8073                                 skip = 1;
8074                         if (d && is_failed(&d->disk))
8075                                 skip = 1;
8076                         if (!skip && (ord & IMSM_ORD_REBUILD))
8077                                 recovery_start = 0;
8078                         if (!(ord & IMSM_ORD_REBUILD))
8079                                 this->array.working_disks++;
8080                         /*
8081                          * if we skip some disks the array will be assmebled degraded;
8082                          * reset resync start to avoid a dirty-degraded
8083                          * situation when performing the intial sync
8084                          */
8085                         if (skip)
8086                                 missing++;
8087
8088                         if (!(dev->vol.dirty & RAIDVOL_DIRTY)) {
8089                                 if ((!able_to_resync(level, missing) ||
8090                                      recovery_start == 0))
8091                                         this->resync_start = MaxSector;
8092                         } else {
8093                                 /*
8094                                  * FIXME handle dirty degraded
8095                                  */
8096                         }
8097
8098                         if (skip)
8099                                 continue;
8100
8101                         info_d = xcalloc(1, sizeof(*info_d));
8102                         info_d->next = this->devs;
8103                         this->devs = info_d;
8104
8105                         info_d->disk.number = d->index;
8106                         info_d->disk.major = d->major;
8107                         info_d->disk.minor = d->minor;
8108                         info_d->disk.raid_disk = slot;
8109                         info_d->recovery_start = recovery_start;
8110                         if (map2) {
8111                                 if (slot < map2->num_members)
8112                                         info_d->disk.state = (1 << MD_DISK_ACTIVE);
8113                                 else
8114                                         this->array.spare_disks++;
8115                         } else {
8116                                 if (slot < map->num_members)
8117                                         info_d->disk.state = (1 << MD_DISK_ACTIVE);
8118                                 else
8119                                         this->array.spare_disks++;
8120                         }
8121
8122                         info_d->events = __le32_to_cpu(mpb->generation_num);
8123                         info_d->data_offset = pba_of_lba0(map);
8124                         info_d->component_size = calc_component_size(map, dev);
8125
8126                         if (map->raid_level == 5) {
8127                                 info_d->ppl_sector = this->ppl_sector;
8128                                 info_d->ppl_size = this->ppl_size;
8129                                 if (this->consistency_policy == CONSISTENCY_POLICY_PPL &&
8130                                     recovery_start == 0)
8131                                         this->resync_start = 0;
8132                         }
8133
8134                         info_d->bb.supported = 1;
8135                         get_volume_badblocks(super->bbm_log, ord_to_idx(ord),
8136                                              info_d->data_offset,
8137                                              info_d->component_size,
8138                                              &info_d->bb);
8139                 }
8140                 /* now that the disk list is up-to-date fixup recovery_start */
8141                 update_recovery_start(super, dev, this);
8142                 this->array.spare_disks += spare_disks;
8143
8144                 /* check for reshape */
8145                 if (this->reshape_active == 1)
8146                         recover_backup_imsm(st, this);
8147                 rest = this;
8148         }
8149
8150         super->current_vol = current_vol;
8151         return rest;
8152 }
8153
8154 static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev,
8155                                 int failed, int look_in_map)
8156 {
8157         struct imsm_map *map;
8158
8159         map = get_imsm_map(dev, look_in_map);
8160
8161         if (!failed)
8162                 return map->map_state == IMSM_T_STATE_UNINITIALIZED ?
8163                         IMSM_T_STATE_UNINITIALIZED : IMSM_T_STATE_NORMAL;
8164
8165         switch (get_imsm_raid_level(map)) {
8166         case 0:
8167                 return IMSM_T_STATE_FAILED;
8168                 break;
8169         case 1:
8170                 if (failed < map->num_members)
8171                         return IMSM_T_STATE_DEGRADED;
8172                 else
8173                         return IMSM_T_STATE_FAILED;
8174                 break;
8175         case 10:
8176         {
8177                 /**
8178                  * check to see if any mirrors have failed, otherwise we
8179                  * are degraded.  Even numbered slots are mirrored on
8180                  * slot+1
8181                  */
8182                 int i;
8183                 /* gcc -Os complains that this is unused */
8184                 int insync = insync;
8185
8186                 for (i = 0; i < map->num_members; i++) {
8187                         __u32 ord = get_imsm_ord_tbl_ent(dev, i, MAP_X);
8188                         int idx = ord_to_idx(ord);
8189                         struct imsm_disk *disk;
8190
8191                         /* reset the potential in-sync count on even-numbered
8192                          * slots.  num_copies is always 2 for imsm raid10
8193                          */
8194                         if ((i & 1) == 0)
8195                                 insync = 2;
8196
8197                         disk = get_imsm_disk(super, idx);
8198                         if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD)
8199                                 insync--;
8200
8201                         /* no in-sync disks left in this mirror the
8202                          * array has failed
8203                          */
8204                         if (insync == 0)
8205                                 return IMSM_T_STATE_FAILED;
8206                 }
8207
8208                 return IMSM_T_STATE_DEGRADED;
8209         }
8210         case 5:
8211                 if (failed < 2)
8212                         return IMSM_T_STATE_DEGRADED;
8213                 else
8214                         return IMSM_T_STATE_FAILED;
8215                 break;
8216         default:
8217                 break;
8218         }
8219
8220         return map->map_state;
8221 }
8222
8223 static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev,
8224                              int look_in_map)
8225 {
8226         int i;
8227         int failed = 0;
8228         struct imsm_disk *disk;
8229         struct imsm_map *map = get_imsm_map(dev, MAP_0);
8230         struct imsm_map *prev = get_imsm_map(dev, MAP_1);
8231         struct imsm_map *map_for_loop;
8232         __u32 ord;
8233         int idx;
8234         int idx_1;
8235
8236         /* at the beginning of migration we set IMSM_ORD_REBUILD on
8237          * disks that are being rebuilt.  New failures are recorded to
8238          * map[0].  So we look through all the disks we started with and
8239          * see if any failures are still present, or if any new ones
8240          * have arrived
8241          */
8242         map_for_loop = map;
8243         if (prev && (map->num_members < prev->num_members))
8244                 map_for_loop = prev;
8245
8246         for (i = 0; i < map_for_loop->num_members; i++) {
8247                 idx_1 = -255;
8248                 /* when MAP_X is passed both maps failures are counted
8249                  */
8250                 if (prev &&
8251                     (look_in_map == MAP_1 || look_in_map == MAP_X) &&
8252                     i < prev->num_members) {
8253                         ord = __le32_to_cpu(prev->disk_ord_tbl[i]);
8254                         idx_1 = ord_to_idx(ord);
8255
8256                         disk = get_imsm_disk(super, idx_1);
8257                         if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD)
8258                                 failed++;
8259                 }
8260                 if ((look_in_map == MAP_0 || look_in_map == MAP_X) &&
8261                     i < map->num_members) {
8262                         ord = __le32_to_cpu(map->disk_ord_tbl[i]);
8263                         idx = ord_to_idx(ord);
8264
8265                         if (idx != idx_1) {
8266                                 disk = get_imsm_disk(super, idx);
8267                                 if (!disk || is_failed(disk) ||
8268                                     ord & IMSM_ORD_REBUILD)
8269                                         failed++;
8270                         }
8271                 }
8272         }
8273
8274         return failed;
8275 }
8276
8277 static int imsm_open_new(struct supertype *c, struct active_array *a,
8278                          char *inst)
8279 {
8280         struct intel_super *super = c->sb;
8281         struct imsm_super *mpb = super->anchor;
8282         struct imsm_update_prealloc_bb_mem u;
8283
8284         if (atoi(inst) >= mpb->num_raid_devs) {
8285                 pr_err("subarry index %d, out of range\n", atoi(inst));
8286                 return -ENODEV;
8287         }
8288
8289         dprintf("imsm: open_new %s\n", inst);
8290         a->info.container_member = atoi(inst);
8291
8292         u.type = update_prealloc_badblocks_mem;
8293         imsm_update_metadata_locally(c, &u, sizeof(u));
8294
8295         return 0;
8296 }
8297
8298 static int is_resyncing(struct imsm_dev *dev)
8299 {
8300         struct imsm_map *migr_map;
8301
8302         if (!dev->vol.migr_state)
8303                 return 0;
8304
8305         if (migr_type(dev) == MIGR_INIT ||
8306             migr_type(dev) == MIGR_REPAIR)
8307                 return 1;
8308
8309         if (migr_type(dev) == MIGR_GEN_MIGR)
8310                 return 0;
8311
8312         migr_map = get_imsm_map(dev, MAP_1);
8313
8314         if (migr_map->map_state == IMSM_T_STATE_NORMAL &&
8315             dev->vol.migr_type != MIGR_GEN_MIGR)
8316                 return 1;
8317         else
8318                 return 0;
8319 }
8320
8321 /* return true if we recorded new information */
8322 static int mark_failure(struct intel_super *super,
8323                         struct imsm_dev *dev, struct imsm_disk *disk, int idx)
8324 {
8325         __u32 ord;
8326         int slot;
8327         struct imsm_map *map;
8328         char buf[MAX_RAID_SERIAL_LEN+3];
8329         unsigned int len, shift = 0;
8330
8331         /* new failures are always set in map[0] */
8332         map = get_imsm_map(dev, MAP_0);
8333
8334         slot = get_imsm_disk_slot(map, idx);
8335         if (slot < 0)
8336                 return 0;
8337
8338         ord = __le32_to_cpu(map->disk_ord_tbl[slot]);
8339         if (is_failed(disk) && (ord & IMSM_ORD_REBUILD))
8340                 return 0;
8341
8342         memcpy(buf, disk->serial, MAX_RAID_SERIAL_LEN);
8343         buf[MAX_RAID_SERIAL_LEN] = '\000';
8344         strcat(buf, ":0");
8345         if ((len = strlen(buf)) >= MAX_RAID_SERIAL_LEN)
8346                 shift = len - MAX_RAID_SERIAL_LEN + 1;
8347         memcpy(disk->serial, &buf[shift], len + 1 - shift);
8348
8349         disk->status |= FAILED_DISK;
8350         set_imsm_ord_tbl_ent(map, slot, idx | IMSM_ORD_REBUILD);
8351         /* mark failures in second map if second map exists and this disk
8352          * in this slot.
8353          * This is valid for migration, initialization and rebuild
8354          */
8355         if (dev->vol.migr_state) {
8356                 struct imsm_map *map2 = get_imsm_map(dev, MAP_1);
8357                 int slot2 = get_imsm_disk_slot(map2, idx);
8358
8359                 if (slot2 < map2->num_members && slot2 >= 0)
8360                         set_imsm_ord_tbl_ent(map2, slot2,
8361                                              idx | IMSM_ORD_REBUILD);
8362         }
8363         if (map->failed_disk_num == 0xff ||
8364                 (!is_rebuilding(dev) && map->failed_disk_num > slot))
8365                 map->failed_disk_num = slot;
8366
8367         clear_disk_badblocks(super->bbm_log, ord_to_idx(ord));
8368
8369         return 1;
8370 }
8371
8372 static void mark_missing(struct intel_super *super,
8373                          struct imsm_dev *dev, struct imsm_disk *disk, int idx)
8374 {
8375         mark_failure(super, dev, disk, idx);
8376
8377         if (disk->scsi_id == __cpu_to_le32(~(__u32)0))
8378                 return;
8379
8380         disk->scsi_id = __cpu_to_le32(~(__u32)0);
8381         memmove(&disk->serial[0], &disk->serial[1], MAX_RAID_SERIAL_LEN - 1);
8382 }
8383
8384 static void handle_missing(struct intel_super *super, struct imsm_dev *dev)
8385 {
8386         struct dl *dl;
8387
8388         if (!super->missing)
8389                 return;
8390
8391         /* When orom adds replacement for missing disk it does
8392          * not remove entry of missing disk, but just updates map with
8393          * new added disk. So it is not enough just to test if there is
8394          * any missing disk, we have to look if there are any failed disks
8395          * in map to stop migration */
8396
8397         dprintf("imsm: mark missing\n");
8398         /* end process for initialization and rebuild only
8399          */
8400         if (is_gen_migration(dev) == 0) {
8401                 int failed = imsm_count_failed(super, dev, MAP_0);
8402
8403                 if (failed) {
8404                         __u8 map_state;
8405                         struct imsm_map *map = get_imsm_map(dev, MAP_0);
8406                         struct imsm_map *map1;
8407                         int i, ord, ord_map1;
8408                         int rebuilt = 1;
8409
8410                         for (i = 0; i < map->num_members; i++) {
8411                                 ord = get_imsm_ord_tbl_ent(dev, i, MAP_0);
8412                                 if (!(ord & IMSM_ORD_REBUILD))
8413                                         continue;
8414
8415                                 map1 = get_imsm_map(dev, MAP_1);
8416                                 if (!map1)
8417                                         continue;
8418
8419                                 ord_map1 = __le32_to_cpu(map1->disk_ord_tbl[i]);
8420                                 if (ord_map1 & IMSM_ORD_REBUILD)
8421                                         rebuilt = 0;
8422                         }
8423
8424                         if (rebuilt) {
8425                                 map_state = imsm_check_degraded(super, dev,
8426                                                                 failed, MAP_0);
8427                                 end_migration(dev, super, map_state);
8428                         }
8429                 }
8430         }
8431         for (dl = super->missing; dl; dl = dl->next)
8432                 mark_missing(super, dev, &dl->disk, dl->index);
8433         super->updates_pending++;
8434 }
8435
8436 static unsigned long long imsm_set_array_size(struct imsm_dev *dev,
8437                                               long long new_size)
8438 {
8439         unsigned long long array_blocks;
8440         struct imsm_map *map = get_imsm_map(dev, MAP_0);
8441         int used_disks = imsm_num_data_members(map);
8442
8443         if (used_disks == 0) {
8444                 /* when problems occures
8445                  * return current array_blocks value
8446                  */
8447                 array_blocks = imsm_dev_size(dev);
8448
8449                 return array_blocks;
8450         }
8451
8452         /* set array size in metadata
8453          */
8454         if (new_size <= 0)
8455                 /* OLCE size change is caused by added disks
8456                  */
8457                 array_blocks = per_dev_array_size(map) * used_disks;
8458         else
8459                 /* Online Volume Size Change
8460                  * Using  available free space
8461                  */
8462                 array_blocks = new_size;
8463
8464         array_blocks = round_size_to_mb(array_blocks, used_disks);
8465         set_imsm_dev_size(dev, array_blocks);
8466
8467         return array_blocks;
8468 }
8469
8470 static void imsm_set_disk(struct active_array *a, int n, int state);
8471
8472 static void imsm_progress_container_reshape(struct intel_super *super)
8473 {
8474         /* if no device has a migr_state, but some device has a
8475          * different number of members than the previous device, start
8476          * changing the number of devices in this device to match
8477          * previous.
8478          */
8479         struct imsm_super *mpb = super->anchor;
8480         int prev_disks = -1;
8481         int i;
8482         int copy_map_size;
8483
8484         for (i = 0; i < mpb->num_raid_devs; i++) {
8485                 struct imsm_dev *dev = get_imsm_dev(super, i);
8486                 struct imsm_map *map = get_imsm_map(dev, MAP_0);
8487                 struct imsm_map *map2;
8488                 int prev_num_members;
8489
8490                 if (dev->vol.migr_state)
8491                         return;
8492
8493                 if (prev_disks == -1)
8494                         prev_disks = map->num_members;
8495                 if (prev_disks == map->num_members)
8496                         continue;
8497
8498                 /* OK, this array needs to enter reshape mode.
8499                  * i.e it needs a migr_state
8500                  */
8501
8502                 copy_map_size = sizeof_imsm_map(map);
8503                 prev_num_members = map->num_members;
8504                 map->num_members = prev_disks;
8505                 dev->vol.migr_state = 1;
8506                 set_vol_curr_migr_unit(dev, 0);
8507                 set_migr_type(dev, MIGR_GEN_MIGR);
8508                 for (i = prev_num_members;
8509                      i < map->num_members; i++)
8510                         set_imsm_ord_tbl_ent(map, i, i);
8511                 map2 = get_imsm_map(dev, MAP_1);
8512                 /* Copy the current map */
8513                 memcpy(map2, map, copy_map_size);
8514                 map2->num_members = prev_num_members;
8515
8516                 imsm_set_array_size(dev, -1);
8517                 super->clean_migration_record_by_mdmon = 1;
8518                 super->updates_pending++;
8519         }
8520 }
8521
8522 /* Handle dirty -> clean transititions, resync and reshape.  Degraded and rebuild
8523  * states are handled in imsm_set_disk() with one exception, when a
8524  * resync is stopped due to a new failure this routine will set the
8525  * 'degraded' state for the array.
8526  */
8527 static int imsm_set_array_state(struct active_array *a, int consistent)
8528 {
8529         int inst = a->info.container_member;
8530         struct intel_super *super = a->container->sb;
8531         struct imsm_dev *dev = get_imsm_dev(super, inst);
8532         struct imsm_map *map = get_imsm_map(dev, MAP_0);
8533         int failed = imsm_count_failed(super, dev, MAP_0);
8534         __u8 map_state = imsm_check_degraded(super, dev, failed, MAP_0);
8535         __u32 blocks_per_unit;
8536
8537         if (dev->vol.migr_state &&
8538             dev->vol.migr_type  == MIGR_GEN_MIGR) {
8539                 /* array state change is blocked due to reshape action
8540                  * We might need to
8541                  * - abort the reshape (if last_checkpoint is 0 and action!= reshape)
8542                  * - finish the reshape (if last_checkpoint is big and action != reshape)
8543                  * - update vol_curr_migr_unit
8544                  */
8545                 if (a->curr_action == reshape) {
8546                         /* still reshaping, maybe update vol_curr_migr_unit */
8547                         goto mark_checkpoint;
8548                 } else {
8549                         if (a->last_checkpoint == 0 && a->prev_action == reshape) {
8550                                 /* for some reason we aborted the reshape.
8551                                  *
8552                                  * disable automatic metadata rollback
8553                                  * user action is required to recover process
8554                                  */
8555                                 if (0) {
8556                                         struct imsm_map *map2 =
8557                                                 get_imsm_map(dev, MAP_1);
8558                                         dev->vol.migr_state = 0;
8559                                         set_migr_type(dev, 0);
8560                                         set_vol_curr_migr_unit(dev, 0);
8561                                         memcpy(map, map2,
8562                                                sizeof_imsm_map(map2));
8563                                         super->updates_pending++;
8564                                 }
8565                         }
8566                         if (a->last_checkpoint >= a->info.component_size) {
8567                                 unsigned long long array_blocks;
8568                                 int used_disks;
8569                                 struct mdinfo *mdi;
8570
8571                                 used_disks = imsm_num_data_members(map);
8572                                 if (used_disks > 0) {
8573                                         array_blocks =
8574                                                 per_dev_array_size(map) *
8575                                                 used_disks;
8576                                         array_blocks =
8577                                                 round_size_to_mb(array_blocks,
8578                                                                  used_disks);
8579                                         a->info.custom_array_size = array_blocks;
8580                                         /* encourage manager to update array
8581                                          * size
8582                                          */
8583
8584                                         a->check_reshape = 1;
8585                                 }
8586                                 /* finalize online capacity expansion/reshape */
8587                                 for (mdi = a->info.devs; mdi; mdi = mdi->next)
8588                                         imsm_set_disk(a,
8589                                                       mdi->disk.raid_disk,
8590                                                       mdi->curr_state);
8591
8592                                 imsm_progress_container_reshape(super);
8593                         }
8594                 }
8595         }
8596
8597         /* before we activate this array handle any missing disks */
8598         if (consistent == 2)
8599                 handle_missing(super, dev);
8600
8601         if (consistent == 2 &&
8602             (!is_resync_complete(&a->info) ||
8603              map_state != IMSM_T_STATE_NORMAL ||
8604              dev->vol.migr_state))
8605                 consistent = 0;
8606
8607         if (is_resync_complete(&a->info)) {
8608                 /* complete intialization / resync,
8609                  * recovery and interrupted recovery is completed in
8610                  * ->set_disk
8611                  */
8612                 if (is_resyncing(dev)) {
8613                         dprintf("imsm: mark resync done\n");
8614                         end_migration(dev, super, map_state);
8615                         super->updates_pending++;
8616                         a->last_checkpoint = 0;
8617                 }
8618         } else if ((!is_resyncing(dev) && !failed) &&
8619                    (imsm_reshape_blocks_arrays_changes(super) == 0)) {
8620                 /* mark the start of the init process if nothing is failed */
8621                 dprintf("imsm: mark resync start\n");
8622                 if (map->map_state == IMSM_T_STATE_UNINITIALIZED)
8623                         migrate(dev, super, IMSM_T_STATE_NORMAL, MIGR_INIT);
8624                 else
8625                         migrate(dev, super, IMSM_T_STATE_NORMAL, MIGR_REPAIR);
8626                 super->updates_pending++;
8627         }
8628
8629 mark_checkpoint:
8630         /* skip checkpointing for general migration,
8631          * it is controlled in mdadm
8632          */
8633         if (is_gen_migration(dev))
8634                 goto skip_mark_checkpoint;
8635
8636         /* check if we can update vol_curr_migr_unit from resync_start,
8637          * recovery_start
8638          */
8639         blocks_per_unit = blocks_per_migr_unit(super, dev);
8640         if (blocks_per_unit) {
8641                 set_vol_curr_migr_unit(dev,
8642                                        a->last_checkpoint / blocks_per_unit);
8643                 dprintf("imsm: mark checkpoint (%llu)\n",
8644                         vol_curr_migr_unit(dev));
8645                 super->updates_pending++;
8646         }
8647
8648 skip_mark_checkpoint:
8649         /* mark dirty / clean */
8650         if (((dev->vol.dirty & RAIDVOL_DIRTY) && consistent) ||
8651             (!(dev->vol.dirty & RAIDVOL_DIRTY) && !consistent)) {
8652                 dprintf("imsm: mark '%s'\n", consistent ? "clean" : "dirty");
8653                 if (consistent) {
8654                         dev->vol.dirty = RAIDVOL_CLEAN;
8655                 } else {
8656                         dev->vol.dirty = RAIDVOL_DIRTY;
8657                         if (dev->rwh_policy == RWH_DISTRIBUTED ||
8658                             dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED)
8659                                 dev->vol.dirty |= RAIDVOL_DSRECORD_VALID;
8660                 }
8661                 super->updates_pending++;
8662         }
8663
8664         return consistent;
8665 }
8666
8667 static int imsm_disk_slot_to_ord(struct active_array *a, int slot)
8668 {
8669         int inst = a->info.container_member;
8670         struct intel_super *super = a->container->sb;
8671         struct imsm_dev *dev = get_imsm_dev(super, inst);
8672         struct imsm_map *map = get_imsm_map(dev, MAP_0);
8673
8674         if (slot > map->num_members) {
8675                 pr_err("imsm: imsm_disk_slot_to_ord %d out of range 0..%d\n",
8676                        slot, map->num_members - 1);
8677                 return -1;
8678         }
8679
8680         if (slot < 0)
8681                 return -1;
8682
8683         return get_imsm_ord_tbl_ent(dev, slot, MAP_0);
8684 }
8685
8686 static void imsm_set_disk(struct active_array *a, int n, int state)
8687 {
8688         int inst = a->info.container_member;
8689         struct intel_super *super = a->container->sb;
8690         struct imsm_dev *dev = get_imsm_dev(super, inst);
8691         struct imsm_map *map = get_imsm_map(dev, MAP_0);
8692         struct imsm_disk *disk;
8693         struct mdinfo *mdi;
8694         int recovery_not_finished = 0;
8695         int failed;
8696         int ord;
8697         __u8 map_state;
8698         int rebuild_done = 0;
8699         int i;
8700
8701         ord = get_imsm_ord_tbl_ent(dev, n, MAP_X);
8702         if (ord < 0)
8703                 return;
8704
8705         dprintf("imsm: set_disk %d:%x\n", n, state);
8706         disk = get_imsm_disk(super, ord_to_idx(ord));
8707
8708         /* check for new failures */
8709         if (disk && (state & DS_FAULTY)) {
8710                 if (mark_failure(super, dev, disk, ord_to_idx(ord)))
8711                         super->updates_pending++;
8712         }
8713
8714         /* check if in_sync */
8715         if (state & DS_INSYNC && ord & IMSM_ORD_REBUILD && is_rebuilding(dev)) {
8716                 struct imsm_map *migr_map = get_imsm_map(dev, MAP_1);
8717
8718                 set_imsm_ord_tbl_ent(migr_map, n, ord_to_idx(ord));
8719                 rebuild_done = 1;
8720                 super->updates_pending++;
8721         }
8722
8723         failed = imsm_count_failed(super, dev, MAP_0);
8724         map_state = imsm_check_degraded(super, dev, failed, MAP_0);
8725
8726         /* check if recovery complete, newly degraded, or failed */
8727         dprintf("imsm: Detected transition to state ");
8728         switch (map_state) {
8729         case IMSM_T_STATE_NORMAL: /* transition to normal state */
8730                 dprintf("normal: ");
8731                 if (is_rebuilding(dev)) {
8732                         dprintf_cont("while rebuilding");
8733                         /* check if recovery is really finished */
8734                         for (mdi = a->info.devs; mdi ; mdi = mdi->next)
8735                                 if (mdi->recovery_start != MaxSector) {
8736                                         recovery_not_finished = 1;
8737                                         break;
8738                                 }
8739                         if (recovery_not_finished) {
8740                                 dprintf_cont("\n");
8741                                 dprintf("Rebuild has not finished yet, state not changed");
8742                                 if (a->last_checkpoint < mdi->recovery_start) {
8743                                         a->last_checkpoint = mdi->recovery_start;
8744                                         super->updates_pending++;
8745                                 }
8746                                 break;
8747                         }
8748                         end_migration(dev, super, map_state);
8749                         map->failed_disk_num = ~0;
8750                         super->updates_pending++;
8751                         a->last_checkpoint = 0;
8752                         break;
8753                 }
8754                 if (is_gen_migration(dev)) {
8755                         dprintf_cont("while general migration");
8756                         if (a->last_checkpoint >= a->info.component_size)
8757                                 end_migration(dev, super, map_state);
8758                         else
8759                                 map->map_state = map_state;
8760                         map->failed_disk_num = ~0;
8761                         super->updates_pending++;
8762                         break;
8763                 }
8764         break;
8765         case IMSM_T_STATE_DEGRADED: /* transition to degraded state */
8766                 dprintf_cont("degraded: ");
8767                 if (map->map_state != map_state && !dev->vol.migr_state) {
8768                         dprintf_cont("mark degraded");
8769                         map->map_state = map_state;
8770                         super->updates_pending++;
8771                         a->last_checkpoint = 0;
8772                         break;
8773                 }
8774                 if (is_rebuilding(dev)) {
8775                         dprintf_cont("while rebuilding ");
8776                         if (state & DS_FAULTY)  {
8777                                 dprintf_cont("removing failed drive ");
8778                                 if (n == map->failed_disk_num) {
8779                                         dprintf_cont("end migration");
8780                                         end_migration(dev, super, map_state);
8781                                         a->last_checkpoint = 0;
8782                                 } else {
8783                                         dprintf_cont("fail detected during rebuild, changing map state");
8784                                         map->map_state = map_state;
8785                                 }
8786                                 super->updates_pending++;
8787                         }
8788
8789                         if (!rebuild_done)
8790                                 break;
8791
8792                         /* check if recovery is really finished */
8793                         for (mdi = a->info.devs; mdi ; mdi = mdi->next)
8794                                 if (mdi->recovery_start != MaxSector) {
8795                                         recovery_not_finished = 1;
8796                                         break;
8797                                 }
8798                         if (recovery_not_finished) {
8799                                 dprintf_cont("\n");
8800                                 dprintf_cont("Rebuild has not finished yet");
8801                                 if (a->last_checkpoint < mdi->recovery_start) {
8802                                         a->last_checkpoint =
8803                                                 mdi->recovery_start;
8804                                         super->updates_pending++;
8805                                 }
8806                                 break;
8807                         }
8808
8809                         dprintf_cont(" Rebuild done, still degraded");
8810                         end_migration(dev, super, map_state);
8811                         a->last_checkpoint = 0;
8812                         super->updates_pending++;
8813
8814                         for (i = 0; i < map->num_members; i++) {
8815                                 int idx = get_imsm_ord_tbl_ent(dev, i, MAP_0);
8816
8817                                 if (idx & IMSM_ORD_REBUILD)
8818                                         map->failed_disk_num = i;
8819                         }
8820                         super->updates_pending++;
8821                         break;
8822                 }
8823                 if (is_gen_migration(dev)) {
8824                         dprintf_cont("while general migration");
8825                         if (a->last_checkpoint >= a->info.component_size)
8826                                 end_migration(dev, super, map_state);
8827                         else {
8828                                 map->map_state = map_state;
8829                                 manage_second_map(super, dev);
8830                         }
8831                         super->updates_pending++;
8832                         break;
8833                 }
8834                 if (is_initializing(dev)) {
8835                         dprintf_cont("while initialization.");
8836                         map->map_state = map_state;
8837                         super->updates_pending++;
8838                         break;
8839                 }
8840         break;
8841         case IMSM_T_STATE_FAILED: /* transition to failed state */
8842                 dprintf_cont("failed: ");
8843                 if (is_gen_migration(dev)) {
8844                         dprintf_cont("while general migration");
8845                         map->map_state = map_state;
8846                         super->updates_pending++;
8847                         break;
8848                 }
8849                 if (map->map_state != map_state) {
8850                         dprintf_cont("mark failed");
8851                         end_migration(dev, super, map_state);
8852                         super->updates_pending++;
8853                         a->last_checkpoint = 0;
8854                         break;
8855                 }
8856         break;
8857         default:
8858                 dprintf_cont("state %i\n", map_state);
8859         }
8860         dprintf_cont("\n");
8861 }
8862
8863 static int store_imsm_mpb(int fd, struct imsm_super *mpb)
8864 {
8865         void *buf = mpb;
8866         __u32 mpb_size = __le32_to_cpu(mpb->mpb_size);
8867         unsigned long long dsize;
8868         unsigned long long sectors;
8869         unsigned int sector_size;
8870
8871         if (!get_dev_sector_size(fd, NULL, &sector_size))
8872                 return 1;
8873         get_dev_size(fd, NULL, &dsize);
8874
8875         if (mpb_size > sector_size) {
8876                 /* -1 to account for anchor */
8877                 sectors = mpb_sectors(mpb, sector_size) - 1;
8878
8879                 /* write the extended mpb to the sectors preceeding the anchor */
8880                 if (lseek64(fd, dsize - (sector_size * (2 + sectors)),
8881                    SEEK_SET) < 0)
8882                         return 1;
8883
8884                 if ((unsigned long long)write(fd, buf + sector_size,
8885                    sector_size * sectors) != sector_size * sectors)
8886                         return 1;
8887         }
8888
8889         /* first block is stored on second to last sector of the disk */
8890         if (lseek64(fd, dsize - (sector_size * 2), SEEK_SET) < 0)
8891                 return 1;
8892
8893         if ((unsigned int)write(fd, buf, sector_size) != sector_size)
8894                 return 1;
8895
8896         return 0;
8897 }
8898
8899 static void imsm_sync_metadata(struct supertype *container)
8900 {
8901         struct intel_super *super = container->sb;
8902
8903         dprintf("sync metadata: %d\n", super->updates_pending);
8904         if (!super->updates_pending)
8905                 return;
8906
8907         write_super_imsm(container, 0);
8908
8909         super->updates_pending = 0;
8910 }
8911
8912 static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_array *a)
8913 {
8914         struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member);
8915         int i = get_imsm_disk_idx(dev, idx, MAP_X);
8916         struct dl *dl;
8917
8918         for (dl = super->disks; dl; dl = dl->next)
8919                 if (dl->index == i)
8920                         break;
8921
8922         if (dl && is_failed(&dl->disk))
8923                 dl = NULL;
8924
8925         if (dl)
8926                 dprintf("found %x:%x\n", dl->major, dl->minor);
8927
8928         return dl;
8929 }
8930
8931 static struct dl *imsm_add_spare(struct intel_super *super, int slot,
8932                                  struct active_array *a, int activate_new,
8933                                  struct mdinfo *additional_test_list)
8934 {
8935         struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member);
8936         int idx = get_imsm_disk_idx(dev, slot, MAP_X);
8937         struct imsm_super *mpb = super->anchor;
8938         struct imsm_map *map;
8939         unsigned long long pos;
8940         struct mdinfo *d;
8941         struct extent *ex;
8942         int i, j;
8943         int found;
8944         __u32 array_start = 0;
8945         __u32 array_end = 0;
8946         struct dl *dl;
8947         struct mdinfo *test_list;
8948
8949         for (dl = super->disks; dl; dl = dl->next) {
8950                 /* If in this array, skip */
8951                 for (d = a->info.devs ; d ; d = d->next)
8952                         if (d->state_fd >= 0 &&
8953                             d->disk.major == dl->major &&
8954                             d->disk.minor == dl->minor) {
8955                                 dprintf("%x:%x already in array\n",
8956                                         dl->major, dl->minor);
8957                                 break;
8958                         }
8959                 if (d)
8960                         continue;
8961                 test_list = additional_test_list;
8962                 while (test_list) {
8963                         if (test_list->disk.major == dl->major &&
8964                             test_list->disk.minor == dl->minor) {
8965                                 dprintf("%x:%x already in additional test list\n",
8966                                         dl->major, dl->minor);
8967                                 break;
8968                         }
8969                         test_list = test_list->next;
8970                 }
8971                 if (test_list)
8972                         continue;
8973
8974                 /* skip in use or failed drives */
8975                 if (is_failed(&dl->disk) || idx == dl->index ||
8976                     dl->index == -2) {
8977                         dprintf("%x:%x status (failed: %d index: %d)\n",
8978                                 dl->major, dl->minor, is_failed(&dl->disk), idx);
8979                         continue;
8980                 }
8981
8982                 /* skip pure spares when we are looking for partially
8983                  * assimilated drives
8984                  */
8985                 if (dl->index == -1 && !activate_new)
8986                         continue;
8987
8988                 if (!drive_validate_sector_size(super, dl))
8989                         continue;
8990
8991                 /* Does this unused device have the requisite free space?
8992                  * It needs to be able to cover all member volumes
8993                  */
8994                 ex = get_extents(super, dl, 1);
8995                 if (!ex) {
8996                         dprintf("cannot get extents\n");
8997                         continue;
8998                 }
8999                 for (i = 0; i < mpb->num_raid_devs; i++) {
9000                         dev = get_imsm_dev(super, i);
9001                         map = get_imsm_map(dev, MAP_0);
9002
9003                         /* check if this disk is already a member of
9004                          * this array
9005                          */
9006                         if (get_imsm_disk_slot(map, dl->index) >= 0)
9007                                 continue;
9008
9009                         found = 0;
9010                         j = 0;
9011                         pos = 0;
9012                         array_start = pba_of_lba0(map);
9013                         array_end = array_start +
9014                                     per_dev_array_size(map) - 1;
9015
9016                         do {
9017                                 /* check that we can start at pba_of_lba0 with
9018                                  * num_data_stripes*blocks_per_stripe of space
9019                                  */
9020                                 if (array_start >= pos && array_end < ex[j].start) {
9021                                         found = 1;
9022                                         break;
9023                                 }
9024                                 pos = ex[j].start + ex[j].size;
9025                                 j++;
9026                         } while (ex[j-1].size);
9027
9028                         if (!found)
9029                                 break;
9030                 }
9031
9032                 free(ex);
9033                 if (i < mpb->num_raid_devs) {
9034                         dprintf("%x:%x does not have %u to %u available\n",
9035                                 dl->major, dl->minor, array_start, array_end);
9036                         /* No room */
9037                         continue;
9038                 }
9039                 return dl;
9040         }
9041
9042         return dl;
9043 }
9044
9045 static int imsm_rebuild_allowed(struct supertype *cont, int dev_idx, int failed)
9046 {
9047         struct imsm_dev *dev2;
9048         struct imsm_map *map;
9049         struct dl *idisk;
9050         int slot;
9051         int idx;
9052         __u8 state;
9053
9054         dev2 = get_imsm_dev(cont->sb, dev_idx);
9055         if (dev2) {
9056                 state = imsm_check_degraded(cont->sb, dev2, failed, MAP_0);
9057                 if (state == IMSM_T_STATE_FAILED) {
9058                         map = get_imsm_map(dev2, MAP_0);
9059                         if (!map)
9060                                 return 1;
9061                         for (slot = 0; slot < map->num_members; slot++) {
9062                                 /*
9063                                  * Check if failed disks are deleted from intel
9064                                  * disk list or are marked to be deleted
9065                                  */
9066                                 idx = get_imsm_disk_idx(dev2, slot, MAP_X);
9067                                 idisk = get_imsm_dl_disk(cont->sb, idx);
9068                                 /*
9069                                  * Do not rebuild the array if failed disks
9070                                  * from failed sub-array are not removed from
9071                                  * container.
9072                                  */
9073                                 if (idisk &&
9074                                     is_failed(&idisk->disk) &&
9075                                     (idisk->action != DISK_REMOVE))
9076                                         return 0;
9077                         }
9078                 }
9079         }
9080         return 1;
9081 }
9082
9083 static struct mdinfo *imsm_activate_spare(struct active_array *a,
9084                                           struct metadata_update **updates)
9085 {
9086         /**
9087          * Find a device with unused free space and use it to replace a
9088          * failed/vacant region in an array.  We replace failed regions one a
9089          * array at a time.  The result is that a new spare disk will be added
9090          * to the first failed array and after the monitor has finished
9091          * propagating failures the remainder will be consumed.
9092          *
9093          * FIXME add a capability for mdmon to request spares from another
9094          * container.
9095          */
9096
9097         struct intel_super *super = a->container->sb;
9098         int inst = a->info.container_member;
9099         struct imsm_dev *dev = get_imsm_dev(super, inst);
9100         struct imsm_map *map = get_imsm_map(dev, MAP_0);
9101         int failed = a->info.array.raid_disks;
9102         struct mdinfo *rv = NULL;
9103         struct mdinfo *d;
9104         struct mdinfo *di;
9105         struct metadata_update *mu;
9106         struct dl *dl;
9107         struct imsm_update_activate_spare *u;
9108         int num_spares = 0;
9109         int i;
9110         int allowed;
9111
9112         for (d = a->info.devs ; d ; d = d->next) {
9113                 if ((d->curr_state & DS_FAULTY) &&
9114                         d->state_fd >= 0)
9115                         /* wait for Removal to happen */
9116                         return NULL;
9117                 if (d->state_fd >= 0)
9118                         failed--;
9119         }
9120
9121         dprintf("imsm: activate spare: inst=%d failed=%d (%d) level=%d\n",
9122                 inst, failed, a->info.array.raid_disks, a->info.array.level);
9123
9124         if (imsm_reshape_blocks_arrays_changes(super))
9125                         return NULL;
9126
9127         /* Cannot activate another spare if rebuild is in progress already
9128          */
9129         if (is_rebuilding(dev)) {
9130                 dprintf("imsm: No spare activation allowed. Rebuild in progress already.\n");
9131                 return NULL;
9132         }
9133
9134         if (a->info.array.level == 4)
9135                 /* No repair for takeovered array
9136                  * imsm doesn't support raid4
9137                  */
9138                 return NULL;
9139
9140         if (imsm_check_degraded(super, dev, failed, MAP_0) !=
9141                         IMSM_T_STATE_DEGRADED)
9142                 return NULL;
9143
9144         if (get_imsm_map(dev, MAP_0)->map_state == IMSM_T_STATE_UNINITIALIZED) {
9145                 dprintf("imsm: No spare activation allowed. Volume is not initialized.\n");
9146                 return NULL;
9147         }
9148
9149         /*
9150          * If there are any failed disks check state of the other volume.
9151          * Block rebuild if the another one is failed until failed disks
9152          * are removed from container.
9153          */
9154         if (failed) {
9155                 dprintf("found failed disks in %.*s, check if there anotherfailed sub-array.\n",
9156                         MAX_RAID_SERIAL_LEN, dev->volume);
9157                 /* check if states of the other volumes allow for rebuild */
9158                 for (i = 0; i <  super->anchor->num_raid_devs; i++) {
9159                         if (i != inst) {
9160                                 allowed = imsm_rebuild_allowed(a->container,
9161                                                                i, failed);
9162                                 if (!allowed)
9163                                         return NULL;
9164                         }
9165                 }
9166         }
9167
9168         /* For each slot, if it is not working, find a spare */
9169         for (i = 0; i < a->info.array.raid_disks; i++) {
9170                 for (d = a->info.devs ; d ; d = d->next)
9171                         if (d->disk.raid_disk == i)
9172                                 break;
9173                 dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0);
9174                 if (d && (d->state_fd >= 0))
9175                         continue;
9176
9177                 /*
9178                  * OK, this device needs recovery.  Try to re-add the
9179                  * previous occupant of this slot, if this fails see if
9180                  * we can continue the assimilation of a spare that was
9181                  * partially assimilated, finally try to activate a new
9182                  * spare.
9183                  */
9184                 dl = imsm_readd(super, i, a);
9185                 if (!dl)
9186                         dl = imsm_add_spare(super, i, a, 0, rv);
9187                 if (!dl)
9188                         dl = imsm_add_spare(super, i, a, 1, rv);
9189                 if (!dl)
9190                         continue;
9191
9192                 /* found a usable disk with enough space */
9193                 di = xcalloc(1, sizeof(*di));
9194
9195                 /* dl->index will be -1 in the case we are activating a
9196                  * pristine spare.  imsm_process_update() will create a
9197                  * new index in this case.  Once a disk is found to be
9198                  * failed in all member arrays it is kicked from the
9199                  * metadata
9200                  */
9201                 di->disk.number = dl->index;
9202
9203                 /* (ab)use di->devs to store a pointer to the device
9204                  * we chose
9205                  */
9206                 di->devs = (struct mdinfo *) dl;
9207
9208                 di->disk.raid_disk = i;
9209                 di->disk.major = dl->major;
9210                 di->disk.minor = dl->minor;
9211                 di->disk.state = 0;
9212                 di->recovery_start = 0;
9213                 di->data_offset = pba_of_lba0(map);
9214                 di->component_size = a->info.component_size;
9215                 di->container_member = inst;
9216                 di->bb.supported = 1;
9217                 if (a->info.consistency_policy == CONSISTENCY_POLICY_PPL) {
9218                         di->ppl_sector = get_ppl_sector(super, inst);
9219                         di->ppl_size = MULTIPLE_PPL_AREA_SIZE_IMSM >> 9;
9220                 }
9221                 super->random = random32();
9222                 di->next = rv;
9223                 rv = di;
9224                 num_spares++;
9225                 dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor,
9226                         i, di->data_offset);
9227         }
9228
9229         if (!rv)
9230                 /* No spares found */
9231                 return rv;
9232         /* Now 'rv' has a list of devices to return.
9233          * Create a metadata_update record to update the
9234          * disk_ord_tbl for the array
9235          */
9236         mu = xmalloc(sizeof(*mu));
9237         mu->buf = xcalloc(num_spares,
9238                           sizeof(struct imsm_update_activate_spare));
9239         mu->space = NULL;
9240         mu->space_list = NULL;
9241         mu->len = sizeof(struct imsm_update_activate_spare) * num_spares;
9242         mu->next = *updates;
9243         u = (struct imsm_update_activate_spare *) mu->buf;
9244
9245         for (di = rv ; di ; di = di->next) {
9246                 u->type = update_activate_spare;
9247                 u->dl = (struct dl *) di->devs;
9248                 di->devs = NULL;
9249                 u->slot = di->disk.raid_disk;
9250                 u->array = inst;
9251                 u->next = u + 1;
9252                 u++;
9253         }
9254         (u-1)->next = NULL;
9255         *updates = mu;
9256
9257         return rv;
9258 }
9259
9260 static int disks_overlap(struct intel_super *super, int idx, struct imsm_update_create_array *u)
9261 {
9262         struct imsm_dev *dev = get_imsm_dev(super, idx);
9263         struct imsm_map *map = get_imsm_map(dev, MAP_0);
9264         struct imsm_map *new_map = get_imsm_map(&u->dev, MAP_0);
9265         struct disk_info *inf = get_disk_info(u);
9266         struct imsm_disk *disk;
9267         int i;
9268         int j;
9269
9270         for (i = 0; i < map->num_members; i++) {
9271                 disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i, MAP_X));
9272                 for (j = 0; j < new_map->num_members; j++)
9273                         if (serialcmp(disk->serial, inf[j].serial) == 0)
9274                                 return 1;
9275         }
9276
9277         return 0;
9278 }
9279
9280 static struct dl *get_disk_super(struct intel_super *super, int major, int minor)
9281 {
9282         struct dl *dl;
9283
9284         for (dl = super->disks; dl; dl = dl->next)
9285                 if (dl->major == major &&  dl->minor == minor)
9286                         return dl;
9287         return NULL;
9288 }
9289
9290 static int remove_disk_super(struct intel_super *super, int major, int minor)
9291 {
9292         struct dl *prev;
9293         struct dl *dl;
9294
9295         prev = NULL;
9296         for (dl = super->disks; dl; dl = dl->next) {
9297                 if (dl->major == major && dl->minor == minor) {
9298                         /* remove */
9299                         if (prev)
9300                                 prev->next = dl->next;
9301                         else
9302                                 super->disks = dl->next;
9303                         dl->next = NULL;
9304                         __free_imsm_disk(dl);
9305                         dprintf("removed %x:%x\n", major, minor);
9306                         break;
9307                 }
9308                 prev = dl;
9309         }
9310         return 0;
9311 }
9312
9313 static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned index);
9314
9315 static int add_remove_disk_update(struct intel_super *super)
9316 {
9317         int check_degraded = 0;
9318         struct dl *disk;
9319
9320         /* add/remove some spares to/from the metadata/contrainer */
9321         while (super->disk_mgmt_list) {
9322                 struct dl *disk_cfg;
9323
9324                 disk_cfg = super->disk_mgmt_list;
9325                 super->disk_mgmt_list = disk_cfg->next;
9326                 disk_cfg->next = NULL;
9327
9328                 if (disk_cfg->action == DISK_ADD) {
9329                         disk_cfg->next = super->disks;
9330                         super->disks = disk_cfg;
9331                         check_degraded = 1;
9332                         dprintf("added %x:%x\n",
9333                                 disk_cfg->major, disk_cfg->minor);
9334                 } else if (disk_cfg->action == DISK_REMOVE) {
9335                         dprintf("Disk remove action processed: %x.%x\n",
9336                                 disk_cfg->major, disk_cfg->minor);
9337                         disk = get_disk_super(super,
9338                                               disk_cfg->major,
9339                                               disk_cfg->minor);
9340                         if (disk) {
9341                                 /* store action status */
9342                                 disk->action = DISK_REMOVE;
9343                                 /* remove spare disks only */
9344                                 if (disk->index == -1) {
9345                                         remove_disk_super(super,
9346                                                           disk_cfg->major,
9347                                                           disk_cfg->minor);
9348                                 } else {
9349                                         disk_cfg->fd = disk->fd;
9350                                         disk->fd = -1;
9351                                 }
9352                         }
9353                         /* release allocate disk structure */
9354                         __free_imsm_disk(disk_cfg);
9355                 }
9356         }
9357         return check_degraded;
9358 }
9359
9360 static int apply_reshape_migration_update(struct imsm_update_reshape_migration *u,
9361                                                 struct intel_super *super,
9362                                                 void ***space_list)
9363 {
9364         struct intel_dev *id;
9365         void **tofree = NULL;
9366         int ret_val = 0;
9367
9368         dprintf("(enter)\n");
9369         if (u->subdev < 0 || u->subdev > 1) {
9370                 dprintf("imsm: Error: Wrong subdev: %i\n", u->subdev);
9371                 return ret_val;
9372         }
9373         if (space_list == NULL || *space_list == NULL) {
9374                 dprintf("imsm: Error: Memory is not allocated\n");
9375                 return ret_val;
9376         }
9377
9378         for (id = super->devlist ; id; id = id->next) {
9379                 if (id->index == (unsigned)u->subdev) {
9380                         struct imsm_dev *dev = get_imsm_dev(super, u->subdev);
9381                         struct imsm_map *map;
9382                         struct imsm_dev *new_dev =
9383                                 (struct imsm_dev *)*space_list;
9384                         struct imsm_map *migr_map = get_imsm_map(dev, MAP_1);
9385                         int to_state;
9386                         struct dl *new_disk;
9387
9388                         if (new_dev == NULL)
9389                                 return ret_val;
9390                         *space_list = **space_list;
9391                         memcpy(new_dev, dev, sizeof_imsm_dev(dev, 0));
9392                         map = get_imsm_map(new_dev, MAP_0);
9393                         if (migr_map) {
9394                                 dprintf("imsm: Error: migration in progress");
9395                                 return ret_val;
9396                         }
9397
9398                         to_state = map->map_state;
9399                         if ((u->new_level == 5) && (map->raid_level == 0)) {
9400                                 map->num_members++;
9401                                 /* this should not happen */
9402                                 if (u->new_disks[0] < 0) {
9403                                         map->failed_disk_num =
9404                                                 map->num_members - 1;
9405                                         to_state = IMSM_T_STATE_DEGRADED;
9406                                 } else
9407                                         to_state = IMSM_T_STATE_NORMAL;
9408                         }
9409                         migrate(new_dev, super, to_state, MIGR_GEN_MIGR);
9410                         if (u->new_level > -1)
9411                                 map->raid_level = u->new_level;
9412                         migr_map = get_imsm_map(new_dev, MAP_1);
9413                         if ((u->new_level == 5) &&
9414                             (migr_map->raid_level == 0)) {
9415                                 int ord = map->num_members - 1;
9416                                 migr_map->num_members--;
9417                                 if (u->new_disks[0] < 0)
9418                                         ord |= IMSM_ORD_REBUILD;
9419                                 set_imsm_ord_tbl_ent(map,
9420                                                      map->num_members - 1,
9421                                                      ord);
9422                         }
9423                         id->dev = new_dev;
9424                         tofree = (void **)dev;
9425
9426                         /* update chunk size
9427                          */
9428                         if (u->new_chunksize > 0) {
9429                                 unsigned long long num_data_stripes;
9430                                 struct imsm_map *dest_map =
9431                                         get_imsm_map(dev, MAP_0);
9432                                 int used_disks =
9433                                         imsm_num_data_members(dest_map);
9434
9435                                 if (used_disks == 0)
9436                                         return ret_val;
9437
9438                                 map->blocks_per_strip =
9439                                         __cpu_to_le16(u->new_chunksize * 2);
9440                                 num_data_stripes =
9441                                         imsm_dev_size(dev) / used_disks;
9442                                 num_data_stripes /= map->blocks_per_strip;
9443                                 num_data_stripes /= map->num_domains;
9444                                 set_num_data_stripes(map, num_data_stripes);
9445                         }
9446
9447                         /* ensure blocks_per_member has valid value
9448                          */
9449                         set_blocks_per_member(map,
9450                                               per_dev_array_size(map) +
9451                                               NUM_BLOCKS_DIRTY_STRIPE_REGION);
9452
9453                         /* add disk
9454                          */
9455                         if (u->new_level != 5 || migr_map->raid_level != 0 ||
9456                             migr_map->raid_level == map->raid_level)
9457                                 goto skip_disk_add;
9458
9459                         if (u->new_disks[0] >= 0) {
9460                                 /* use passes spare
9461                                  */
9462                                 new_disk = get_disk_super(super,
9463                                                         major(u->new_disks[0]),
9464                                                         minor(u->new_disks[0]));
9465                                 dprintf("imsm: new disk for reshape is: %i:%i (%p, index = %i)\n",
9466                                         major(u->new_disks[0]),
9467                                         minor(u->new_disks[0]),
9468                                         new_disk, new_disk->index);
9469                                 if (new_disk == NULL)
9470                                         goto error_disk_add;
9471
9472                                 new_disk->index = map->num_members - 1;
9473                                 /* slot to fill in autolayout
9474                                  */
9475                                 new_disk->raiddisk = new_disk->index;
9476                                 new_disk->disk.status |= CONFIGURED_DISK;
9477                                 new_disk->disk.status &= ~SPARE_DISK;
9478                         } else
9479                                 goto error_disk_add;
9480
9481 skip_disk_add:
9482                         *tofree = *space_list;
9483                         /* calculate new size
9484                          */
9485                         imsm_set_array_size(new_dev, -1);
9486
9487                         ret_val = 1;
9488                 }
9489         }
9490
9491         if (tofree)
9492                 *space_list = tofree;
9493         return ret_val;
9494
9495 error_disk_add:
9496         dprintf("Error: imsm: Cannot find disk.\n");
9497         return ret_val;
9498 }
9499
9500 static int apply_size_change_update(struct imsm_update_size_change *u,
9501                 struct intel_super *super)
9502 {
9503         struct intel_dev *id;
9504         int ret_val = 0;
9505
9506         dprintf("(enter)\n");
9507         if (u->subdev < 0 || u->subdev > 1) {
9508                 dprintf("imsm: Error: Wrong subdev: %i\n", u->subdev);
9509                 return ret_val;
9510         }
9511
9512         for (id = super->devlist ; id; id = id->next) {
9513                 if (id->index == (unsigned)u->subdev) {
9514                         struct imsm_dev *dev = get_imsm_dev(super, u->subdev);
9515                         struct imsm_map *map = get_imsm_map(dev, MAP_0);
9516                         int used_disks = imsm_num_data_members(map);
9517                         unsigned long long blocks_per_member;
9518                         unsigned long long num_data_stripes;
9519                         unsigned long long new_size_per_disk;
9520
9521                         if (used_disks == 0)
9522                                 return 0;
9523
9524                         /* calculate new size
9525                          */
9526                         new_size_per_disk = u->new_size / used_disks;
9527                         blocks_per_member = new_size_per_disk +
9528                                             NUM_BLOCKS_DIRTY_STRIPE_REGION;
9529                         num_data_stripes = new_size_per_disk /
9530                                            map->blocks_per_strip;
9531                         num_data_stripes /= map->num_domains;
9532                         dprintf("(size: %llu, blocks per member: %llu, num_data_stipes: %llu)\n",
9533                                 u->new_size, new_size_per_disk,
9534                                 num_data_stripes);
9535                         set_blocks_per_member(map, blocks_per_member);
9536                         set_num_data_stripes(map, num_data_stripes);
9537                         imsm_set_array_size(dev, u->new_size);
9538
9539                         ret_val = 1;
9540                         break;
9541                 }
9542         }
9543
9544         return ret_val;
9545 }
9546
9547 static int prepare_spare_to_activate(struct supertype *st,
9548                                      struct imsm_update_activate_spare *u)
9549 {
9550         struct intel_super *super = st->sb;
9551         int prev_current_vol = super->current_vol;
9552         struct active_array *a;
9553         int ret = 1;
9554
9555         for (a = st->arrays; a; a = a->next)
9556                 /*
9557                  * Additional initialization (adding bitmap header, filling
9558                  * the bitmap area with '1's to force initial rebuild for a whole
9559                  * data-area) is required when adding the spare to the volume
9560                  * with write-intent bitmap.
9561                  */
9562                 if (a->info.container_member == u->array &&
9563                     a->info.consistency_policy == CONSISTENCY_POLICY_BITMAP) {
9564                         struct dl *dl;
9565
9566                         for (dl = super->disks; dl; dl = dl->next)
9567                                 if (dl == u->dl)
9568                                         break;
9569                         if (!dl)
9570                                 break;
9571
9572                         super->current_vol = u->array;
9573                         if (st->ss->write_bitmap(st, dl->fd, NoUpdate))
9574                                 ret = 0;
9575                         super->current_vol = prev_current_vol;
9576                 }
9577         return ret;
9578 }
9579
9580 static int apply_update_activate_spare(struct imsm_update_activate_spare *u,
9581                                        struct intel_super *super,
9582                                        struct active_array *active_array)
9583 {
9584         struct imsm_super *mpb = super->anchor;
9585         struct imsm_dev *dev = get_imsm_dev(super, u->array);
9586         struct imsm_map *map = get_imsm_map(dev, MAP_0);
9587         struct imsm_map *migr_map;
9588         struct active_array *a;
9589         struct imsm_disk *disk;
9590         __u8 to_state;
9591         struct dl *dl;
9592         unsigned int found;
9593         int failed;
9594         int victim;
9595         int i;
9596         int second_map_created = 0;
9597
9598         for (; u; u = u->next) {
9599                 victim = get_imsm_disk_idx(dev, u->slot, MAP_X);
9600
9601                 if (victim < 0)
9602                         return 0;
9603
9604                 for (dl = super->disks; dl; dl = dl->next)
9605                         if (dl == u->dl)
9606                                 break;
9607
9608                 if (!dl) {
9609                         pr_err("error: imsm_activate_spare passed an unknown disk (index: %d)\n",
9610                                 u->dl->index);
9611                         return 0;
9612                 }
9613
9614                 /* count failures (excluding rebuilds and the victim)
9615                  * to determine map[0] state
9616                  */
9617                 failed = 0;
9618                 for (i = 0; i < map->num_members; i++) {
9619                         if (i == u->slot)
9620                                 continue;
9621                         disk = get_imsm_disk(super,
9622                                              get_imsm_disk_idx(dev, i, MAP_X));
9623                         if (!disk || is_failed(disk))
9624                                 failed++;
9625                 }
9626
9627                 /* adding a pristine spare, assign a new index */
9628                 if (dl->index < 0) {
9629                         dl->index = super->anchor->num_disks;
9630                         super->anchor->num_disks++;
9631                 }
9632                 disk = &dl->disk;
9633                 disk->status |= CONFIGURED_DISK;
9634                 disk->status &= ~SPARE_DISK;
9635
9636                 /* mark rebuild */
9637                 to_state = imsm_check_degraded(super, dev, failed, MAP_0);
9638                 if (!second_map_created) {
9639                         second_map_created = 1;
9640                         map->map_state = IMSM_T_STATE_DEGRADED;
9641                         migrate(dev, super, to_state, MIGR_REBUILD);
9642                 } else
9643                         map->map_state = to_state;
9644                 migr_map = get_imsm_map(dev, MAP_1);
9645                 set_imsm_ord_tbl_ent(map, u->slot, dl->index);
9646                 set_imsm_ord_tbl_ent(migr_map, u->slot,
9647                                      dl->index | IMSM_ORD_REBUILD);
9648
9649                 /* update the family_num to mark a new container
9650                  * generation, being careful to record the existing
9651                  * family_num in orig_family_num to clean up after
9652                  * earlier mdadm versions that neglected to set it.
9653                  */
9654                 if (mpb->orig_family_num == 0)
9655                         mpb->orig_family_num = mpb->family_num;
9656                 mpb->family_num += super->random;
9657
9658                 /* count arrays using the victim in the metadata */
9659                 found = 0;
9660                 for (a = active_array; a ; a = a->next) {
9661                         dev = get_imsm_dev(super, a->info.container_member);
9662                         map = get_imsm_map(dev, MAP_0);
9663
9664                         if (get_imsm_disk_slot(map, victim) >= 0)
9665                                 found++;
9666                 }
9667
9668                 /* delete the victim if it is no longer being
9669                  * utilized anywhere
9670                  */
9671                 if (!found) {
9672                         struct dl **dlp;
9673
9674                         /* We know that 'manager' isn't touching anything,
9675                          * so it is safe to delete
9676                          */
9677                         for (dlp = &super->disks; *dlp; dlp = &(*dlp)->next)
9678                                 if ((*dlp)->index == victim)
9679                                         break;
9680
9681                         /* victim may be on the missing list */
9682                         if (!*dlp)
9683                                 for (dlp = &super->missing; *dlp;
9684                                      dlp = &(*dlp)->next)
9685                                         if ((*dlp)->index == victim)
9686                                                 break;
9687                         imsm_delete(super, dlp, victim);
9688                 }
9689         }
9690
9691         return 1;
9692 }
9693
9694 static int apply_reshape_container_disks_update(struct imsm_update_reshape *u,
9695                                                 struct intel_super *super,
9696                                                 void ***space_list)
9697 {
9698         struct dl *new_disk;
9699         struct intel_dev *id;
9700         int i;
9701         int delta_disks = u->new_raid_disks - u->old_raid_disks;
9702         int disk_count = u->old_raid_disks;
9703         void **tofree = NULL;
9704         int devices_to_reshape = 1;
9705         struct imsm_super *mpb = super->anchor;
9706         int ret_val = 0;
9707         unsigned int dev_id;
9708
9709         dprintf("(enter)\n");
9710
9711         /* enable spares to use in array */
9712         for (i = 0; i < delta_disks; i++) {
9713                 new_disk = get_disk_super(super,
9714                                           major(u->new_disks[i]),
9715                                           minor(u->new_disks[i]));
9716                 dprintf("imsm: new disk for reshape is: %i:%i (%p, index = %i)\n",
9717                         major(u->new_disks[i]), minor(u->new_disks[i]),
9718                         new_disk, new_disk->index);
9719                 if (new_disk == NULL ||
9720                     (new_disk->index >= 0 &&
9721                      new_disk->index < u->old_raid_disks))
9722                         goto update_reshape_exit;
9723                 new_disk->index = disk_count++;
9724                 /* slot to fill in autolayout
9725                  */
9726                 new_disk->raiddisk = new_disk->index;
9727                 new_disk->disk.status |=
9728                         CONFIGURED_DISK;
9729                 new_disk->disk.status &= ~SPARE_DISK;
9730         }
9731
9732         dprintf("imsm: volume set mpb->num_raid_devs = %i\n",
9733                 mpb->num_raid_devs);
9734         /* manage changes in volume
9735          */
9736         for (dev_id = 0; dev_id < mpb->num_raid_devs; dev_id++) {
9737                 void **sp = *space_list;
9738                 struct imsm_dev *newdev;
9739                 struct imsm_map *newmap, *oldmap;
9740
9741                 for (id = super->devlist ; id; id = id->next) {
9742                         if (id->index == dev_id)
9743                                 break;
9744                 }
9745                 if (id == NULL)
9746                         break;
9747                 if (!sp)
9748                         continue;
9749                 *space_list = *sp;
9750                 newdev = (void*)sp;
9751                 /* Copy the dev, but not (all of) the map */
9752                 memcpy(newdev, id->dev, sizeof(*newdev));
9753                 oldmap = get_imsm_map(id->dev, MAP_0);
9754                 newmap = get_imsm_map(newdev, MAP_0);
9755                 /* Copy the current map */
9756                 memcpy(newmap, oldmap, sizeof_imsm_map(oldmap));
9757                 /* update one device only
9758                  */
9759                 if (devices_to_reshape) {
9760                         dprintf("imsm: modifying subdev: %i\n",
9761                                 id->index);
9762                         devices_to_reshape--;
9763                         newdev->vol.migr_state = 1;
9764                         set_vol_curr_migr_unit(newdev, 0);
9765                         set_migr_type(newdev, MIGR_GEN_MIGR);
9766                         newmap->num_members = u->new_raid_disks;
9767                         for (i = 0; i < delta_disks; i++) {
9768                                 set_imsm_ord_tbl_ent(newmap,
9769                                                      u->old_raid_disks + i,
9770                                                      u->old_raid_disks + i);
9771                         }
9772                         /* New map is correct, now need to save old map
9773                          */
9774                         newmap = get_imsm_map(newdev, MAP_1);
9775                         memcpy(newmap, oldmap, sizeof_imsm_map(oldmap));
9776
9777                         imsm_set_array_size(newdev, -1);
9778                 }
9779
9780                 sp = (void **)id->dev;
9781                 id->dev = newdev;
9782                 *sp = tofree;
9783                 tofree = sp;
9784
9785                 /* Clear migration record */
9786                 memset(super->migr_rec, 0, sizeof(struct migr_record));
9787         }
9788         if (tofree)
9789                 *space_list = tofree;
9790         ret_val = 1;
9791
9792 update_reshape_exit:
9793
9794         return ret_val;
9795 }
9796
9797 static int apply_takeover_update(struct imsm_update_takeover *u,
9798                                  struct intel_super *super,
9799                                  void ***space_list)
9800 {
9801         struct imsm_dev *dev = NULL;
9802         struct intel_dev *dv;
9803         struct imsm_dev *dev_new;
9804         struct imsm_map *map;
9805         struct dl *dm, *du;
9806         int i;
9807
9808         for (dv = super->devlist; dv; dv = dv->next)
9809                 if (dv->index == (unsigned int)u->subarray) {
9810                         dev = dv->dev;
9811                         break;
9812                 }
9813
9814         if (dev == NULL)
9815                 return 0;
9816
9817         map = get_imsm_map(dev, MAP_0);
9818
9819         if (u->direction == R10_TO_R0) {
9820                 unsigned long long num_data_stripes;
9821
9822                 /* Number of failed disks must be half of initial disk number */
9823                 if (imsm_count_failed(super, dev, MAP_0) !=
9824                                 (map->num_members / 2))
9825                         return 0;
9826
9827                 /* iterate through devices to mark removed disks as spare */
9828                 for (dm = super->disks; dm; dm = dm->next) {
9829                         if (dm->disk.status & FAILED_DISK) {
9830                                 int idx = dm->index;
9831                                 /* update indexes on the disk list */
9832 /* FIXME this loop-with-the-loop looks wrong,  I'm not convinced
9833    the index values will end up being correct.... NB */
9834                                 for (du = super->disks; du; du = du->next)
9835                                         if (du->index > idx)
9836                                                 du->index--;
9837                                 /* mark as spare disk */
9838                                 mark_spare(dm);
9839                         }
9840                 }
9841                 /* update map */
9842                 map->num_members = map->num_members / 2;
9843                 map->map_state = IMSM_T_STATE_NORMAL;
9844                 map->num_domains = 1;
9845                 map->raid_level = 0;
9846                 map->failed_disk_num = -1;
9847                 num_data_stripes = imsm_dev_size(dev) / 2;
9848                 num_data_stripes /= map->blocks_per_strip;
9849                 set_num_data_stripes(map, num_data_stripes);
9850         }
9851
9852         if (u->direction == R0_TO_R10) {
9853                 void **space;
9854                 unsigned long long num_data_stripes;
9855
9856                 /* update slots in current disk list */
9857                 for (dm = super->disks; dm; dm = dm->next) {
9858                         if (dm->index >= 0)
9859                                 dm->index *= 2;
9860                 }
9861                 /* create new *missing* disks */
9862                 for (i = 0; i < map->num_members; i++) {
9863                         space = *space_list;
9864                         if (!space)
9865                                 continue;
9866                         *space_list = *space;
9867                         du = (void *)space;
9868                         memcpy(du, super->disks, sizeof(*du));
9869                         du->fd = -1;
9870                         du->minor = 0;
9871                         du->major = 0;
9872                         du->index = (i * 2) + 1;
9873                         sprintf((char *)du->disk.serial,
9874                                 " MISSING_%d", du->index);
9875                         sprintf((char *)du->serial,
9876                                 "MISSING_%d", du->index);
9877                         du->next = super->missing;
9878                         super->missing = du;
9879                 }
9880                 /* create new dev and map */
9881                 space = *space_list;
9882                 if (!space)
9883                         return 0;
9884                 *space_list = *space;
9885                 dev_new = (void *)space;
9886                 memcpy(dev_new, dev, sizeof(*dev));
9887                 /* update new map */
9888                 map = get_imsm_map(dev_new, MAP_0);
9889                 map->num_members = map->num_members * 2;
9890                 map->map_state = IMSM_T_STATE_DEGRADED;
9891                 map->num_domains = 2;
9892                 map->raid_level = 1;
9893                 num_data_stripes = imsm_dev_size(dev) / 2;
9894                 num_data_stripes /= map->blocks_per_strip;
9895                 num_data_stripes /= map->num_domains;
9896                 set_num_data_stripes(map, num_data_stripes);
9897
9898                 /* replace dev<->dev_new */
9899                 dv->dev = dev_new;
9900         }
9901         /* update disk order table */
9902         for (du = super->disks; du; du = du->next)
9903                 if (du->index >= 0)
9904                         set_imsm_ord_tbl_ent(map, du->index, du->index);
9905         for (du = super->missing; du; du = du->next)
9906                 if (du->index >= 0) {
9907                         set_imsm_ord_tbl_ent(map, du->index, du->index);
9908                         mark_missing(super, dv->dev, &du->disk, du->index);
9909                 }
9910
9911         return 1;
9912 }
9913
9914 static void imsm_process_update(struct supertype *st,
9915                                 struct metadata_update *update)
9916 {
9917         /**
9918          * crack open the metadata_update envelope to find the update record
9919          * update can be one of:
9920          *    update_reshape_container_disks - all the arrays in the container
9921          *      are being reshaped to have more devices.  We need to mark
9922          *      the arrays for general migration and convert selected spares
9923          *      into active devices.
9924          *    update_activate_spare - a spare device has replaced a failed
9925          *      device in an array, update the disk_ord_tbl.  If this disk is
9926          *      present in all member arrays then also clear the SPARE_DISK
9927          *      flag
9928          *    update_create_array
9929          *    update_kill_array
9930          *    update_rename_array
9931          *    update_add_remove_disk
9932          */
9933         struct intel_super *super = st->sb;
9934         struct imsm_super *mpb;
9935         enum imsm_update_type type = *(enum imsm_update_type *) update->buf;
9936
9937         /* update requires a larger buf but the allocation failed */
9938         if (super->next_len && !super->next_buf) {
9939                 super->next_len = 0;
9940                 return;
9941         }
9942
9943         if (super->next_buf) {
9944                 memcpy(super->next_buf, super->buf, super->len);
9945                 free(super->buf);
9946                 super->len = super->next_len;
9947                 super->buf = super->next_buf;
9948
9949                 super->next_len = 0;
9950                 super->next_buf = NULL;
9951         }
9952
9953         mpb = super->anchor;
9954
9955         switch (type) {
9956         case update_general_migration_checkpoint: {
9957                 struct intel_dev *id;
9958                 struct imsm_update_general_migration_checkpoint *u =
9959                                                         (void *)update->buf;
9960
9961                 dprintf("called for update_general_migration_checkpoint\n");
9962
9963                 /* find device under general migration */
9964                 for (id = super->devlist ; id; id = id->next) {
9965                         if (is_gen_migration(id->dev)) {
9966                                 set_vol_curr_migr_unit(id->dev,
9967                                                    u->curr_migr_unit);
9968                                 super->updates_pending++;
9969                         }
9970                 }
9971                 break;
9972         }
9973         case update_takeover: {
9974                 struct imsm_update_takeover *u = (void *)update->buf;
9975                 if (apply_takeover_update(u, super, &update->space_list)) {
9976                         imsm_update_version_info(super);
9977                         super->updates_pending++;
9978                 }
9979                 break;
9980         }
9981
9982         case update_reshape_container_disks: {
9983                 struct imsm_update_reshape *u = (void *)update->buf;
9984                 if (apply_reshape_container_disks_update(
9985                             u, super, &update->space_list))
9986                         super->updates_pending++;
9987                 break;
9988         }
9989         case update_reshape_migration: {
9990                 struct imsm_update_reshape_migration *u = (void *)update->buf;
9991                 if (apply_reshape_migration_update(
9992                             u, super, &update->space_list))
9993                         super->updates_pending++;
9994                 break;
9995         }
9996         case update_size_change: {
9997                 struct imsm_update_size_change *u = (void *)update->buf;
9998                 if (apply_size_change_update(u, super))
9999                         super->updates_pending++;
10000                 break;
10001         }
10002         case update_activate_spare: {
10003                 struct imsm_update_activate_spare *u = (void *) update->buf;
10004
10005                 if (prepare_spare_to_activate(st, u) &&
10006                     apply_update_activate_spare(u, super, st->arrays))
10007                         super->updates_pending++;
10008                 break;
10009         }
10010         case update_create_array: {
10011                 /* someone wants to create a new array, we need to be aware of
10012                  * a few races/collisions:
10013                  * 1/ 'Create' called by two separate instances of mdadm
10014                  * 2/ 'Create' versus 'activate_spare': mdadm has chosen
10015                  *     devices that have since been assimilated via
10016                  *     activate_spare.
10017                  * In the event this update can not be carried out mdadm will
10018                  * (FIX ME) notice that its update did not take hold.
10019                  */
10020                 struct imsm_update_create_array *u = (void *) update->buf;
10021                 struct intel_dev *dv;
10022                 struct imsm_dev *dev;
10023                 struct imsm_map *map, *new_map;
10024                 unsigned long long start, end;
10025                 unsigned long long new_start, new_end;
10026                 int i;
10027                 struct disk_info *inf;
10028                 struct dl *dl;
10029
10030                 /* handle racing creates: first come first serve */
10031                 if (u->dev_idx < mpb->num_raid_devs) {
10032                         dprintf("subarray %d already defined\n", u->dev_idx);
10033                         goto create_error;
10034                 }
10035
10036                 /* check update is next in sequence */
10037                 if (u->dev_idx != mpb->num_raid_devs) {
10038                         dprintf("can not create array %d expected index %d\n",
10039                                 u->dev_idx, mpb->num_raid_devs);
10040                         goto create_error;
10041                 }
10042
10043                 new_map = get_imsm_map(&u->dev, MAP_0);
10044                 new_start = pba_of_lba0(new_map);
10045                 new_end = new_start + per_dev_array_size(new_map);
10046                 inf = get_disk_info(u);
10047
10048                 /* handle activate_spare versus create race:
10049                  * check to make sure that overlapping arrays do not include
10050                  * overalpping disks
10051                  */
10052                 for (i = 0; i < mpb->num_raid_devs; i++) {
10053                         dev = get_imsm_dev(super, i);
10054                         map = get_imsm_map(dev, MAP_0);
10055                         start = pba_of_lba0(map);
10056                         end = start + per_dev_array_size(map);
10057                         if ((new_start >= start && new_start <= end) ||
10058                             (start >= new_start && start <= new_end))
10059                                 /* overlap */;
10060                         else
10061                                 continue;
10062
10063                         if (disks_overlap(super, i, u)) {
10064                                 dprintf("arrays overlap\n");
10065                                 goto create_error;
10066                         }
10067                 }
10068
10069                 /* check that prepare update was successful */
10070                 if (!update->space) {
10071                         dprintf("prepare update failed\n");
10072                         goto create_error;
10073                 }
10074
10075                 /* check that all disks are still active before committing
10076                  * changes.  FIXME: could we instead handle this by creating a
10077                  * degraded array?  That's probably not what the user expects,
10078                  * so better to drop this update on the floor.
10079                  */
10080                 for (i = 0; i < new_map->num_members; i++) {
10081                         dl = serial_to_dl(inf[i].serial, super);
10082                         if (!dl) {
10083                                 dprintf("disk disappeared\n");
10084                                 goto create_error;
10085                         }
10086                 }
10087
10088                 super->updates_pending++;
10089
10090                 /* convert spares to members and fixup ord_tbl */
10091                 for (i = 0; i < new_map->num_members; i++) {
10092                         dl = serial_to_dl(inf[i].serial, super);
10093                         if (dl->index == -1) {
10094                                 dl->index = mpb->num_disks;
10095                                 mpb->num_disks++;
10096                                 dl->disk.status |= CONFIGURED_DISK;
10097                                 dl->disk.status &= ~SPARE_DISK;
10098                         }
10099                         set_imsm_ord_tbl_ent(new_map, i, dl->index);
10100                 }
10101
10102                 dv = update->space;
10103                 dev = dv->dev;
10104                 update->space = NULL;
10105                 imsm_copy_dev(dev, &u->dev);
10106                 dv->index = u->dev_idx;
10107                 dv->next = super->devlist;
10108                 super->devlist = dv;
10109                 mpb->num_raid_devs++;
10110
10111                 imsm_update_version_info(super);
10112                 break;
10113  create_error:
10114                 /* mdmon knows how to release update->space, but not
10115                  * ((struct intel_dev *) update->space)->dev
10116                  */
10117                 if (update->space) {
10118                         dv = update->space;
10119                         free(dv->dev);
10120                 }
10121                 break;
10122         }
10123         case update_kill_array: {
10124                 struct imsm_update_kill_array *u = (void *) update->buf;
10125                 int victim = u->dev_idx;
10126                 struct active_array *a;
10127                 struct intel_dev **dp;
10128                 struct imsm_dev *dev;
10129
10130                 /* sanity check that we are not affecting the uuid of
10131                  * active arrays, or deleting an active array
10132                  *
10133                  * FIXME when immutable ids are available, but note that
10134                  * we'll also need to fixup the invalidated/active
10135                  * subarray indexes in mdstat
10136                  */
10137                 for (a = st->arrays; a; a = a->next)
10138                         if (a->info.container_member >= victim)
10139                                 break;
10140                 /* by definition if mdmon is running at least one array
10141                  * is active in the container, so checking
10142                  * mpb->num_raid_devs is just extra paranoia
10143                  */
10144                 dev = get_imsm_dev(super, victim);
10145                 if (a || !dev || mpb->num_raid_devs == 1) {
10146                         dprintf("failed to delete subarray-%d\n", victim);
10147                         break;
10148                 }
10149
10150                 for (dp = &super->devlist; *dp;)
10151                         if ((*dp)->index == (unsigned)super->current_vol) {
10152                                 *dp = (*dp)->next;
10153                         } else {
10154                                 if ((*dp)->index > (unsigned)victim)
10155                                         (*dp)->index--;
10156                                 dp = &(*dp)->next;
10157                         }
10158                 mpb->num_raid_devs--;
10159                 super->updates_pending++;
10160                 break;
10161         }
10162         case update_rename_array: {
10163                 struct imsm_update_rename_array *u = (void *) update->buf;
10164                 char name[MAX_RAID_SERIAL_LEN+1];
10165                 int target = u->dev_idx;
10166                 struct active_array *a;
10167                 struct imsm_dev *dev;
10168
10169                 /* sanity check that we are not affecting the uuid of
10170                  * an active array
10171                  */
10172                 memset(name, 0, sizeof(name));
10173                 snprintf(name, MAX_RAID_SERIAL_LEN, "%s", (char *) u->name);
10174                 name[MAX_RAID_SERIAL_LEN] = '\0';
10175                 for (a = st->arrays; a; a = a->next)
10176                         if (a->info.container_member == target)
10177                                 break;
10178                 dev = get_imsm_dev(super, u->dev_idx);
10179                 if (a || !dev || !check_name(super, name, 1)) {
10180                         dprintf("failed to rename subarray-%d\n", target);
10181                         break;
10182                 }
10183
10184                 memcpy(dev->volume, name, MAX_RAID_SERIAL_LEN);
10185                 super->updates_pending++;
10186                 break;
10187         }
10188         case update_add_remove_disk: {
10189                 /* we may be able to repair some arrays if disks are
10190                  * being added, check the status of add_remove_disk
10191                  * if discs has been added.
10192                  */
10193                 if (add_remove_disk_update(super)) {
10194                         struct active_array *a;
10195
10196                         super->updates_pending++;
10197                         for (a = st->arrays; a; a = a->next)
10198                                 a->check_degraded = 1;
10199                 }
10200                 break;
10201         }
10202         case update_prealloc_badblocks_mem:
10203                 break;
10204         case update_rwh_policy: {
10205                 struct imsm_update_rwh_policy *u = (void *)update->buf;
10206                 int target = u->dev_idx;
10207                 struct imsm_dev *dev = get_imsm_dev(super, target);
10208                 if (!dev) {
10209                         dprintf("could not find subarray-%d\n", target);
10210                         break;
10211                 }
10212
10213                 if (dev->rwh_policy != u->new_policy) {
10214                         dev->rwh_policy = u->new_policy;
10215                         super->updates_pending++;
10216                 }
10217                 break;
10218         }
10219         default:
10220                 pr_err("error: unsupported process update type:(type: %d)\n",   type);
10221         }
10222 }
10223
10224 static struct mdinfo *get_spares_for_grow(struct supertype *st);
10225
10226 static int imsm_prepare_update(struct supertype *st,
10227                                struct metadata_update *update)
10228 {
10229         /**
10230          * Allocate space to hold new disk entries, raid-device entries or a new
10231          * mpb if necessary.  The manager synchronously waits for updates to
10232          * complete in the monitor, so new mpb buffers allocated here can be
10233          * integrated by the monitor thread without worrying about live pointers
10234          * in the manager thread.
10235          */
10236         enum imsm_update_type type;
10237         struct intel_super *super = st->sb;
10238         unsigned int sector_size = super->sector_size;
10239         struct imsm_super *mpb = super->anchor;
10240         size_t buf_len;
10241         size_t len = 0;
10242
10243         if (update->len < (int)sizeof(type))
10244                 return 0;
10245
10246         type = *(enum imsm_update_type *) update->buf;
10247
10248         switch (type) {
10249         case update_general_migration_checkpoint:
10250                 if (update->len < (int)sizeof(struct imsm_update_general_migration_checkpoint))
10251                         return 0;
10252                 dprintf("called for update_general_migration_checkpoint\n");
10253                 break;
10254         case update_takeover: {
10255                 struct imsm_update_takeover *u = (void *)update->buf;
10256                 if (update->len < (int)sizeof(*u))
10257                         return 0;
10258                 if (u->direction == R0_TO_R10) {
10259                         void **tail = (void **)&update->space_list;
10260                         struct imsm_dev *dev = get_imsm_dev(super, u->subarray);
10261                         struct imsm_map *map = get_imsm_map(dev, MAP_0);
10262                         int num_members = map->num_members;
10263                         void *space;
10264                         int size, i;
10265                         /* allocate memory for added disks */
10266                         for (i = 0; i < num_members; i++) {
10267                                 size = sizeof(struct dl);
10268                                 space = xmalloc(size);
10269                                 *tail = space;
10270                                 tail = space;
10271                                 *tail = NULL;
10272                         }
10273                         /* allocate memory for new device */
10274                         size = sizeof_imsm_dev(super->devlist->dev, 0) +
10275                                 (num_members * sizeof(__u32));
10276                         space = xmalloc(size);
10277                         *tail = space;
10278                         tail = space;
10279                         *tail = NULL;
10280                         len = disks_to_mpb_size(num_members * 2);
10281                 }
10282
10283                 break;
10284         }
10285         case update_reshape_container_disks: {
10286                 /* Every raid device in the container is about to
10287                  * gain some more devices, and we will enter a
10288                  * reconfiguration.
10289                  * So each 'imsm_map' will be bigger, and the imsm_vol
10290                  * will now hold 2 of them.
10291                  * Thus we need new 'struct imsm_dev' allocations sized
10292                  * as sizeof_imsm_dev but with more devices in both maps.
10293                  */
10294                 struct imsm_update_reshape *u = (void *)update->buf;
10295                 struct intel_dev *dl;
10296                 void **space_tail = (void**)&update->space_list;
10297
10298                 if (update->len < (int)sizeof(*u))
10299                         return 0;
10300
10301                 dprintf("for update_reshape\n");
10302
10303                 for (dl = super->devlist; dl; dl = dl->next) {
10304                         int size = sizeof_imsm_dev(dl->dev, 1);
10305                         void *s;
10306                         if (u->new_raid_disks > u->old_raid_disks)
10307                                 size += sizeof(__u32)*2*
10308                                         (u->new_raid_disks - u->old_raid_disks);
10309                         s = xmalloc(size);
10310                         *space_tail = s;
10311                         space_tail = s;
10312                         *space_tail = NULL;
10313                 }
10314
10315                 len = disks_to_mpb_size(u->new_raid_disks);
10316                 dprintf("New anchor length is %llu\n", (unsigned long long)len);
10317                 break;
10318         }
10319         case update_reshape_migration: {
10320                 /* for migration level 0->5 we need to add disks
10321                  * so the same as for container operation we will copy
10322                  * device to the bigger location.
10323                  * in memory prepared device and new disk area are prepared
10324                  * for usage in process update
10325                  */
10326                 struct imsm_update_reshape_migration *u = (void *)update->buf;
10327                 struct intel_dev *id;
10328                 void **space_tail = (void **)&update->space_list;
10329                 int size;
10330                 void *s;
10331                 int current_level = -1;
10332
10333                 if (update->len < (int)sizeof(*u))
10334                         return 0;
10335
10336                 dprintf("for update_reshape\n");
10337
10338                 /* add space for bigger array in update
10339                  */
10340                 for (id = super->devlist; id; id = id->next) {
10341                         if (id->index == (unsigned)u->subdev) {
10342                                 size = sizeof_imsm_dev(id->dev, 1);
10343                                 if (u->new_raid_disks > u->old_raid_disks)
10344                                         size += sizeof(__u32)*2*
10345                                         (u->new_raid_disks - u->old_raid_disks);
10346                                 s = xmalloc(size);
10347                                 *space_tail = s;
10348                                 space_tail = s;
10349                                 *space_tail = NULL;
10350                                 break;
10351                         }
10352                 }
10353                 if (update->space_list == NULL)
10354                         break;
10355
10356                 /* add space for disk in update
10357                  */
10358                 size = sizeof(struct dl);
10359                 s = xmalloc(size);
10360                 *space_tail = s;
10361                 space_tail = s;
10362                 *space_tail = NULL;
10363
10364                 /* add spare device to update
10365                  */
10366                 for (id = super->devlist ; id; id = id->next)
10367                         if (id->index == (unsigned)u->subdev) {
10368                                 struct imsm_dev *dev;
10369                                 struct imsm_map *map;
10370
10371                                 dev = get_imsm_dev(super, u->subdev);
10372                                 map = get_imsm_map(dev, MAP_0);
10373                                 current_level = map->raid_level;
10374                                 break;
10375                         }
10376                 if (u->new_level == 5 && u->new_level != current_level) {
10377                         struct mdinfo *spares;
10378
10379                         spares = get_spares_for_grow(st);
10380                         if (spares) {
10381                                 struct dl *dl;
10382                                 struct mdinfo *dev;
10383
10384                                 dev = spares->devs;
10385                                 if (dev) {
10386                                         u->new_disks[0] =
10387                                                 makedev(dev->disk.major,
10388                                                         dev->disk.minor);
10389                                         dl = get_disk_super(super,
10390                                                             dev->disk.major,
10391                                                             dev->disk.minor);
10392                                         dl->index = u->old_raid_disks;
10393                                         dev = dev->next;
10394                                 }
10395                                 sysfs_free(spares);
10396                         }
10397                 }
10398                 len = disks_to_mpb_size(u->new_raid_disks);
10399                 dprintf("New anchor length is %llu\n", (unsigned long long)len);
10400                 break;
10401         }
10402         case update_size_change: {
10403                 if (update->len < (int)sizeof(struct imsm_update_size_change))
10404                         return 0;
10405                 break;
10406         }
10407         case update_activate_spare: {
10408                 if (update->len < (int)sizeof(struct imsm_update_activate_spare))
10409                         return 0;
10410                 break;
10411         }
10412         case update_create_array: {
10413                 struct imsm_update_create_array *u = (void *) update->buf;
10414                 struct intel_dev *dv;
10415                 struct imsm_dev *dev = &u->dev;
10416                 struct imsm_map *map = get_imsm_map(dev, MAP_0);
10417                 struct dl *dl;
10418                 struct disk_info *inf;
10419                 int i;
10420                 int activate = 0;
10421
10422                 if (update->len < (int)sizeof(*u))
10423                         return 0;
10424
10425                 inf = get_disk_info(u);
10426                 len = sizeof_imsm_dev(dev, 1);
10427                 /* allocate a new super->devlist entry */
10428                 dv = xmalloc(sizeof(*dv));
10429                 dv->dev = xmalloc(len);
10430                 update->space = dv;
10431
10432                 /* count how many spares will be converted to members */
10433                 for (i = 0; i < map->num_members; i++) {
10434                         dl = serial_to_dl(inf[i].serial, super);
10435                         if (!dl) {
10436                                 /* hmm maybe it failed?, nothing we can do about
10437                                  * it here
10438                                  */
10439                                 continue;
10440                         }
10441                         if (count_memberships(dl, super) == 0)
10442                                 activate++;
10443                 }
10444                 len += activate * sizeof(struct imsm_disk);
10445                 break;
10446         }
10447         case update_kill_array: {
10448                 if (update->len < (int)sizeof(struct imsm_update_kill_array))
10449                         return 0;
10450                 break;
10451         }
10452         case update_rename_array: {
10453                 if (update->len < (int)sizeof(struct imsm_update_rename_array))
10454                         return 0;
10455                 break;
10456         }
10457         case update_add_remove_disk:
10458                 /* no update->len needed */
10459                 break;
10460         case update_prealloc_badblocks_mem:
10461                 super->extra_space += sizeof(struct bbm_log) -
10462                         get_imsm_bbm_log_size(super->bbm_log);
10463                 break;
10464         case update_rwh_policy: {
10465                 if (update->len < (int)sizeof(struct imsm_update_rwh_policy))
10466                         return 0;
10467                 break;
10468         }
10469         default:
10470                 return 0;
10471         }
10472
10473         /* check if we need a larger metadata buffer */
10474         if (super->next_buf)
10475                 buf_len = super->next_len;
10476         else
10477                 buf_len = super->len;
10478
10479         if (__le32_to_cpu(mpb->mpb_size) + super->extra_space + len > buf_len) {
10480                 /* ok we need a larger buf than what is currently allocated
10481                  * if this allocation fails process_update will notice that
10482                  * ->next_len is set and ->next_buf is NULL
10483                  */
10484                 buf_len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) +
10485                                    super->extra_space + len, sector_size);
10486                 if (super->next_buf)
10487                         free(super->next_buf);
10488
10489                 super->next_len = buf_len;
10490                 if (posix_memalign(&super->next_buf, sector_size, buf_len) == 0)
10491                         memset(super->next_buf, 0, buf_len);
10492                 else
10493                         super->next_buf = NULL;
10494         }
10495         return 1;
10496 }
10497
10498 /* must be called while manager is quiesced */
10499 static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned index)
10500 {
10501         struct imsm_super *mpb = super->anchor;
10502         struct dl *iter;
10503         struct imsm_dev *dev;
10504         struct imsm_map *map;
10505         unsigned int i, j, num_members;
10506         __u32 ord, ord_map0;
10507         struct bbm_log *log = super->bbm_log;
10508
10509         dprintf("deleting device[%d] from imsm_super\n", index);
10510
10511         /* shift all indexes down one */
10512         for (iter = super->disks; iter; iter = iter->next)
10513                 if (iter->index > (int)index)
10514                         iter->index--;
10515         for (iter = super->missing; iter; iter = iter->next)
10516                 if (iter->index > (int)index)
10517                         iter->index--;
10518
10519         for (i = 0; i < mpb->num_raid_devs; i++) {
10520                 dev = get_imsm_dev(super, i);
10521                 map = get_imsm_map(dev, MAP_0);
10522                 num_members = map->num_members;
10523                 for (j = 0; j < num_members; j++) {
10524                         /* update ord entries being careful not to propagate
10525                          * ord-flags to the first map
10526                          */
10527                         ord = get_imsm_ord_tbl_ent(dev, j, MAP_X);
10528                         ord_map0 = get_imsm_ord_tbl_ent(dev, j, MAP_0);
10529
10530                         if (ord_to_idx(ord) <= index)
10531                                 continue;
10532
10533                         map = get_imsm_map(dev, MAP_0);
10534                         set_imsm_ord_tbl_ent(map, j, ord_map0 - 1);
10535                         map = get_imsm_map(dev, MAP_1);
10536                         if (map)
10537                                 set_imsm_ord_tbl_ent(map, j, ord - 1);
10538                 }
10539         }
10540
10541         for (i = 0; i < log->entry_count; i++) {
10542                 struct bbm_log_entry *entry = &log->marked_block_entries[i];
10543
10544                 if (entry->disk_ordinal <= index)
10545                         continue;
10546                 entry->disk_ordinal--;
10547         }
10548
10549         mpb->num_disks--;
10550         super->updates_pending++;
10551         if (*dlp) {
10552                 struct dl *dl = *dlp;
10553
10554                 *dlp = (*dlp)->next;
10555                 __free_imsm_disk(dl);
10556         }
10557 }
10558
10559 static int imsm_get_allowed_degradation(int level, int raid_disks,
10560                                         struct intel_super *super,
10561                                         struct imsm_dev *dev)
10562 {
10563         switch (level) {
10564         case 1:
10565         case 10:{
10566                 int ret_val = 0;
10567                 struct imsm_map *map;
10568                 int i;
10569
10570                 ret_val = raid_disks/2;
10571                 /* check map if all disks pairs not failed
10572                  * in both maps
10573                  */
10574                 map = get_imsm_map(dev, MAP_0);
10575                 for (i = 0; i < ret_val; i++) {
10576                         int degradation = 0;
10577                         if (get_imsm_disk(super, i) == NULL)
10578                                 degradation++;
10579                         if (get_imsm_disk(super, i + 1) == NULL)
10580                                 degradation++;
10581                         if (degradation == 2)
10582                                 return 0;
10583                 }
10584                 map = get_imsm_map(dev, MAP_1);
10585                 /* if there is no second map
10586                  * result can be returned
10587                  */
10588                 if (map == NULL)
10589                         return ret_val;
10590                 /* check degradation in second map
10591                  */
10592                 for (i = 0; i < ret_val; i++) {
10593                         int degradation = 0;
10594                 if (get_imsm_disk(super, i) == NULL)
10595                                 degradation++;
10596                         if (get_imsm_disk(super, i + 1) == NULL)
10597                                 degradation++;
10598                         if (degradation == 2)
10599                                 return 0;
10600                 }
10601                 return ret_val;
10602         }
10603         case 5:
10604                 return 1;
10605         case 6:
10606                 return 2;
10607         default:
10608                 return 0;
10609         }
10610 }
10611
10612 /*******************************************************************************
10613  * Function:    validate_container_imsm
10614  * Description: This routine validates container after assemble,
10615  *              eg. if devices in container are under the same controller.
10616  *
10617  * Parameters:
10618  *      info    : linked list with info about devices used in array
10619  * Returns:
10620  *      1 : HBA mismatch
10621  *      0 : Success
10622  ******************************************************************************/
10623 int validate_container_imsm(struct mdinfo *info)
10624 {
10625         if (check_env("IMSM_NO_PLATFORM"))
10626                 return 0;
10627
10628         struct sys_dev *idev;
10629         struct sys_dev *hba = NULL;
10630         struct sys_dev *intel_devices = find_intel_devices();
10631         char *dev_path = devt_to_devpath(makedev(info->disk.major,
10632                                                  info->disk.minor), 1, NULL);
10633
10634         for (idev = intel_devices; idev; idev = idev->next) {
10635                 if (dev_path && strstr(dev_path, idev->path)) {
10636                         hba = idev;
10637                         break;
10638                 }
10639         }
10640         if (dev_path)
10641                 free(dev_path);
10642
10643         if (!hba) {
10644                 pr_err("WARNING - Cannot detect HBA for device %s!\n",
10645                                 devid2kname(makedev(info->disk.major, info->disk.minor)));
10646                 return 1;
10647         }
10648
10649         const struct imsm_orom *orom = get_orom_by_device_id(hba->dev_id);
10650         struct mdinfo *dev;
10651
10652         for (dev = info->next; dev; dev = dev->next) {
10653                 dev_path = devt_to_devpath(makedev(dev->disk.major,
10654                                                    dev->disk.minor), 1, NULL);
10655
10656                 struct sys_dev *hba2 = NULL;
10657                 for (idev = intel_devices; idev; idev = idev->next) {
10658                         if (dev_path && strstr(dev_path, idev->path)) {
10659                                 hba2 = idev;
10660                                 break;
10661                         }
10662                 }
10663                 if (dev_path)
10664                         free(dev_path);
10665
10666                 const struct imsm_orom *orom2 = hba2 == NULL ? NULL :
10667                                 get_orom_by_device_id(hba2->dev_id);
10668
10669                 if (hba2 && hba->type != hba2->type) {
10670                         pr_err("WARNING - HBAs of devices do not match %s != %s\n",
10671                                 get_sys_dev_type(hba->type), get_sys_dev_type(hba2->type));
10672                         return 1;
10673                 }
10674
10675                 if (orom != orom2) {
10676                         pr_err("WARNING - IMSM container assembled with disks under different HBAs!\n"
10677                                 "       This operation is not supported and can lead to data loss.\n");
10678                         return 1;
10679                 }
10680
10681                 if (!orom) {
10682                         pr_err("WARNING - IMSM container assembled with disks under HBAs without IMSM platform support!\n"
10683                                 "       This operation is not supported and can lead to data loss.\n");
10684                         return 1;
10685                 }
10686         }
10687
10688         return 0;
10689 }
10690
10691 /*******************************************************************************
10692 * Function:   imsm_record_badblock
10693 * Description: This routine stores new bad block record in BBM log
10694 *
10695 * Parameters:
10696 *     a         : array containing a bad block
10697 *     slot      : disk number containing a bad block
10698 *     sector    : bad block sector
10699 *     length    : bad block sectors range
10700 * Returns:
10701 *     1 : Success
10702 *     0 : Error
10703 ******************************************************************************/
10704 static int imsm_record_badblock(struct active_array *a, int slot,
10705                           unsigned long long sector, int length)
10706 {
10707         struct intel_super *super = a->container->sb;
10708         int ord;
10709         int ret;
10710
10711         ord = imsm_disk_slot_to_ord(a, slot);
10712         if (ord < 0)
10713                 return 0;
10714
10715         ret = record_new_badblock(super->bbm_log, ord_to_idx(ord), sector,
10716                                    length);
10717         if (ret)
10718                 super->updates_pending++;
10719
10720         return ret;
10721 }
10722 /*******************************************************************************
10723 * Function:   imsm_clear_badblock
10724 * Description: This routine clears bad block record from BBM log
10725 *
10726 * Parameters:
10727 *     a         : array containing a bad block
10728 *     slot      : disk number containing a bad block
10729 *     sector    : bad block sector
10730 *     length    : bad block sectors range
10731 * Returns:
10732 *     1 : Success
10733 *     0 : Error
10734 ******************************************************************************/
10735 static int imsm_clear_badblock(struct active_array *a, int slot,
10736                         unsigned long long sector, int length)
10737 {
10738         struct intel_super *super = a->container->sb;
10739         int ord;
10740         int ret;
10741
10742         ord = imsm_disk_slot_to_ord(a, slot);
10743         if (ord < 0)
10744                 return 0;
10745
10746         ret = clear_badblock(super->bbm_log, ord_to_idx(ord), sector, length);
10747         if (ret)
10748                 super->updates_pending++;
10749
10750         return ret;
10751 }
10752 /*******************************************************************************
10753 * Function:   imsm_get_badblocks
10754 * Description: This routine get list of bad blocks for an array
10755 *
10756 * Parameters:
10757 *     a         : array
10758 *     slot      : disk number
10759 * Returns:
10760 *     bb        : structure containing bad blocks
10761 *     NULL      : error
10762 ******************************************************************************/
10763 static struct md_bb *imsm_get_badblocks(struct active_array *a, int slot)
10764 {
10765         int inst = a->info.container_member;
10766         struct intel_super *super = a->container->sb;
10767         struct imsm_dev *dev = get_imsm_dev(super, inst);
10768         struct imsm_map *map = get_imsm_map(dev, MAP_0);
10769         int ord;
10770
10771         ord = imsm_disk_slot_to_ord(a, slot);
10772         if (ord < 0)
10773                 return NULL;
10774
10775         get_volume_badblocks(super->bbm_log, ord_to_idx(ord), pba_of_lba0(map),
10776                              per_dev_array_size(map), &super->bb);
10777
10778         return &super->bb;
10779 }
10780 /*******************************************************************************
10781 * Function:   examine_badblocks_imsm
10782 * Description: Prints list of bad blocks on a disk to the standard output
10783 *
10784 * Parameters:
10785 *     st        : metadata handler
10786 *     fd        : open file descriptor for device
10787 *     devname   : device name
10788 * Returns:
10789 *     0 : Success
10790 *     1 : Error
10791 ******************************************************************************/
10792 static int examine_badblocks_imsm(struct supertype *st, int fd, char *devname)
10793 {
10794         struct intel_super *super = st->sb;
10795         struct bbm_log *log = super->bbm_log;
10796         struct dl *d = NULL;
10797         int any = 0;
10798
10799         for (d = super->disks; d ; d = d->next) {
10800                 if (strcmp(d->devname, devname) == 0)
10801                         break;
10802         }
10803
10804         if ((d == NULL) || (d->index < 0)) { /* serial mismatch probably */
10805                 pr_err("%s doesn't appear to be part of a raid array\n",
10806                        devname);
10807                 return 1;
10808         }
10809
10810         if (log != NULL) {
10811                 unsigned int i;
10812                 struct bbm_log_entry *entry = &log->marked_block_entries[0];
10813
10814                 for (i = 0; i < log->entry_count; i++) {
10815                         if (entry[i].disk_ordinal == d->index) {
10816                                 unsigned long long sector = __le48_to_cpu(
10817                                         &entry[i].defective_block_start);
10818                                 int cnt = entry[i].marked_count + 1;
10819
10820                                 if (!any) {
10821                                         printf("Bad-blocks on %s:\n", devname);
10822                                         any = 1;
10823                                 }
10824
10825                                 printf("%20llu for %d sectors\n", sector, cnt);
10826                         }
10827                 }
10828         }
10829
10830         if (!any)
10831                 printf("No bad-blocks list configured on %s\n", devname);
10832
10833         return 0;
10834 }
10835 /*******************************************************************************
10836  * Function:    init_migr_record_imsm
10837  * Description: Function inits imsm migration record
10838  * Parameters:
10839  *      super   : imsm internal array info
10840  *      dev     : device under migration
10841  *      info    : general array info to find the smallest device
10842  * Returns:
10843  *      none
10844  ******************************************************************************/
10845 void init_migr_record_imsm(struct supertype *st, struct imsm_dev *dev,
10846                            struct mdinfo *info)
10847 {
10848         struct intel_super *super = st->sb;
10849         struct migr_record *migr_rec = super->migr_rec;
10850         int new_data_disks;
10851         unsigned long long dsize, dev_sectors;
10852         long long unsigned min_dev_sectors = -1LLU;
10853         struct imsm_map *map_dest = get_imsm_map(dev, MAP_0);
10854         struct imsm_map *map_src = get_imsm_map(dev, MAP_1);
10855         unsigned long long num_migr_units;
10856         unsigned long long array_blocks;
10857         struct dl *dl_disk = NULL;
10858
10859         memset(migr_rec, 0, sizeof(struct migr_record));
10860         migr_rec->family_num = __cpu_to_le32(super->anchor->family_num);
10861
10862         /* only ascending reshape supported now */
10863         migr_rec->ascending_migr = __cpu_to_le32(1);
10864
10865         migr_rec->dest_depth_per_unit = GEN_MIGR_AREA_SIZE /
10866                 max(map_dest->blocks_per_strip, map_src->blocks_per_strip);
10867         migr_rec->dest_depth_per_unit *=
10868                 max(map_dest->blocks_per_strip, map_src->blocks_per_strip);
10869         new_data_disks = imsm_num_data_members(map_dest);
10870         migr_rec->blocks_per_unit =
10871                 __cpu_to_le32(migr_rec->dest_depth_per_unit * new_data_disks);
10872         migr_rec->dest_depth_per_unit =
10873                 __cpu_to_le32(migr_rec->dest_depth_per_unit);
10874         array_blocks = info->component_size * new_data_disks;
10875         num_migr_units =
10876                 array_blocks / __le32_to_cpu(migr_rec->blocks_per_unit);
10877
10878         if (array_blocks % __le32_to_cpu(migr_rec->blocks_per_unit))
10879                 num_migr_units++;
10880         set_num_migr_units(migr_rec, num_migr_units);
10881
10882         migr_rec->post_migr_vol_cap =  dev->size_low;
10883         migr_rec->post_migr_vol_cap_hi = dev->size_high;
10884
10885         /* Find the smallest dev */
10886         for (dl_disk =  super->disks; dl_disk ; dl_disk = dl_disk->next) {
10887                 /* ignore spares in container */
10888                 if (dl_disk->index < 0)
10889                         continue;
10890                 get_dev_size(dl_disk->fd, NULL, &dsize);
10891                 dev_sectors = dsize / 512;
10892                 if (dev_sectors < min_dev_sectors)
10893                         min_dev_sectors = dev_sectors;
10894         }
10895         set_migr_chkp_area_pba(migr_rec, min_dev_sectors -
10896                                         RAID_DISK_RESERVED_BLOCKS_IMSM_HI);
10897
10898         write_imsm_migr_rec(st);
10899
10900         return;
10901 }
10902
10903 /*******************************************************************************
10904  * Function:    save_backup_imsm
10905  * Description: Function saves critical data stripes to Migration Copy Area
10906  *              and updates the current migration unit status.
10907  *              Use restore_stripes() to form a destination stripe,
10908  *              and to write it to the Copy Area.
10909  * Parameters:
10910  *      st              : supertype information
10911  *      dev             : imsm device that backup is saved for
10912  *      info            : general array info
10913  *      buf             : input buffer
10914  *      length          : length of data to backup (blocks_per_unit)
10915  * Returns:
10916  *       0 : success
10917  *,     -1 : fail
10918  ******************************************************************************/
10919 int save_backup_imsm(struct supertype *st,
10920                      struct imsm_dev *dev,
10921                      struct mdinfo *info,
10922                      void *buf,
10923                      int length)
10924 {
10925         int rv = -1;
10926         struct intel_super *super = st->sb;
10927         unsigned long long *target_offsets;
10928         int *targets;
10929         int i;
10930         struct imsm_map *map_dest = get_imsm_map(dev, MAP_0);
10931         int new_disks = map_dest->num_members;
10932         int dest_layout = 0;
10933         int dest_chunk;
10934         unsigned long long start;
10935         int data_disks = imsm_num_data_members(map_dest);
10936
10937         targets = xmalloc(new_disks * sizeof(int));
10938
10939         for (i = 0; i < new_disks; i++) {
10940                 struct dl *dl_disk = get_imsm_dl_disk(super, i);
10941
10942                 targets[i] = dl_disk->fd;
10943         }
10944
10945         target_offsets = xcalloc(new_disks, sizeof(unsigned long long));
10946
10947         start = info->reshape_progress * 512;
10948         for (i = 0; i < new_disks; i++) {
10949                 target_offsets[i] = migr_chkp_area_pba(super->migr_rec) * 512;
10950                 /* move back copy area adderss, it will be moved forward
10951                  * in restore_stripes() using start input variable
10952                  */
10953                 target_offsets[i] -= start/data_disks;
10954         }
10955
10956         dest_layout = imsm_level_to_layout(map_dest->raid_level);
10957         dest_chunk = __le16_to_cpu(map_dest->blocks_per_strip) * 512;
10958
10959         if (restore_stripes(targets, /* list of dest devices */
10960                             target_offsets, /* migration record offsets */
10961                             new_disks,
10962                             dest_chunk,
10963                             map_dest->raid_level,
10964                             dest_layout,
10965                             -1,    /* source backup file descriptor */
10966                             0,     /* input buf offset
10967                                     * always 0 buf is already offseted */
10968                             start,
10969                             length,
10970                             buf) != 0) {
10971                 pr_err("Error restoring stripes\n");
10972                 goto abort;
10973         }
10974
10975         rv = 0;
10976
10977 abort:
10978         if (targets) {
10979                 free(targets);
10980         }
10981         free(target_offsets);
10982
10983         return rv;
10984 }
10985
10986 /*******************************************************************************
10987  * Function:    save_checkpoint_imsm
10988  * Description: Function called for current unit status update
10989  *              in the migration record. It writes it to disk.
10990  * Parameters:
10991  *      super   : imsm internal array info
10992  *      info    : general array info
10993  * Returns:
10994  *      0: success
10995  *      1: failure
10996  *      2: failure, means no valid migration record
10997  *                 / no general migration in progress /
10998  ******************************************************************************/
10999 int save_checkpoint_imsm(struct supertype *st, struct mdinfo *info, int state)
11000 {
11001         struct intel_super *super = st->sb;
11002         unsigned long long blocks_per_unit;
11003         unsigned long long curr_migr_unit;
11004
11005         if (load_imsm_migr_rec(super) != 0) {
11006                 dprintf("imsm: ERROR: Cannot read migration record for checkpoint save.\n");
11007                 return 1;
11008         }
11009
11010         blocks_per_unit = __le32_to_cpu(super->migr_rec->blocks_per_unit);
11011         if (blocks_per_unit == 0) {
11012                 dprintf("imsm: no migration in progress.\n");
11013                 return 2;
11014         }
11015         curr_migr_unit = info->reshape_progress / blocks_per_unit;
11016         /* check if array is alligned to copy area
11017          * if it is not alligned, add one to current migration unit value
11018          * this can happend on array reshape finish only
11019          */
11020         if (info->reshape_progress % blocks_per_unit)
11021                 curr_migr_unit++;
11022
11023         set_current_migr_unit(super->migr_rec, curr_migr_unit);
11024         super->migr_rec->rec_status = __cpu_to_le32(state);
11025         set_migr_dest_1st_member_lba(super->migr_rec,
11026                         super->migr_rec->dest_depth_per_unit * curr_migr_unit);
11027
11028         if (write_imsm_migr_rec(st) < 0) {
11029                 dprintf("imsm: Cannot write migration record outside backup area\n");
11030                 return 1;
11031         }
11032
11033         return 0;
11034 }
11035
11036 /*******************************************************************************
11037  * Function:    recover_backup_imsm
11038  * Description: Function recovers critical data from the Migration Copy Area
11039  *              while assembling an array.
11040  * Parameters:
11041  *      super   : imsm internal array info
11042  *      info    : general array info
11043  * Returns:
11044  *      0 : success (or there is no data to recover)
11045  *      1 : fail
11046  ******************************************************************************/
11047 int recover_backup_imsm(struct supertype *st, struct mdinfo *info)
11048 {
11049         struct intel_super *super = st->sb;
11050         struct migr_record *migr_rec = super->migr_rec;
11051         struct imsm_map *map_dest;
11052         struct intel_dev *id = NULL;
11053         unsigned long long read_offset;
11054         unsigned long long write_offset;
11055         unsigned unit_len;
11056         int new_disks, err;
11057         char *buf = NULL;
11058         int retval = 1;
11059         unsigned int sector_size = super->sector_size;
11060         unsigned long long curr_migr_unit = current_migr_unit(migr_rec);
11061         unsigned long long num_migr_units = get_num_migr_units(migr_rec);
11062         char buffer[20];
11063         int skipped_disks = 0;
11064         struct dl *dl_disk;
11065
11066         err = sysfs_get_str(info, NULL, "array_state", (char *)buffer, 20);
11067         if (err < 1)
11068                 return 1;
11069
11070         /* recover data only during assemblation */
11071         if (strncmp(buffer, "inactive", 8) != 0)
11072                 return 0;
11073         /* no data to recover */
11074         if (__le32_to_cpu(migr_rec->rec_status) == UNIT_SRC_NORMAL)
11075                 return 0;
11076         if (curr_migr_unit >= num_migr_units)
11077                 return 1;
11078
11079         /* find device during reshape */
11080         for (id = super->devlist; id; id = id->next)
11081                 if (is_gen_migration(id->dev))
11082                         break;
11083         if (id == NULL)
11084                 return 1;
11085
11086         map_dest = get_imsm_map(id->dev, MAP_0);
11087         new_disks = map_dest->num_members;
11088
11089         read_offset = migr_chkp_area_pba(migr_rec) * 512;
11090
11091         write_offset = (migr_dest_1st_member_lba(migr_rec) +
11092                         pba_of_lba0(map_dest)) * 512;
11093
11094         unit_len = __le32_to_cpu(migr_rec->dest_depth_per_unit) * 512;
11095         if (posix_memalign((void **)&buf, sector_size, unit_len) != 0)
11096                 goto abort;
11097
11098         for (dl_disk = super->disks; dl_disk; dl_disk = dl_disk->next) {
11099                 if (dl_disk->index < 0)
11100                         continue;
11101
11102                 if (dl_disk->fd < 0) {
11103                         skipped_disks++;
11104                         continue;
11105                 }
11106                 if (lseek64(dl_disk->fd, read_offset, SEEK_SET) < 0) {
11107                         pr_err("Cannot seek to block: %s\n",
11108                                strerror(errno));
11109                         skipped_disks++;
11110                         continue;
11111                 }
11112                 if (read(dl_disk->fd, buf, unit_len) != (ssize_t)unit_len) {
11113                         pr_err("Cannot read copy area block: %s\n",
11114                                strerror(errno));
11115                         skipped_disks++;
11116                         continue;
11117                 }
11118                 if (lseek64(dl_disk->fd, write_offset, SEEK_SET) < 0) {
11119                         pr_err("Cannot seek to block: %s\n",
11120                                strerror(errno));
11121                         skipped_disks++;
11122                         continue;
11123                 }
11124                 if (write(dl_disk->fd, buf, unit_len) != (ssize_t)unit_len) {
11125                         pr_err("Cannot restore block: %s\n",
11126                                strerror(errno));
11127                         skipped_disks++;
11128                         continue;
11129                 }
11130         }
11131
11132         if (skipped_disks > imsm_get_allowed_degradation(info->new_level,
11133                                                          new_disks,
11134                                                          super,
11135                                                          id->dev)) {
11136                 pr_err("Cannot restore data from backup. Too many failed disks\n");
11137                 goto abort;
11138         }
11139
11140         if (save_checkpoint_imsm(st, info, UNIT_SRC_NORMAL)) {
11141                 /* ignore error == 2, this can mean end of reshape here
11142                  */
11143                 dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_NORMAL) during restart\n");
11144         } else
11145                 retval = 0;
11146
11147 abort:
11148         free(buf);
11149         return retval;
11150 }
11151
11152 static char disk_by_path[] = "/dev/disk/by-path/";
11153
11154 static const char *imsm_get_disk_controller_domain(const char *path)
11155 {
11156         char disk_path[PATH_MAX];
11157         char *drv=NULL;
11158         struct stat st;
11159
11160         strcpy(disk_path, disk_by_path);
11161         strncat(disk_path, path, PATH_MAX - strlen(disk_path) - 1);
11162         if (stat(disk_path, &st) == 0) {
11163                 struct sys_dev* hba;
11164                 char *path;
11165
11166                 path = devt_to_devpath(st.st_rdev, 1, NULL);
11167                 if (path == NULL)
11168                         return "unknown";
11169                 hba = find_disk_attached_hba(-1, path);
11170                 if (hba && hba->type == SYS_DEV_SAS)
11171                         drv = "isci";
11172                 else if (hba && hba->type == SYS_DEV_SATA)
11173                         drv = "ahci";
11174                 else if (hba && hba->type == SYS_DEV_VMD)
11175                         drv = "vmd";
11176                 else if (hba && hba->type == SYS_DEV_NVME)
11177                         drv = "nvme";
11178                 else
11179                         drv = "unknown";
11180                 dprintf("path: %s hba: %s attached: %s\n",
11181                         path, (hba) ? hba->path : "NULL", drv);
11182                 free(path);
11183         }
11184         return drv;
11185 }
11186
11187 static char *imsm_find_array_devnm_by_subdev(int subdev, char *container)
11188 {
11189         static char devnm[32];
11190         char subdev_name[20];
11191         struct mdstat_ent *mdstat;
11192
11193         sprintf(subdev_name, "%d", subdev);
11194         mdstat = mdstat_by_subdev(subdev_name, container);
11195         if (!mdstat)
11196                 return NULL;
11197
11198         strcpy(devnm, mdstat->devnm);
11199         free_mdstat(mdstat);
11200         return devnm;
11201 }
11202
11203 static int imsm_reshape_is_allowed_on_container(struct supertype *st,
11204                                                 struct geo_params *geo,
11205                                                 int *old_raid_disks,
11206                                                 int direction)
11207 {
11208         /* currently we only support increasing the number of devices
11209          * for a container.  This increases the number of device for each
11210          * member array.  They must all be RAID0 or RAID5.
11211          */
11212         int ret_val = 0;
11213         struct mdinfo *info, *member;
11214         int devices_that_can_grow = 0;
11215
11216         dprintf("imsm: imsm_reshape_is_allowed_on_container(ENTER): st->devnm = (%s)\n", st->devnm);
11217
11218         if (geo->size > 0 ||
11219             geo->level != UnSet ||
11220             geo->layout != UnSet ||
11221             geo->chunksize != 0 ||
11222             geo->raid_disks == UnSet) {
11223                 dprintf("imsm: Container operation is allowed for raid disks number change only.\n");
11224                 return ret_val;
11225         }
11226
11227         if (direction == ROLLBACK_METADATA_CHANGES) {
11228                 dprintf("imsm: Metadata changes rollback is not supported for container operation.\n");
11229                 return ret_val;
11230         }
11231
11232         info = container_content_imsm(st, NULL);
11233         for (member = info; member; member = member->next) {
11234                 char *result;
11235
11236                 dprintf("imsm: checking device_num: %i\n",
11237                         member->container_member);
11238
11239                 if (geo->raid_disks <= member->array.raid_disks) {
11240                         /* we work on container for Online Capacity Expansion
11241                          * only so raid_disks has to grow
11242                          */
11243                         dprintf("imsm: for container operation raid disks increase is required\n");
11244                         break;
11245                 }
11246
11247                 if (info->array.level != 0 && info->array.level != 5) {
11248                         /* we cannot use this container with other raid level
11249                          */
11250                         dprintf("imsm: for container operation wrong raid level (%i) detected\n",
11251                                 info->array.level);
11252                         break;
11253                 } else {
11254                         /* check for platform support
11255                          * for this raid level configuration
11256                          */
11257                         struct intel_super *super = st->sb;
11258                         if (!is_raid_level_supported(super->orom,
11259                                                      member->array.level,
11260                                                      geo->raid_disks)) {
11261                                 dprintf("platform does not support raid%d with %d disk%s\n",
11262                                          info->array.level,
11263                                          geo->raid_disks,
11264                                          geo->raid_disks > 1 ? "s" : "");
11265                                 break;
11266                         }
11267                         /* check if component size is aligned to chunk size
11268                          */
11269                         if (info->component_size %
11270                             (info->array.chunk_size/512)) {
11271                                 dprintf("Component size is not aligned to chunk size\n");
11272                                 break;
11273                         }
11274                 }
11275
11276                 if (*old_raid_disks &&
11277                     info->array.raid_disks != *old_raid_disks)
11278                         break;
11279                 *old_raid_disks = info->array.raid_disks;
11280
11281                 /* All raid5 and raid0 volumes in container
11282                  * have to be ready for Online Capacity Expansion
11283                  * so they need to be assembled.  We have already
11284                  * checked that no recovery etc is happening.
11285                  */
11286                 result = imsm_find_array_devnm_by_subdev(member->container_member,
11287                                                          st->container_devnm);
11288                 if (result == NULL) {
11289                         dprintf("imsm: cannot find array\n");
11290                         break;
11291                 }
11292                 devices_that_can_grow++;
11293         }
11294         sysfs_free(info);
11295         if (!member && devices_that_can_grow)
11296                 ret_val = 1;
11297
11298         if (ret_val)
11299                 dprintf("Container operation allowed\n");
11300         else
11301                 dprintf("Error: %i\n", ret_val);
11302
11303         return ret_val;
11304 }
11305
11306 /* Function: get_spares_for_grow
11307  * Description: Allocates memory and creates list of spare devices
11308  *              avaliable in container. Checks if spare drive size is acceptable.
11309  * Parameters: Pointer to the supertype structure
11310  * Returns: Pointer to the list of spare devices (mdinfo structure) on success,
11311  *              NULL if fail
11312  */
11313 static struct mdinfo *get_spares_for_grow(struct supertype *st)
11314 {
11315         struct spare_criteria sc;
11316
11317         get_spare_criteria_imsm(st, &sc);
11318         return container_choose_spares(st, &sc, NULL, NULL, NULL, 0);
11319 }
11320
11321 /******************************************************************************
11322  * function: imsm_create_metadata_update_for_reshape
11323  * Function creates update for whole IMSM container.
11324  *
11325  ******************************************************************************/
11326 static int imsm_create_metadata_update_for_reshape(
11327         struct supertype *st,
11328         struct geo_params *geo,
11329         int old_raid_disks,
11330         struct imsm_update_reshape **updatep)
11331 {
11332         struct intel_super *super = st->sb;
11333         struct imsm_super *mpb = super->anchor;
11334         int update_memory_size;
11335         struct imsm_update_reshape *u;
11336         struct mdinfo *spares;
11337         int i;
11338         int delta_disks;
11339         struct mdinfo *dev;
11340
11341         dprintf("(enter) raid_disks = %i\n", geo->raid_disks);
11342
11343         delta_disks = geo->raid_disks - old_raid_disks;
11344
11345         /* size of all update data without anchor */
11346         update_memory_size = sizeof(struct imsm_update_reshape);
11347
11348         /* now add space for spare disks that we need to add. */
11349         update_memory_size += sizeof(u->new_disks[0]) * (delta_disks - 1);
11350
11351         u = xcalloc(1, update_memory_size);
11352         u->type = update_reshape_container_disks;
11353         u->old_raid_disks = old_raid_disks;
11354         u->new_raid_disks = geo->raid_disks;
11355
11356         /* now get spare disks list
11357          */
11358         spares = get_spares_for_grow(st);
11359
11360         if (spares == NULL || delta_disks > spares->array.spare_disks) {
11361                 pr_err("imsm: ERROR: Cannot get spare devices for %s.\n", geo->dev_name);
11362                 i = -1;
11363                 goto abort;
11364         }
11365
11366         /* we have got spares
11367          * update disk list in imsm_disk list table in anchor
11368          */
11369         dprintf("imsm: %i spares are available.\n\n",
11370                 spares->array.spare_disks);
11371
11372         dev = spares->devs;
11373         for (i = 0; i < delta_disks; i++) {
11374                 struct dl *dl;
11375
11376                 if (dev == NULL)
11377                         break;
11378                 u->new_disks[i] = makedev(dev->disk.major,
11379                                           dev->disk.minor);
11380                 dl = get_disk_super(super, dev->disk.major, dev->disk.minor);
11381                 dl->index = mpb->num_disks;
11382                 mpb->num_disks++;
11383                 dev = dev->next;
11384         }
11385
11386 abort:
11387         /* free spares
11388          */
11389         sysfs_free(spares);
11390
11391         dprintf("imsm: reshape update preparation :");
11392         if (i == delta_disks) {
11393                 dprintf_cont(" OK\n");
11394                 *updatep = u;
11395                 return update_memory_size;
11396         }
11397         free(u);
11398         dprintf_cont(" Error\n");
11399
11400         return 0;
11401 }
11402
11403 /******************************************************************************
11404  * function: imsm_create_metadata_update_for_size_change()
11405  *           Creates update for IMSM array for array size change.
11406  *
11407  ******************************************************************************/
11408 static int imsm_create_metadata_update_for_size_change(
11409                                 struct supertype *st,
11410                                 struct geo_params *geo,
11411                                 struct imsm_update_size_change **updatep)
11412 {
11413         struct intel_super *super = st->sb;
11414         int update_memory_size;
11415         struct imsm_update_size_change *u;
11416
11417         dprintf("(enter) New size = %llu\n", geo->size);
11418
11419         /* size of all update data without anchor */
11420         update_memory_size = sizeof(struct imsm_update_size_change);
11421
11422         u = xcalloc(1, update_memory_size);
11423         u->type = update_size_change;
11424         u->subdev = super->current_vol;
11425         u->new_size = geo->size;
11426
11427         dprintf("imsm: reshape update preparation : OK\n");
11428         *updatep = u;
11429
11430         return update_memory_size;
11431 }
11432
11433 /******************************************************************************
11434  * function: imsm_create_metadata_update_for_migration()
11435  *           Creates update for IMSM array.
11436  *
11437  ******************************************************************************/
11438 static int imsm_create_metadata_update_for_migration(
11439                                         struct supertype *st,
11440                                         struct geo_params *geo,
11441                                         struct imsm_update_reshape_migration **updatep)
11442 {
11443         struct intel_super *super = st->sb;
11444         int update_memory_size;
11445         struct imsm_update_reshape_migration *u;
11446         struct imsm_dev *dev;
11447         int previous_level = -1;
11448
11449         dprintf("(enter) New Level = %i\n", geo->level);
11450
11451         /* size of all update data without anchor */
11452         update_memory_size = sizeof(struct imsm_update_reshape_migration);
11453
11454         u = xcalloc(1, update_memory_size);
11455         u->type = update_reshape_migration;
11456         u->subdev = super->current_vol;
11457         u->new_level = geo->level;
11458         u->new_layout = geo->layout;
11459         u->new_raid_disks = u->old_raid_disks = geo->raid_disks;
11460         u->new_disks[0] = -1;
11461         u->new_chunksize = -1;
11462
11463         dev = get_imsm_dev(super, u->subdev);
11464         if (dev) {
11465                 struct imsm_map *map;
11466
11467                 map = get_imsm_map(dev, MAP_0);
11468                 if (map) {
11469                         int current_chunk_size =
11470                                 __le16_to_cpu(map->blocks_per_strip) / 2;
11471
11472                         if (geo->chunksize != current_chunk_size) {
11473                                 u->new_chunksize = geo->chunksize / 1024;
11474                                 dprintf("imsm: chunk size change from %i to %i\n",
11475                                         current_chunk_size, u->new_chunksize);
11476                         }
11477                         previous_level = map->raid_level;
11478                 }
11479         }
11480         if (geo->level == 5 && previous_level == 0) {
11481                 struct mdinfo *spares = NULL;
11482
11483                 u->new_raid_disks++;
11484                 spares = get_spares_for_grow(st);
11485                 if (spares == NULL || spares->array.spare_disks < 1) {
11486                         free(u);
11487                         sysfs_free(spares);
11488                         update_memory_size = 0;
11489                         pr_err("cannot get spare device for requested migration\n");
11490                         return 0;
11491                 }
11492                 sysfs_free(spares);
11493         }
11494         dprintf("imsm: reshape update preparation : OK\n");
11495         *updatep = u;
11496
11497         return update_memory_size;
11498 }
11499
11500 static void imsm_update_metadata_locally(struct supertype *st,
11501                                          void *buf, int len)
11502 {
11503         struct metadata_update mu;
11504
11505         mu.buf = buf;
11506         mu.len = len;
11507         mu.space = NULL;
11508         mu.space_list = NULL;
11509         mu.next = NULL;
11510         if (imsm_prepare_update(st, &mu))
11511                 imsm_process_update(st, &mu);
11512
11513         while (mu.space_list) {
11514                 void **space = mu.space_list;
11515                 mu.space_list = *space;
11516                 free(space);
11517         }
11518 }
11519
11520 /***************************************************************************
11521 * Function:     imsm_analyze_change
11522 * Description:  Function analyze change for single volume
11523 *               and validate if transition is supported
11524 * Parameters:   Geometry parameters, supertype structure,
11525 *               metadata change direction (apply/rollback)
11526 * Returns:      Operation type code on success, -1 if fail
11527 ****************************************************************************/
11528 enum imsm_reshape_type imsm_analyze_change(struct supertype *st,
11529                                            struct geo_params *geo,
11530                                            int direction)
11531 {
11532         struct mdinfo info;
11533         int change = -1;
11534         int check_devs = 0;
11535         int chunk;
11536         /* number of added/removed disks in operation result */
11537         int devNumChange = 0;
11538         /* imsm compatible layout value for array geometry verification */
11539         int imsm_layout = -1;
11540         int data_disks;
11541         struct imsm_dev *dev;
11542         struct imsm_map *map;
11543         struct intel_super *super;
11544         unsigned long long current_size;
11545         unsigned long long free_size;
11546         unsigned long long max_size;
11547         int rv;
11548
11549         getinfo_super_imsm_volume(st, &info, NULL);
11550         if (geo->level != info.array.level && geo->level >= 0 &&
11551             geo->level != UnSet) {
11552                 switch (info.array.level) {
11553                 case 0:
11554                         if (geo->level == 5) {
11555                                 change = CH_MIGRATION;
11556                                 if (geo->layout != ALGORITHM_LEFT_ASYMMETRIC) {
11557                                         pr_err("Error. Requested Layout not supported (left-asymmetric layout is supported only)!\n");
11558                                         change = -1;
11559                                         goto analyse_change_exit;
11560                                 }
11561                                 imsm_layout =  geo->layout;
11562                                 check_devs = 1;
11563                                 devNumChange = 1; /* parity disk added */
11564                         } else if (geo->level == 10) {
11565                                 change = CH_TAKEOVER;
11566                                 check_devs = 1;
11567                                 devNumChange = 2; /* two mirrors added */
11568                                 imsm_layout = 0x102; /* imsm supported layout */
11569                         }
11570                         break;
11571                 case 1:
11572                 case 10:
11573                         if (geo->level == 0) {
11574                                 change = CH_TAKEOVER;
11575                                 check_devs = 1;
11576                                 devNumChange = -(geo->raid_disks/2);
11577                                 imsm_layout = 0; /* imsm raid0 layout */
11578                         }
11579                         break;
11580                 }
11581                 if (change == -1) {
11582                         pr_err("Error. Level Migration from %d to %d not supported!\n",
11583                                info.array.level, geo->level);
11584                         goto analyse_change_exit;
11585                 }
11586         } else
11587                 geo->level = info.array.level;
11588
11589         if (geo->layout != info.array.layout &&
11590             (geo->layout != UnSet && geo->layout != -1)) {
11591                 change = CH_MIGRATION;
11592                 if (info.array.layout == 0 && info.array.level == 5 &&
11593                     geo->layout == 5) {
11594                         /* reshape 5 -> 4 */
11595                 } else if (info.array.layout == 5 && info.array.level == 5 &&
11596                            geo->layout == 0) {
11597                         /* reshape 4 -> 5 */
11598                         geo->layout = 0;
11599                         geo->level = 5;
11600                 } else {
11601                         pr_err("Error. Layout Migration from %d to %d not supported!\n",
11602                                info.array.layout, geo->layout);
11603                         change = -1;
11604                         goto analyse_change_exit;
11605                 }
11606         } else {
11607                 geo->layout = info.array.layout;
11608                 if (imsm_layout == -1)
11609                         imsm_layout = info.array.layout;
11610         }
11611
11612         if (geo->chunksize > 0 && geo->chunksize != UnSet &&
11613             geo->chunksize != info.array.chunk_size) {
11614                 if (info.array.level == 10) {
11615                         pr_err("Error. Chunk size change for RAID 10 is not supported.\n");
11616                         change = -1;
11617                         goto analyse_change_exit;
11618                 } else if (info.component_size % (geo->chunksize/512)) {
11619                         pr_err("New chunk size (%dK) does not evenly divide device size (%lluk). Aborting...\n",
11620                                geo->chunksize/1024, info.component_size/2);
11621                         change = -1;
11622                         goto analyse_change_exit;
11623                 }
11624                 change = CH_MIGRATION;
11625         } else {
11626                 geo->chunksize = info.array.chunk_size;
11627         }
11628
11629         chunk = geo->chunksize / 1024;
11630
11631         super = st->sb;
11632         dev = get_imsm_dev(super, super->current_vol);
11633         map = get_imsm_map(dev, MAP_0);
11634         data_disks = imsm_num_data_members(map);
11635         /* compute current size per disk member
11636          */
11637         current_size = info.custom_array_size / data_disks;
11638
11639         if (geo->size > 0 && geo->size != MAX_SIZE) {
11640                 /* align component size
11641                  */
11642                 geo->size = imsm_component_size_alignment_check(
11643                                     get_imsm_raid_level(dev->vol.map),
11644                                     chunk * 1024, super->sector_size,
11645                                     geo->size * 2);
11646                 if (geo->size == 0) {
11647                         pr_err("Error. Size expansion is supported only (current size is %llu, requested size /rounded/ is 0).\n",
11648                                    current_size);
11649                         goto analyse_change_exit;
11650                 }
11651         }
11652
11653         if (current_size != geo->size && geo->size > 0) {
11654                 if (change != -1) {
11655                         pr_err("Error. Size change should be the only one at a time.\n");
11656                         change = -1;
11657                         goto analyse_change_exit;
11658                 }
11659                 if ((super->current_vol + 1) != super->anchor->num_raid_devs) {
11660                         pr_err("Error. The last volume in container can be expanded only (%i/%s).\n",
11661                                super->current_vol, st->devnm);
11662                         goto analyse_change_exit;
11663                 }
11664                 /* check the maximum available size
11665                  */
11666                 rv =  imsm_get_free_size(st, dev->vol.map->num_members,
11667                                          0, chunk, &free_size);
11668                 if (rv == 0)
11669                         /* Cannot find maximum available space
11670                          */
11671                         max_size = 0;
11672                 else {
11673                         max_size = free_size + current_size;
11674                         /* align component size
11675                          */
11676                         max_size = imsm_component_size_alignment_check(
11677                                         get_imsm_raid_level(dev->vol.map),
11678                                         chunk * 1024, super->sector_size,
11679                                         max_size);
11680                 }
11681                 if (geo->size == MAX_SIZE) {
11682                         /* requested size change to the maximum available size
11683                          */
11684                         if (max_size == 0) {
11685                                 pr_err("Error. Cannot find maximum available space.\n");
11686                                 change = -1;
11687                                 goto analyse_change_exit;
11688                         } else
11689                                 geo->size = max_size;
11690                 }
11691
11692                 if (direction == ROLLBACK_METADATA_CHANGES) {
11693                         /* accept size for rollback only
11694                         */
11695                 } else {
11696                         /* round size due to metadata compatibility
11697                         */
11698                         geo->size = (geo->size >> SECT_PER_MB_SHIFT)
11699                                     << SECT_PER_MB_SHIFT;
11700                         dprintf("Prepare update for size change to %llu\n",
11701                                 geo->size );
11702                         if (current_size >= geo->size) {
11703                                 pr_err("Error. Size expansion is supported only (current size is %llu, requested size /rounded/ is %llu).\n",
11704                                        current_size, geo->size);
11705                                 goto analyse_change_exit;
11706                         }
11707                         if (max_size && geo->size > max_size) {
11708                                 pr_err("Error. Requested size is larger than maximum available size (maximum available size is %llu, requested size /rounded/ is %llu).\n",
11709                                        max_size, geo->size);
11710                                 goto analyse_change_exit;
11711                         }
11712                 }
11713                 geo->size *= data_disks;
11714                 geo->raid_disks = dev->vol.map->num_members;
11715                 change = CH_ARRAY_SIZE;
11716         }
11717         if (!validate_geometry_imsm(st,
11718                                     geo->level,
11719                                     imsm_layout,
11720                                     geo->raid_disks + devNumChange,
11721                                     &chunk,
11722                                     geo->size, INVALID_SECTORS,
11723                                     0, 0, info.consistency_policy, 1))
11724                 change = -1;
11725
11726         if (check_devs) {
11727                 struct intel_super *super = st->sb;
11728                 struct imsm_super *mpb = super->anchor;
11729
11730                 if (mpb->num_raid_devs > 1) {
11731                         pr_err("Error. Cannot perform operation on %s- for this operation it MUST be single array in container\n",
11732                                geo->dev_name);
11733                         change = -1;
11734                 }
11735         }
11736
11737 analyse_change_exit:
11738         if (direction == ROLLBACK_METADATA_CHANGES &&
11739             (change == CH_MIGRATION || change == CH_TAKEOVER)) {
11740                 dprintf("imsm: Metadata changes rollback is not supported for migration and takeover operations.\n");
11741                 change = -1;
11742         }
11743         return change;
11744 }
11745
11746 int imsm_takeover(struct supertype *st, struct geo_params *geo)
11747 {
11748         struct intel_super *super = st->sb;
11749         struct imsm_update_takeover *u;
11750
11751         u = xmalloc(sizeof(struct imsm_update_takeover));
11752
11753         u->type = update_takeover;
11754         u->subarray = super->current_vol;
11755
11756         /* 10->0 transition */
11757         if (geo->level == 0)
11758                 u->direction = R10_TO_R0;
11759
11760         /* 0->10 transition */
11761         if (geo->level == 10)
11762                 u->direction = R0_TO_R10;
11763
11764         /* update metadata locally */
11765         imsm_update_metadata_locally(st, u,
11766                                         sizeof(struct imsm_update_takeover));
11767         /* and possibly remotely */
11768         if (st->update_tail)
11769                 append_metadata_update(st, u,
11770                                         sizeof(struct imsm_update_takeover));
11771         else
11772                 free(u);
11773
11774         return 0;
11775 }
11776
11777 /* Flush size update if size calculated by num_data_stripes is higher than
11778  * imsm_dev_size to eliminate differences during reshape.
11779  * Mdmon will recalculate them correctly.
11780  * If subarray index is not set then check whole container.
11781  * Returns:
11782  *      0 - no error occurred
11783  *      1 - error detected
11784  */
11785 static int imsm_fix_size_mismatch(struct supertype *st, int subarray_index)
11786 {
11787         struct intel_super *super = st->sb;
11788         int tmp = super->current_vol;
11789         int ret_val = 1;
11790         int i;
11791
11792         for (i = 0; i < super->anchor->num_raid_devs; i++) {
11793                 if (subarray_index >= 0 && i != subarray_index)
11794                         continue;
11795                 super->current_vol = i;
11796                 struct imsm_dev *dev = get_imsm_dev(super, super->current_vol);
11797                 struct imsm_map *map = get_imsm_map(dev, MAP_0);
11798                 unsigned int disc_count = imsm_num_data_members(map);
11799                 struct geo_params geo;
11800                 struct imsm_update_size_change *update;
11801                 unsigned long long calc_size = per_dev_array_size(map) * disc_count;
11802                 unsigned long long d_size = imsm_dev_size(dev);
11803                 int u_size;
11804
11805                 if (calc_size == d_size || dev->vol.migr_type == MIGR_GEN_MIGR)
11806                         continue;
11807
11808                 /* There is a difference, confirm that imsm_dev_size is
11809                  * smaller and push update.
11810                  */
11811                 if (d_size > calc_size) {
11812                         pr_err("imsm: dev size of subarray %d is incorrect\n",
11813                                 i);
11814                         goto exit;
11815                 }
11816                 memset(&geo, 0, sizeof(struct geo_params));
11817                 geo.size = d_size;
11818                 u_size = imsm_create_metadata_update_for_size_change(st, &geo,
11819                                                                      &update);
11820                 if (u_size < 1) {
11821                         dprintf("imsm: Cannot prepare size change update\n");
11822                         goto exit;
11823                 }
11824                 imsm_update_metadata_locally(st, update, u_size);
11825                 if (st->update_tail) {
11826                         append_metadata_update(st, update, u_size);
11827                         flush_metadata_updates(st);
11828                         st->update_tail = &st->updates;
11829                 } else {
11830                         imsm_sync_metadata(st);
11831                 }
11832         }
11833         ret_val = 0;
11834 exit:
11835         super->current_vol = tmp;
11836         return ret_val;
11837 }
11838
11839 static int imsm_reshape_super(struct supertype *st, unsigned long long size,
11840                               int level,
11841                               int layout, int chunksize, int raid_disks,
11842                               int delta_disks, char *backup, char *dev,
11843                               int direction, int verbose)
11844 {
11845         int ret_val = 1;
11846         struct geo_params geo;
11847
11848         dprintf("(enter)\n");
11849
11850         memset(&geo, 0, sizeof(struct geo_params));
11851
11852         geo.dev_name = dev;
11853         strcpy(geo.devnm, st->devnm);
11854         geo.size = size;
11855         geo.level = level;
11856         geo.layout = layout;
11857         geo.chunksize = chunksize;
11858         geo.raid_disks = raid_disks;
11859         if (delta_disks != UnSet)
11860                 geo.raid_disks += delta_disks;
11861
11862         dprintf("for level      : %i\n", geo.level);
11863         dprintf("for raid_disks : %i\n", geo.raid_disks);
11864
11865         if (strcmp(st->container_devnm, st->devnm) == 0) {
11866                 /* On container level we can only increase number of devices. */
11867                 dprintf("imsm: info: Container operation\n");
11868                 int old_raid_disks = 0;
11869
11870                 if (imsm_reshape_is_allowed_on_container(
11871                             st, &geo, &old_raid_disks, direction)) {
11872                         struct imsm_update_reshape *u = NULL;
11873                         int len;
11874
11875                         if (imsm_fix_size_mismatch(st, -1)) {
11876                                 dprintf("imsm: Cannot fix size mismatch\n");
11877                                 goto exit_imsm_reshape_super;
11878                         }
11879
11880                         len = imsm_create_metadata_update_for_reshape(
11881                                 st, &geo, old_raid_disks, &u);
11882
11883                         if (len <= 0) {
11884                                 dprintf("imsm: Cannot prepare update\n");
11885                                 goto exit_imsm_reshape_super;
11886                         }
11887
11888                         ret_val = 0;
11889                         /* update metadata locally */
11890                         imsm_update_metadata_locally(st, u, len);
11891                         /* and possibly remotely */
11892                         if (st->update_tail)
11893                                 append_metadata_update(st, u, len);
11894                         else
11895                                 free(u);
11896
11897                 } else {
11898                         pr_err("(imsm) Operation is not allowed on this container\n");
11899                 }
11900         } else {
11901                 /* On volume level we support following operations
11902                  * - takeover: raid10 -> raid0; raid0 -> raid10
11903                  * - chunk size migration
11904                  * - migration: raid5 -> raid0; raid0 -> raid5
11905                  */
11906                 struct intel_super *super = st->sb;
11907                 struct intel_dev *dev = super->devlist;
11908                 int change;
11909                 dprintf("imsm: info: Volume operation\n");
11910                 /* find requested device */
11911                 while (dev) {
11912                         char *devnm =
11913                                 imsm_find_array_devnm_by_subdev(
11914                                         dev->index, st->container_devnm);
11915                         if (devnm && strcmp(devnm, geo.devnm) == 0)
11916                                 break;
11917                         dev = dev->next;
11918                 }
11919                 if (dev == NULL) {
11920                         pr_err("Cannot find %s (%s) subarray\n",
11921                                 geo.dev_name, geo.devnm);
11922                         goto exit_imsm_reshape_super;
11923                 }
11924                 super->current_vol = dev->index;
11925                 change = imsm_analyze_change(st, &geo, direction);
11926                 switch (change) {
11927                 case CH_TAKEOVER:
11928                         ret_val = imsm_takeover(st, &geo);
11929                         break;
11930                 case CH_MIGRATION: {
11931                         struct imsm_update_reshape_migration *u = NULL;
11932                         int len =
11933                                 imsm_create_metadata_update_for_migration(
11934                                         st, &geo, &u);
11935                         if (len < 1) {
11936                                 dprintf("imsm: Cannot prepare update\n");
11937                                 break;
11938                         }
11939                         ret_val = 0;
11940                         /* update metadata locally */
11941                         imsm_update_metadata_locally(st, u, len);
11942                         /* and possibly remotely */
11943                         if (st->update_tail)
11944                                 append_metadata_update(st, u, len);
11945                         else
11946                                 free(u);
11947                 }
11948                 break;
11949                 case CH_ARRAY_SIZE: {
11950                         struct imsm_update_size_change *u = NULL;
11951                         int len =
11952                                 imsm_create_metadata_update_for_size_change(
11953                                         st, &geo, &u);
11954                         if (len < 1) {
11955                                 dprintf("imsm: Cannot prepare update\n");
11956                                 break;
11957                         }
11958                         ret_val = 0;
11959                         /* update metadata locally */
11960                         imsm_update_metadata_locally(st, u, len);
11961                         /* and possibly remotely */
11962                         if (st->update_tail)
11963                                 append_metadata_update(st, u, len);
11964                         else
11965                                 free(u);
11966                 }
11967                 break;
11968                 default:
11969                         ret_val = 1;
11970                 }
11971         }
11972
11973 exit_imsm_reshape_super:
11974         dprintf("imsm: reshape_super Exit code = %i\n", ret_val);
11975         return ret_val;
11976 }
11977
11978 #define COMPLETED_OK            0
11979 #define COMPLETED_NONE          1
11980 #define COMPLETED_DELAYED       2
11981
11982 static int read_completed(int fd, unsigned long long *val)
11983 {
11984         int ret;
11985         char buf[50];
11986
11987         ret = sysfs_fd_get_str(fd, buf, 50);
11988         if (ret < 0)
11989                 return ret;
11990
11991         ret = COMPLETED_OK;
11992         if (strncmp(buf, "none", 4) == 0) {
11993                 ret = COMPLETED_NONE;
11994         } else if (strncmp(buf, "delayed", 7) == 0) {
11995                 ret = COMPLETED_DELAYED;
11996         } else {
11997                 char *ep;
11998                 *val = strtoull(buf, &ep, 0);
11999                 if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' '))
12000                         ret = -1;
12001         }
12002         return ret;
12003 }
12004
12005 /*******************************************************************************
12006  * Function:    wait_for_reshape_imsm
12007  * Description: Function writes new sync_max value and waits until
12008  *              reshape process reach new position
12009  * Parameters:
12010  *      sra             : general array info
12011  *      ndata           : number of disks in new array's layout
12012  * Returns:
12013  *       0 : success,
12014  *       1 : there is no reshape in progress,
12015  *      -1 : fail
12016  ******************************************************************************/
12017 int wait_for_reshape_imsm(struct mdinfo *sra, int ndata)
12018 {
12019         int fd = sysfs_get_fd(sra, NULL, "sync_completed");
12020         int retry = 3;
12021         unsigned long long completed;
12022         /* to_complete : new sync_max position */
12023         unsigned long long to_complete = sra->reshape_progress;
12024         unsigned long long position_to_set = to_complete / ndata;
12025
12026         if (fd < 0) {
12027                 dprintf("cannot open reshape_position\n");
12028                 return 1;
12029         }
12030
12031         do {
12032                 if (sysfs_fd_get_ll(fd, &completed) < 0) {
12033                         if (!retry) {
12034                                 dprintf("cannot read reshape_position (no reshape in progres)\n");
12035                                 close(fd);
12036                                 return 1;
12037                         }
12038                         usleep(30000);
12039                 } else
12040                         break;
12041         } while (retry--);
12042
12043         if (completed > position_to_set) {
12044                 dprintf("wrong next position to set %llu (%llu)\n",
12045                         to_complete, position_to_set);
12046                 close(fd);
12047                 return -1;
12048         }
12049         dprintf("Position set: %llu\n", position_to_set);
12050         if (sysfs_set_num(sra, NULL, "sync_max",
12051                           position_to_set) != 0) {
12052                 dprintf("cannot set reshape position to %llu\n",
12053                         position_to_set);
12054                 close(fd);
12055                 return -1;
12056         }
12057
12058         do {
12059                 int rc;
12060                 char action[20];
12061                 int timeout = 3000;
12062
12063                 sysfs_wait(fd, &timeout);
12064                 if (sysfs_get_str(sra, NULL, "sync_action",
12065                                   action, 20) > 0 &&
12066                                 strncmp(action, "reshape", 7) != 0) {
12067                         if (strncmp(action, "idle", 4) == 0)
12068                                 break;
12069                         close(fd);
12070                         return -1;
12071                 }
12072
12073                 rc = read_completed(fd, &completed);
12074                 if (rc < 0) {
12075                         dprintf("cannot read reshape_position (in loop)\n");
12076                         close(fd);
12077                         return 1;
12078                 } else if (rc == COMPLETED_NONE)
12079                         break;
12080         } while (completed < position_to_set);
12081
12082         close(fd);
12083         return 0;
12084 }
12085
12086 /*******************************************************************************
12087  * Function:    check_degradation_change
12088  * Description: Check that array hasn't become failed.
12089  * Parameters:
12090  *      info    : for sysfs access
12091  *      sources : source disks descriptors
12092  *      degraded: previous degradation level
12093  * Returns:
12094  *      degradation level
12095  ******************************************************************************/
12096 int check_degradation_change(struct mdinfo *info,
12097                              int *sources,
12098                              int degraded)
12099 {
12100         unsigned long long new_degraded;
12101         int rv;
12102
12103         rv = sysfs_get_ll(info, NULL, "degraded", &new_degraded);
12104         if (rv == -1 || (new_degraded != (unsigned long long)degraded)) {
12105                 /* check each device to ensure it is still working */
12106                 struct mdinfo *sd;
12107                 new_degraded = 0;
12108                 for (sd = info->devs ; sd ; sd = sd->next) {
12109                         if (sd->disk.state & (1<<MD_DISK_FAULTY))
12110                                 continue;
12111                         if (sd->disk.state & (1<<MD_DISK_SYNC)) {
12112                                 char sbuf[100];
12113
12114                                 if (sysfs_get_str(info,
12115                                         sd, "state", sbuf, sizeof(sbuf)) < 0 ||
12116                                         strstr(sbuf, "faulty") ||
12117                                         strstr(sbuf, "in_sync") == NULL) {
12118                                         /* this device is dead */
12119                                         sd->disk.state = (1<<MD_DISK_FAULTY);
12120                                         if (sd->disk.raid_disk >= 0 &&
12121                                             sources[sd->disk.raid_disk] >= 0) {
12122                                                 close(sources[
12123                                                         sd->disk.raid_disk]);
12124                                                 sources[sd->disk.raid_disk] =
12125                                                         -1;
12126                                         }
12127                                         new_degraded++;
12128                                 }
12129                         }
12130                 }
12131         }
12132
12133         return new_degraded;
12134 }
12135
12136 /*******************************************************************************
12137  * Function:    imsm_manage_reshape
12138  * Description: Function finds array under reshape and it manages reshape
12139  *              process. It creates stripes backups (if required) and sets
12140  *              checkpoints.
12141  * Parameters:
12142  *      afd             : Backup handle (nattive) - not used
12143  *      sra             : general array info
12144  *      reshape         : reshape parameters - not used
12145  *      st              : supertype structure
12146  *      blocks          : size of critical section [blocks]
12147  *      fds             : table of source device descriptor
12148  *      offsets         : start of array (offest per devices)
12149  *      dests           : not used
12150  *      destfd          : table of destination device descriptor
12151  *      destoffsets     : table of destination offsets (per device)
12152  * Returns:
12153  *      1 : success, reshape is done
12154  *      0 : fail
12155  ******************************************************************************/
12156 static int imsm_manage_reshape(
12157         int afd, struct mdinfo *sra, struct reshape *reshape,
12158         struct supertype *st, unsigned long backup_blocks,
12159         int *fds, unsigned long long *offsets,
12160         int dests, int *destfd, unsigned long long *destoffsets)
12161 {
12162         int ret_val = 0;
12163         struct intel_super *super = st->sb;
12164         struct intel_dev *dv;
12165         unsigned int sector_size = super->sector_size;
12166         struct imsm_dev *dev = NULL;
12167         struct imsm_map *map_src, *map_dest;
12168         int migr_vol_qan = 0;
12169         int ndata, odata; /* [bytes] */
12170         int chunk; /* [bytes] */
12171         struct migr_record *migr_rec;
12172         char *buf = NULL;
12173         unsigned int buf_size; /* [bytes] */
12174         unsigned long long max_position; /* array size [bytes] */
12175         unsigned long long next_step; /* [blocks]/[bytes] */
12176         unsigned long long old_data_stripe_length;
12177         unsigned long long start_src; /* [bytes] */
12178         unsigned long long start; /* [bytes] */
12179         unsigned long long start_buf_shift; /* [bytes] */
12180         int degraded = 0;
12181         int source_layout = 0;
12182         int subarray_index = -1;
12183
12184         if (!sra)
12185                 return ret_val;
12186
12187         if (!fds || !offsets)
12188                 goto abort;
12189
12190         /* Find volume during the reshape */
12191         for (dv = super->devlist; dv; dv = dv->next) {
12192                 if (dv->dev->vol.migr_type == MIGR_GEN_MIGR &&
12193                     dv->dev->vol.migr_state == 1) {
12194                         dev = dv->dev;
12195                         migr_vol_qan++;
12196                         subarray_index = dv->index;
12197                 }
12198         }
12199         /* Only one volume can migrate at the same time */
12200         if (migr_vol_qan != 1) {
12201                 pr_err("%s", migr_vol_qan ?
12202                         "Number of migrating volumes greater than 1\n" :
12203                         "There is no volume during migrationg\n");
12204                 goto abort;
12205         }
12206
12207         map_dest = get_imsm_map(dev, MAP_0);
12208         map_src = get_imsm_map(dev, MAP_1);
12209         if (map_src == NULL)
12210                 goto abort;
12211
12212         ndata = imsm_num_data_members(map_dest);
12213         odata = imsm_num_data_members(map_src);
12214
12215         chunk = __le16_to_cpu(map_src->blocks_per_strip) * 512;
12216         old_data_stripe_length = odata * chunk;
12217
12218         migr_rec = super->migr_rec;
12219
12220         /* initialize migration record for start condition */
12221         if (sra->reshape_progress == 0)
12222                 init_migr_record_imsm(st, dev, sra);
12223         else {
12224                 if (__le32_to_cpu(migr_rec->rec_status) != UNIT_SRC_NORMAL) {
12225                         dprintf("imsm: cannot restart migration when data are present in copy area.\n");
12226                         goto abort;
12227                 }
12228                 /* Save checkpoint to update migration record for current
12229                  * reshape position (in md). It can be farther than current
12230                  * reshape position in metadata.
12231                  */
12232                 if (save_checkpoint_imsm(st, sra, UNIT_SRC_NORMAL) == 1) {
12233                         /* ignore error == 2, this can mean end of reshape here
12234                          */
12235                         dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_NORMAL, initial save)\n");
12236                         goto abort;
12237                 }
12238         }
12239
12240         /* size for data */
12241         buf_size = __le32_to_cpu(migr_rec->blocks_per_unit) * 512;
12242         /* extend  buffer size for parity disk */
12243         buf_size += __le32_to_cpu(migr_rec->dest_depth_per_unit) * 512;
12244         /* add space for stripe alignment */
12245         buf_size += old_data_stripe_length;
12246         if (posix_memalign((void **)&buf, MAX_SECTOR_SIZE, buf_size)) {
12247                 dprintf("imsm: Cannot allocate checkpoint buffer\n");
12248                 goto abort;
12249         }
12250
12251         max_position = sra->component_size * ndata;
12252         source_layout = imsm_level_to_layout(map_src->raid_level);
12253
12254         while (current_migr_unit(migr_rec) <
12255                get_num_migr_units(migr_rec)) {
12256                 /* current reshape position [blocks] */
12257                 unsigned long long current_position =
12258                         __le32_to_cpu(migr_rec->blocks_per_unit)
12259                         * current_migr_unit(migr_rec);
12260                 unsigned long long border;
12261
12262                 /* Check that array hasn't become failed.
12263                  */
12264                 degraded = check_degradation_change(sra, fds, degraded);
12265                 if (degraded > 1) {
12266                         dprintf("imsm: Abort reshape due to degradation level (%i)\n", degraded);
12267                         goto abort;
12268                 }
12269
12270                 next_step = __le32_to_cpu(migr_rec->blocks_per_unit);
12271
12272                 if ((current_position + next_step) > max_position)
12273                         next_step = max_position - current_position;
12274
12275                 start = current_position * 512;
12276
12277                 /* align reading start to old geometry */
12278                 start_buf_shift = start % old_data_stripe_length;
12279                 start_src = start - start_buf_shift;
12280
12281                 border = (start_src / odata) - (start / ndata);
12282                 border /= 512;
12283                 if (border <= __le32_to_cpu(migr_rec->dest_depth_per_unit)) {
12284                         /* save critical stripes to buf
12285                          * start     - start address of current unit
12286                          *             to backup [bytes]
12287                          * start_src - start address of current unit
12288                          *             to backup alligned to source array
12289                          *             [bytes]
12290                          */
12291                         unsigned long long next_step_filler;
12292                         unsigned long long copy_length = next_step * 512;
12293
12294                         /* allign copy area length to stripe in old geometry */
12295                         next_step_filler = ((copy_length + start_buf_shift)
12296                                             % old_data_stripe_length);
12297                         if (next_step_filler)
12298                                 next_step_filler = (old_data_stripe_length
12299                                                     - next_step_filler);
12300                         dprintf("save_stripes() parameters: start = %llu,\tstart_src = %llu,\tnext_step*512 = %llu,\tstart_in_buf_shift = %llu,\tnext_step_filler = %llu\n",
12301                                 start, start_src, copy_length,
12302                                 start_buf_shift, next_step_filler);
12303
12304                         if (save_stripes(fds, offsets, map_src->num_members,
12305                                          chunk, map_src->raid_level,
12306                                          source_layout, 0, NULL, start_src,
12307                                          copy_length +
12308                                          next_step_filler + start_buf_shift,
12309                                          buf)) {
12310                                 dprintf("imsm: Cannot save stripes to buffer\n");
12311                                 goto abort;
12312                         }
12313                         /* Convert data to destination format and store it
12314                          * in backup general migration area
12315                          */
12316                         if (save_backup_imsm(st, dev, sra,
12317                                 buf + start_buf_shift, copy_length)) {
12318                                 dprintf("imsm: Cannot save stripes to target devices\n");
12319                                 goto abort;
12320                         }
12321                         if (save_checkpoint_imsm(st, sra,
12322                                                  UNIT_SRC_IN_CP_AREA)) {
12323                                 dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_IN_CP_AREA)\n");
12324                                 goto abort;
12325                         }
12326                 } else {
12327                         /* set next step to use whole border area */
12328                         border /= next_step;
12329                         if (border > 1)
12330                                 next_step *= border;
12331                 }
12332                 /* When data backed up, checkpoint stored,
12333                  * kick the kernel to reshape unit of data
12334                  */
12335                 next_step = next_step + sra->reshape_progress;
12336                 /* limit next step to array max position */
12337                 if (next_step > max_position)
12338                         next_step = max_position;
12339                 sysfs_set_num(sra, NULL, "suspend_lo", sra->reshape_progress);
12340                 sysfs_set_num(sra, NULL, "suspend_hi", next_step);
12341                 sra->reshape_progress = next_step;
12342
12343                 /* wait until reshape finish */
12344                 if (wait_for_reshape_imsm(sra, ndata)) {
12345                         dprintf("wait_for_reshape_imsm returned error!\n");
12346                         goto abort;
12347                 }
12348                 if (sigterm)
12349                         goto abort;
12350
12351                 if (save_checkpoint_imsm(st, sra, UNIT_SRC_NORMAL) == 1) {
12352                         /* ignore error == 2, this can mean end of reshape here
12353                          */
12354                         dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_NORMAL)\n");
12355                         goto abort;
12356                 }
12357
12358         }
12359
12360         /* clear migr_rec on disks after successful migration */
12361         struct dl *d;
12362
12363         memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE);
12364         for (d = super->disks; d; d = d->next) {
12365                 if (d->index < 0 || is_failed(&d->disk))
12366                         continue;
12367                 unsigned long long dsize;
12368
12369                 get_dev_size(d->fd, NULL, &dsize);
12370                 if (lseek64(d->fd, dsize - MIGR_REC_SECTOR_POSITION*sector_size,
12371                             SEEK_SET) >= 0) {
12372                         if ((unsigned int)write(d->fd, super->migr_rec_buf,
12373                             MIGR_REC_BUF_SECTORS*sector_size) !=
12374                             MIGR_REC_BUF_SECTORS*sector_size)
12375                                 perror("Write migr_rec failed");
12376                 }
12377         }
12378
12379         /* return '1' if done */
12380         ret_val = 1;
12381
12382         /* After the reshape eliminate size mismatch in metadata.
12383          * Don't update md/component_size here, volume hasn't
12384          * to take whole space. It is allowed by kernel.
12385          * md/component_size will be set propoperly after next assembly.
12386          */
12387         imsm_fix_size_mismatch(st, subarray_index);
12388
12389 abort:
12390         free(buf);
12391         /* See Grow.c: abort_reshape() for further explanation */
12392         sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL);
12393         sysfs_set_num(sra, NULL, "suspend_hi", 0);
12394         sysfs_set_num(sra, NULL, "suspend_lo", 0);
12395
12396         return ret_val;
12397 }
12398
12399 /*******************************************************************************
12400  * Function:    calculate_bitmap_min_chunksize
12401  * Description: Calculates the minimal valid bitmap chunk size
12402  * Parameters:
12403  *      max_bits        : indicate how many bits can be used for the bitmap
12404  *      data_area_size  : the size of the data area covered by the bitmap
12405  *
12406  * Returns:
12407  *       The bitmap chunk size
12408  ******************************************************************************/
12409 static unsigned long long
12410 calculate_bitmap_min_chunksize(unsigned long long max_bits,
12411                                unsigned long long data_area_size)
12412 {
12413         unsigned long long min_chunk =
12414                 4096; /* sub-page chunks don't work yet.. */
12415         unsigned long long bits = data_area_size / min_chunk + 1;
12416
12417         while (bits > max_bits) {
12418                 min_chunk *= 2;
12419                 bits = (bits + 1) / 2;
12420         }
12421         return min_chunk;
12422 }
12423
12424 /*******************************************************************************
12425  * Function:    calculate_bitmap_chunksize
12426  * Description: Calculates the bitmap chunk size for the given device
12427  * Parameters:
12428  *      st      : supertype information
12429  *      dev     : device for the bitmap
12430  *
12431  * Returns:
12432  *       The bitmap chunk size
12433  ******************************************************************************/
12434 static unsigned long long calculate_bitmap_chunksize(struct supertype *st,
12435                                                      struct imsm_dev *dev)
12436 {
12437         struct intel_super *super = st->sb;
12438         unsigned long long min_chunksize;
12439         unsigned long long result = IMSM_DEFAULT_BITMAP_CHUNKSIZE;
12440         size_t dev_size = imsm_dev_size(dev);
12441
12442         min_chunksize = calculate_bitmap_min_chunksize(
12443                 IMSM_BITMAP_AREA_SIZE * super->sector_size, dev_size);
12444
12445         if (result < min_chunksize)
12446                 result = min_chunksize;
12447
12448         return result;
12449 }
12450
12451 /*******************************************************************************
12452  * Function:    init_bitmap_header
12453  * Description: Initialize the bitmap header structure
12454  * Parameters:
12455  *      st      : supertype information
12456  *      bms     : bitmap header struct to initialize
12457  *      dev     : device for the bitmap
12458  *
12459  * Returns:
12460  *       0 : success
12461  *      -1 : fail
12462  ******************************************************************************/
12463 static int init_bitmap_header(struct supertype *st, struct bitmap_super_s *bms,
12464                               struct imsm_dev *dev)
12465 {
12466         int vol_uuid[4];
12467
12468         if (!bms || !dev)
12469                 return -1;
12470
12471         bms->magic = __cpu_to_le32(BITMAP_MAGIC);
12472         bms->version = __cpu_to_le32(BITMAP_MAJOR_HI);
12473         bms->daemon_sleep = __cpu_to_le32(IMSM_DEFAULT_BITMAP_DAEMON_SLEEP);
12474         bms->sync_size = __cpu_to_le64(IMSM_BITMAP_AREA_SIZE);
12475         bms->write_behind = __cpu_to_le32(0);
12476
12477         uuid_from_super_imsm(st, vol_uuid);
12478         memcpy(bms->uuid, vol_uuid, 16);
12479
12480         bms->chunksize = calculate_bitmap_chunksize(st, dev);
12481
12482         return 0;
12483 }
12484
12485 /*******************************************************************************
12486  * Function:    validate_internal_bitmap_for_drive
12487  * Description: Verify if the bitmap header for a given drive.
12488  * Parameters:
12489  *      st      : supertype information
12490  *      offset  : The offset from the beginning of the drive where to look for
12491  *                the bitmap header.
12492  *      d       : the drive info
12493  *
12494  * Returns:
12495  *       0 : success
12496  *      -1 : fail
12497  ******************************************************************************/
12498 static int validate_internal_bitmap_for_drive(struct supertype *st,
12499                                               unsigned long long offset,
12500                                               struct dl *d)
12501 {
12502         struct intel_super *super = st->sb;
12503         int ret = -1;
12504         int vol_uuid[4];
12505         bitmap_super_t *bms;
12506         int fd;
12507
12508         if (!d)
12509                 return -1;
12510
12511         void *read_buf;
12512
12513         if (posix_memalign(&read_buf, MAX_SECTOR_SIZE, IMSM_BITMAP_HEADER_SIZE))
12514                 return -1;
12515
12516         fd = d->fd;
12517         if (fd < 0) {
12518                 fd = open(d->devname, O_RDONLY, 0);
12519                 if (fd < 0) {
12520                         dprintf("cannot open the device %s\n", d->devname);
12521                         goto abort;
12522                 }
12523         }
12524
12525         if (lseek64(fd, offset * super->sector_size, SEEK_SET) < 0)
12526                 goto abort;
12527         if (read(fd, read_buf, IMSM_BITMAP_HEADER_SIZE) !=
12528             IMSM_BITMAP_HEADER_SIZE)
12529                 goto abort;
12530
12531         uuid_from_super_imsm(st, vol_uuid);
12532
12533         bms = read_buf;
12534         if ((bms->magic != __cpu_to_le32(BITMAP_MAGIC)) ||
12535             (bms->version != __cpu_to_le32(BITMAP_MAJOR_HI)) ||
12536             (!same_uuid((int *)bms->uuid, vol_uuid, st->ss->swapuuid))) {
12537                 dprintf("wrong bitmap header detected\n");
12538                 goto abort;
12539         }
12540
12541         ret = 0;
12542 abort:
12543         if ((d->fd < 0) && (fd >= 0))
12544                 close(fd);
12545         if (read_buf)
12546                 free(read_buf);
12547
12548         return ret;
12549 }
12550
12551 /*******************************************************************************
12552  * Function:    validate_internal_bitmap_imsm
12553  * Description: Verify if the bitmap header is in place and with proper data.
12554  * Parameters:
12555  *      st      : supertype information
12556  *
12557  * Returns:
12558  *       0 : success or device w/o RWH_BITMAP
12559  *      -1 : fail
12560  ******************************************************************************/
12561 static int validate_internal_bitmap_imsm(struct supertype *st)
12562 {
12563         struct intel_super *super = st->sb;
12564         struct imsm_dev *dev = get_imsm_dev(super, super->current_vol);
12565         unsigned long long offset;
12566         struct dl *d;
12567
12568         if (!dev)
12569                 return -1;
12570
12571         if (dev->rwh_policy != RWH_BITMAP)
12572                 return 0;
12573
12574         offset = get_bitmap_header_sector(super, super->current_vol);
12575         for (d = super->disks; d; d = d->next) {
12576                 if (d->index < 0 || is_failed(&d->disk))
12577                         continue;
12578
12579                 if (validate_internal_bitmap_for_drive(st, offset, d)) {
12580                         pr_err("imsm: bitmap validation failed\n");
12581                         return -1;
12582                 }
12583         }
12584         return 0;
12585 }
12586
12587 /*******************************************************************************
12588  * Function:    add_internal_bitmap_imsm
12589  * Description: Mark the volume to use the bitmap and updates the chunk size value.
12590  * Parameters:
12591  *      st              : supertype information
12592  *      chunkp          : bitmap chunk size
12593  *      delay           : not used for imsm
12594  *      write_behind    : not used for imsm
12595  *      size            : not used for imsm
12596  *      may_change      : not used for imsm
12597  *      amajor          : not used for imsm
12598  *
12599  * Returns:
12600  *       0 : success
12601  *      -1 : fail
12602  ******************************************************************************/
12603 static int add_internal_bitmap_imsm(struct supertype *st, int *chunkp,
12604                                     int delay, int write_behind,
12605                                     unsigned long long size, int may_change,
12606                                     int amajor)
12607 {
12608         struct intel_super *super = st->sb;
12609         int vol_idx = super->current_vol;
12610         struct imsm_dev *dev;
12611
12612         if (!super->devlist || vol_idx == -1 || !chunkp)
12613                 return -1;
12614
12615         dev = get_imsm_dev(super, vol_idx);
12616
12617         if (!dev) {
12618                 dprintf("cannot find the device for volume index %d\n",
12619                         vol_idx);
12620                 return -1;
12621         }
12622         dev->rwh_policy = RWH_BITMAP;
12623
12624         *chunkp = calculate_bitmap_chunksize(st, dev);
12625
12626         return 0;
12627 }
12628
12629 /*******************************************************************************
12630  * Function:    locate_bitmap_imsm
12631  * Description: Seek 'fd' to start of write-intent-bitmap.
12632  * Parameters:
12633  *      st              : supertype information
12634  *      fd              : file descriptor for the device
12635  *      node_num        : not used for imsm
12636  *
12637  * Returns:
12638  *       0 : success
12639  *      -1 : fail
12640  ******************************************************************************/
12641 static int locate_bitmap_imsm(struct supertype *st, int fd, int node_num)
12642 {
12643         struct intel_super *super = st->sb;
12644         unsigned long long offset;
12645         int vol_idx = super->current_vol;
12646
12647         if (!super->devlist || vol_idx == -1)
12648                 return -1;
12649
12650         offset = get_bitmap_header_sector(super, super->current_vol);
12651         dprintf("bitmap header offset is %llu\n", offset);
12652
12653         lseek64(fd, offset << 9, 0);
12654
12655         return 0;
12656 }
12657
12658 /*******************************************************************************
12659  * Function:    write_init_bitmap_imsm
12660  * Description: Write a bitmap header and prepares the area for the bitmap.
12661  * Parameters:
12662  *      st      : supertype information
12663  *      fd      : file descriptor for the device
12664  *      update  : not used for imsm
12665  *
12666  * Returns:
12667  *       0 : success
12668  *      -1 : fail
12669  ******************************************************************************/
12670 static int write_init_bitmap_imsm(struct supertype *st, int fd,
12671                                   enum bitmap_update update)
12672 {
12673         struct intel_super *super = st->sb;
12674         int vol_idx = super->current_vol;
12675         int ret = 0;
12676         unsigned long long offset;
12677         bitmap_super_t bms = { 0 };
12678         size_t written = 0;
12679         size_t to_write;
12680         ssize_t rv_num;
12681         void *buf;
12682
12683         if (!super->devlist || !super->sector_size || vol_idx == -1)
12684                 return -1;
12685
12686         struct imsm_dev *dev = get_imsm_dev(super, vol_idx);
12687
12688         /* first clear the space for bitmap header */
12689         unsigned long long bitmap_area_start =
12690                 get_bitmap_header_sector(super, vol_idx);
12691
12692         dprintf("zeroing area start (%llu) and size (%u)\n", bitmap_area_start,
12693                 IMSM_BITMAP_AND_HEADER_SIZE / super->sector_size);
12694         if (zero_disk_range(fd, bitmap_area_start,
12695                             IMSM_BITMAP_HEADER_SIZE / super->sector_size)) {
12696                 pr_err("imsm: cannot zeroing the space for the bitmap\n");
12697                 return -1;
12698         }
12699
12700         /* The bitmap area should be filled with "1"s to perform initial
12701          * synchronization.
12702          */
12703         if (posix_memalign(&buf, MAX_SECTOR_SIZE, MAX_SECTOR_SIZE))
12704                 return -1;
12705         memset(buf, 0xFF, MAX_SECTOR_SIZE);
12706         offset = get_bitmap_sector(super, vol_idx);
12707         lseek64(fd, offset << 9, 0);
12708         while (written < IMSM_BITMAP_AREA_SIZE) {
12709                 to_write = IMSM_BITMAP_AREA_SIZE - written;
12710                 if (to_write > MAX_SECTOR_SIZE)
12711                         to_write = MAX_SECTOR_SIZE;
12712                 rv_num = write(fd, buf, MAX_SECTOR_SIZE);
12713                 if (rv_num != MAX_SECTOR_SIZE) {
12714                         ret = -1;
12715                         dprintf("cannot initialize bitmap area\n");
12716                         goto abort;
12717                 }
12718                 written += rv_num;
12719         }
12720
12721         /* write a bitmap header */
12722         init_bitmap_header(st, &bms, dev);
12723         memset(buf, 0, MAX_SECTOR_SIZE);
12724         memcpy(buf, &bms, sizeof(bitmap_super_t));
12725         if (locate_bitmap_imsm(st, fd, 0)) {
12726                 ret = -1;
12727                 dprintf("cannot locate the bitmap\n");
12728                 goto abort;
12729         }
12730         if (write(fd, buf, MAX_SECTOR_SIZE) != MAX_SECTOR_SIZE) {
12731                 ret = -1;
12732                 dprintf("cannot write the bitmap header\n");
12733                 goto abort;
12734         }
12735         fsync(fd);
12736
12737 abort:
12738         free(buf);
12739
12740         return ret;
12741 }
12742
12743 /*******************************************************************************
12744  * Function:    is_vol_to_setup_bitmap
12745  * Description: Checks if a bitmap should be activated on the dev.
12746  * Parameters:
12747  *      info    : info about the volume to setup the bitmap
12748  *      dev     : the device to check against bitmap creation
12749  *
12750  * Returns:
12751  *       0 : bitmap should be set up on the device
12752  *      -1 : otherwise
12753  ******************************************************************************/
12754 static int is_vol_to_setup_bitmap(struct mdinfo *info, struct imsm_dev *dev)
12755 {
12756         if (!dev || !info)
12757                 return -1;
12758
12759         if ((strcmp((char *)dev->volume, info->name) == 0) &&
12760             (dev->rwh_policy == RWH_BITMAP))
12761                 return -1;
12762
12763         return 0;
12764 }
12765
12766 /*******************************************************************************
12767  * Function:    set_bitmap_sysfs
12768  * Description: Set the sysfs atributes of a given volume to activate the bitmap.
12769  * Parameters:
12770  *      info            : info about the volume where the bitmap should be setup
12771  *      chunksize       : bitmap chunk size
12772  *      location        : location of the bitmap
12773  *
12774  * Returns:
12775  *       0 : success
12776  *      -1 : fail
12777  ******************************************************************************/
12778 static int set_bitmap_sysfs(struct mdinfo *info, unsigned long long chunksize,
12779                             char *location)
12780 {
12781         /* The bitmap/metadata is set to external to allow changing of value for
12782          * bitmap/location. When external is used, the kernel will treat an offset
12783          * related to the device's first lba (in opposition to the "internal" case
12784          * when this value is related to the beginning of the superblock).
12785          */
12786         if (sysfs_set_str(info, NULL, "bitmap/metadata", "external")) {
12787                 dprintf("failed to set bitmap/metadata\n");
12788                 return -1;
12789         }
12790
12791         /* It can only be changed when no bitmap is active.
12792          * Should be bigger than 512 and must be power of 2.
12793          * It is expecting the value in bytes.
12794          */
12795         if (sysfs_set_num(info, NULL, "bitmap/chunksize",
12796                                           __cpu_to_le32(chunksize))) {
12797                 dprintf("failed to set bitmap/chunksize\n");
12798                 return -1;
12799         }
12800
12801         /* It is expecting the value in sectors. */
12802         if (sysfs_set_num(info, NULL, "bitmap/space",
12803                                           __cpu_to_le64(IMSM_BITMAP_AREA_SIZE))) {
12804                 dprintf("failed to set bitmap/space\n");
12805                 return -1;
12806         }
12807
12808         /* Determines the delay between the bitmap updates.
12809          * It is expecting the value in seconds.
12810          */
12811         if (sysfs_set_num(info, NULL, "bitmap/time_base",
12812                                           __cpu_to_le64(IMSM_DEFAULT_BITMAP_DAEMON_SLEEP))) {
12813                 dprintf("failed to set bitmap/time_base\n");
12814                 return -1;
12815         }
12816
12817         /* It is expecting the value in sectors with a sign at the beginning. */
12818         if (sysfs_set_str(info, NULL, "bitmap/location", location)) {
12819                 dprintf("failed to set bitmap/location\n");
12820                 return -1;
12821         }
12822
12823         return 0;
12824 }
12825
12826 /*******************************************************************************
12827  * Function:    set_bitmap_imsm
12828  * Description: Setup the bitmap for the given volume
12829  * Parameters:
12830  *      st      : supertype information
12831  *      info    : info about the volume where the bitmap should be setup
12832  *
12833  * Returns:
12834  *       0 : success
12835  *      -1 : fail
12836  ******************************************************************************/
12837 static int set_bitmap_imsm(struct supertype *st, struct mdinfo *info)
12838 {
12839         struct intel_super *super = st->sb;
12840         int prev_current_vol = super->current_vol;
12841         struct imsm_dev *dev;
12842         int ret = -1;
12843         char location[16] = "";
12844         unsigned long long chunksize;
12845         struct intel_dev *dev_it;
12846
12847         for (dev_it = super->devlist; dev_it; dev_it = dev_it->next) {
12848                 super->current_vol = dev_it->index;
12849                 dev = get_imsm_dev(super, super->current_vol);
12850
12851                 if (is_vol_to_setup_bitmap(info, dev)) {
12852                         if (validate_internal_bitmap_imsm(st)) {
12853                                 dprintf("bitmap header validation failed\n");
12854                                 goto abort;
12855                         }
12856
12857                         chunksize = calculate_bitmap_chunksize(st, dev);
12858                         dprintf("chunk size is %llu\n", chunksize);
12859
12860                         snprintf(location, sizeof(location), "+%llu",
12861                                  get_bitmap_sector(super, super->current_vol));
12862                         dprintf("bitmap offset is %s\n", location);
12863
12864                         if (set_bitmap_sysfs(info, chunksize, location)) {
12865                                 dprintf("cannot setup the bitmap\n");
12866                                 goto abort;
12867                         }
12868                 }
12869         }
12870         ret = 0;
12871 abort:
12872         super->current_vol = prev_current_vol;
12873         return ret;
12874 }
12875
12876 struct superswitch super_imsm = {
12877         .examine_super  = examine_super_imsm,
12878         .brief_examine_super = brief_examine_super_imsm,
12879         .brief_examine_subarrays = brief_examine_subarrays_imsm,
12880         .export_examine_super = export_examine_super_imsm,
12881         .detail_super   = detail_super_imsm,
12882         .brief_detail_super = brief_detail_super_imsm,
12883         .write_init_super = write_init_super_imsm,
12884         .validate_geometry = validate_geometry_imsm,
12885         .add_to_super   = add_to_super_imsm,
12886         .remove_from_super = remove_from_super_imsm,
12887         .detail_platform = detail_platform_imsm,
12888         .export_detail_platform = export_detail_platform_imsm,
12889         .kill_subarray = kill_subarray_imsm,
12890         .update_subarray = update_subarray_imsm,
12891         .load_container = load_container_imsm,
12892         .default_geometry = default_geometry_imsm,
12893         .get_disk_controller_domain = imsm_get_disk_controller_domain,
12894         .reshape_super  = imsm_reshape_super,
12895         .manage_reshape = imsm_manage_reshape,
12896         .recover_backup = recover_backup_imsm,
12897         .examine_badblocks = examine_badblocks_imsm,
12898         .match_home     = match_home_imsm,
12899         .uuid_from_super= uuid_from_super_imsm,
12900         .getinfo_super  = getinfo_super_imsm,
12901         .getinfo_super_disks = getinfo_super_disks_imsm,
12902         .update_super   = update_super_imsm,
12903
12904         .avail_size     = avail_size_imsm,
12905         .get_spare_criteria = get_spare_criteria_imsm,
12906
12907         .compare_super  = compare_super_imsm,
12908
12909         .load_super     = load_super_imsm,
12910         .init_super     = init_super_imsm,
12911         .store_super    = store_super_imsm,
12912         .free_super     = free_super_imsm,
12913         .match_metadata_desc = match_metadata_desc_imsm,
12914         .container_content = container_content_imsm,
12915         .validate_container = validate_container_imsm,
12916
12917         .add_internal_bitmap = add_internal_bitmap_imsm,
12918         .locate_bitmap = locate_bitmap_imsm,
12919         .write_bitmap = write_init_bitmap_imsm,
12920         .set_bitmap = set_bitmap_imsm,
12921
12922         .write_init_ppl = write_init_ppl_imsm,
12923         .validate_ppl   = validate_ppl_imsm,
12924
12925         .external       = 1,
12926         .name = "imsm",
12927
12928 /* for mdmon */
12929         .open_new       = imsm_open_new,
12930         .set_array_state= imsm_set_array_state,
12931         .set_disk       = imsm_set_disk,
12932         .sync_metadata  = imsm_sync_metadata,
12933         .activate_spare = imsm_activate_spare,
12934         .process_update = imsm_process_update,
12935         .prepare_update = imsm_prepare_update,
12936         .record_bad_block = imsm_record_badblock,
12937         .clear_bad_block  = imsm_clear_badblock,
12938         .get_bad_blocks   = imsm_get_badblocks,
12939 };