Grow.c

   1 /*
   2  * mdadm - manage Linux "md" devices aka RAID arrays.
   3  *
   4  * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
   5  *
   6  *
   7  *    This program is free software; you can redistribute it and/or modify
   8  *    it under the terms of the GNU General Public License as published by
   9  *    the Free Software Foundation; either version 2 of the License, or
  10  *    (at your option) any later version.
  11  *
  12  *    This program is distributed in the hope that it will be useful,
  13  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *    GNU General Public License for more details.
  16  *
  17  *    You should have received a copy of the GNU General Public License
  18  *    along with this program; if not, write to the Free Software
  19  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20  *
  21  *    Author: Neil Brown
  22  *    Email: <neilb@suse.de>
  23  */
  24 #include        "mdadm.h"
  25 #include        "dlink.h"
  26 #include        <sys/mman.h>
  27
  28 #if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN)
  29 #error no endian defined
  30 #endif
  31 #include        "md_u.h"
  32 #include        "md_p.h"
  33
  34 #ifndef offsetof
  35 #define offsetof(t,f) ((size_t)&(((t*)0)->f))
  36 #endif
  37
  38 int restore_backup(struct supertype *st,
  39                    struct mdinfo *content,
  40                    int working_disks,
  41                    int next_spare,
  42                    char *backup_file,
  43                    int verbose)
  44 {
  45         int i;
  46         int *fdlist;
  47         struct mdinfo *dev;
  48         int err;
  49         int disk_count = next_spare + working_disks;
  50
  51         dprintf("Called restore_backup()\n");
  52         fdlist = malloc(sizeof(int) * disk_count);
  53         if (fdlist == NULL) {
  54                 fprintf(stderr,
  55                         Name ": cannot allocate memory for disk list\n");
  56                 return 1;
  57         }
  58         for (i = 0; i < next_spare; i++)
  59                 fdlist[i] = -1;
  60         for (dev = content->devs; dev; dev = dev->next) {
  61                 char buf[22];
  62                 int fd;
  63                 sprintf(buf, "%d:%d",
  64                         dev->disk.major,
  65                         dev->disk.minor);
  66                 fd = dev_open(buf, O_RDWR);
  67
  68                 if (dev->disk.raid_disk >= 0)
  69                         fdlist[dev->disk.raid_disk] = fd;
  70                 else
  71                         fdlist[next_spare++] = fd;
  72         }
  73
  74         if (st->ss->external && st->ss->recover_backup)
  75                 err = st->ss->recover_backup(st, content);
  76         else
  77                 err = Grow_restart(st, content, fdlist, next_spare,
  78                                    backup_file, verbose > 0);
  79
  80         while (next_spare > 0) {
  81                 next_spare--;
  82                 if (fdlist[next_spare] >= 0)
  83                         close(fdlist[next_spare]);
  84         }
  85         free(fdlist);
  86         if (err) {
  87                 fprintf(stderr, Name ": Failed to restore critical"
  88                         " section for reshape - sorry.\n");
  89                 if (!backup_file)
  90                         fprintf(stderr, Name ":  Possibly you need"
  91                                 " to specify a --backup-file\n");
  92                 return 1;
  93         }
  94
  95         dprintf("restore_backup() returns status OK.\n");
  96         return 0;
  97 }
  98
  99 int Grow_Add_device(char *devname, int fd, char *newdev)
 100 {
 101         /* Add a device to an active array.
 102          * Currently, just extend a linear array.
 103          * This requires writing a new superblock on the
 104          * new device, calling the kernel to add the device,
 105          * and if that succeeds, update the superblock on
 106          * all other devices.
 107          * This means that we need to *find* all other devices.
 108          */
 109         struct mdinfo info;
 110
 111         struct stat stb;
 112         int nfd, fd2;
 113         int d, nd;
 114         struct supertype *st = NULL;
 115         char *subarray = NULL;
 116
 117         if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) {
 118                 fprintf(stderr, Name ": cannot get array info for %s\n", devname);
 119                 return 1;
 120         }
 121
 122         if (info.array.level != -1) {
 123                 fprintf(stderr, Name ": can only add devices to linear arrays\n");
 124                 return 1;
 125         }
 126
 127         st = super_by_fd(fd, &subarray);
 128         if (!st) {
 129                 fprintf(stderr, Name ": cannot handle arrays with superblock version %d\n", info.array.major_version);
 130                 return 1;
 131         }
 132
 133         if (subarray) {
 134                 fprintf(stderr, Name ": Cannot grow linear sub-arrays yet\n");
 135                 free(subarray);
 136                 free(st);
 137                 return 1;
 138         }
 139
 140         nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT);
 141         if (nfd < 0) {
 142                 fprintf(stderr, Name ": cannot open %s\n", newdev);
 143                 free(st);
 144                 return 1;
 145         }
 146         fstat(nfd, &stb);
 147         if ((stb.st_mode & S_IFMT) != S_IFBLK) {
 148                 fprintf(stderr, Name ": %s is not a block device!\n", newdev);
 149                 close(nfd);
 150                 free(st);
 151                 return 1;
 152         }
 153         /* now check out all the devices and make sure we can read the superblock */
 154         for (d=0 ; d < info.array.raid_disks ; d++) {
 155                 mdu_disk_info_t disk;
 156                 char *dv;
 157
 158                 st->ss->free_super(st);
 159
 160                 disk.number = d;
 161                 if (ioctl(fd, GET_DISK_INFO, &disk) < 0) {
 162                         fprintf(stderr, Name ": cannot get device detail for device %d\n",
 163                                 d);
 164                         close(nfd);
 165                         free(st);
 166                         return 1;
 167                 }
 168                 dv = map_dev(disk.major, disk.minor, 1);
 169                 if (!dv) {
 170                         fprintf(stderr, Name ": cannot find device file for device %d\n",
 171                                 d);
 172                         close(nfd);
 173                         free(st);
 174                         return 1;
 175                 }
 176                 fd2 = dev_open(dv, O_RDWR);
 177                 if (fd2 < 0) {
 178                         fprintf(stderr, Name ": cannot open device file %s\n", dv);
 179                         close(nfd);
 180                         free(st);
 181                         return 1;
 182                 }
 183
 184                 if (st->ss->load_super(st, fd2, NULL)) {
 185                         fprintf(stderr, Name ": cannot find super block on %s\n", dv);
 186                         close(nfd);
 187                         close(fd2);
 188                         free(st);
 189                         return 1;
 190                 }
 191                 close(fd2);
 192         }
 193         /* Ok, looks good. Lets update the superblock and write it out to
 194          * newdev.
 195          */
 196
 197         info.disk.number = d;
 198         info.disk.major = major(stb.st_rdev);
 199         info.disk.minor = minor(stb.st_rdev);
 200         info.disk.raid_disk = d;
 201         info.disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE);
 202         st->ss->update_super(st, &info, "linear-grow-new", newdev,
 203                              0, 0, NULL);
 204
 205         if (st->ss->store_super(st, nfd)) {
 206                 fprintf(stderr, Name ": Cannot store new superblock on %s\n",
 207                         newdev);
 208                 close(nfd);
 209                 return 1;
 210         }
 211         close(nfd);
 212
 213         if (ioctl(fd, ADD_NEW_DISK, &info.disk) != 0) {
 214                 fprintf(stderr, Name ": Cannot add new disk to this array\n");
 215                 return 1;
 216         }
 217         /* Well, that seems to have worked.
 218          * Now go through and update all superblocks
 219          */
 220
 221         if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) {
 222                 fprintf(stderr, Name ": cannot get array info for %s\n", devname);
 223                 return 1;
 224         }
 225
 226         nd = d;
 227         for (d=0 ; d < info.array.raid_disks ; d++) {
 228                 mdu_disk_info_t disk;
 229                 char *dv;
 230
 231                 disk.number = d;
 232                 if (ioctl(fd, GET_DISK_INFO, &disk) < 0) {
 233                         fprintf(stderr, Name ": cannot get device detail for device %d\n",
 234                                 d);
 235                         return 1;
 236                 }
 237                 dv = map_dev(disk.major, disk.minor, 1);
 238                 if (!dv) {
 239                         fprintf(stderr, Name ": cannot find device file for device %d\n",
 240                                 d);
 241                         return 1;
 242                 }
 243                 fd2 = dev_open(dv, O_RDWR);
 244                 if (fd2 < 0) {
 245                         fprintf(stderr, Name ": cannot open device file %s\n", dv);
 246                         return 1;
 247                 }
 248                 if (st->ss->load_super(st, fd2, NULL)) {
 249                         fprintf(stderr, Name ": cannot find super block on %s\n", dv);
 250                         close(fd);
 251                         return 1;
 252                 }
 253                 info.array.raid_disks = nd+1;
 254                 info.array.nr_disks = nd+1;
 255                 info.array.active_disks = nd+1;
 256                 info.array.working_disks = nd+1;
 257
 258                 st->ss->update_super(st, &info, "linear-grow-update", dv,
 259                                      0, 0, NULL);
 260
 261                 if (st->ss->store_super(st, fd2)) {
 262                         fprintf(stderr, Name ": Cannot store new superblock on %s\n", dv);
 263                         close(fd2);
 264                         return 1;
 265                 }
 266                 close(fd2);
 267         }
 268
 269         return 0;
 270 }
 271
 272 int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int write_behind, int force)
 273 {
 274         /*
 275          * First check that array doesn't have a bitmap
 276          * Then create the bitmap
 277          * Then add it
 278          *
 279          * For internal bitmaps, we need to check the version,
 280          * find all the active devices, and write the bitmap block
 281          * to all devices
 282          */
 283         mdu_bitmap_file_t bmf;
 284         mdu_array_info_t array;
 285         struct supertype *st;
 286         char *subarray = NULL;
 287         int major = BITMAP_MAJOR_HI;
 288         int vers = md_get_version(fd);
 289         unsigned long long bitmapsize, array_size;
 290
 291         if (vers < 9003) {
 292                 major = BITMAP_MAJOR_HOSTENDIAN;
 293                 fprintf(stderr, Name ": Warning - bitmaps created on this kernel"
 294                         " are not portable\n"
 295                         "  between different architectures.  Consider upgrading"
 296                         " the Linux kernel.\n");
 297         }
 298
 299         if (ioctl(fd, GET_BITMAP_FILE, &bmf) != 0) {
 300                 if (errno == ENOMEM)
 301                         fprintf(stderr, Name ": Memory allocation failure.\n");
 302                 else
 303                         fprintf(stderr, Name ": bitmaps not supported by this kernel.\n");
 304                 return 1;
 305         }
 306         if (bmf.pathname[0]) {
 307                 if (strcmp(file,"none")==0) {
 308                         if (ioctl(fd, SET_BITMAP_FILE, -1)!= 0) {
 309                                 fprintf(stderr, Name ": failed to remove bitmap %s\n",
 310                                         bmf.pathname);
 311                                 return 1;
 312                         }
 313                         return 0;
 314                 }
 315                 fprintf(stderr, Name ": %s already has a bitmap (%s)\n",
 316                         devname, bmf.pathname);
 317                 return 1;
 318         }
 319         if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) {
 320                 fprintf(stderr, Name ": cannot get array status for %s\n", devname);
 321                 return 1;
 322         }
 323         if (array.state & (1<<MD_SB_BITMAP_PRESENT)) {
 324                 if (strcmp(file, "none")==0) {
 325                         array.state &= ~(1<<MD_SB_BITMAP_PRESENT);
 326                         if (ioctl(fd, SET_ARRAY_INFO, &array)!= 0) {
 327                                 fprintf(stderr, Name ": failed to remove internal bitmap.\n");
 328                                 return 1;
 329                         }
 330                         return 0;
 331                 }
 332                 fprintf(stderr, Name ": Internal bitmap already present on %s\n",
 333                         devname);
 334                 return 1;
 335         }
 336
 337         if (strcmp(file, "none") == 0) {
 338                 fprintf(stderr, Name ": no bitmap found on %s\n", devname);
 339                 return 1;
 340         }
 341         if (array.level <= 0) {
 342                 fprintf(stderr, Name ": Bitmaps not meaningful with level %s\n",
 343                         map_num(pers, array.level)?:"of this array");
 344                 return 1;
 345         }
 346         bitmapsize = array.size;
 347         bitmapsize <<= 1;
 348         if (get_dev_size(fd, NULL, &array_size) &&
 349             array_size > (0x7fffffffULL<<9)) {
 350                 /* Array is big enough that we cannot trust array.size
 351                  * try other approaches
 352                  */
 353                 bitmapsize = get_component_size(fd);
 354         }
 355         if (bitmapsize == 0) {
 356                 fprintf(stderr, Name ": Cannot reliably determine size of array to create bitmap - sorry.\n");
 357                 return 1;
 358         }
 359
 360         if (array.level == 10) {
 361                 int ncopies = (array.layout&255)*((array.layout>>8)&255);
 362                 bitmapsize = bitmapsize * array.raid_disks / ncopies;
 363         }
 364
 365         st = super_by_fd(fd, &subarray);
 366         if (!st) {
 367                 fprintf(stderr, Name ": Cannot understand version %d.%d\n",
 368                         array.major_version, array.minor_version);
 369                 return 1;
 370         }
 371         if (subarray) {
 372                 fprintf(stderr, Name ": Cannot add bitmaps to sub-arrays yet\n");
 373                 free(subarray);
 374                 free(st);
 375                 return 1;
 376         }
 377         if (strcmp(file, "internal") == 0) {
 378                 int rv;
 379                 int d;
 380                 int offset_setable = 0;
 381                 struct mdinfo *mdi;
 382                 if (st->ss->add_internal_bitmap == NULL) {
 383                         fprintf(stderr, Name ": Internal bitmaps not supported "
 384                                 "with %s metadata\n", st->ss->name);
 385                         return 1;
 386                 }
 387                 mdi = sysfs_read(fd, -1, GET_BITMAP_LOCATION);
 388                 if (mdi)
 389                         offset_setable = 1;
 390                 for (d=0; d< st->max_devs; d++) {
 391                         mdu_disk_info_t disk;
 392                         char *dv;
 393                         disk.number = d;
 394                         if (ioctl(fd, GET_DISK_INFO, &disk) < 0)
 395                                 continue;
 396                         if (disk.major == 0 &&
 397                             disk.minor == 0)
 398                                 continue;
 399                         if ((disk.state & (1<<MD_DISK_SYNC))==0)
 400                                 continue;
 401                         dv = map_dev(disk.major, disk.minor, 1);
 402                         if (dv) {
 403                                 int fd2 = dev_open(dv, O_RDWR);
 404                                 if (fd2 < 0)
 405                                         continue;
 406                                 if (st->ss->load_super(st, fd2, NULL)==0) {
 407                                         if (st->ss->add_internal_bitmap(
 408                                                     st,
 409                                                     &chunk, delay, write_behind,
 410                                                     bitmapsize, offset_setable,
 411                                                     major)
 412                                                 )
 413                                                 st->ss->write_bitmap(st, fd2);
 414                                         else {
 415                                                 fprintf(stderr, Name ": failed "
 416                                 "to create internal bitmap - chunksize problem.\n");
 417                                                 close(fd2);
 418                                                 return 1;
 419                                         }
 420                                 }
 421                                 close(fd2);
 422                         }
 423                 }
 424                 if (offset_setable) {
 425                         st->ss->getinfo_super(st, mdi, NULL);
 426                         sysfs_init(mdi, fd, -1);
 427                         rv = sysfs_set_num(mdi, NULL, "bitmap/location",
 428                                            mdi->bitmap_offset);
 429                 } else {
 430                         array.state |= (1<<MD_SB_BITMAP_PRESENT);
 431                         rv = ioctl(fd, SET_ARRAY_INFO, &array);
 432                 }
 433                 if (rv < 0) {
 434                         if (errno == EBUSY)
 435                                 fprintf(stderr, Name
 436                                         ": Cannot add bitmap while array is"
 437                                         " resyncing or reshaping etc.\n");
 438                         fprintf(stderr, Name ": failed to set internal bitmap.\n");
 439                         return 1;
 440                 }
 441         } else {
 442                 int uuid[4];
 443                 int bitmap_fd;
 444                 int d;
 445                 int max_devs = st->max_devs;
 446
 447                 /* try to load a superblock */
 448                 for (d=0; d<max_devs; d++) {
 449                         mdu_disk_info_t disk;
 450                         char *dv;
 451                         int fd2;
 452                         disk.number = d;
 453                         if (ioctl(fd, GET_DISK_INFO, &disk) < 0)
 454                                 continue;
 455                         if ((disk.major==0 && disk.minor==0) ||
 456                             (disk.state & (1<<MD_DISK_REMOVED)))
 457                                 continue;
 458                         dv = map_dev(disk.major, disk.minor, 1);
 459                         if (!dv) continue;
 460                         fd2 = dev_open(dv, O_RDONLY);
 461                         if (fd2 >= 0) {
 462                                 if (st->ss->load_super(st, fd2, NULL) == 0) {
 463                                         close(fd2);
 464                                         st->ss->uuid_from_super(st, uuid);
 465                                         break;
 466                                 }
 467                                 close(fd2);
 468                         }
 469                 }
 470                 if (d == max_devs) {
 471                         fprintf(stderr, Name ": cannot find UUID for array!\n");
 472                         return 1;
 473                 }
 474                 if (CreateBitmap(file, force, (char*)uuid, chunk,
 475                                  delay, write_behind, bitmapsize, major)) {
 476                         return 1;
 477                 }
 478                 bitmap_fd = open(file, O_RDWR);
 479                 if (bitmap_fd < 0) {
 480                         fprintf(stderr, Name ": weird: %s cannot be opened\n",
 481                                 file);
 482                         return 1;
 483                 }
 484                 if (ioctl(fd, SET_BITMAP_FILE, bitmap_fd) < 0) {
 485                         int err = errno;
 486                         if (errno == EBUSY)
 487                                 fprintf(stderr, Name
 488                                         ": Cannot add bitmap while array is"
 489                                         " resyncing or reshaping etc.\n");
 490                         fprintf(stderr, Name ": Cannot set bitmap file for %s: %s\n",
 491                                 devname, strerror(err));
 492                         return 1;
 493                 }
 494         }
 495
 496         return 0;
 497 }
 498
 499 /*
 500  * When reshaping an array we might need to backup some data.
 501  * This is written to all spares with a 'super_block' describing it.
 502  * The superblock goes 4K from the end of the used space on the
 503  * device.
 504  * It if written after the backup is complete.
 505  * It has the following structure.
 506  */
 507
 508 static struct mdp_backup_super {
 509         char    magic[16];  /* md_backup_data-1 or -2 */
 510         __u8    set_uuid[16];
 511         __u64   mtime;
 512         /* start/sizes in 512byte sectors */
 513         __u64   devstart;       /* address on backup device/file of data */
 514         __u64   arraystart;
 515         __u64   length;
 516         __u32   sb_csum;        /* csum of preceeding bytes. */
 517         __u32   pad1;
 518         __u64   devstart2;      /* offset in to data of second section */
 519         __u64   arraystart2;
 520         __u64   length2;
 521         __u32   sb_csum2;       /* csum of preceeding bytes. */
 522         __u8 pad[512-68-32];
 523 } __attribute__((aligned(512))) bsb, bsb2;
 524
 525 static __u32 bsb_csum(char *buf, int len)
 526 {
 527         int i;
 528         int csum = 0;
 529         for (i=0; i<len; i++)
 530                 csum = (csum<<3) + buf[0];
 531         return __cpu_to_le32(csum);
 532 }
 533
 534 static int check_idle(struct supertype *st)
 535 {
 536         /* Check that all member arrays for this container, or the
 537          * container of this array, are idle
 538          */
 539         int container_dev = (st->container_dev != NoMdDev
 540                              ? st->container_dev : st->devnum);
 541         char container[40];
 542         struct mdstat_ent *ent, *e;
 543         int is_idle = 1;
 544
 545         fmt_devname(container, container_dev);
 546         ent = mdstat_read(0, 0);
 547         for (e = ent ; e; e = e->next) {
 548                 if (!is_container_member(e, container))
 549                         continue;
 550                 if (e->percent >= 0) {
 551                         is_idle = 0;
 552                         break;
 553                 }
 554         }
 555         free_mdstat(ent);
 556         return is_idle;
 557 }
 558
 559 static int freeze_container(struct supertype *st)
 560 {
 561         int container_dev = (st->container_dev != NoMdDev
 562                              ? st->container_dev : st->devnum);
 563         char container[40];
 564
 565         if (!check_idle(st))
 566                 return -1;
 567
 568         fmt_devname(container, container_dev);
 569
 570         if (block_monitor(container, 1)) {
 571                 fprintf(stderr, Name ": failed to freeze container\n");
 572                 return -2;
 573         }
 574
 575         return 1;
 576 }
 577
 578 static void unfreeze_container(struct supertype *st)
 579 {
 580         int container_dev = (st->container_dev != NoMdDev
 581                              ? st->container_dev : st->devnum);
 582         char container[40];
 583
 584         fmt_devname(container, container_dev);
 585
 586         unblock_monitor(container, 1);
 587 }
 588
 589 static int freeze(struct supertype *st)
 590 {
 591         /* Try to freeze resync/rebuild on this array/container.
 592          * Return -1 if the array is busy,
 593          * return -2 container cannot be frozen,
 594          * return 0 if this kernel doesn't support 'frozen'
 595          * return 1 if it worked.
 596          */
 597         if (st->ss->external)
 598                 return freeze_container(st);
 599         else {
 600                 struct mdinfo *sra = sysfs_read(-1, st->devnum, GET_VERSION);
 601                 int err;
 602                 char buf[20];
 603
 604                 if (!sra)
 605                         return -1;
 606                 /* Need to clear any 'read-auto' status */
 607                 if (sysfs_get_str(sra, NULL, "array_state", buf, 20) > 0 &&
 608                     strncmp(buf, "read-auto", 9) == 0)
 609                         sysfs_set_str(sra, NULL, "array_state", "clean");
 610
 611                 err = sysfs_freeze_array(sra);
 612                 sysfs_free(sra);
 613                 return err;
 614         }
 615 }
 616
 617 static void unfreeze(struct supertype *st)
 618 {
 619         if (st->ss->external)
 620                 return unfreeze_container(st);
 621         else {
 622                 struct mdinfo *sra = sysfs_read(-1, st->devnum, GET_VERSION);
 623
 624                 if (sra)
 625                         sysfs_set_str(sra, NULL, "sync_action", "idle");
 626                 else
 627                         fprintf(stderr, Name ": failed to unfreeze array\n");
 628                 sysfs_free(sra);
 629         }
 630 }
 631
 632 static void wait_reshape(struct mdinfo *sra)
 633 {
 634         int fd = sysfs_get_fd(sra, NULL, "sync_action");
 635         char action[20];
 636
 637         if (fd < 0)
 638                 return;
 639
 640         while  (sysfs_fd_get_str(fd, action, 20) > 0 &&
 641                 strncmp(action, "reshape", 7) == 0) {
 642                 fd_set rfds;
 643                 FD_ZERO(&rfds);
 644                 FD_SET(fd, &rfds);
 645                 select(fd+1, NULL, NULL, &rfds, NULL);
 646         }
 647         close(fd);
 648 }
 649
 650 static int reshape_super(struct supertype *st, long long size, int level,
 651                          int layout, int chunksize, int raid_disks,
 652                          int delta_disks, char *backup_file, char *dev,
 653                          int verbose)
 654 {
 655         /* nothing extra to check in the native case */
 656         if (!st->ss->external)
 657                 return 0;
 658         if (!st->ss->reshape_super ||
 659             !st->ss->manage_reshape) {
 660                 fprintf(stderr, Name ": %s metadata does not support reshape\n",
 661                         st->ss->name);
 662                 return 1;
 663         }
 664
 665         return st->ss->reshape_super(st, size, level, layout, chunksize,
 666                                      raid_disks, delta_disks, backup_file, dev,
 667                                      verbose);
 668 }
 669
 670 static void sync_metadata(struct supertype *st)
 671 {
 672         if (st->ss->external) {
 673                 if (st->update_tail) {
 674                         flush_metadata_updates(st);
 675                         st->update_tail = &st->updates;
 676                 } else
 677                         st->ss->sync_metadata(st);
 678         }
 679 }
 680
 681 static int subarray_set_num(char *container, struct mdinfo *sra, char *name, int n)
 682 {
 683         /* when dealing with external metadata subarrays we need to be
 684          * prepared to handle EAGAIN.  The kernel may need to wait for
 685          * mdmon to mark the array active so the kernel can handle
 686          * allocations/writeback when preparing the reshape action
 687          * (md_allow_write()).  We temporarily disable safe_mode_delay
 688          * to close a race with the array_state going clean before the
 689          * next write to raid_disks / stripe_cache_size
 690          */
 691         char safe[50];
 692         int rc;
 693
 694         /* only 'raid_disks' and 'stripe_cache_size' trigger md_allow_write */
 695         if (!container ||
 696             (strcmp(name, "raid_disks") != 0 &&
 697              strcmp(name, "stripe_cache_size") != 0))
 698                 return sysfs_set_num(sra, NULL, name, n);
 699
 700         rc = sysfs_get_str(sra, NULL, "safe_mode_delay", safe, sizeof(safe));
 701         if (rc <= 0)
 702                 return -1;
 703         sysfs_set_num(sra, NULL, "safe_mode_delay", 0);
 704         rc = sysfs_set_num(sra, NULL, name, n);
 705         if (rc < 0 && errno == EAGAIN) {
 706                 ping_monitor(container);
 707                 /* if we get EAGAIN here then the monitor is not active
 708                  * so stop trying
 709                  */
 710                 rc = sysfs_set_num(sra, NULL, name, n);
 711         }
 712         sysfs_set_str(sra, NULL, "safe_mode_delay", safe);
 713         return rc;
 714 }
 715
 716 int start_reshape(struct mdinfo *sra, int already_running,
 717                   int before_data_disks, int data_disks)
 718 {
 719         int err;
 720         unsigned long long sync_max_to_set;
 721
 722         sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL);
 723         err = sysfs_set_num(sra, NULL, "suspend_hi", sra->reshape_progress);
 724         err = err ?: sysfs_set_num(sra, NULL, "suspend_lo",
 725                                    sra->reshape_progress);
 726         if (before_data_disks <= data_disks)
 727                 sync_max_to_set = sra->reshape_progress / data_disks;
 728         else
 729                 sync_max_to_set = (sra->component_size * data_disks
 730                                    - sra->reshape_progress) / data_disks;
 731         if (!already_running)
 732                 sysfs_set_num(sra, NULL, "sync_min", sync_max_to_set);
 733         err = err ?: sysfs_set_num(sra, NULL, "sync_max", sync_max_to_set);
 734         if (!already_running)
 735                 err = err ?: sysfs_set_str(sra, NULL, "sync_action", "reshape");
 736
 737         return err;
 738 }
 739
 740 void abort_reshape(struct mdinfo *sra)
 741 {
 742         sysfs_set_str(sra, NULL, "sync_action", "idle");
 743         sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL);
 744         sysfs_set_num(sra, NULL, "suspend_hi", 0);
 745         sysfs_set_num(sra, NULL, "suspend_lo", 0);
 746         sysfs_set_num(sra, NULL, "sync_min", 0);
 747         sysfs_set_str(sra, NULL, "sync_max", "max");
 748 }
 749
 750 int remove_disks_for_takeover(struct supertype *st,
 751                               struct mdinfo *sra,
 752                               int layout)
 753 {
 754         int nr_of_copies;
 755         struct mdinfo *remaining;
 756         int slot;
 757
 758         if (sra->array.level == 10)
 759                 nr_of_copies = layout & 0xff;
 760         else if (sra->array.level == 1)
 761                 nr_of_copies = sra->array.raid_disks;
 762         else
 763                 return 1;
 764
 765         remaining = sra->devs;
 766         sra->devs = NULL;
 767         /* for each 'copy', select one device and remove from the list. */
 768         for (slot = 0; slot < sra->array.raid_disks; slot += nr_of_copies) {
 769                 struct mdinfo **diskp;
 770                 int found = 0;
 771
 772                 /* Find a working device to keep */
 773                 for (diskp =  &remaining; *diskp ; diskp = &(*diskp)->next) {
 774                         struct mdinfo *disk = *diskp;
 775
 776                         if (disk->disk.raid_disk < slot)
 777                                 continue;
 778                         if (disk->disk.raid_disk >= slot + nr_of_copies)
 779                                 continue;
 780                         if (disk->disk.state & (1<<MD_DISK_REMOVED))
 781                                 continue;
 782                         if (disk->disk.state & (1<<MD_DISK_FAULTY))
 783                                 continue;
 784                         if (!(disk->disk.state & (1<<MD_DISK_SYNC)))
 785                                 continue;
 786
 787                         /* We have found a good disk to use! */
 788                         *diskp = disk->next;
 789                         disk->next = sra->devs;
 790                         sra->devs = disk;
 791                         found = 1;
 792                         break;
 793                 }
 794                 if (!found)
 795                         break;
 796         }
 797
 798         if (slot < sra->array.raid_disks) {
 799                 /* didn't find all slots */
 800                 struct mdinfo **e;
 801                 e = &remaining;
 802                 while (*e)
 803                         e = &(*e)->next;
 804                 *e = sra->devs;
 805                 sra->devs = remaining;
 806                 return 1;
 807         }
 808
 809         /* Remove all 'remaining' devices from the array */
 810         while (remaining) {
 811                 struct mdinfo *sd = remaining;
 812                 remaining = sd->next;
 813
 814                 sysfs_set_str(sra, sd, "state", "faulty");
 815                 sysfs_set_str(sra, sd, "slot", "none");
 816                 /* for external metadata disks should be removed in mdmon */
 817                 if (!st->ss->external)
 818                         sysfs_set_str(sra, sd, "state", "remove");
 819                 sd->disk.state |= (1<<MD_DISK_REMOVED);
 820                 sd->disk.state &= ~(1<<MD_DISK_SYNC);
 821                 sd->next = sra->devs;
 822                 sra->devs = sd;
 823         }
 824         return 0;
 825 }
 826
 827 void reshape_free_fdlist(int *fdlist,
 828                          unsigned long long *offsets,
 829                          int size)
 830 {
 831         int i;
 832
 833         for (i = 0; i < size; i++)
 834                 if (fdlist[i] >= 0)
 835                         close(fdlist[i]);
 836
 837         free(fdlist);
 838         free(offsets);
 839 }
 840
 841 int reshape_prepare_fdlist(char *devname,
 842                            struct mdinfo *sra,
 843                            int raid_disks,
 844                            int nrdisks,
 845                            unsigned long blocks,
 846                            char *backup_file,
 847                            int *fdlist,
 848                            unsigned long long *offsets)
 849 {
 850         int d = 0;
 851         struct mdinfo *sd;
 852
 853         for (d = 0; d <= nrdisks; d++)
 854                 fdlist[d] = -1;
 855         d = raid_disks;
 856         for (sd = sra->devs; sd; sd = sd->next) {
 857                 if (sd->disk.state & (1<<MD_DISK_FAULTY))
 858                         continue;
 859                 if (sd->disk.state & (1<<MD_DISK_SYNC)) {
 860                         char *dn = map_dev(sd->disk.major,
 861                                            sd->disk.minor, 1);
 862                         fdlist[sd->disk.raid_disk]
 863                                 = dev_open(dn, O_RDONLY);
 864                         offsets[sd->disk.raid_disk] = sd->data_offset*512;
 865                         if (fdlist[sd->disk.raid_disk] < 0) {
 866                                 fprintf(stderr,
 867                                         Name ": %s: cannot open component %s\n",
 868                                         devname, dn ? dn : "-unknown-");
 869                                 d = -1;
 870                                 goto release;
 871                         }
 872                 } else if (backup_file == NULL) {
 873                         /* spare */
 874                         char *dn = map_dev(sd->disk.major,
 875                                            sd->disk.minor, 1);
 876                                 fdlist[d] = dev_open(dn, O_RDWR);
 877                                 offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512;
 878                                 if (fdlist[d] < 0) {
 879                                         fprintf(stderr, Name ": %s: cannot open component %s\n",
 880                                                 devname, dn ? dn : "-unknown-");
 881                                         d = -1;
 882                                         goto release;
 883                                 }
 884                                 d++;
 885                         }
 886                 }
 887 release:
 888         return d;
 889 }
 890
 891 int reshape_open_backup_file(char *backup_file,
 892                              int fd,
 893                              char *devname,
 894                              long blocks,
 895                              int *fdlist,
 896                              unsigned long long *offsets,
 897                              int restart)
 898 {
 899         /* Return 1 on success, 0 on any form of failure */
 900         /* need to check backup file is large enough */
 901         char buf[512];
 902         struct stat stb;
 903         unsigned int dev;
 904         int i;
 905
 906         *fdlist = open(backup_file, O_RDWR|O_CREAT|(restart ? O_TRUNC : O_EXCL),
 907                        S_IRUSR | S_IWUSR);
 908         *offsets = 8 * 512;
 909         if (*fdlist < 0) {
 910                 fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
 911                         devname, backup_file, strerror(errno));
 912                 return 0;
 913         }
 914         /* Guard against backup file being on array device.
 915          * If array is partitioned or if LVM etc is in the
 916          * way this will not notice, but it is better than
 917          * nothing.
 918          */
 919         fstat(*fdlist, &stb);
 920         dev = stb.st_dev;
 921         fstat(fd, &stb);
 922         if (stb.st_rdev == dev) {
 923                 fprintf(stderr, Name ": backup file must NOT be"
 924                         " on the array being reshaped.\n");
 925                 close(*fdlist);
 926                 return 0;
 927         }
 928
 929         memset(buf, 0, 512);
 930         for (i=0; i < blocks + 8 ; i++) {
 931                 if (write(*fdlist, buf, 512) != 512) {
 932                         fprintf(stderr, Name ": %s: cannot create"
 933                                 " backup file %s: %s\n",
 934                                 devname, backup_file, strerror(errno));
 935                         return 0;
 936                 }
 937         }
 938         if (fsync(*fdlist) != 0) {
 939                 fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
 940                         devname, backup_file, strerror(errno));
 941                 return 0;
 942         }
 943
 944         return 1;
 945 }
 946
 947 unsigned long compute_backup_blocks(int nchunk, int ochunk,
 948                                     unsigned int ndata, unsigned int odata)
 949 {
 950         unsigned long a, b, blocks;
 951         /* So how much do we need to backup.
 952          * We need an amount of data which is both a whole number of
 953          * old stripes and a whole number of new stripes.
 954          * So LCM for (chunksize*datadisks).
 955          */
 956         a = (ochunk/512) * odata;
 957         b = (nchunk/512) * ndata;
 958         /* Find GCD */
 959         while (a != b) {
 960                 if (a < b)
 961                         b -= a;
 962                 if (b < a)
 963                         a -= b;
 964         }
 965         /* LCM == product / GCD */
 966         blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a;
 967
 968         return blocks;
 969 }
 970
 971 char *analyse_change(struct mdinfo *info, struct reshape *re)
 972 {
 973         /* Based on the current array state in info->array and
 974          * the changes in info->new_* etc, determine:
 975          *  - whether the change is possible
 976          *  - Intermediate level/raid_disks/layout
 977          *  - whether a restriping reshape is needed
 978          *  - number of sectors in minimum change unit.  This
 979          *    will cover a whole number of stripes in 'before' and
 980          *    'after'.
 981          *
 982          * Return message if the change should be rejected
 983          *        NULL if the change can be achieved
 984          *
 985          * This can be called as part of starting a reshape, or
 986          * when assembling an array that is undergoing reshape.
 987          */
 988         int new_disks;
 989         /* delta_parity records change in number of devices
 990          * caused by level change
 991          */
 992         int delta_parity = 0;
 993
 994         /* If a new level not explicitly given, we assume no-change */
 995         if (info->new_level == UnSet)
 996                 info->new_level = info->array.level;
 997
 998         if (info->new_chunk)
 999                 switch (info->new_level) {
1000                 case 0:
1001                 case 4:
1002                 case 5:
1003                 case 6:
1004                 case 10:
1005                         /* chunk size is meaningful, must divide component_size
1006                          * evenly
1007                          */
1008                         if (info->component_size % (info->new_chunk/512))
1009                                 return "New chunk size does not"
1010                                         " divide component size";
1011                         break;
1012                 default:
1013                         return "chunk size not meaningful for this level";
1014                 }
1015         else
1016                 info->new_chunk = info->array.chunk_size;
1017
1018         switch (info->array.level) {
1019         default:
1020                 return "Cannot understand this RAID level";
1021         case 1:
1022                 /* RAID1 can convert to RAID1 with different disks, or
1023                  * raid5 with 2 disks, or
1024                  * raid0 with 1 disk
1025                  */
1026                 if (info->new_level == 0) {
1027                         if (info->delta_disks != UnSet &&
1028                             info->delta_disks != 0)
1029                                 return "Cannot change number of disks "
1030                                         "with RAID1->RAID0 conversion";
1031                         re->level = 0;
1032                         re->before.data_disks = 1;
1033                         re->after.data_disks = 1;
1034                         re->before.layout = 0;
1035                         re->backup_blocks = 0;
1036                         re->parity = 0;
1037                         return NULL;
1038                 }
1039                 if (info->new_level == 1) {
1040                         if (info->delta_disks == UnSet)
1041                                 /* Don't know what to do */
1042                                 return "no change requested for Growing RAID1";
1043                         re->level = 1;
1044                         re->backup_blocks = 0;
1045                         re->parity = 0;
1046                         return NULL;
1047                 }
1048                 if (info->array.raid_disks == 2 &&
1049                     info->new_level == 5) {
1050
1051                         re->level = 5;
1052                         re->before.data_disks = 1;
1053                         if (info->delta_disks != UnSet &&
1054                             info->delta_disks != 0)
1055                                 re->after.data_disks = 1 + info->delta_disks;
1056                         else
1057                                 re->after.data_disks = 1;
1058                         if (re->after.data_disks < 1)
1059                                 return "Number of disks too small for RAID5";
1060
1061                         re->before.layout = ALGORITHM_LEFT_SYMMETRIC;
1062                         info->array.chunk_size = 65536;
1063                         break;
1064                 }
1065                 /* Could do some multi-stage conversions, but leave that to
1066                  * later.
1067                  */
1068                 return "Impossibly level change request for RAID1";
1069
1070         case 10:
1071                 /* RAID10 can only be converted from near mode to
1072                  * RAID0 by removing some devices
1073                  */
1074                 if ((info->array.layout & ~0xff) != 0x100)
1075                         return "Cannot Grow RAID10 with far/offset layout";
1076                 /* number of devices must be multiple of number of copies */
1077                 if (info->array.raid_disks % (info->array.layout & 0xff))
1078                         return "RAID10 layout too complex for Grow operation";
1079
1080                 if (info->new_level != 0)
1081                         return "RAID10 can only be changed to RAID0";
1082                 new_disks = (info->array.raid_disks
1083                              / (info->array.layout & 0xff));
1084                 if (info->delta_disks == UnSet)
1085                         info->delta_disks = (new_disks
1086                                              - info->array.raid_disks);
1087
1088                 if (info->delta_disks != new_disks - info->array.raid_disks)
1089                         return "New number of raid-devices impossible for RAID10";
1090                 if (info->new_chunk &&
1091                     info->new_chunk != info->array.chunk_size)
1092                         return "Cannot change chunk-size with RAID10 Grow";
1093
1094                 /* looks good */
1095                 re->level = 0;
1096                 re->parity = 0;
1097                 re->before.data_disks = new_disks;
1098                 re->after.data_disks = re->before.data_disks;
1099                 re->before.layout = 0;
1100                 re->backup_blocks = 0;
1101                 return NULL;
1102
1103         case 0:
1104                 /* RAID0 can be converted to RAID10, or to RAID456 */
1105                 if (info->new_level == 10) {
1106                         if (info->new_layout == UnSet && info->delta_disks == UnSet) {
1107                                 /* Assume near=2 layout */
1108                                 info->new_layout = 0x102;
1109                                 info->delta_disks = info->array.raid_disks;
1110                         }
1111                         if (info->new_layout == UnSet) {
1112                                 int copies = 1 + (info->delta_disks
1113                                                   / info->array.raid_disks);
1114                                 if (info->array.raid_disks * (copies-1)
1115                                     != info->delta_disks)
1116                                         return "Impossible number of devices"
1117                                                 " for RAID0->RAID10";
1118                                 info->new_layout = 0x100 + copies;
1119                         }
1120                         if (info->delta_disks == UnSet) {
1121                                 int copies = info->new_layout & 0xff;
1122                                 if (info->new_layout != 0x100 + copies)
1123                                         return "New layout impossible"
1124                                                 " for RAID0->RAID10";;
1125                                 info->delta_disks = (copies - 1) *
1126                                         info->array.raid_disks;
1127                         }
1128                         if (info->new_chunk &&
1129                             info->new_chunk != info->array.chunk_size)
1130                                 return "Cannot change chunk-size with RAID0->RAID10";
1131                         /* looks good */
1132                         re->level = 10;
1133                         re->parity = 0;
1134                         re->before.data_disks = (info->array.raid_disks +
1135                                                  info->delta_disks);
1136                         re->after.data_disks = re->before.data_disks;
1137                         re->before.layout = info->new_layout;
1138                         re->backup_blocks = 0;
1139                         return NULL;
1140                 }
1141
1142                 /* RAID0 can also covert to RAID0/4/5/6 by first converting to
1143                  * a raid4 style layout of the final level.
1144                  */
1145                 switch (info->new_level) {
1146                 case 4:
1147                         delta_parity = 1;
1148                 case 0:
1149                         re->level = 4;
1150                         re->before.layout = 0;
1151                         break;
1152                 case 5:
1153                         delta_parity = 1;
1154                         re->level = 5;
1155                         re->before.layout = ALGORITHM_PARITY_N;
1156                         break;
1157                 case 6:
1158                         delta_parity = 2;
1159                         re->level = 6;
1160                         re->before.layout = ALGORITHM_PARITY_N;
1161                         break;
1162                 default:
1163                         return "Impossible level change requested";
1164                 }
1165                 re->before.data_disks = info->array.raid_disks;
1166                 /* determining 'after' layout happens outside this 'switch' */
1167                 break;
1168
1169         case 4:
1170                 info->array.layout = ALGORITHM_PARITY_N;
1171         case 5:
1172                 switch (info->new_level) {
1173                 case 0:
1174                         delta_parity = -1;
1175                 case 4:
1176                         re->level = info->array.level;
1177                         re->before.data_disks = info->array.raid_disks - 1;
1178                         re->before.layout = info->array.layout;
1179                         break;
1180                 case 5:
1181                         re->level = 5;
1182                         re->before.data_disks = info->array.raid_disks - 1;
1183                         re->before.layout = info->array.layout;
1184                         break;
1185                 case 6:
1186                         delta_parity = 1;
1187                         re->level = 6;
1188                         re->before.data_disks = info->array.raid_disks - 1;
1189                         switch (info->array.layout) {
1190                         case ALGORITHM_LEFT_ASYMMETRIC:
1191                                 re->before.layout = ALGORITHM_LEFT_ASYMMETRIC_6;
1192                                 break;
1193                         case ALGORITHM_RIGHT_ASYMMETRIC:
1194                                 re->before.layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
1195                                 break;
1196                         case ALGORITHM_LEFT_SYMMETRIC:
1197                                 re->before.layout = ALGORITHM_LEFT_SYMMETRIC_6;
1198                                 break;
1199                         case ALGORITHM_RIGHT_SYMMETRIC:
1200                                 re->before.layout = ALGORITHM_RIGHT_SYMMETRIC_6;
1201                                 break;
1202                         case ALGORITHM_PARITY_0:
1203                                 re->before.layout = ALGORITHM_PARITY_0_6;
1204                                 break;
1205                         case ALGORITHM_PARITY_N:
1206                                 re->before.layout = ALGORITHM_PARITY_N_6;
1207                                 break;
1208                         default:
1209                                 return "Cannot convert an array with this layout";
1210                         }
1211                         break;
1212                 case 1:
1213                         if (info->array.raid_disks != 2)
1214                                 return "Can only convert a 2-device array to RAID1";
1215                         if (info->delta_disks != UnSet &&
1216                             info->delta_disks != 0)
1217                                 return "Cannot set raid_disk when "
1218                                         "converting RAID5->RAID1";
1219                         re->level = 1;
1220                         break;
1221                 default:
1222                         return "Impossible level change requested";
1223                 }
1224                 break;
1225         case 6:
1226                 switch (info->new_level) {
1227                 case 4:
1228                 case 5:
1229                         delta_parity = -1;
1230                 case 6:
1231                         re->level = 6;
1232                         re->before.data_disks = info->array.raid_disks - 2;
1233                         re->before.layout = info->array.layout;
1234                         break;
1235                 default:
1236                         return "Impossible level change requested";
1237                 }
1238                 break;
1239         }
1240
1241         /* If we reached here then it looks like a re-stripe is
1242          * happening.  We have determined the intermediate level
1243          * and initial raid_disks/layout and stored these in 're'.
1244          *
1245          * We need to deduce the final layout that can be atomically
1246          * converted to the end state.
1247          */
1248         switch (info->new_level) {
1249         case 0:
1250                 /* We can only get to RAID0 from RAID4 or RAID5
1251                  * with appropriate layout and one extra device
1252                  */
1253                 if (re->level != 4 && re->level != 5)
1254                         return "Cannot covert to RAID0 from this level";
1255
1256                 switch (re->level) {
1257                 case 4:
1258                         re->after.layout = 0 ; break;
1259                 case 5:
1260                         re->after.layout = ALGORITHM_PARITY_N; break;
1261                 }
1262                 break;
1263
1264         case 4:
1265                 /* We can only get to RAID4 from RAID5 */
1266                 if (re->level != 4 && re->level != 5)
1267                         return "Cannot convert to RAID4 from this level";
1268
1269                 switch (re->level) {
1270                 case 4:
1271                         re->after.layout = 0 ; break;
1272                 case 5:
1273                         re->after.layout = ALGORITHM_PARITY_N; break;
1274                 }
1275                 break;
1276
1277         case 5:
1278                 /* We get to RAID5 for RAID5 or RAID6 */
1279                 if (re->level != 5 && re->level != 6)
1280                         return "Cannot convert to RAID5 from this level";
1281
1282                 switch (re->level) {
1283                 case 5:
1284                         if (info->new_layout == UnSet)
1285                                 re->after.layout = re->before.layout;
1286                         else
1287                                 re->after.layout = info->new_layout;
1288                         break;
1289                 case 6:
1290                         if (info->new_layout == UnSet)
1291                                 info->new_layout = re->before.layout;
1292
1293                         /* after.layout needs to be raid6 version of new_layout */
1294                         if (info->new_layout == ALGORITHM_PARITY_N)
1295                                 re->after.layout = ALGORITHM_PARITY_N;
1296                         else {
1297                                 char layout[40];
1298                                 char *ls = map_num(r5layout, info->new_layout);
1299                                 int l;
1300                                 strcat(strcpy(layout, ls), "-6");
1301                                 l = map_name(r6layout, layout);
1302                                 if (l == UnSet)
1303                                         return "Cannot find RAID6 layout"
1304                                                 " to convert to";
1305                                 re->after.layout = l;
1306                         }
1307                 }
1308                 break;
1309
1310         case 6:
1311                 /* We must already be at level 6 */
1312                 if (re->level != 6)
1313                         return "Impossible level change";
1314                 if (info->new_layout == UnSet)
1315                         re->after.layout = info->array.layout;
1316                 else
1317                         re->after.layout = info->new_layout;
1318                 break;
1319         default:
1320                 return "Impossible level change requested";
1321         }
1322         if (info->delta_disks == UnSet)
1323                 info->delta_disks = delta_parity;
1324
1325         re->after.data_disks = (re->before.data_disks
1326                                 + info->delta_disks
1327                                 - delta_parity);
1328         switch (re->level) {
1329         case 6: re->parity = 2; break;
1330         case 4:
1331         case 5: re->parity = 1; break;
1332         default: re->parity = 0; break;
1333         }
1334         /* So we have a restripe operation, we need to calculate the number
1335          * of blocks per reshape operation.
1336          */
1337         if (info->new_chunk == 0)
1338                 info->new_chunk = info->array.chunk_size;
1339         if (re->after.data_disks == re->before.data_disks &&
1340             re->after.layout == re->before.layout &&
1341             info->new_chunk == info->array.chunk_size) {
1342                 /* Nothing to change */
1343                 re->backup_blocks = 0;
1344                 return NULL;
1345         }
1346         if (re->after.data_disks == 1 && re->before.data_disks == 1) {
1347                 /* chunk and layout changes make no difference */
1348                 re->backup_blocks = 0;
1349                 return NULL;
1350         }
1351
1352         if (re->after.data_disks == re->before.data_disks &&
1353             get_linux_version() < 2006032)
1354                 return "in-place reshape is not safe before 2.6.32 - sorry.";
1355
1356         if (re->after.data_disks < re->before.data_disks &&
1357             get_linux_version() < 2006030)
1358                 return "reshape to fewer devices is not supported before 2.6.30 - sorry.";
1359
1360         re->backup_blocks = compute_backup_blocks(
1361                 info->new_chunk, info->array.chunk_size,
1362                 re->after.data_disks,
1363                 re->before.data_disks);
1364
1365         re->new_size = info->component_size * re->after.data_disks;
1366         return NULL;
1367 }
1368
1369 static int reshape_array(char *container, int fd, char *devname,
1370                          struct supertype *st, struct mdinfo *info,
1371                          int force, struct mddev_dev *devlist,
1372                          char *backup_file, int quiet, int forked,
1373                          int restart, int freeze_reshape);
1374 static int reshape_container(char *container, char *devname,
1375                              int mdfd,
1376                              struct supertype *st,
1377                              struct mdinfo *info,
1378                              int force,
1379                              char *backup_file,
1380                              int quiet, int restart, int freeze_reshape);
1381
1382 int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
1383                  long long size,
1384                  int level, char *layout_str, int chunksize, int raid_disks,
1385                  struct mddev_dev *devlist,
1386                  int assume_clean, int force)
1387 {
1388         /* Make some changes in the shape of an array.
1389          * The kernel must support the change.
1390          *
1391          * There are three different changes.  Each can trigger
1392          * a resync or recovery so we freeze that until we have
1393          * requested everything (if kernel supports freezing - 2.6.30).
1394          * The steps are:
1395          *  - change size (i.e. component_size)
1396          *  - change level
1397          *  - change layout/chunksize/ndisks
1398          *
1399          * The last can require a reshape.  It is different on different
1400          * levels so we need to check the level before actioning it.
1401          * Some times the level change needs to be requested after the
1402          * reshape (e.g. raid6->raid5, raid5->raid0)
1403          *
1404          */
1405         struct mdu_array_info_s array;
1406         int rv = 0;
1407         struct supertype *st;
1408         char *subarray = NULL;
1409
1410         int frozen;
1411         int changed = 0;
1412         char *container = NULL;
1413         char container_buf[20];
1414         int cfd = -1;
1415
1416         struct mddev_dev *dv;
1417         int added_disks;
1418
1419         struct mdinfo info;
1420         struct mdinfo *sra;
1421
1422         if (ioctl(fd, GET_ARRAY_INFO, &array) < 0) {
1423                 fprintf(stderr, Name ": %s is not an active md array - aborting\n",
1424                         devname);
1425                 return 1;
1426         }
1427
1428         if (size >= 0 &&
1429             (chunksize || level!= UnSet || layout_str || raid_disks)) {
1430                 fprintf(stderr, Name ": cannot change component size at the same time "
1431                         "as other changes.\n"
1432                         "   Change size first, then check data is intact before "
1433                         "making other changes.\n");
1434                 return 1;
1435         }
1436
1437         if (raid_disks && raid_disks < array.raid_disks && array.level > 1 &&
1438             get_linux_version() < 2006032 &&
1439             !check_env("MDADM_FORCE_FEWER")) {
1440                 fprintf(stderr, Name ": reducing the number of devices is not safe before Linux 2.6.32\n"
1441                         "       Please use a newer kernel\n");
1442                 return 1;
1443         }
1444
1445         st = super_by_fd(fd, &subarray);
1446         if (!st) {
1447                 fprintf(stderr, Name ": Unable to determine metadata format for %s\n", devname);
1448                 return 1;
1449         }
1450         if (raid_disks > st->max_devs) {
1451                 fprintf(stderr, Name ": Cannot increase raid-disks on this array"
1452                         " beyond %d\n", st->max_devs);
1453                 return 1;
1454         }
1455
1456         /* in the external case we need to check that the requested reshape is
1457          * supported, and perform an initial check that the container holds the
1458          * pre-requisite spare devices (mdmon owns final validation)
1459          */
1460         if (st->ss->external) {
1461                 int container_dev;
1462                 int rv;
1463
1464                 if (subarray) {
1465                         container_dev = st->container_dev;
1466                         cfd = open_dev_excl(st->container_dev);
1467                 } else {
1468                         container_dev = st->devnum;
1469                         close(fd);
1470                         cfd = open_dev_excl(st->devnum);
1471                         fd = cfd;
1472                 }
1473                 if (cfd < 0) {
1474                         fprintf(stderr, Name ": Unable to open container for %s\n",
1475                                 devname);
1476                         free(subarray);
1477                         return 1;
1478                 }
1479
1480                 fmt_devname(container_buf, container_dev);
1481                 container = container_buf;
1482
1483                 rv = st->ss->load_container(st, cfd, NULL);
1484
1485                 if (rv) {
1486                         fprintf(stderr, Name ": Cannot read superblock for %s\n",
1487                                 devname);
1488                         free(subarray);
1489                         return 1;
1490                 }
1491
1492                 /* check if operation is supported for metadata handler */
1493                 if (st->ss->container_content) {
1494                         struct mdinfo *cc = NULL;
1495                         struct mdinfo *content = NULL;
1496
1497                         cc = st->ss->container_content(st, subarray);
1498                         for (content = cc; content ; content = content->next) {
1499                                 int allow_reshape = 1;
1500
1501                                 /* check if reshape is allowed based on metadata
1502                                  * indications stored in content.array.status
1503                                  */
1504                                 if (content->array.state & (1<<MD_SB_BLOCK_VOLUME))
1505                                         allow_reshape = 0;
1506                                 if (content->array.state
1507                                     & (1<<MD_SB_BLOCK_CONTAINER_RESHAPE))
1508                                         allow_reshape = 0;
1509                                 if (!allow_reshape) {
1510                                         fprintf(stderr, Name
1511                                                 " cannot reshape arrays in"
1512                                                 " container with unsupported"
1513                                                 " metadata: %s(%s)\n",
1514                                                 devname, container_buf);
1515                                         sysfs_free(cc);
1516                                         free(subarray);
1517                                         return 1;
1518                                 }
1519                         }
1520                         sysfs_free(cc);
1521                 }
1522                 if (mdmon_running(container_dev))
1523                         st->update_tail = &st->updates;
1524         }
1525
1526         added_disks = 0;
1527         for (dv = devlist; dv; dv = dv->next)
1528                 added_disks++;
1529         if (raid_disks > array.raid_disks &&
1530             array.spare_disks +added_disks < (raid_disks - array.raid_disks) &&
1531             !force) {
1532                 fprintf(stderr,
1533                         Name ": Need %d spare%s to avoid degraded array,"
1534                         " and only have %d.\n"
1535                         "       Use --force to over-ride this check.\n",
1536                         raid_disks - array.raid_disks,
1537                         raid_disks - array.raid_disks == 1 ? "" : "s",
1538                         array.spare_disks + added_disks);
1539                 return 1;
1540         }
1541
1542         sra = sysfs_read(fd, 0, GET_LEVEL | GET_DISKS | GET_DEVS
1543                          | GET_STATE | GET_VERSION);
1544         if (sra) {
1545                 if (st->ss->external && subarray == NULL) {
1546                         array.level = LEVEL_CONTAINER;
1547                         sra->array.level = LEVEL_CONTAINER;
1548                 }
1549         } else {
1550                 fprintf(stderr, Name ": failed to read sysfs parameters for %s\n",
1551                         devname);
1552                 return 1;
1553         }
1554         frozen = freeze(st);
1555         if (frozen < -1) {
1556                 /* freeze() already spewed the reason */
1557                 sysfs_free(sra);
1558                 return 1;
1559         } else if (frozen < 0) {
1560                 fprintf(stderr, Name ": %s is performing resync/recovery and cannot"
1561                         " be reshaped\n", devname);
1562                 sysfs_free(sra);
1563                 return 1;
1564         }
1565
1566         /* ========= set size =============== */
1567         if (size >= 0 && (size == 0 || size != array.size)) {
1568                 long long orig_size = get_component_size(fd)/2;
1569                 long long min_csize;
1570                 struct mdinfo *mdi;
1571
1572                 if (orig_size == 0)
1573                         orig_size = array.size;
1574
1575                 if (reshape_super(st, size, UnSet, UnSet, 0, 0, UnSet, NULL,
1576                                   devname, !quiet)) {
1577                         rv = 1;
1578                         goto release;
1579                 }
1580                 sync_metadata(st);
1581
1582                 /* Update the size of each member device in case
1583                  * they have been resized.  This will never reduce
1584                  * below the current used-size.  The "size" attribute
1585                  * understands '0' to mean 'max'.
1586                  */
1587                 min_csize = 0;
1588                 for (mdi = sra->devs; mdi; mdi = mdi->next) {
1589                         if (sysfs_set_num(sra, mdi, "size", size) < 0)
1590                                 break;
1591                         if (array.not_persistent == 0 &&
1592                             array.major_version == 0 &&
1593                             get_linux_version() < 3001000) {
1594                                 /* Dangerous to allow size to exceed 2TB */
1595                                 unsigned long long csize;
1596                                 if (sysfs_get_ll(sra, mdi, "size", &csize) == 0) {
1597                                         if (csize >= 2ULL*1024*1024*1024)
1598                                                 csize = 2ULL*1024*1024*1024;
1599                                         if ((min_csize == 0 || (min_csize
1600                                                                 > (long long)csize)))
1601                                                 min_csize = csize;
1602                                 }
1603                         }
1604                 }
1605                 if (min_csize && size > min_csize) {
1606                         fprintf(stderr, Name ": Cannot safely make this array "
1607                                 "use more than 2TB per device on this kernel.\n");
1608                         rv = 1;
1609                         goto release;
1610                 }
1611                 if (min_csize && size == 0) {
1612                         /* Don't let the kernel choose a size - it will get
1613                          * it wrong
1614                          */
1615                         fprintf(stderr, Name ": Limited v0.90 array to "
1616                                 "2TB per device\n");
1617                         size = min_csize;
1618                 }
1619
1620                 array.size = size;
1621                 if (array.size != size) {
1622                         /* got truncated to 32bit, write to
1623                          * component_size instead
1624                          */
1625                         if (sra)
1626                                 rv = sysfs_set_num(sra, NULL,
1627                                                    "component_size", size);
1628                         else
1629                                 rv = -1;
1630                 } else
1631                         rv = ioctl(fd, SET_ARRAY_INFO, &array);
1632                 if (rv != 0) {
1633                         int err = errno;
1634
1635                         /* restore metadata */
1636                         if (reshape_super(st, orig_size, UnSet, UnSet, 0, 0,
1637                                           UnSet, NULL, devname, !quiet) == 0)
1638                                 sync_metadata(st);
1639                         fprintf(stderr, Name ": Cannot set device size for %s: %s\n",
1640                                 devname, strerror(err));
1641                         if (err == EBUSY &&
1642                             (array.state & (1<<MD_SB_BITMAP_PRESENT)))
1643                                 fprintf(stderr, "       Bitmap must be removed before size can be changed\n");
1644                         rv = 1;
1645                         goto release;
1646                 }
1647                 if (assume_clean) {
1648                         /* This will fail on kernels newer than 3.0 unless
1649                          * a backport has been arranged.
1650                          */
1651                         if (sra == NULL ||
1652                             sysfs_set_str(sra, NULL, "resync_start", "none") < 0)
1653                                 fprintf(stderr, Name ": --assume-clean not support with --grow on this kernel\n");
1654                 }
1655                 ioctl(fd, GET_ARRAY_INFO, &array);
1656                 size = get_component_size(fd)/2;
1657                 if (size == 0)
1658                         size = array.size;
1659                 if (!quiet) {
1660                         if (size == orig_size)
1661                                 fprintf(stderr, Name ": component size of %s "
1662                                         "unchanged at %lluK\n",
1663                                         devname, size);
1664                         else
1665                                 fprintf(stderr, Name ": component size of %s "
1666                                         "has been set to %lluK\n",
1667                                         devname, size);
1668                 }
1669                 changed = 1;
1670         } else if (array.level != LEVEL_CONTAINER) {
1671                 size = get_component_size(fd)/2;
1672                 if (size == 0)
1673                         size = array.size;
1674         }
1675
1676         /* See if there is anything else to do */
1677         if ((level == UnSet || level == array.level) &&
1678             (layout_str == NULL) &&
1679             (chunksize == 0 || chunksize == array.chunk_size) &&
1680             (raid_disks == 0 || raid_disks == array.raid_disks)) {
1681                 /* Nothing more to do */
1682                 if (!changed && !quiet)
1683                         fprintf(stderr, Name ": %s: no change requested\n",
1684                                 devname);
1685                 goto release;
1686         }
1687
1688         /* ========= check for Raid10/Raid1 -> Raid0 conversion ===============
1689          * current implementation assumes that following conditions must be met:
1690          * - RAID10:
1691          *      - far_copies == 1
1692          *      - near_copies == 2
1693          */
1694         if ((level == 0 && array.level == 10 && sra &&
1695              array.layout == ((1 << 8) + 2) && !(array.raid_disks & 1)) ||
1696             (level == 0 && array.level == 1 && sra)) {
1697                 int err;
1698                 err = remove_disks_for_takeover(st, sra, array.layout);
1699                 if (err) {
1700                         dprintf(Name": Array cannot be reshaped\n");
1701                         if (cfd > -1)
1702                                 close(cfd);
1703                         rv = 1;
1704                         goto release;
1705                 }
1706                 /* Make sure mdmon has seen the device removal
1707                  * and updated metadata before we continue with
1708                  * level change
1709                  */
1710                 if (container)
1711                         ping_monitor(container);
1712         }
1713
1714         memset(&info, 0, sizeof(info));
1715         info.array = array;
1716         sysfs_init(&info, fd, NoMdDev);
1717         strcpy(info.text_version, sra->text_version);
1718         info.component_size = size*2;
1719         info.new_level = level;
1720         info.new_chunk = chunksize * 1024;
1721         if (info.array.level == LEVEL_CONTAINER) {
1722                 info.delta_disks = UnSet;
1723                 info.array.raid_disks = raid_disks;
1724         } else if (raid_disks)
1725                 info.delta_disks = raid_disks - info.array.raid_disks;
1726         else
1727                 info.delta_disks = UnSet;
1728         if (layout_str == NULL) {
1729                 info.new_layout = UnSet;
1730                 if (info.array.level == 6 &&
1731                     (info.new_level == 6 || info.new_level == UnSet) &&
1732                     info.array.layout >= 16) {
1733                         fprintf(stderr, Name
1734                                 ": %s has a non-standard layout.  If you"
1735                                 " wish to preserve this\n"
1736                                 "      during the reshape, please specify"
1737                                 " --layout=preserve\n"
1738                                 "      If you want to change it, specify a"
1739                                 " layout or use --layout=normalise\n",
1740                                 devname);
1741                         rv = 1;
1742                         goto release;
1743                 }
1744         } else if (strcmp(layout_str, "normalise") == 0 ||
1745                    strcmp(layout_str, "normalize") == 0) {
1746                 /* If we have a -6 RAID6 layout, remove the '-6'. */
1747                 info.new_layout = UnSet;
1748                 if (info.array.level == 6 && info.new_level == UnSet) {
1749                         char l[40], *h;
1750                         strcpy(l, map_num(r6layout, info.array.layout));
1751                         h = strrchr(l, '-');
1752                         if (h && strcmp(h, "-6") == 0) {
1753                                 *h = 0;
1754                                 info.new_layout = map_name(r6layout, l);
1755                         }
1756                 }
1757         } else if (strcmp(layout_str, "preserve") == 0) {
1758                 info.new_layout = UnSet;
1759         } else {
1760                 int l = info.new_level;
1761                 if (l == UnSet)
1762                         l = info.array.level;
1763                 switch (l) {
1764                 case 5:
1765                         info.new_layout = map_name(r5layout, layout_str);
1766                         break;
1767                 case 6:
1768                         info.new_layout = map_name(r6layout, layout_str);
1769                         break;
1770                 case 10:
1771                         info.new_layout = parse_layout_10(layout_str);
1772                         break;
1773                 case LEVEL_FAULTY:
1774                         info.new_layout = parse_layout_faulty(layout_str);
1775                         break;
1776                 default:
1777                         fprintf(stderr, Name ": layout not meaningful"
1778                                 " with this level\n");
1779                         rv = 1;
1780                         goto release;
1781                 }
1782                 if (info.new_layout == UnSet) {
1783                         fprintf(stderr, Name ": layout %s not understood"
1784                                 " for this level\n",
1785                                 layout_str);
1786                         rv = 1;
1787                         goto release;
1788                 }
1789         }
1790
1791         if (array.level == LEVEL_FAULTY) {
1792                 if (level != UnSet && level != array.level) {
1793                         fprintf(stderr, Name ": cannot change level of Faulty device\n");
1794                         rv =1 ;
1795                 }
1796                 if (chunksize) {
1797                         fprintf(stderr, Name ": cannot set chunksize of Faulty device\n");
1798                         rv =1 ;
1799                 }
1800                 if (raid_disks && raid_disks != 1) {
1801                         fprintf(stderr, Name ": cannot set raid_disks of Faulty device\n");
1802                         rv =1 ;
1803                 }
1804                 if (layout_str) {
1805                         if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) {
1806                                 dprintf("Cannot get array information.\n");
1807                                 goto release;
1808                         }
1809                         array.layout = info.new_layout;
1810                         if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
1811                                 fprintf(stderr, Name ": failed to set new layout\n");
1812                                 rv = 1;
1813                         } else if (!quiet)
1814                                 printf("layout for %s set to %d\n",
1815                                        devname, array.layout);
1816                 }
1817         } else if (array.level == LEVEL_CONTAINER) {
1818                 /* This change is to be applied to every array in the
1819                  * container.  This is only needed when the metadata imposes
1820                  * restraints of the various arrays in the container.
1821                  * Currently we only know that IMSM requires all arrays
1822                  * to have the same number of devices so changing the
1823                  * number of devices (On-Line Capacity Expansion) must be
1824                  * performed at the level of the container
1825                  */
1826                 rv = reshape_container(container, devname, -1, st, &info,
1827                                        force, backup_file, quiet, 0, 0);
1828                 frozen = 0;
1829         } else {
1830                 /* get spare devices from external metadata
1831                  */
1832                 if (st->ss->external) {
1833                         struct mdinfo *info2;
1834
1835                         info2 = st->ss->container_content(st, subarray);
1836                         if (info2) {
1837                                 info.array.spare_disks =
1838                                         info2->array.spare_disks;
1839                                 sysfs_free(info2);
1840                         }
1841                 }
1842
1843                 /* Impose these changes on a single array.  First
1844                  * check that the metadata is OK with the change. */
1845
1846                 if (reshape_super(st, info.component_size, info.new_level,
1847                                   info.new_layout, info.new_chunk,
1848                                   info.array.raid_disks, info.delta_disks,
1849                                   backup_file, devname, quiet)) {
1850                         rv = 1;
1851                         goto release;
1852                 }
1853                 sync_metadata(st);
1854                 rv = reshape_array(container, fd, devname, st, &info, force,
1855                                    devlist, backup_file, quiet, 0, 0, 0);
1856                 frozen = 0;
1857         }
1858 release:
1859         sysfs_free(sra);
1860         if (frozen > 0)
1861                 unfreeze(st);
1862         return rv;
1863 }
1864
1865 static int reshape_array(char *container, int fd, char *devname,
1866                          struct supertype *st, struct mdinfo *info,
1867                          int force, struct mddev_dev *devlist,
1868                          char *backup_file, int quiet, int forked,
1869                          int restart, int freeze_reshape)
1870 {
1871         struct reshape reshape;
1872         int spares_needed;
1873         char *msg;
1874         int orig_level = UnSet;
1875         int disks, odisks;
1876
1877         struct mdu_array_info_s array;
1878         char *c;
1879
1880         struct mddev_dev *dv;
1881         int added_disks;
1882
1883         int *fdlist = NULL;
1884         unsigned long long *offsets = NULL;
1885         int d;
1886         int nrdisks;
1887         int err;
1888         unsigned long blocks;
1889         unsigned long cache;
1890         unsigned long long array_size;
1891         int done;
1892         struct mdinfo *sra = NULL;
1893
1894         /* when reshaping a RAID0, the component_size might be zero.
1895          * So try to fix that up.
1896          */
1897         if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) {
1898                 dprintf("Cannot get array information.\n");
1899                 goto release;
1900         }
1901         if (array.level == 0 && info->component_size == 0) {
1902                 get_dev_size(fd, NULL, &array_size);
1903                 info->component_size = array_size / array.raid_disks;
1904         }
1905
1906         if (info->reshape_active) {
1907                 int new_level = info->new_level;
1908                 info->new_level = UnSet;
1909                 if (info->delta_disks > 0)
1910                         info->array.raid_disks -= info->delta_disks;
1911                 msg = analyse_change(info, &reshape);
1912                 info->new_level = new_level;
1913                 if (info->delta_disks > 0)
1914                         info->array.raid_disks += info->delta_disks;
1915                 if (!restart)
1916                         /* Make sure the array isn't read-only */
1917                         ioctl(fd, RESTART_ARRAY_RW, 0);
1918         } else
1919                 msg = analyse_change(info, &reshape);
1920         if (msg) {
1921                 fprintf(stderr, Name ": %s\n", msg);
1922                 goto release;
1923         }
1924         if (restart &&
1925             (reshape.level != info->array.level ||
1926              reshape.before.layout != info->array.layout ||
1927              reshape.before.data_disks + reshape.parity
1928              != info->array.raid_disks - max(0, info->delta_disks))) {
1929                 fprintf(stderr, Name ": reshape info is not in native format -"
1930                         " cannot continue.\n");
1931                 goto release;
1932         }
1933
1934         if (restart) {
1935                 /* reshape already started. just skip to monitoring the reshape */
1936                 if (reshape.backup_blocks == 0)
1937                         return 0;
1938                 goto started;
1939         }
1940         /* The container is frozen but the array may not be.
1941          * So freeze the array so spares don't get put to the wrong use
1942          * FIXME there should probably be a cleaner separation between
1943          * freeze_array and freeze_container.
1944          */
1945         sysfs_freeze_array(info);
1946         /* Check we have enough spares to not be degraded */
1947         added_disks = 0;
1948         for (dv = devlist; dv ; dv=dv->next)
1949                 added_disks++;
1950         spares_needed = max(reshape.before.data_disks,
1951                             reshape.after.data_disks)
1952                 + reshape.parity - array.raid_disks;
1953
1954         if (!force &&
1955             info->new_level > 1 && info->array.level > 1 &&
1956             spares_needed > info->array.spare_disks + added_disks) {
1957                 fprintf(stderr,
1958                         Name ": Need %d spare%s to avoid degraded array,"
1959                         " and only have %d.\n"
1960                         "       Use --force to over-ride this check.\n",
1961                         spares_needed,
1962                         spares_needed == 1 ? "" : "s",
1963                         info->array.spare_disks + added_disks);
1964                 goto release;
1965         }
1966         /* Check we have enough spares to not fail */
1967         spares_needed = max(reshape.before.data_disks,
1968                             reshape.after.data_disks)
1969                 - array.raid_disks;
1970         if ((info->new_level > 1 || info->new_level == 0) &&
1971             spares_needed > info->array.spare_disks +added_disks) {
1972                 fprintf(stderr,
1973                         Name ": Need %d spare%s to create working array,"
1974                         " and only have %d.\n",
1975                         spares_needed,
1976                         spares_needed == 1 ? "" : "s",
1977                         info->array.spare_disks + added_disks);
1978                 goto release;
1979         }
1980
1981         if (reshape.level != array.level) {
1982                 char *c = map_num(pers, reshape.level);
1983                 int err;
1984                 if (c == NULL)
1985                         goto release;
1986
1987                 err = sysfs_set_str(info, NULL, "level", c);
1988                 if (err) {
1989                         err = errno;
1990                         fprintf(stderr, Name ": %s: could not set level to %s\n",
1991                                 devname, c);
1992                         if (err == EBUSY &&
1993                             (info->array.state & (1<<MD_SB_BITMAP_PRESENT)))
1994                                 fprintf(stderr, "       Bitmap must be removed"
1995                                         " before level can be changed\n");
1996                         goto release;
1997                 }
1998                 if (!quiet)
1999                         fprintf(stderr, Name ": level of %s changed to %s\n",
2000                                 devname, c);
2001                 orig_level = array.level;
2002                 sysfs_freeze_array(info);
2003
2004                 if (reshape.level > 0 && st->ss->external) {
2005                         /* make sure mdmon is aware of the new level */
2006                         if (mdmon_running(st->container_dev))
2007                                 flush_mdmon(container);
2008
2009                         if (!mdmon_running(st->container_dev))
2010                                 start_mdmon(st->container_dev);
2011                         ping_monitor(container);
2012                         if (mdmon_running(st->container_dev) &&
2013                             st->update_tail == NULL)
2014                                 st->update_tail = &st->updates;
2015                 }
2016         }
2017         /* ->reshape_super might have chosen some spares from the
2018          * container that it wants to be part of the new array.
2019          * We can collect them with ->container_content and give
2020          * them to the kernel.
2021          */
2022         if (st->ss->reshape_super && st->ss->container_content) {
2023                 char *subarray = strchr(info->text_version+1, '/')+1;
2024                 struct mdinfo *info2 =
2025                         st->ss->container_content(st, subarray);
2026                 struct mdinfo *d;
2027
2028                 if (info2) {
2029                         sysfs_init(info2, fd, st->devnum);
2030                         /* When increasing number of devices, we need to set
2031                          * new raid_disks before adding these, or they might
2032                          * be rejected.
2033                          */
2034                         if (reshape.backup_blocks &&
2035                             reshape.after.data_disks > reshape.before.data_disks)
2036                                 subarray_set_num(container, info2, "raid_disks",
2037                                                  reshape.after.data_disks +
2038                                                  reshape.parity);
2039                         for (d = info2->devs; d; d = d->next) {
2040                                 if (d->disk.state == 0 &&
2041                                     d->disk.raid_disk >= 0) {
2042                                         /* This is a spare that wants to
2043                                          * be part of the array.
2044                                          */
2045                                         add_disk(fd, st, info2, d);
2046                                 }
2047                         }
2048                         sysfs_free(info2);
2049                 }
2050         }
2051         /* We might have been given some devices to add to the
2052          * array.  Now that the array has been changed to the right
2053          * level and frozen, we can safely add them.
2054          */
2055         if (devlist)
2056                 Manage_subdevs(devname, fd, devlist, !quiet,
2057                                0,NULL, 0);
2058
2059         if (reshape.backup_blocks == 0) {
2060                 /* No restriping needed, but we might need to impose
2061                  * some more changes: layout, raid_disks, chunk_size
2062                  */
2063                 /* read current array info */
2064                 if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) {
2065                         dprintf("Cannot get array information.\n");
2066                         goto release;
2067                 }
2068                 /* compare current array info with new values and if
2069                  * it is different update them to new */
2070                 if (info->new_layout != UnSet &&
2071                     info->new_layout != array.layout) {
2072                         array.layout = info->new_layout;
2073                         if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
2074                                 fprintf(stderr, Name ": failed to set new layout\n");
2075                                 goto release;
2076                         } else if (!quiet)
2077                                 printf("layout for %s set to %d\n",
2078                                        devname, array.layout);
2079                 }
2080                 if (info->delta_disks != UnSet &&
2081                     info->delta_disks != 0 &&
2082                     array.raid_disks != (info->array.raid_disks + info->delta_disks)) {
2083                         array.raid_disks += info->delta_disks;
2084                         if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
2085                                 fprintf(stderr, Name ": failed to set raid disks\n");
2086                                 goto release;
2087                         } else if (!quiet) {
2088                                 printf("raid_disks for %s set to %d\n",
2089                                        devname, array.raid_disks);
2090                         }
2091                 }
2092                 if (info->new_chunk != 0 &&
2093                     info->new_chunk != array.chunk_size) {
2094                         if (sysfs_set_num(info, NULL,
2095                                           "chunk_size", info->new_chunk) != 0) {
2096                                 fprintf(stderr, Name ": failed to set chunk size\n");
2097                                 goto release;
2098                         } else if (!quiet)
2099                                 printf("chunk size for %s set to %d\n",
2100                                        devname, array.chunk_size);
2101                 }
2102                 unfreeze(st);
2103                 return 0;
2104         }
2105
2106         /*
2107          * There are three possibilities.
2108          * 1/ The array will shrink.
2109          *    We need to ensure the reshape will pause before reaching
2110          *    the 'critical section'.  We also need to fork and wait for
2111          *    that to happen.  When it does we
2112          *       suspend/backup/complete/unfreeze
2113          *
2114          * 2/ The array will not change size.
2115          *    This requires that we keep a backup of a sliding window
2116          *    so that we can restore data after a crash.  So we need
2117          *    to fork and monitor progress.
2118          *    In future we will allow the data_offset to change, so
2119          *    a sliding backup becomes unnecessary.
2120          *
2121          * 3/ The array will grow. This is relatively easy.
2122          *    However the kernel's restripe routines will cheerfully
2123          *    overwrite some early data before it is safe.  So we
2124          *    need to make a backup of the early parts of the array
2125          *    and be ready to restore it if rebuild aborts very early.
2126          *    For externally managed metadata, we still need a forked
2127          *    child to monitor the reshape and suspend IO over the region
2128          *    that is being reshaped.
2129          *
2130          *    We backup data by writing it to one spare, or to a
2131          *    file which was given on command line.
2132          *
2133          * In each case, we first make sure that storage is available
2134          * for the required backup.
2135          * Then we:
2136          *   -  request the shape change.
2137          *   -  fork to handle backup etc.
2138          */
2139 started:
2140         /* Check that we can hold all the data */
2141         get_dev_size(fd, NULL, &array_size);
2142         if (reshape.new_size < (array_size/512)) {
2143                 fprintf(stderr,
2144                         Name ": this change will reduce the size of the array.\n"
2145                         "       use --grow --array-size first to truncate array.\n"
2146                         "       e.g. mdadm --grow %s --array-size %llu\n",
2147                         devname, reshape.new_size/2);
2148                 goto release;
2149         }
2150
2151         sra = sysfs_read(fd, 0,
2152                          GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK|
2153                          GET_CACHE);
2154         if (!sra) {
2155                 fprintf(stderr, Name ": %s: Cannot get array details from sysfs\n",
2156                         devname);
2157                 goto release;
2158         }
2159
2160         /* Decide how many blocks (sectors) for a reshape
2161          * unit.  The number we have so far is just a minimum
2162          */
2163         blocks = reshape.backup_blocks;
2164         if (reshape.before.data_disks ==
2165             reshape.after.data_disks) {
2166                 /* Make 'blocks' bigger for better throughput, but
2167                  * not so big that we reject it below.
2168                  * Try for 16 megabytes
2169                  */
2170                 while (blocks * 32 < sra->component_size &&
2171                        blocks < 16*1024*2)
2172                         blocks *= 2;
2173         } else
2174                 fprintf(stderr, Name ": Need to backup %luK of critical "
2175                         "section..\n", blocks/2);
2176
2177         if (blocks >= sra->component_size/2) {
2178                 fprintf(stderr, Name ": %s: Something wrong"
2179                         " - reshape aborted\n",
2180                         devname);
2181                 goto release;
2182         }
2183
2184         /* Now we need to open all these devices so we can read/write.
2185          */
2186         nrdisks = max(reshape.before.data_disks,
2187                       reshape.after.data_disks) + reshape.parity
2188                 + sra->array.spare_disks;
2189         fdlist = malloc((1+nrdisks) * sizeof(int));
2190         offsets = malloc((1+nrdisks) * sizeof(offsets[0]));
2191         if (!fdlist || !offsets) {
2192                 fprintf(stderr, Name ": malloc failed: grow aborted\n");
2193                 goto release;
2194         }
2195
2196         odisks = reshape.before.data_disks + reshape.parity;
2197         d = reshape_prepare_fdlist(devname, sra, odisks,
2198                                    nrdisks, blocks, backup_file,
2199                                    fdlist, offsets);
2200         if (d < 0) {
2201                 goto release;
2202         }
2203         if ((st->ss->manage_reshape == NULL) ||
2204             (st->ss->recover_backup == NULL)) {
2205                 if (backup_file == NULL) {
2206                         if (reshape.after.data_disks <=
2207                             reshape.before.data_disks) {
2208                                 fprintf(stderr, Name ": %s: Cannot grow - "
2209                                         "need backup-file\n", devname);
2210                                 goto release;
2211                         } else if (sra->array.spare_disks == 0) {
2212                                 fprintf(stderr, Name ": %s: Cannot grow - "
2213                                         "need a spare or backup-file to backup "
2214                                         "critical section\n", devname);
2215                                 goto release;
2216                         }
2217                 } else {
2218                         if (!reshape_open_backup_file(backup_file, fd, devname,
2219                                                       (signed)blocks,
2220                                                       fdlist+d, offsets+d,
2221                                                       restart)) {
2222                                 goto release;
2223                         }
2224                         d++;
2225                 }
2226         }
2227
2228         /* lastly, check that the internal stripe cache is
2229          * large enough, or it won't work.
2230          * It must hold at least 4 stripes of the larger
2231          * chunk size
2232          */
2233         cache = max(info->array.chunk_size, info->new_chunk);
2234         cache *= 4; /* 4 stripes minimum */
2235         cache /= 512; /* convert to sectors */
2236         disks = min(reshape.before.data_disks, reshape.after.data_disks);
2237         /* make sure there is room for 'blocks' with a bit to spare */
2238         if (cache < 16 + blocks / disks)
2239                 cache = 16 + blocks / disks;
2240         cache /= (4096/512); /* Covert from sectors to pages */
2241
2242         if (sra->cache_size < cache)
2243                 subarray_set_num(container, sra, "stripe_cache_size",
2244                                  cache+1);
2245
2246         /* Right, everything seems fine. Let's kick things off.
2247          * If only changing raid_disks, use ioctl, else use
2248          * sysfs.
2249          */
2250         sync_metadata(st);
2251
2252         sra->new_chunk = info->new_chunk;
2253
2254         if (restart)
2255                 sra->reshape_progress = info->reshape_progress;
2256         else {
2257                 sra->reshape_progress = 0;
2258                 if (reshape.after.data_disks < reshape.before.data_disks)
2259                         /* start from the end of the new array */
2260                         sra->reshape_progress = (sra->component_size
2261                                                  * reshape.after.data_disks);
2262         }
2263
2264         if (info->array.chunk_size == info->new_chunk &&
2265             reshape.before.layout == reshape.after.layout &&
2266             st->ss->external == 0) {
2267                 /* use SET_ARRAY_INFO but only if reshape hasn't started */
2268                 ioctl(fd, GET_ARRAY_INFO, &array);
2269                 array.raid_disks = reshape.after.data_disks + reshape.parity;
2270                 if (!restart &&
2271                     ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
2272                         int err = errno;
2273
2274                         fprintf(stderr,
2275                                 Name ": Cannot set device shape for %s: %s\n",
2276                                 devname, strerror(errno));
2277
2278                         if (err == EBUSY &&
2279                             (array.state & (1<<MD_SB_BITMAP_PRESENT)))
2280                                 fprintf(stderr,
2281                                         "       Bitmap must be removed before"
2282                                         " shape can be changed\n");
2283
2284                         goto release;
2285                 }
2286         } else if (!restart) {
2287                 /* set them all just in case some old 'new_*' value
2288                  * persists from some earlier problem.
2289                  */
2290                 int err = 0;
2291                 if (sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0)
2292                         err = errno;
2293                 if (!err && sysfs_set_num(sra, NULL, "layout",
2294                                           reshape.after.layout) < 0)
2295                         err = errno;
2296                 if (!err && subarray_set_num(container, sra, "raid_disks",
2297                                              reshape.after.data_disks +
2298                                              reshape.parity) < 0)
2299                         err = errno;
2300                 if (err) {
2301                         fprintf(stderr, Name ": Cannot set device shape for %s\n",
2302                                 devname);
2303
2304                         if (err == EBUSY &&
2305                             (array.state & (1<<MD_SB_BITMAP_PRESENT)))
2306                                 fprintf(stderr,
2307                                         "       Bitmap must be removed before"
2308                                         " shape can be changed\n");
2309                         goto release;
2310                 }
2311         }
2312
2313         err = start_reshape(sra, restart, reshape.before.data_disks,
2314                             reshape.after.data_disks);
2315         if (err) {
2316                 fprintf(stderr,
2317                         Name ": Cannot %s reshape for %s\n",
2318                         restart ? "continue" : "start",
2319                         devname);
2320                 goto release;
2321         }
2322         if (restart)
2323                 sysfs_set_str(sra, NULL, "array_state", "active");
2324         if (freeze_reshape) {
2325                 free(fdlist);
2326                 free(offsets);
2327                 sysfs_free(sra);
2328                 fprintf(stderr, Name ": Reshape has to be continued from"
2329                         " location %llu when root filesystem has been mounted.\n",
2330                         sra->reshape_progress);
2331                 return 1;
2332         }
2333
2334         /* Now we just need to kick off the reshape and watch, while
2335          * handling backups of the data...
2336          * This is all done by a forked background process.
2337          */
2338         switch(forked ? 0 : fork()) {
2339         case -1:
2340                 fprintf(stderr, Name ": Cannot run child to monitor reshape: %s\n",
2341                         strerror(errno));
2342                 abort_reshape(sra);
2343                 goto release;
2344         default:
2345                 free(fdlist);
2346                 free(offsets);
2347                 sysfs_free(sra);
2348                 return 0;
2349         case 0:
2350                 map_fork();
2351                 break;
2352         }
2353
2354         close(fd);
2355         if (check_env("MDADM_GROW_VERIFY"))
2356                 fd = open(devname, O_RDONLY | O_DIRECT);
2357         else
2358                 fd = -1;
2359         mlockall(MCL_FUTURE);
2360
2361         if (st->ss->external) {
2362                 /* metadata handler takes it from here */
2363                 done = st->ss->manage_reshape(
2364                         fd, sra, &reshape, st, blocks,
2365                         fdlist, offsets,
2366                         d - odisks, fdlist+odisks,
2367                         offsets+odisks);
2368         } else
2369                 done = child_monitor(
2370                         fd, sra, &reshape, st, blocks,
2371                         fdlist, offsets,
2372                         d - odisks, fdlist+odisks,
2373                         offsets+odisks);
2374
2375         free(fdlist);
2376         free(offsets);
2377
2378         if (backup_file && done)
2379                 unlink(backup_file);
2380         if (!done) {
2381                 abort_reshape(sra);
2382                 goto out;
2383         }
2384
2385         if (!st->ss->external &&
2386             !(reshape.before.data_disks != reshape.after.data_disks
2387               && info->custom_array_size) &&
2388             info->new_level == reshape.level &&
2389             !forked) {
2390                 /* no need to wait for the reshape to finish as
2391                  * there is nothing more to do.
2392                  */
2393                 sysfs_free(sra);
2394                 exit(0);
2395         }
2396         wait_reshape(sra);
2397
2398         if (st->ss->external) {
2399                 /* Re-load the metadata as much could have changed */
2400                 int cfd = open_dev(st->container_dev);
2401                 if (cfd >= 0) {
2402                         flush_mdmon(container);
2403                         st->ss->free_super(st);
2404                         st->ss->load_container(st, cfd, container);
2405                         close(cfd);
2406                 }
2407         }
2408
2409         /* set new array size if required customer_array_size is used
2410          * by this metadata.
2411          */
2412         if (reshape.before.data_disks !=
2413             reshape.after.data_disks &&
2414             info->custom_array_size) {
2415                 struct mdinfo *info2;
2416                 char *subarray = strchr(info->text_version+1, '/')+1;
2417
2418                 info2 = st->ss->container_content(st, subarray);
2419                 if (info2) {
2420                         unsigned long long current_size = 0;
2421                         unsigned long long new_size =
2422                                 info2->custom_array_size/2;
2423
2424                         if (sysfs_get_ll(sra,
2425                                          NULL,
2426                                          "array_size",
2427                                          &current_size) == 0 &&
2428                             new_size > current_size) {
2429                                 if (sysfs_set_num(sra, NULL,
2430                                                   "array_size", new_size)
2431                                     < 0)
2432                                         dprintf("Error: Cannot"
2433                                                 " set array size");
2434                                 else
2435                                         dprintf("Array size "
2436                                                 "changed");
2437                                 dprintf(" from %llu to %llu.\n",
2438                                         current_size, new_size);
2439                         }
2440                         sysfs_free(info2);
2441                 }
2442         }
2443
2444         if (info->new_level != reshape.level) {
2445
2446                 c = map_num(pers, info->new_level);
2447                 if (c) {
2448                         err = sysfs_set_str(sra, NULL, "level", c);
2449                         if (err)
2450                                 fprintf(stderr, Name\
2451                                         ": %s: could not set level "
2452                                         "to %s\n", devname, c);
2453                 }
2454                 if (info->new_level == 0)
2455                         st->update_tail = NULL;
2456         }
2457 out:
2458         sysfs_free(sra);
2459         if (forked)
2460                 return 0;
2461         unfreeze(st);
2462         exit(0);
2463
2464 release:
2465         free(fdlist);
2466         free(offsets);
2467         if (orig_level != UnSet && sra) {
2468                 c = map_num(pers, orig_level);
2469                 if (c && sysfs_set_str(sra, NULL, "level", c) == 0)
2470                         fprintf(stderr, Name ": aborting level change\n");
2471         }
2472         sysfs_free(sra);
2473         if (!forked)
2474                 unfreeze(st);
2475         return 1;
2476 }
2477
2478 /* mdfd handle is passed to be closed in child process (after fork).
2479  */
2480 int reshape_container(char *container, char *devname,
2481                       int mdfd,
2482                       struct supertype *st,
2483                       struct mdinfo *info,
2484                       int force,
2485                       char *backup_file,
2486                       int quiet, int restart, int freeze_reshape)
2487 {
2488         struct mdinfo *cc = NULL;
2489         int rv = restart;
2490         int last_devnum = -1;
2491
2492         /* component_size is not meaningful for a container,
2493          * so pass '-1' meaning 'no change'
2494          */
2495         if (!restart &&
2496             reshape_super(st, -1, info->new_level,
2497                           info->new_layout, info->new_chunk,
2498                           info->array.raid_disks, info->delta_disks,
2499                           backup_file, devname, quiet)) {
2500                 unfreeze(st);
2501                 return 1;
2502         }
2503
2504         sync_metadata(st);
2505
2506         /* ping monitor to be sure that update is on disk
2507          */
2508         ping_monitor(container);
2509
2510         switch (fork()) {
2511         case -1: /* error */
2512                 perror("Cannot fork to complete reshape\n");
2513                 unfreeze(st);
2514                 return 1;
2515         default: /* parent */
2516                 if (!freeze_reshape)
2517                         printf(Name ": multi-array reshape continues"
2518                                " in background\n");
2519                 return 0;
2520         case 0: /* child */
2521                 map_fork();
2522                 break;
2523         }
2524
2525         /* close unused handle in child process
2526          */
2527         if (mdfd > -1)
2528                 close(mdfd);
2529
2530         while(1) {
2531                 /* For each member array with reshape_active,
2532                  * we need to perform the reshape.
2533                  * We pick the first array that needs reshaping and
2534                  * reshape it.  reshape_array() will re-read the metadata
2535                  * so the next time through a different array should be
2536                  * ready for reshape.
2537                  * It is possible that the 'different' array will not
2538                  * be assembled yet.  In that case we simple exit.
2539                  * When it is assembled, the mdadm which assembles it
2540                  * will take over the reshape.
2541                  */
2542                 struct mdinfo *content;
2543                 int fd;
2544                 struct mdstat_ent *mdstat;
2545                 char *adev;
2546
2547                 sysfs_free(cc);
2548
2549                 cc = st->ss->container_content(st, NULL);
2550
2551                 for (content = cc; content ; content = content->next) {
2552                         char *subarray;
2553                         if (!content->reshape_active)
2554                                 continue;
2555
2556                         subarray = strchr(content->text_version+1, '/')+1;
2557                         mdstat = mdstat_by_subdev(subarray,
2558                                                   devname2devnum(container));
2559                         if (!mdstat)
2560                                 continue;
2561                         break;
2562                 }
2563                 if (!content)
2564                         break;
2565
2566                 adev = map_dev(dev2major(mdstat->devnum),
2567                                dev2minor(mdstat->devnum),
2568                                0);
2569                 if (!adev)
2570                         adev = content->text_version;
2571
2572                 fd = open_dev(mdstat->devnum);
2573                 if (fd < 0) {
2574                         printf(Name ": Device %s cannot be opened for reshape.",
2575                                adev);
2576                         break;
2577                 }
2578
2579                 if (last_devnum == mdstat->devnum) {
2580                         /* Do not allow for multiple reshape_array() calls for
2581                          * the same array.
2582                          * It can happen when reshape_array() returns without
2583                          * error, when reshape is not finished (wrong reshape
2584                          * starting/continuation conditions).  Mdmon doesn't
2585                          * switch to next array in container and reentry
2586                          * conditions for the same array occur.
2587                          * This is possibly interim until the behaviour of
2588                          * reshape_array is resolved().
2589                          */
2590                         printf(Name ": Multiple reshape execution detected for "
2591                                "device  %s.", adev);
2592                         close(fd);
2593                         break;
2594                 }
2595                 last_devnum = mdstat->devnum;
2596
2597                 sysfs_init(content, fd, mdstat->devnum);
2598
2599                 if (mdmon_running(devname2devnum(container)))
2600                         flush_mdmon(container);
2601
2602                 rv = reshape_array(container, fd, adev, st,
2603                                    content, force, NULL,
2604                                    backup_file, quiet, 1, restart,
2605                                    freeze_reshape);
2606                 close(fd);
2607
2608                 if (freeze_reshape) {
2609                         sysfs_free(cc);
2610                         exit(0);
2611                 }
2612
2613                 restart = 0;
2614                 if (rv)
2615                         break;
2616
2617                 if (mdmon_running(devname2devnum(container)))
2618                         flush_mdmon(container);
2619         }
2620         if (!rv)
2621                 unfreeze(st);
2622         sysfs_free(cc);
2623         exit(0);
2624 }
2625
2626 /*
2627  * We run a child process in the background which performs the following
2628  * steps:
2629  *   - wait for resync to reach a certain point
2630  *   - suspend io to the following section
2631  *   - backup that section
2632  *   - allow resync to proceed further
2633  *   - resume io
2634  *   - discard the backup.
2635  *
2636  * When are combined in slightly different ways in the three cases.
2637  * Grow:
2638  *   - suspend/backup/allow/wait/resume/discard
2639  * Shrink:
2640  *   - allow/wait/suspend/backup/allow/wait/resume/discard
2641  * same-size:
2642  *   - wait/resume/discard/suspend/backup/allow
2643  *
2644  * suspend/backup/allow always come together
2645  * wait/resume/discard do too.
2646  * For the same-size case we have two backups to improve flow.
2647  *
2648  */
2649
2650 int progress_reshape(struct mdinfo *info, struct reshape *reshape,
2651                      unsigned long long backup_point,
2652                      unsigned long long wait_point,
2653                      unsigned long long *suspend_point,
2654                      unsigned long long *reshape_completed)
2655 {
2656         /* This function is called repeatedly by the reshape manager.
2657          * It determines how much progress can safely be made and allows
2658          * that progress.
2659          * - 'info' identifies the array and particularly records in
2660          *    ->reshape_progress the metadata's knowledge of progress
2661          *      This is a sector offset from the start of the array
2662          *      of the next array block to be relocated.  This number
2663          *      may increase from 0 or decrease from array_size, depending
2664          *      on the type of reshape that is happening.
2665          *    Note that in contrast, 'sync_completed' is a block count of the
2666          *    reshape so far.  It gives the distance between the start point
2667          *    (head or tail of device) and the next place that data will be
2668          *    written.  It always increases.
2669          * - 'reshape' is the structure created by analyse_change
2670          * - 'backup_point' shows how much the metadata manager has backed-up
2671          *   data.  For reshapes with increasing progress, it is the next address
2672          *   to be backed up, previous addresses have been backed-up.  For
2673          *   decreasing progress, it is the earliest address that has been
2674          *   backed up - later address are also backed up.
2675          *   So addresses between reshape_progress and backup_point are
2676          *   backed up providing those are in the 'correct' order.
2677          * - 'wait_point' is an array address.  When reshape_completed
2678          *   passes this point, progress_reshape should return.  It might
2679          *   return earlier if it determines that ->reshape_progress needs
2680          *   to be updated or further backup is needed.
2681          * - suspend_point is maintained by progress_reshape and the caller
2682          *   should not touch it except to initialise to zero.
2683          *   It is an array address and it only increases in 2.6.37 and earlier.
2684          *   This makes it difficult to handle reducing reshapes with
2685          *   external metadata.
2686          *   However:  it is similar to backup_point in that it records the
2687          *     other end of a suspended region from  reshape_progress.
2688          *     it is moved to extend the region that is safe to backup and/or
2689          *     reshape
2690          * - reshape_completed is read from sysfs and returned.  The caller
2691          *   should copy this into ->reshape_progress when it has reason to
2692          *   believe that the metadata knows this, and any backup outside this
2693          *   has been erased.
2694          *
2695          * Return value is:
2696          *   1 if more data from backup_point - but only as far as suspend_point,
2697          *     should be backed up
2698          *   0 if things are progressing smoothly
2699          *  -1 if the reshape is finished because it is all done,
2700          *  -2 if the reshape is finished due to an error.
2701          */
2702
2703         int advancing = (reshape->after.data_disks
2704                          >= reshape->before.data_disks);
2705         unsigned long long need_backup; /* All data between start of array and
2706                                          * here will at some point need to
2707                                          * be backed up.
2708                                          */
2709         unsigned long long read_offset, write_offset;
2710         unsigned long long write_range;
2711         unsigned long long max_progress, target, completed;
2712         unsigned long long array_size = (info->component_size
2713                                          * reshape->before.data_disks);
2714         int fd;
2715         char buf[20];
2716
2717         /* First, we unsuspend any region that is now known to be safe.
2718          * If suspend_point is on the 'wrong' side of reshape_progress, then
2719          * we don't have or need suspension at the moment.  This is true for
2720          * native metadata when we don't need to back-up.
2721          */
2722         if (advancing) {
2723                 if (info->reshape_progress <= *suspend_point)
2724                         sysfs_set_num(info, NULL, "suspend_lo",
2725                                       info->reshape_progress);
2726         } else {
2727                 /* Note: this won't work in 2.6.37 and before.
2728                  * Something somewhere should make sure we don't need it!
2729                  */
2730                 if (info->reshape_progress >= *suspend_point)
2731                         sysfs_set_num(info, NULL, "suspend_hi",
2732                                       info->reshape_progress);
2733         }
2734
2735         /* Now work out how far it is safe to progress.
2736          * If the read_offset for ->reshape_progress is less than
2737          * 'blocks' beyond the write_offset, we can only progress as far
2738          * as a backup.
2739          * Otherwise we can progress until the write_offset for the new location
2740          * reaches (within 'blocks' of) the read_offset at the current location.
2741          * However that region must be suspended unless we are using native
2742          * metadata.
2743          * If we need to suspend more, we limit it to 128M per device, which is
2744          * rather arbitrary and should be some time-based calculation.
2745          */
2746         read_offset = info->reshape_progress / reshape->before.data_disks;
2747         write_offset = info->reshape_progress / reshape->after.data_disks;
2748         write_range = info->new_chunk/512;
2749         if (reshape->before.data_disks == reshape->after.data_disks)
2750                 need_backup = array_size;
2751         else
2752                 need_backup = reshape->backup_blocks;
2753         if (advancing) {
2754                 if (read_offset < write_offset + write_range)
2755                         max_progress = backup_point;
2756                 else
2757                         max_progress =
2758                                 read_offset *
2759                                 reshape->after.data_disks;
2760         } else {
2761                 if (read_offset > write_offset - write_range)
2762                         /* Can only progress as far as has been backed up,
2763                          * which must be suspended */
2764                         max_progress = backup_point;
2765                 else if (info->reshape_progress <= need_backup)
2766                         max_progress = backup_point;
2767                 else {
2768                         if (info->array.major_version >= 0)
2769                                 /* Can progress until backup is needed */
2770                                 max_progress = need_backup;
2771                         else {
2772                                 /* Can progress until metadata update is required */
2773                                 max_progress =
2774                                         read_offset *
2775                                         reshape->after.data_disks;
2776                                 /* but data must be suspended */
2777                                 if (max_progress < *suspend_point)
2778                                         max_progress = *suspend_point;
2779                         }
2780                 }
2781         }
2782
2783         /* We know it is safe to progress to 'max_progress' providing
2784          * it is suspended or we are using native metadata.
2785          * Consider extending suspend_point 128M per device if it
2786          * is less than 64M per device beyond reshape_progress.
2787          * But always do a multiple of 'blocks'
2788          * FIXME this is too big - it takes to long to complete
2789          * this much.
2790          */
2791         target = 64*1024*2 * min(reshape->before.data_disks,
2792                                  reshape->after.data_disks);
2793         target /= reshape->backup_blocks;
2794         if (target < 2)
2795                 target = 2;
2796         target *= reshape->backup_blocks;
2797
2798         /* For externally managed metadata we always need to suspend IO to
2799          * the area being reshaped so we regularly push suspend_point forward.
2800          * For native metadata we only need the suspend if we are going to do
2801          * a backup.
2802          */
2803         if (advancing) {
2804                 if ((need_backup > info->reshape_progress
2805                      || info->array.major_version < 0) &&
2806                     *suspend_point < info->reshape_progress + target) {
2807                         if (need_backup < *suspend_point + 2 * target)
2808                                 *suspend_point = need_backup;
2809                         else if (*suspend_point + 2 * target < array_size)
2810                                 *suspend_point += 2 * target;
2811                         else
2812                                 *suspend_point = array_size;
2813                         sysfs_set_num(info, NULL, "suspend_hi", *suspend_point);
2814                         if (max_progress > *suspend_point)
2815                                 max_progress = *suspend_point;
2816                 }
2817         } else {
2818                 if (info->array.major_version >= 0) {
2819                         /* Only need to suspend when about to backup */
2820                         if (info->reshape_progress < need_backup * 2 &&
2821                             *suspend_point > 0) {
2822                                 *suspend_point = 0;
2823                                 sysfs_set_num(info, NULL, "suspend_lo", 0);
2824                                 sysfs_set_num(info, NULL, "suspend_hi", need_backup);
2825                         }
2826                 } else {
2827                         /* Need to suspend continually */
2828                         if (info->reshape_progress < *suspend_point)
2829                                 *suspend_point = info->reshape_progress;
2830                         if (*suspend_point + target < info->reshape_progress)
2831                                 /* No need to move suspend region yet */;
2832                         else {
2833                                 if (*suspend_point >= 2 * target)
2834                                         *suspend_point -= 2 * target;
2835                                 else
2836                                         *suspend_point = 0;
2837                                 sysfs_set_num(info, NULL, "suspend_lo",
2838                                               *suspend_point);
2839                         }
2840                         if (max_progress < *suspend_point)
2841                                 max_progress = *suspend_point;
2842                 }
2843         }
2844
2845         /* now set sync_max to allow that progress. sync_max, like
2846          * sync_completed is a count of sectors written per device, so
2847          * we find the difference between max_progress and the start point,
2848          * and divide that by after.data_disks to get a sync_max
2849          * number.
2850          * At the same time we convert wait_point to a similar number
2851          * for comparing against sync_completed.
2852          */
2853         /* scale down max_progress to per_disk */
2854         max_progress /= reshape->after.data_disks;
2855         /* Round to chunk size as some kernels give an erroneously high number */
2856         max_progress /= info->new_chunk/512;
2857         max_progress *= info->new_chunk/512;
2858         /* And round to old chunk size as the kernel wants that */
2859         max_progress /= info->array.chunk_size/512;
2860         max_progress *= info->array.chunk_size/512;
2861         /* Limit progress to the whole device */
2862         if (max_progress > info->component_size)
2863                 max_progress = info->component_size;
2864         wait_point /= reshape->after.data_disks;
2865         if (!advancing) {
2866                 /* switch from 'device offset' to 'processed block count' */
2867                 max_progress = info->component_size - max_progress;
2868                 wait_point = info->component_size - wait_point;
2869         }
2870
2871         sysfs_set_num(info, NULL, "sync_max", max_progress);
2872
2873         /* Now wait.  If we have already reached the point that we were
2874          * asked to wait to, don't wait at all, else wait for any change.
2875          * We need to select on 'sync_completed' as that is the place that
2876          * notifications happen, but we are really interested in
2877          * 'reshape_position'
2878          */
2879         fd = sysfs_get_fd(info, NULL, "sync_completed");
2880         if (fd < 0)
2881                 goto check_progress;
2882
2883         if (sysfs_fd_get_ll(fd, &completed) < 0)
2884                 goto check_progress;
2885
2886         while (completed < max_progress && completed < wait_point) {
2887                 /* Check that sync_action is still 'reshape' to avoid
2888                  * waiting forever on a dead array
2889                  */
2890                 char action[20];
2891                 fd_set rfds;
2892                 if (sysfs_get_str(info, NULL, "sync_action",
2893                                   action, 20) <= 0 ||
2894                     strncmp(action, "reshape", 7) != 0)
2895                         break;
2896                 /* Some kernels reset 'sync_completed' to zero
2897                  * before setting 'sync_action' to 'idle'.
2898                  * So we need these extra tests.
2899                  */
2900                 if (completed == 0 && advancing
2901                     && info->reshape_progress > 0)
2902                         break;
2903                 if (completed == 0 && !advancing
2904                     && info->reshape_progress < (info->component_size
2905                                                  * reshape->after.data_disks))
2906                         break;
2907                 FD_ZERO(&rfds);
2908                 FD_SET(fd, &rfds);
2909                 select(fd+1, NULL, NULL, &rfds, NULL);
2910                 if (sysfs_fd_get_ll(fd, &completed) < 0)
2911                         goto check_progress;
2912         }
2913         /* Some kernels reset 'sync_completed' to zero,
2914          * we need to have real point we are in md
2915          */
2916         if (completed == 0)
2917                 completed = max_progress;
2918
2919         /* some kernels can give an incorrectly high 'completed' number */
2920         completed /= (info->new_chunk/512);
2921         completed *= (info->new_chunk/512);
2922         /* Convert 'completed' back in to a 'progress' number */
2923         completed *= reshape->after.data_disks;
2924         if (!advancing) {
2925                 completed = info->component_size * reshape->after.data_disks
2926                         - completed;
2927         }
2928         *reshape_completed = completed;
2929
2930         close(fd);
2931
2932         /* We return the need_backup flag.  Caller will decide
2933          * how much - a multiple of ->backup_blocks up to *suspend_point
2934          */
2935         if (advancing)
2936                 return need_backup > info->reshape_progress;
2937         else
2938                 return need_backup >= info->reshape_progress;
2939
2940 check_progress:
2941         /* if we couldn't read a number from sync_completed, then
2942          * either the reshape did complete, or it aborted.
2943          * We can tell which by checking for 'none' in reshape_position.
2944          * If it did abort, then it might immediately restart if it
2945          * it was just a device failure that leaves us degraded but
2946          * functioning.
2947          */
2948         strcpy(buf, "hi");
2949         if (sysfs_get_str(info, NULL, "reshape_position", buf, sizeof(buf)) < 0
2950             || strncmp(buf, "none", 4) != 0) {
2951                 /* The abort might only be temporary.  Wait up to 10
2952                  * seconds for fd to contain a valid number again.
2953                  */
2954                 struct timeval tv;
2955                 int rv = -2;
2956                 tv.tv_sec = 10;
2957                 tv.tv_usec = 0;
2958                 while (fd >= 0 && rv < 0 && tv.tv_sec > 0) {
2959                         fd_set rfds;
2960                         FD_ZERO(&rfds);
2961                         FD_SET(fd, &rfds);
2962                         if (select(fd+1, NULL, NULL, &rfds, &tv) != 1)
2963                                 break;
2964                         switch (sysfs_fd_get_ll(fd, &completed)) {
2965                         case 0:
2966                                 /* all good again */
2967                                 rv = 1;
2968                                 break;
2969                         case -2: /* read error - abort */
2970                                 tv.tv_sec = 0;
2971                                 break;
2972                         }
2973                 }
2974                 if (fd >= 0)
2975                         close(fd);
2976                 return rv; /* abort */
2977         } else {
2978                 /* Maybe racing with array shutdown - check state */
2979                 if (fd >= 0)
2980                         close(fd);
2981                 if (sysfs_get_str(info, NULL, "array_state", buf, sizeof(buf)) < 0
2982                     || strncmp(buf, "inactive", 8) == 0
2983                     || strncmp(buf, "clear",5) == 0)
2984                         return -2; /* abort */
2985                 return -1; /* complete */
2986         }
2987 }
2988
2989 /* FIXME return status is never checked */
2990 static int grow_backup(struct mdinfo *sra,
2991                 unsigned long long offset, /* per device */
2992                 unsigned long stripes, /* per device, in old chunks */
2993                 int *sources, unsigned long long *offsets,
2994                 int disks, int chunk, int level, int layout,
2995                 int dests, int *destfd, unsigned long long *destoffsets,
2996                 int part, int *degraded,
2997                 char *buf)
2998 {
2999         /* Backup 'blocks' sectors at 'offset' on each device of the array,
3000          * to storage 'destfd' (offset 'destoffsets'), after first
3001          * suspending IO.  Then allow resync to continue
3002          * over the suspended section.
3003          * Use part 'part' of the backup-super-block.
3004          */
3005         int odata = disks;
3006         int rv = 0;
3007         int i;
3008         unsigned long long ll;
3009         int new_degraded;
3010         //printf("offset %llu\n", offset);
3011         if (level >= 4)
3012                 odata--;
3013         if (level == 6)
3014                 odata--;
3015
3016         /* Check that array hasn't become degraded, else we might backup the wrong data */
3017         if (sysfs_get_ll(sra, NULL, "degraded", &ll) < 0)
3018                 return -1; /* FIXME this error is ignored */
3019         new_degraded = (int)ll;
3020         if (new_degraded != *degraded) {
3021                 /* check each device to ensure it is still working */
3022                 struct mdinfo *sd;
3023                 for (sd = sra->devs ; sd ; sd = sd->next) {
3024                         if (sd->disk.state & (1<<MD_DISK_FAULTY))
3025                                 continue;
3026                         if (sd->disk.state & (1<<MD_DISK_SYNC)) {
3027                                 char sbuf[20];
3028                                 if (sysfs_get_str(sra, sd, "state", sbuf, 20) < 0 ||
3029                                     strstr(sbuf, "faulty") ||
3030                                     strstr(sbuf, "in_sync") == NULL) {
3031                                         /* this device is dead */
3032                                         sd->disk.state = (1<<MD_DISK_FAULTY);
3033                                         if (sd->disk.raid_disk >= 0 &&
3034                                             sources[sd->disk.raid_disk] >= 0) {
3035                                                 close(sources[sd->disk.raid_disk]);
3036                                                 sources[sd->disk.raid_disk] = -1;
3037                                         }
3038                                 }
3039                         }
3040                 }
3041                 *degraded = new_degraded;
3042         }
3043         if (part) {
3044                 bsb.arraystart2 = __cpu_to_le64(offset * odata);
3045                 bsb.length2 = __cpu_to_le64(stripes * (chunk/512) * odata);
3046         } else {
3047                 bsb.arraystart = __cpu_to_le64(offset * odata);
3048                 bsb.length = __cpu_to_le64(stripes * (chunk/512) * odata);
3049         }
3050         if (part)
3051                 bsb.magic[15] = '2';
3052         for (i = 0; i < dests; i++)
3053                 if (part)
3054                         lseek64(destfd[i], destoffsets[i] + __le64_to_cpu(bsb.devstart2)*512, 0);
3055                 else
3056                         lseek64(destfd[i], destoffsets[i], 0);
3057
3058         rv = save_stripes(sources, offsets,
3059                           disks, chunk, level, layout,
3060                           dests, destfd,
3061                           offset*512*odata, stripes * chunk * odata,
3062                           buf);
3063
3064         if (rv)
3065                 return rv;
3066         bsb.mtime = __cpu_to_le64(time(0));
3067         for (i = 0; i < dests; i++) {
3068                 bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
3069
3070                 bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
3071                 if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
3072                         bsb.sb_csum2 = bsb_csum((char*)&bsb,
3073                                                 ((char*)&bsb.sb_csum2)-((char*)&bsb));
3074
3075                 rv = -1;
3076                 if ((unsigned long long)lseek64(destfd[i], destoffsets[i] - 4096, 0)
3077                     != destoffsets[i] - 4096)
3078                         break;
3079                 if (write(destfd[i], &bsb, 512) != 512)
3080                         break;
3081                 if (destoffsets[i] > 4096) {
3082                         if ((unsigned long long)lseek64(destfd[i], destoffsets[i]+stripes*chunk*odata, 0) !=
3083                             destoffsets[i]+stripes*chunk*odata)
3084                                 break;
3085                         if (write(destfd[i], &bsb, 512) != 512)
3086                                 break;
3087                 }
3088                 fsync(destfd[i]);
3089                 rv = 0;
3090         }
3091
3092         return rv;
3093 }
3094
3095 /* in 2.6.30, the value reported by sync_completed can be
3096  * less that it should be by one stripe.
3097  * This only happens when reshape hits sync_max and pauses.
3098  * So allow wait_backup to either extent sync_max further
3099  * than strictly necessary, or return before the
3100  * sync has got quite as far as we would really like.
3101  * This is what 'blocks2' is for.
3102  * The various caller give appropriate values so that
3103  * every works.
3104  */
3105 /* FIXME return value is often ignored */
3106 static int forget_backup(int dests, int *destfd,
3107                          unsigned long long *destoffsets,
3108                          int part)
3109 {
3110         /*
3111          * Erase backup 'part' (which is 0 or 1)
3112          */
3113         int i;
3114         int rv;
3115
3116         if (part) {
3117                 bsb.arraystart2 = __cpu_to_le64(0);
3118                 bsb.length2 = __cpu_to_le64(0);
3119         } else {
3120                 bsb.arraystart = __cpu_to_le64(0);
3121                 bsb.length = __cpu_to_le64(0);
3122         }
3123         bsb.mtime = __cpu_to_le64(time(0));
3124         rv = 0;
3125         for (i = 0; i < dests; i++) {
3126                 bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
3127                 bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
3128                 if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
3129                         bsb.sb_csum2 = bsb_csum((char*)&bsb,
3130                                                 ((char*)&bsb.sb_csum2)-((char*)&bsb));
3131                 if ((unsigned long long)lseek64(destfd[i], destoffsets[i]-4096, 0) !=
3132                     destoffsets[i]-4096)
3133                         rv = -1;
3134                 if (rv == 0 &&
3135                     write(destfd[i], &bsb, 512) != 512)
3136                         rv = -1;
3137                 fsync(destfd[i]);
3138         }
3139         return rv;
3140 }
3141
3142 static void fail(char *msg)
3143 {
3144         int rv;
3145         rv = (write(2, msg, strlen(msg)) != (int)strlen(msg));
3146         rv |= (write(2, "\n", 1) != 1);
3147         exit(rv ? 1 : 2);
3148 }
3149
3150 static char *abuf, *bbuf;
3151 static unsigned long long abuflen;
3152 static void validate(int afd, int bfd, unsigned long long offset)
3153 {
3154         /* check that the data in the backup against the array.
3155          * This is only used for regression testing and should not
3156          * be used while the array is active
3157          */
3158         if (afd < 0)
3159                 return;
3160         lseek64(bfd, offset - 4096, 0);
3161         if (read(bfd, &bsb2, 512) != 512)
3162                 fail("cannot read bsb");
3163         if (bsb2.sb_csum != bsb_csum((char*)&bsb2,
3164                                      ((char*)&bsb2.sb_csum)-((char*)&bsb2)))
3165                 fail("first csum bad");
3166         if (memcmp(bsb2.magic, "md_backup_data", 14) != 0)
3167                 fail("magic is bad");
3168         if (memcmp(bsb2.magic, "md_backup_data-2", 16) == 0 &&
3169             bsb2.sb_csum2 != bsb_csum((char*)&bsb2,
3170                                       ((char*)&bsb2.sb_csum2)-((char*)&bsb2)))
3171                 fail("second csum bad");
3172
3173         if (__le64_to_cpu(bsb2.devstart)*512 != offset)
3174                 fail("devstart is wrong");
3175
3176         if (bsb2.length) {
3177                 unsigned long long len = __le64_to_cpu(bsb2.length)*512;
3178
3179                 if (abuflen < len) {
3180                         free(abuf);
3181                         free(bbuf);
3182                         abuflen = len;
3183                         if (posix_memalign((void**)&abuf, 4096, abuflen) ||
3184                             posix_memalign((void**)&bbuf, 4096, abuflen)) {
3185                                 abuflen = 0;
3186                                 /* just stop validating on mem-alloc failure */
3187                                 return;
3188                         }
3189                 }
3190
3191                 lseek64(bfd, offset, 0);
3192                 if ((unsigned long long)read(bfd, bbuf, len) != len) {
3193                         //printf("len %llu\n", len);
3194                         fail("read first backup failed");
3195                 }
3196                 lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0);
3197                 if ((unsigned long long)read(afd, abuf, len) != len)
3198                         fail("read first from array failed");
3199                 if (memcmp(bbuf, abuf, len) != 0) {
3200 #if 0
3201                         int i;
3202                         printf("offset=%llu len=%llu\n",
3203                                (unsigned long long)__le64_to_cpu(bsb2.arraystart)*512, len);
3204                         for (i=0; i<len; i++)
3205                                 if (bbuf[i] != abuf[i]) {
3206                                         printf("first diff byte %d\n", i);
3207                                         break;
3208                                 }
3209 #endif
3210                         fail("data1 compare failed");
3211                 }
3212         }
3213         if (bsb2.length2) {
3214                 unsigned long long len = __le64_to_cpu(bsb2.length2)*512;
3215
3216                 if (abuflen < len) {
3217                         free(abuf);
3218                         free(bbuf);
3219                         abuflen = len;
3220                         abuf = malloc(abuflen);
3221                         bbuf = malloc(abuflen);
3222                 }
3223
3224                 lseek64(bfd, offset+__le64_to_cpu(bsb2.devstart2)*512, 0);
3225                 if ((unsigned long long)read(bfd, bbuf, len) != len)
3226                         fail("read second backup failed");
3227                 lseek64(afd, __le64_to_cpu(bsb2.arraystart2)*512, 0);
3228                 if ((unsigned long long)read(afd, abuf, len) != len)
3229                         fail("read second from array failed");
3230                 if (memcmp(bbuf, abuf, len) != 0)
3231                         fail("data2 compare failed");
3232         }
3233 }
3234
3235 int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
3236                   struct supertype *st, unsigned long blocks,
3237                   int *fds, unsigned long long *offsets,
3238                   int dests, int *destfd, unsigned long long *destoffsets)
3239 {
3240         /* Monitor a reshape where backup is being performed using
3241          * 'native' mechanism - either to a backup file, or
3242          * to some space in a spare.
3243          */
3244         char *buf;
3245         int degraded = -1;
3246         unsigned long long speed;
3247         unsigned long long suspend_point, array_size;
3248         unsigned long long backup_point, wait_point;
3249         unsigned long long reshape_completed;
3250         int done = 0;
3251         int increasing = reshape->after.data_disks >= reshape->before.data_disks;
3252         int part = 0; /* The next part of the backup area to fill.  It may already
3253                        * be full, so we need to check */
3254         int level = reshape->level;
3255         int layout = reshape->before.layout;
3256         int data = reshape->before.data_disks;
3257         int disks = reshape->before.data_disks + reshape->parity;
3258         int chunk = sra->array.chunk_size;
3259         struct mdinfo *sd;
3260         unsigned long stripes;
3261         int uuid[4];
3262
3263         /* set up the backup-super-block.  This requires the
3264          * uuid from the array.
3265          */
3266         /* Find a superblock */
3267         for (sd = sra->devs; sd; sd = sd->next) {
3268                 char *dn;
3269                 int devfd;
3270                 int ok;
3271                 if (sd->disk.state & (1<<MD_DISK_FAULTY))
3272                         continue;
3273                 dn = map_dev(sd->disk.major, sd->disk.minor, 1);
3274                 devfd = dev_open(dn, O_RDONLY);
3275                 if (devfd < 0)
3276                         continue;
3277                 ok = st->ss->load_super(st, devfd, NULL);
3278                 close(devfd);
3279                 if (ok == 0)
3280                         break;
3281         }
3282         if (!sd) {
3283                 fprintf(stderr, Name ": Cannot find a superblock\n");
3284                 return 0;
3285         }
3286
3287         memset(&bsb, 0, 512);
3288         memcpy(bsb.magic, "md_backup_data-1", 16);
3289         st->ss->uuid_from_super(st, uuid);
3290         memcpy(bsb.set_uuid, uuid, 16);
3291         bsb.mtime = __cpu_to_le64(time(0));
3292         bsb.devstart2 = blocks;
3293
3294         stripes = blocks / (sra->array.chunk_size/512) /
3295                 reshape->before.data_disks;
3296
3297         if (posix_memalign((void**)&buf, 4096, disks * chunk))
3298                 /* Don't start the 'reshape' */
3299                 return 0;
3300         if (reshape->before.data_disks == reshape->after.data_disks) {
3301                 sysfs_get_ll(sra, NULL, "sync_speed_min", &speed);
3302                 sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
3303         }
3304
3305         if (increasing) {
3306                 array_size = sra->component_size * reshape->after.data_disks;
3307                 backup_point = sra->reshape_progress;
3308                 suspend_point = 0;
3309         } else {
3310                 array_size = sra->component_size * reshape->before.data_disks;
3311                 backup_point = reshape->backup_blocks;
3312                 suspend_point = array_size;
3313         }
3314
3315         while (!done) {
3316                 int rv;
3317
3318                 /* Want to return as soon the oldest backup slot can
3319                  * be released as that allows us to start backing up
3320                  * some more, providing suspend_point has been
3321                  * advanced, which it should have.
3322                  */
3323                 if (increasing) {
3324                         wait_point = array_size;
3325                         if (part == 0 && __le64_to_cpu(bsb.length) > 0)
3326                                 wait_point = (__le64_to_cpu(bsb.arraystart) +
3327                                               __le64_to_cpu(bsb.length));
3328                         if (part == 1 && __le64_to_cpu(bsb.length2) > 0)
3329                                 wait_point = (__le64_to_cpu(bsb.arraystart2) +
3330                                               __le64_to_cpu(bsb.length2));
3331                 } else {
3332                         wait_point = 0;
3333                         if (part == 0 && __le64_to_cpu(bsb.length) > 0)
3334                                 wait_point = __le64_to_cpu(bsb.arraystart);
3335                         if (part == 1 && __le64_to_cpu(bsb.length2) > 0)
3336                                 wait_point = __le64_to_cpu(bsb.arraystart2);
3337                 }
3338
3339                 rv = progress_reshape(sra, reshape,
3340                                       backup_point, wait_point,
3341                                       &suspend_point, &reshape_completed);
3342                 /* external metadata would need to ping_monitor here */
3343                 sra->reshape_progress = reshape_completed;
3344
3345                 /* Clear any backup region that is before 'here' */
3346                 if (increasing) {
3347                         if (__le64_to_cpu(bsb.length) > 0 &&
3348                             reshape_completed >= (__le64_to_cpu(bsb.arraystart) +
3349                                                   __le64_to_cpu(bsb.length)))
3350                                 forget_backup(dests, destfd,
3351                                               destoffsets, 0);
3352                         if (__le64_to_cpu(bsb.length2) > 0 &&
3353                             reshape_completed >= (__le64_to_cpu(bsb.arraystart2) +
3354                                                   __le64_to_cpu(bsb.length2)))
3355                                 forget_backup(dests, destfd,
3356                                               destoffsets, 1);
3357                 } else {
3358                         if (__le64_to_cpu(bsb.length) > 0 &&
3359                             reshape_completed <= (__le64_to_cpu(bsb.arraystart)))
3360                                 forget_backup(dests, destfd,
3361                                               destoffsets, 0);
3362                         if (__le64_to_cpu(bsb.length2) > 0 &&
3363                             reshape_completed <= (__le64_to_cpu(bsb.arraystart2)))
3364                                 forget_backup(dests, destfd,
3365                                               destoffsets, 1);
3366                 }
3367
3368                 if (rv < 0) {
3369                         if (rv == -1)
3370                                 done = 1;
3371                         break;
3372                 }
3373                 if (rv == 0 && increasing && !st->ss->external) {
3374                         /* No longer need to monitor this reshape */
3375                         done = 1;
3376                         break;
3377                 }
3378
3379                 while (rv) {
3380                         unsigned long long offset;
3381                         unsigned long actual_stripes;
3382                         /* Need to backup some data.
3383                          * If 'part' is not used and the desired
3384                          * backup size is suspended, do a backup,
3385                          * then consider the next part.
3386                          */
3387                         /* Check that 'part' is unused */
3388                         if (part == 0 && __le64_to_cpu(bsb.length) != 0)
3389                                 break;
3390                         if (part == 1 && __le64_to_cpu(bsb.length2) != 0)
3391                                 break;
3392
3393                         offset = backup_point / data;
3394                         actual_stripes = stripes;
3395                         if (increasing) {
3396                                 if (offset + actual_stripes * (chunk/512) >
3397                                     sra->component_size)
3398                                         actual_stripes = ((sra->component_size - offset)
3399                                                           / (chunk/512));
3400                                 if (offset + actual_stripes * (chunk/512) >
3401                                     suspend_point/data)
3402                                         break;
3403                         } else {
3404                                 if (offset < actual_stripes * (chunk/512))
3405                                         actual_stripes = offset / (chunk/512);
3406                                 offset -= actual_stripes * (chunk/512);
3407                                 if (offset < suspend_point/data)
3408                                         break;
3409                         }
3410                         if (actual_stripes == 0)
3411                                 break;
3412                         grow_backup(sra, offset, actual_stripes,
3413                                     fds, offsets,
3414                                     disks, chunk, level, layout,
3415                                     dests, destfd, destoffsets,
3416                                     part, &degraded, buf);
3417                         validate(afd, destfd[0], destoffsets[0]);
3418                         /* record where 'part' is up to */
3419                         part = !part;
3420                         if (increasing)
3421                                 backup_point += actual_stripes * (chunk/512) * data;
3422                         else
3423                                 backup_point -= actual_stripes * (chunk/512) * data;
3424                 }
3425         }
3426
3427         /* FIXME maybe call progress_reshape one more time instead */
3428         abort_reshape(sra); /* remove any remaining suspension */
3429         if (reshape->before.data_disks == reshape->after.data_disks)
3430                 sysfs_set_num(sra, NULL, "sync_speed_min", speed);
3431         free(buf);
3432         return done;
3433 }
3434
3435 /*
3436  * If any spare contains md_back_data-1 which is recent wrt mtime,
3437  * write that data into the array and update the super blocks with
3438  * the new reshape_progress
3439  */
3440 int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt,
3441                  char *backup_file, int verbose)
3442 {
3443         int i, j;
3444         int old_disks;
3445         unsigned long long *offsets;
3446         unsigned long long  nstripe, ostripe;
3447         int ndata, odata;
3448
3449         odata = info->array.raid_disks - info->delta_disks - 1;
3450         if (info->array.level == 6) odata--; /* number of data disks */
3451         ndata = info->array.raid_disks - 1;
3452         if (info->new_level == 6) ndata--;
3453
3454         old_disks = info->array.raid_disks - info->delta_disks;
3455
3456         if (info->delta_disks <= 0)
3457                 /* Didn't grow, so the backup file must have
3458                  * been used
3459                  */
3460                 old_disks = cnt;
3461         for (i=old_disks-(backup_file?1:0); i<cnt; i++) {
3462                 struct mdinfo dinfo;
3463                 int fd;
3464                 int bsbsize;
3465                 char *devname, namebuf[20];
3466                 unsigned long long lo, hi;
3467
3468                 /* This was a spare and may have some saved data on it.
3469                  * Load the superblock, find and load the
3470                  * backup_super_block.
3471                  * If either fail, go on to next device.
3472                  * If the backup contains no new info, just return
3473                  * else restore data and update all superblocks
3474                  */
3475                 if (i == old_disks-1) {
3476                         fd = open(backup_file, O_RDONLY);
3477                         if (fd<0) {
3478                                 fprintf(stderr, Name ": backup file %s inaccessible: %s\n",
3479                                         backup_file, strerror(errno));
3480                                 continue;
3481                         }
3482                         devname = backup_file;
3483                 } else {
3484                         fd = fdlist[i];
3485                         if (fd < 0)
3486                                 continue;
3487                         if (st->ss->load_super(st, fd, NULL))
3488                                 continue;
3489
3490                         st->ss->getinfo_super(st, &dinfo, NULL);
3491                         st->ss->free_super(st);
3492
3493                         if (lseek64(fd,
3494                                     (dinfo.data_offset + dinfo.component_size - 8) <<9,
3495                                     0) < 0) {
3496                                 fprintf(stderr, Name ": Cannot seek on device %d\n", i);
3497                                 continue; /* Cannot seek */
3498                         }
3499                         sprintf(namebuf, "device-%d", i);
3500                         devname = namebuf;
3501                 }
3502                 if (read(fd, &bsb, sizeof(bsb)) != sizeof(bsb)) {
3503                         if (verbose)
3504                                 fprintf(stderr, Name ": Cannot read from %s\n", devname);
3505                         continue; /* Cannot read */
3506                 }
3507                 if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0 &&
3508                     memcmp(bsb.magic, "md_backup_data-2", 16) != 0) {
3509                         if (verbose)
3510                                 fprintf(stderr, Name ": No backup metadata on %s\n", devname);
3511                         continue;
3512                 }
3513                 if (bsb.sb_csum != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb))) {
3514                         if (verbose)
3515                                 fprintf(stderr, Name ": Bad backup-metadata checksum on %s\n", devname);
3516                         continue; /* bad checksum */
3517                 }
3518                 if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0 &&
3519                     bsb.sb_csum2 != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb))) {
3520                         if (verbose)
3521                                 fprintf(stderr, Name ": Bad backup-metadata checksum2 on %s\n", devname);
3522                         continue; /* Bad second checksum */
3523                 }
3524                 if (memcmp(bsb.set_uuid,info->uuid, 16) != 0) {
3525                         if (verbose)
3526                                 fprintf(stderr, Name ": Wrong uuid on backup-metadata on %s\n", devname);
3527                         continue; /* Wrong uuid */
3528                 }
3529
3530                 /* array utime and backup-mtime should be updated at much the same time, but it seems that
3531                  * sometimes they aren't... So allow considerable flexability in matching, and allow
3532                  * this test to be overridden by an environment variable.
3533                  */
3534                 if (info->array.utime > (int)__le64_to_cpu(bsb.mtime) + 2*60*60 ||
3535                     info->array.utime < (int)__le64_to_cpu(bsb.mtime) - 10*60) {
3536                         if (check_env("MDADM_GROW_ALLOW_OLD")) {
3537                                 fprintf(stderr, Name ": accepting backup with timestamp %lu "
3538                                         "for array with timestamp %lu\n",
3539                                         (unsigned long)__le64_to_cpu(bsb.mtime),
3540                                         (unsigned long)info->array.utime);
3541                         } else {
3542                                 if (verbose)
3543                                         fprintf(stderr, Name ": too-old timestamp on "
3544                                                 "backup-metadata on %s\n", devname);
3545                                 continue; /* time stamp is too bad */
3546                         }
3547                 }
3548
3549                 if (bsb.magic[15] == '1') {
3550                         if (bsb.length == 0)
3551                                 continue;
3552                         if (info->delta_disks >= 0) {
3553                                 /* reshape_progress is increasing */
3554                                 if (__le64_to_cpu(bsb.arraystart)
3555                                     + __le64_to_cpu(bsb.length)
3556                                     < info->reshape_progress) {
3557                                 nonew:
3558                                         if (verbose)
3559                                                 fprintf(stderr, Name
3560                                                         ": backup-metadata found on %s but is not needed\n", devname);
3561                                         continue; /* No new data here */
3562                                 }
3563                         } else {
3564                                 /* reshape_progress is decreasing */
3565                                 if (__le64_to_cpu(bsb.arraystart) >=
3566                                     info->reshape_progress)
3567                                         goto nonew; /* No new data here */
3568                         }
3569                 } else {
3570                         if (bsb.length == 0 && bsb.length2 == 0)
3571                                 continue;
3572                         if (info->delta_disks >= 0) {
3573                                 /* reshape_progress is increasing */
3574                                 if ((__le64_to_cpu(bsb.arraystart)
3575                                      + __le64_to_cpu(bsb.length)
3576                                      < info->reshape_progress)
3577                                     &&
3578                                     (__le64_to_cpu(bsb.arraystart2)
3579                                      + __le64_to_cpu(bsb.length2)
3580                                      < info->reshape_progress))
3581                                         goto nonew; /* No new data here */
3582                         } else {
3583                                 /* reshape_progress is decreasing */
3584                                 if (__le64_to_cpu(bsb.arraystart) >=
3585                                     info->reshape_progress &&
3586                                     __le64_to_cpu(bsb.arraystart2) >=
3587                                     info->reshape_progress)
3588                                         goto nonew; /* No new data here */
3589                         }
3590                 }
3591                 if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) {
3592                 second_fail:
3593                         if (verbose)
3594                                 fprintf(stderr, Name
3595                                         ": Failed to verify secondary backup-metadata block on %s\n",
3596                                         devname);
3597                         continue; /* Cannot seek */
3598                 }
3599                 /* There should be a duplicate backup superblock 4k before here */
3600                 if (lseek64(fd, -4096, 1) < 0 ||
3601                     read(fd, &bsb2, sizeof(bsb2)) != sizeof(bsb2))
3602                         goto second_fail; /* Cannot find leading superblock */
3603                 if (bsb.magic[15] == '1')
3604                         bsbsize = offsetof(struct mdp_backup_super, pad1);
3605                 else
3606                         bsbsize = offsetof(struct mdp_backup_super, pad);
3607                 if (memcmp(&bsb2, &bsb, bsbsize) != 0)
3608                         goto second_fail; /* Cannot find leading superblock */
3609
3610                 /* Now need the data offsets for all devices. */
3611                 offsets = malloc(sizeof(*offsets)*info->array.raid_disks);
3612                 for(j=0; j<info->array.raid_disks; j++) {
3613                         if (fdlist[j] < 0)
3614                                 continue;
3615                         if (st->ss->load_super(st, fdlist[j], NULL))
3616                                 /* FIXME should be this be an error */
3617                                 continue;
3618                         st->ss->getinfo_super(st, &dinfo, NULL);
3619                         st->ss->free_super(st);
3620                         offsets[j] = dinfo.data_offset * 512;
3621                 }
3622                 printf(Name ": restoring critical section\n");
3623
3624                 if (restore_stripes(fdlist, offsets,
3625                                     info->array.raid_disks,
3626                                     info->new_chunk,
3627                                     info->new_level,
3628                                     info->new_layout,
3629                                     fd, __le64_to_cpu(bsb.devstart)*512,
3630                                     __le64_to_cpu(bsb.arraystart)*512,
3631                                     __le64_to_cpu(bsb.length)*512, NULL)) {
3632                         /* didn't succeed, so giveup */
3633                         if (verbose)
3634                                 fprintf(stderr, Name ": Error restoring backup from %s\n",
3635                                         devname);
3636                         free(offsets);
3637                         return 1;
3638                 }
3639
3640                 if (bsb.magic[15] == '2' &&
3641                     restore_stripes(fdlist, offsets,
3642                                     info->array.raid_disks,
3643                                     info->new_chunk,
3644                                     info->new_level,
3645                                     info->new_layout,
3646                                     fd, __le64_to_cpu(bsb.devstart)*512 +
3647                                     __le64_to_cpu(bsb.devstart2)*512,
3648                                     __le64_to_cpu(bsb.arraystart2)*512,
3649                                     __le64_to_cpu(bsb.length2)*512, NULL)) {
3650                         /* didn't succeed, so giveup */
3651                         if (verbose)
3652                                 fprintf(stderr, Name ": Error restoring second backup from %s\n",
3653                                         devname);
3654                         free(offsets);
3655                         return 1;
3656                 }
3657
3658                 free(offsets);
3659
3660                 /* Ok, so the data is restored. Let's update those superblocks. */
3661
3662                 lo = hi = 0;
3663                 if (bsb.length) {
3664                         lo = __le64_to_cpu(bsb.arraystart);
3665                         hi = lo + __le64_to_cpu(bsb.length);
3666                 }
3667                 if (bsb.magic[15] == '2' && bsb.length2) {
3668                         unsigned long long lo1, hi1;
3669                         lo1 = __le64_to_cpu(bsb.arraystart2);
3670                         hi1 = lo1 + __le64_to_cpu(bsb.length2);
3671                         if (lo == hi) {
3672                                 lo = lo1;
3673                                 hi = hi1;
3674                         } else if (lo < lo1)
3675                                 hi = hi1;
3676                         else
3677                                 lo = lo1;
3678                 }
3679                 if (lo < hi &&
3680                     (info->reshape_progress < lo ||
3681                      info->reshape_progress > hi))
3682                         /* backup does not affect reshape_progress*/ ;
3683                 else if (info->delta_disks >= 0) {
3684                         info->reshape_progress = __le64_to_cpu(bsb.arraystart) +
3685                                 __le64_to_cpu(bsb.length);
3686                         if (bsb.magic[15] == '2') {
3687                                 unsigned long long p2 = __le64_to_cpu(bsb.arraystart2) +
3688                                         __le64_to_cpu(bsb.length2);
3689                                 if (p2 > info->reshape_progress)
3690                                         info->reshape_progress = p2;
3691                         }
3692                 } else {
3693                         info->reshape_progress = __le64_to_cpu(bsb.arraystart);
3694                         if (bsb.magic[15] == '2') {
3695                                 unsigned long long p2 = __le64_to_cpu(bsb.arraystart2);
3696                                 if (p2 < info->reshape_progress)
3697                                         info->reshape_progress = p2;
3698                         }
3699                 }
3700                 for (j=0; j<info->array.raid_disks; j++) {
3701                         if (fdlist[j] < 0) continue;
3702                         if (st->ss->load_super(st, fdlist[j], NULL))
3703                                 continue;
3704                         st->ss->getinfo_super(st, &dinfo, NULL);
3705                         dinfo.reshape_progress = info->reshape_progress;
3706                         st->ss->update_super(st, &dinfo,
3707                                              "_reshape_progress",
3708                                              NULL,0, 0, NULL);
3709                         st->ss->store_super(st, fdlist[j]);
3710                         st->ss->free_super(st);
3711                 }
3712                 return 0;
3713         }
3714         /* Didn't find any backup data, try to see if any
3715          * was needed.
3716          */
3717         if (info->delta_disks < 0) {
3718                 /* When shrinking, the critical section is at the end.
3719                  * So see if we are before the critical section.
3720                  */
3721                 unsigned long long first_block;
3722                 nstripe = ostripe = 0;
3723                 first_block = 0;
3724                 while (ostripe >= nstripe) {
3725                         ostripe += info->array.chunk_size / 512;
3726                         first_block = ostripe * odata;
3727                         nstripe = first_block / ndata / (info->new_chunk/512) *
3728                                 (info->new_chunk/512);
3729                 }
3730
3731                 if (info->reshape_progress >= first_block)
3732                         return 0;
3733         }
3734         if (info->delta_disks > 0) {
3735                 /* See if we are beyond the critical section. */
3736                 unsigned long long last_block;
3737                 nstripe = ostripe = 0;
3738                 last_block = 0;
3739                 while (nstripe >= ostripe) {
3740                         nstripe += info->new_chunk / 512;
3741                         last_block = nstripe * ndata;
3742                         ostripe = last_block / odata / (info->array.chunk_size/512) *
3743                                 (info->array.chunk_size/512);
3744                 }
3745
3746                 if (info->reshape_progress >= last_block)
3747                         return 0;
3748         }
3749         /* needed to recover critical section! */
3750         if (verbose)
3751                 fprintf(stderr, Name ": Failed to find backup of critical section\n");
3752         return 1;
3753 }
3754
3755 int Grow_continue_command(char *devname, int fd,
3756                           char *backup_file, int verbose)
3757 {
3758         int ret_val = 0;
3759         struct supertype *st = NULL;
3760         struct mdinfo *content = NULL;
3761         struct mdinfo array;
3762         char *subarray = NULL;
3763         struct mdinfo *cc = NULL;
3764         struct mdstat_ent *mdstat = NULL;
3765         char buf[40];
3766         int cfd = -1;
3767         int fd2 = -1;
3768         char *ep;
3769         unsigned long long position;
3770
3771         dprintf("Grow continue from command line called for %s\n",
3772                 devname);
3773
3774         st = super_by_fd(fd, &subarray);
3775         if (!st || !st->ss) {
3776                 fprintf(stderr,
3777                         Name ": Unable to determine metadata format for %s\n",
3778                         devname);
3779                 return 1;
3780         }
3781         dprintf("Grow continue is run for ");
3782         if (st->ss->external == 0) {
3783                 dprintf("native array (%s)\n", devname);
3784                 if (ioctl(fd, GET_ARRAY_INFO, &array) < 0) {
3785                         fprintf(stderr, Name ": %s is not an active md array -"
3786                                 " aborting\n", devname);
3787                         ret_val = 1;
3788                         goto Grow_continue_command_exit;
3789                 }
3790                 content = &array;
3791                 sysfs_init(content, fd, st->devnum);
3792         } else {
3793                 int container_dev;
3794
3795                 if (subarray) {
3796                         dprintf("subarray (%s)\n", subarray);
3797                         container_dev = st->container_dev;
3798                         cfd = open_dev_excl(st->container_dev);
3799                 } else {
3800                         container_dev = st->devnum;
3801                         close(fd);
3802                         cfd = open_dev_excl(st->devnum);
3803                         dprintf("container (%i)\n", container_dev);
3804                         fd = cfd;
3805                 }
3806                 if (cfd < 0) {
3807                         fprintf(stderr, Name ": Unable to open container "
3808                                 "for %s\n", devname);
3809                         ret_val = 1;
3810                         goto Grow_continue_command_exit;
3811                 }
3812                 fmt_devname(buf, container_dev);
3813
3814                 /* find in container array under reshape
3815                  */
3816                 ret_val = st->ss->load_container(st, cfd, NULL);
3817                 if (ret_val) {
3818                         fprintf(stderr,
3819                                 Name ": Cannot read superblock for %s\n",
3820                                 devname);
3821                         ret_val = 1;
3822                         goto Grow_continue_command_exit;
3823                 }
3824
3825                 cc = st->ss->container_content(st, subarray);
3826                 for (content = cc; content ; content = content->next) {
3827                         char *array;
3828                         int allow_reshape = 1;
3829
3830                         if (content->reshape_active == 0)
3831                                 continue;
3832                         /* The decision about array or container wide
3833                          * reshape is taken in Grow_continue based
3834                          * content->reshape_active state, therefore we
3835                          * need to check_reshape based on
3836                          * reshape_active and subarray name
3837                          */
3838                         if (content->array.state & (1<<MD_SB_BLOCK_VOLUME))
3839                                 allow_reshape = 0;
3840                         if (content->reshape_active == CONTAINER_RESHAPE &&
3841                             (content->array.state
3842                              & (1<<MD_SB_BLOCK_CONTAINER_RESHAPE)))
3843                                 allow_reshape = 0;
3844
3845                         if (!allow_reshape) {
3846                                 fprintf(stderr, Name
3847                                         ": cannot continue reshape of an array"
3848                                         " in container with unsupported"
3849                                         " metadata: %s(%s)\n",
3850                                         devname, buf);
3851                                 ret_val = 1;
3852                                 goto Grow_continue_command_exit;
3853                         }
3854
3855                         array = strchr(content->text_version+1, '/')+1;
3856                         mdstat = mdstat_by_subdev(array, container_dev);
3857                         if (!mdstat)
3858                                 continue;
3859                         break;
3860                 }
3861                 if (!content) {
3862                         fprintf(stderr,
3863                                 Name ": Unable to determine reshaped "
3864                                 "array for %s\n", devname);
3865                         ret_val = 1;
3866                         goto Grow_continue_command_exit;
3867                 }
3868                 fd2 = open_dev(mdstat->devnum);
3869                 if (fd2 < 0) {
3870                         fprintf(stderr, Name ": cannot open (md%i)\n",
3871                                 mdstat->devnum);
3872                         ret_val = 1;
3873                         goto Grow_continue_command_exit;
3874                 }
3875
3876                 sysfs_init(content, fd2, mdstat->devnum);
3877
3878                 /* start mdmon in case it is not running
3879                  */
3880                 if (!mdmon_running(container_dev))
3881                         start_mdmon(container_dev);
3882                 ping_monitor(buf);
3883
3884                 if (mdmon_running(container_dev))
3885                         st->update_tail = &st->updates;
3886                 else {
3887                         fprintf(stderr, Name ":  No mdmon found. "
3888                                 "Grow cannot continue.\n");
3889                         ret_val = 1;
3890                         goto Grow_continue_command_exit;
3891                 }
3892         }
3893
3894         /* verify that array under reshape is started from
3895          * correct position
3896          */
3897         ret_val = sysfs_get_str(content, NULL, "sync_max", buf, 40);
3898         if (ret_val <= 0) {
3899                 fprintf(stderr, Name
3900                         ": cannot open verify reshape progress for %s (%i)\n",
3901                         content->sys_name, ret_val);
3902                 ret_val = 1;
3903                 goto Grow_continue_command_exit;
3904         }
3905         dprintf(Name ": Read sync_max sysfs entry is: %s\n", buf);
3906         position = strtoull(buf, &ep, 0);
3907         if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' ')) {
3908                 fprintf(stderr, Name ": Fatal error: array reshape was"
3909                         " not properly frozen\n");
3910                 ret_val = 1;
3911                 goto Grow_continue_command_exit;
3912         }
3913         position *= get_data_disks(map_name(pers, mdstat->level),
3914                                    content->new_layout,
3915                                    content->array.raid_disks);
3916         if (position != content->reshape_progress) {
3917                 fprintf(stderr, Name ": Fatal error: array reshape was"
3918                         " not properly frozen.\n");
3919                 ret_val = 1;
3920                 goto Grow_continue_command_exit;
3921         }
3922
3923         /* continue reshape
3924          */
3925         ret_val = Grow_continue(fd, st, content, backup_file, 0);
3926
3927 Grow_continue_command_exit:
3928         if (fd2 > -1)
3929                 close(fd2);
3930         if (cfd > -1)
3931                 close(cfd);
3932         st->ss->free_super(st);
3933         free_mdstat(mdstat);
3934         sysfs_free(cc);
3935         free(subarray);
3936
3937         return ret_val;
3938 }
3939
3940 int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
3941                   char *backup_file, int freeze_reshape)
3942 {
3943         int ret_val = 2;
3944
3945         if (!info->reshape_active)
3946                 return ret_val;
3947
3948         if (st->ss->external) {
3949                 char container[40];
3950                 int cfd = open_dev(st->container_dev);
3951
3952                 if (cfd < 0)
3953                         return 1;
3954
3955                 fmt_devname(container, st->container_dev);
3956                 st->ss->load_container(st, cfd, container);
3957                 close(cfd);
3958                 ret_val = reshape_container(container, NULL, mdfd,
3959                                             st, info, 0, backup_file,
3960                                             0, 1, freeze_reshape);
3961         } else
3962                 ret_val = reshape_array(NULL, mdfd, "array", st, info, 1,
3963                                         NULL, backup_file, 0, 0, 1,
3964                                         freeze_reshape);
3965
3966         return ret_val;
3967 }