Disk removal support for Raid10->Raid0 takeover
[thirdparty/mdadm.git] / Grow.c
1 /*
2  * mdadm - manage Linux "md" devices aka RAID arrays.
3  *
4  * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
5  *
6  *
7  *    This program is free software; you can redistribute it and/or modify
8  *    it under the terms of the GNU General Public License as published by
9  *    the Free Software Foundation; either version 2 of the License, or
10  *    (at your option) any later version.
11  *
12  *    This program is distributed in the hope that it will be useful,
13  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *    GNU General Public License for more details.
16  *
17  *    You should have received a copy of the GNU General Public License
18  *    along with this program; if not, write to the Free Software
19  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20  *
21  *    Author: Neil Brown
22  *    Email: <neilb@suse.de>
23  */
24 #include        "mdadm.h"
25 #include        "dlink.h"
26 #include        <sys/mman.h>
27
28 #if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN)
29 #error no endian defined
30 #endif
31 #include        "md_u.h"
32 #include        "md_p.h"
33
34 #ifndef offsetof
35 #define offsetof(t,f) ((size_t)&(((t*)0)->f))
36 #endif
37
38 int Grow_Add_device(char *devname, int fd, char *newdev)
39 {
40         /* Add a device to an active array.
41          * Currently, just extend a linear array.
42          * This requires writing a new superblock on the
43          * new device, calling the kernel to add the device,
44          * and if that succeeds, update the superblock on
45          * all other devices.
46          * This means that we need to *find* all other devices.
47          */
48         struct mdinfo info;
49
50         struct stat stb;
51         int nfd, fd2;
52         int d, nd;
53         struct supertype *st = NULL;
54         char *subarray = NULL;
55
56         if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) {
57                 fprintf(stderr, Name ": cannot get array info for %s\n", devname);
58                 return 1;
59         }
60
61         if (info.array.level != -1) {
62                 fprintf(stderr, Name ": can only add devices to linear arrays\n");
63                 return 1;
64         }
65
66         st = super_by_fd(fd, &subarray);
67         if (!st) {
68                 fprintf(stderr, Name ": cannot handle arrays with superblock version %d\n", info.array.major_version);
69                 return 1;
70         }
71
72         if (subarray) {
73                 fprintf(stderr, Name ": Cannot grow linear sub-arrays yet\n");
74                 free(subarray);
75                 free(st);
76         }
77
78         nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT);
79         if (nfd < 0) {
80                 fprintf(stderr, Name ": cannot open %s\n", newdev);
81                 free(st);
82                 return 1;
83         }
84         fstat(nfd, &stb);
85         if ((stb.st_mode & S_IFMT) != S_IFBLK) {
86                 fprintf(stderr, Name ": %s is not a block device!\n", newdev);
87                 close(nfd);
88                 free(st);
89                 return 1;
90         }
91         /* now check out all the devices and make sure we can read the superblock */
92         for (d=0 ; d < info.array.raid_disks ; d++) {
93                 mdu_disk_info_t disk;
94                 char *dv;
95
96                 st->ss->free_super(st);
97
98                 disk.number = d;
99                 if (ioctl(fd, GET_DISK_INFO, &disk) < 0) {
100                         fprintf(stderr, Name ": cannot get device detail for device %d\n",
101                                 d);
102                         close(nfd);
103                         free(st);
104                         return 1;
105                 }
106                 dv = map_dev(disk.major, disk.minor, 1);
107                 if (!dv) {
108                         fprintf(stderr, Name ": cannot find device file for device %d\n",
109                                 d);
110                         close(nfd);
111                         free(st);
112                         return 1;
113                 }
114                 fd2 = dev_open(dv, O_RDWR);
115                 if (!fd2) {
116                         fprintf(stderr, Name ": cannot open device file %s\n", dv);
117                         close(nfd);
118                         free(st);
119                         return 1;
120                 }
121
122                 if (st->ss->load_super(st, fd2, NULL)) {
123                         fprintf(stderr, Name ": cannot find super block on %s\n", dv);
124                         close(nfd);
125                         close(fd2);
126                         free(st);
127                         return 1;
128                 }
129                 close(fd2);
130         }
131         /* Ok, looks good. Lets update the superblock and write it out to
132          * newdev.
133          */
134
135         info.disk.number = d;
136         info.disk.major = major(stb.st_rdev);
137         info.disk.minor = minor(stb.st_rdev);
138         info.disk.raid_disk = d;
139         info.disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE);
140         st->ss->update_super(st, &info, "linear-grow-new", newdev,
141                              0, 0, NULL);
142
143         if (st->ss->store_super(st, nfd)) {
144                 fprintf(stderr, Name ": Cannot store new superblock on %s\n",
145                         newdev);
146                 close(nfd);
147                 return 1;
148         }
149         close(nfd);
150
151         if (ioctl(fd, ADD_NEW_DISK, &info.disk) != 0) {
152                 fprintf(stderr, Name ": Cannot add new disk to this array\n");
153                 return 1;
154         }
155         /* Well, that seems to have worked.
156          * Now go through and update all superblocks
157          */
158
159         if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) {
160                 fprintf(stderr, Name ": cannot get array info for %s\n", devname);
161                 return 1;
162         }
163
164         nd = d;
165         for (d=0 ; d < info.array.raid_disks ; d++) {
166                 mdu_disk_info_t disk;
167                 char *dv;
168
169                 disk.number = d;
170                 if (ioctl(fd, GET_DISK_INFO, &disk) < 0) {
171                         fprintf(stderr, Name ": cannot get device detail for device %d\n",
172                                 d);
173                         return 1;
174                 }
175                 dv = map_dev(disk.major, disk.minor, 1);
176                 if (!dv) {
177                         fprintf(stderr, Name ": cannot find device file for device %d\n",
178                                 d);
179                         return 1;
180                 }
181                 fd2 = dev_open(dv, O_RDWR);
182                 if (fd2 < 0) {
183                         fprintf(stderr, Name ": cannot open device file %s\n", dv);
184                         return 1;
185                 }
186                 if (st->ss->load_super(st, fd2, NULL)) {
187                         fprintf(stderr, Name ": cannot find super block on %s\n", dv);
188                         close(fd);
189                         return 1;
190                 }
191                 info.array.raid_disks = nd+1;
192                 info.array.nr_disks = nd+1;
193                 info.array.active_disks = nd+1;
194                 info.array.working_disks = nd+1;
195
196                 st->ss->update_super(st, &info, "linear-grow-update", dv,
197                                      0, 0, NULL);
198
199                 if (st->ss->store_super(st, fd2)) {
200                         fprintf(stderr, Name ": Cannot store new superblock on %s\n", dv);
201                         close(fd2);
202                         return 1;
203                 }
204                 close(fd2);
205         }
206
207         return 0;
208 }
209
210 int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int write_behind, int force)
211 {
212         /*
213          * First check that array doesn't have a bitmap
214          * Then create the bitmap
215          * Then add it
216          *
217          * For internal bitmaps, we need to check the version,
218          * find all the active devices, and write the bitmap block
219          * to all devices
220          */
221         mdu_bitmap_file_t bmf;
222         mdu_array_info_t array;
223         struct supertype *st;
224         char *subarray = NULL;
225         int major = BITMAP_MAJOR_HI;
226         int vers = md_get_version(fd);
227         unsigned long long bitmapsize, array_size;
228
229         if (vers < 9003) {
230                 major = BITMAP_MAJOR_HOSTENDIAN;
231 #ifdef __BIG_ENDIAN
232                 fprintf(stderr, Name ": Warning - bitmaps created on this kernel are not portable\n"
233                         "  between different architectured.  Consider upgrading the Linux kernel.\n");
234 #endif
235         }
236
237         if (ioctl(fd, GET_BITMAP_FILE, &bmf) != 0) {
238                 if (errno == ENOMEM)
239                         fprintf(stderr, Name ": Memory allocation failure.\n");
240                 else
241                         fprintf(stderr, Name ": bitmaps not supported by this kernel.\n");
242                 return 1;
243         }
244         if (bmf.pathname[0]) {
245                 if (strcmp(file,"none")==0) {
246                         if (ioctl(fd, SET_BITMAP_FILE, -1)!= 0) {
247                                 fprintf(stderr, Name ": failed to remove bitmap %s\n",
248                                         bmf.pathname);
249                                 return 1;
250                         }
251                         return 0;
252                 }
253                 fprintf(stderr, Name ": %s already has a bitmap (%s)\n",
254                         devname, bmf.pathname);
255                 return 1;
256         }
257         if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) {
258                 fprintf(stderr, Name ": cannot get array status for %s\n", devname);
259                 return 1;
260         }
261         if (array.state & (1<<MD_SB_BITMAP_PRESENT)) {
262                 if (strcmp(file, "none")==0) {
263                         array.state &= ~(1<<MD_SB_BITMAP_PRESENT);
264                         if (ioctl(fd, SET_ARRAY_INFO, &array)!= 0) {
265                                 fprintf(stderr, Name ": failed to remove internal bitmap.\n");
266                                 return 1;
267                         }
268                         return 0;
269                 }
270                 fprintf(stderr, Name ": Internal bitmap already present on %s\n",
271                         devname);
272                 return 1;
273         }
274
275         if (strcmp(file, "none") == 0) {
276                 fprintf(stderr, Name ": no bitmap found on %s\n", devname);
277                 return 1;
278         }
279         if (array.level <= 0) {
280                 fprintf(stderr, Name ": Bitmaps not meaningful with level %s\n",
281                         map_num(pers, array.level)?:"of this array");
282                 return 1;
283         }
284         bitmapsize = array.size;
285         bitmapsize <<= 1;
286         if (get_dev_size(fd, NULL, &array_size) &&
287             array_size > (0x7fffffffULL<<9)) {
288                 /* Array is big enough that we cannot trust array.size
289                  * try other approaches
290                  */
291                 bitmapsize = get_component_size(fd);
292         }
293         if (bitmapsize == 0) {
294                 fprintf(stderr, Name ": Cannot reliably determine size of array to create bitmap - sorry.\n");
295                 return 1;
296         }
297
298         if (array.level == 10) {
299                 int ncopies = (array.layout&255)*((array.layout>>8)&255);
300                 bitmapsize = bitmapsize * array.raid_disks / ncopies;
301         }
302
303         st = super_by_fd(fd, &subarray);
304         if (!st) {
305                 fprintf(stderr, Name ": Cannot understand version %d.%d\n",
306                         array.major_version, array.minor_version);
307                 return 1;
308         }
309         if (subarray) {
310                 fprintf(stderr, Name ": Cannot add bitmaps to sub-arrays yet\n");
311                 free(subarray);
312                 free(st);
313                 return 1;
314         }
315         if (strcmp(file, "internal") == 0) {
316                 int d;
317                 if (st->ss->add_internal_bitmap == NULL) {
318                         fprintf(stderr, Name ": Internal bitmaps not supported "
319                                 "with %s metadata\n", st->ss->name);
320                         return 1;
321                 }
322                 for (d=0; d< st->max_devs; d++) {
323                         mdu_disk_info_t disk;
324                         char *dv;
325                         disk.number = d;
326                         if (ioctl(fd, GET_DISK_INFO, &disk) < 0)
327                                 continue;
328                         if (disk.major == 0 &&
329                             disk.minor == 0)
330                                 continue;
331                         if ((disk.state & (1<<MD_DISK_SYNC))==0)
332                                 continue;
333                         dv = map_dev(disk.major, disk.minor, 1);
334                         if (dv) {
335                                 int fd2 = dev_open(dv, O_RDWR);
336                                 if (fd2 < 0)
337                                         continue;
338                                 if (st->ss->load_super(st, fd2, NULL)==0) {
339                                         if (st->ss->add_internal_bitmap(
340                                                     st,
341                                                     &chunk, delay, write_behind,
342                                                     bitmapsize, 0, major)
343                                                 )
344                                                 st->ss->write_bitmap(st, fd2);
345                                         else {
346                                                 fprintf(stderr, Name ": failed to create internal bitmap - chunksize problem.\n");
347                                                 close(fd2);
348                                                 return 1;
349                                         }
350                                 }
351                                 close(fd2);
352                         }
353                 }
354                 array.state |= (1<<MD_SB_BITMAP_PRESENT);
355                 if (ioctl(fd, SET_ARRAY_INFO, &array)!= 0) {
356                         fprintf(stderr, Name ": failed to set internal bitmap.\n");
357                         return 1;
358                 }
359         } else {
360                 int uuid[4];
361                 int bitmap_fd;
362                 int d;
363                 int max_devs = st->max_devs;
364
365                 /* try to load a superblock */
366                 for (d=0; d<max_devs; d++) {
367                         mdu_disk_info_t disk;
368                         char *dv;
369                         int fd2;
370                         disk.number = d;
371                         if (ioctl(fd, GET_DISK_INFO, &disk) < 0)
372                                 continue;
373                         if ((disk.major==0 && disk.minor==0) ||
374                             (disk.state & (1<<MD_DISK_REMOVED)))
375                                 continue;
376                         dv = map_dev(disk.major, disk.minor, 1);
377                         if (!dv) continue;
378                         fd2 = dev_open(dv, O_RDONLY);
379                         if (fd2 >= 0 &&
380                             st->ss->load_super(st, fd2, NULL) == 0) {
381                                 close(fd2);
382                                 st->ss->uuid_from_super(st, uuid);
383                                 break;
384                         }
385                         close(fd2);
386                 }
387                 if (d == max_devs) {
388                         fprintf(stderr, Name ": cannot find UUID for array!\n");
389                         return 1;
390                 }
391                 if (CreateBitmap(file, force, (char*)uuid, chunk,
392                                  delay, write_behind, bitmapsize, major)) {
393                         return 1;
394                 }
395                 bitmap_fd = open(file, O_RDWR);
396                 if (bitmap_fd < 0) {
397                         fprintf(stderr, Name ": weird: %s cannot be opened\n",
398                                 file);
399                         return 1;
400                 }
401                 if (ioctl(fd, SET_BITMAP_FILE, bitmap_fd) < 0) {
402                         fprintf(stderr, Name ": Cannot set bitmap file for %s: %s\n",
403                                 devname, strerror(errno));
404                         return 1;
405                 }
406         }
407
408         return 0;
409 }
410
411
412 /*
413  * When reshaping an array we might need to backup some data.
414  * This is written to all spares with a 'super_block' describing it.
415  * The superblock goes 4K from the end of the used space on the
416  * device.
417  * It if written after the backup is complete.
418  * It has the following structure.
419  */
420
421 static struct mdp_backup_super {
422         char    magic[16];  /* md_backup_data-1 or -2 */
423         __u8    set_uuid[16];
424         __u64   mtime;
425         /* start/sizes in 512byte sectors */
426         __u64   devstart;       /* address on backup device/file of data */
427         __u64   arraystart;
428         __u64   length;
429         __u32   sb_csum;        /* csum of preceeding bytes. */
430         __u32   pad1;
431         __u64   devstart2;      /* offset in to data of second section */
432         __u64   arraystart2;
433         __u64   length2;
434         __u32   sb_csum2;       /* csum of preceeding bytes. */
435         __u8 pad[512-68-32];
436 } __attribute__((aligned(512))) bsb, bsb2;
437
438 static __u32 bsb_csum(char *buf, int len)
439 {
440         int i;
441         int csum = 0;
442         for (i=0; i<len; i++)
443                 csum = (csum<<3) + buf[0];
444         return __cpu_to_le32(csum);
445 }
446
447 static int child_grow(int afd, struct mdinfo *sra, unsigned long blocks,
448                       int *fds, unsigned long long *offsets,
449                       int disks, int chunk, int level, int layout, int data,
450                       int dests, int *destfd, unsigned long long *destoffsets);
451 static int child_shrink(int afd, struct mdinfo *sra, unsigned long blocks,
452                         int *fds, unsigned long long *offsets,
453                         int disks, int chunk, int level, int layout, int data,
454                         int dests, int *destfd, unsigned long long *destoffsets);
455 static int child_same_size(int afd, struct mdinfo *sra, unsigned long blocks,
456                            int *fds, unsigned long long *offsets,
457                            unsigned long long start,
458                            int disks, int chunk, int level, int layout, int data,
459                            int dests, int *destfd, unsigned long long *destoffsets);
460
461 static int freeze_container(struct supertype *st)
462 {
463         int container_dev = (st->container_dev != NoMdDev
464                              ? st->container_dev : st->devnum);
465         char *container = devnum2devname(container_dev);
466
467         if (!container) {
468                 fprintf(stderr, Name
469                         ": could not determine container name, freeze aborted\n");
470                 return -2;
471         }
472
473         if (block_monitor(container, 1)) {
474                 fprintf(stderr, Name ": failed to freeze container\n");
475                 return -2;
476         }
477
478         return 1;
479 }
480
481 static void unfreeze_container(struct supertype *st)
482 {
483         int container_dev = (st->container_dev != NoMdDev
484                              ? st->container_dev : st->devnum);
485         char *container = devnum2devname(container_dev);
486
487         if (!container) {
488                 fprintf(stderr, Name
489                         ": could not determine container name, unfreeze aborted\n");
490                 return;
491         }
492
493         unblock_monitor(container, 1);
494 }
495
496 static int freeze(struct supertype *st)
497 {
498         /* Try to freeze resync/rebuild on this array/container.
499          * Return -1 if the array is busy,
500          * return -2 container cannot be frozen,
501          * return 0 if this kernel doesn't support 'frozen'
502          * return 1 if it worked.
503          */
504         if (st->ss->external)
505                 return freeze_container(st);
506         else {
507                 struct mdinfo *sra = sysfs_read(-1, st->devnum, GET_VERSION);
508                 int err;
509
510                 if (!sra)
511                         return -1;
512                 err = sysfs_freeze_array(sra);
513                 sysfs_free(sra);
514                 return err;
515         }
516 }
517
518 static void unfreeze(struct supertype *st, int frozen)
519 {
520         /* If 'frozen' is 1, unfreeze the array */
521         if (frozen <= 0)
522                 return;
523
524         if (st->ss->external)
525                 return unfreeze_container(st);
526         else {
527                 struct mdinfo *sra = sysfs_read(-1, st->devnum, GET_VERSION);
528
529                 if (sra)
530                         sysfs_set_str(sra, NULL, "sync_action", "idle");
531                 else
532                         fprintf(stderr, Name ": failed to unfreeze array\n");
533                 sysfs_free(sra);
534         }
535 }
536
537 static void wait_reshape(struct mdinfo *sra)
538 {
539         int fd = sysfs_get_fd(sra, NULL, "sync_action");
540         char action[20];
541
542         do {
543                 fd_set rfds;
544                 FD_ZERO(&rfds);
545                 FD_SET(fd, &rfds);
546                 select(fd+1, NULL, NULL, &rfds, NULL);
547                 
548                 if (sysfs_fd_get_str(fd, action, 20) < 0) {
549                         close(fd);
550                         return;
551                 }
552         } while  (strncmp(action, "reshape", 7) == 0);
553 }
554
555 static int reshape_super(struct supertype *st, long long size, int level,
556                          int layout, int chunksize, int raid_disks,
557                          char *backup_file, char *dev, int verbose)
558 {
559         /* nothing extra to check in the native case */
560         if (!st->ss->external)
561                 return 0;
562         if (!st->ss->reshape_super ||
563             !st->ss->manage_reshape) {
564                 fprintf(stderr, Name ": %s metadata does not support reshape\n",
565                         st->ss->name);
566                 return 1;
567         }
568
569         return st->ss->reshape_super(st, size, level, layout, chunksize,
570                                      raid_disks, backup_file, dev, verbose);
571 }
572
573 static void sync_metadata(struct supertype *st)
574 {
575         if (st->ss->external) {
576                 if (st->update_tail)
577                         flush_metadata_updates(st);
578                 else
579                         st->ss->sync_metadata(st);
580         }
581 }
582
583 static int subarray_set_num(char *container, struct mdinfo *sra, char *name, int n)
584 {
585         /* when dealing with external metadata subarrays we need to be
586          * prepared to handle EAGAIN.  The kernel may need to wait for
587          * mdmon to mark the array active so the kernel can handle
588          * allocations/writeback when preparing the reshape action
589          * (md_allow_write()).  We temporarily disable safe_mode_delay
590          * to close a race with the array_state going clean before the
591          * next write to raid_disks / stripe_cache_size
592          */
593         char safe[50];
594         int rc;
595
596         /* only 'raid_disks' and 'stripe_cache_size' trigger md_allow_write */
597         if (strcmp(name, "raid_disks") != 0 &&
598             strcmp(name, "stripe_cache_size") != 0)
599                 return sysfs_set_num(sra, NULL, name, n);
600
601         rc = sysfs_get_str(sra, NULL, "safe_mode_delay", safe, sizeof(safe));
602         if (rc <= 0)
603                 return -1;
604         sysfs_set_num(sra, NULL, "safe_mode_delay", 0);
605         rc = sysfs_set_num(sra, NULL, name, n);
606         if (rc < 0 && errno == EAGAIN) {
607                 ping_monitor(container);
608                 /* if we get EAGAIN here then the monitor is not active
609                  * so stop trying
610                  */
611                 rc = sysfs_set_num(sra, NULL, name, n);
612         }
613         sysfs_set_str(sra, NULL, "safe_mode_delay", safe);
614         return rc;
615 }
616
617 static int reshape_container_raid_disks(char *container, int raid_disks)
618 {
619         /* for each subarray switch to a raid level that can
620          * support the reshape, and set raid disks
621          */
622         struct mdstat_ent *ent, *e;
623         int changed = 0, rv = 0, err = 0;
624
625         ent = mdstat_read(1, 0);
626         if (!ent) {
627                 fprintf(stderr, Name ": unable to read /proc/mdstat\n");
628                 return -1;
629         }
630
631         changed = 0;
632         for (e = ent; e; e = e->next) {
633                 struct mdinfo *sub;
634                 unsigned int cache;
635                 int level, takeover_delta = 0;
636
637                 if (!is_container_member(e, container))
638                         continue;
639
640                 level = map_name(pers, e->level);
641                 if (level == 0) {
642                         sub = sysfs_read(-1, e->devnum, GET_VERSION);
643                         if (!sub)
644                                 break;
645                         /* metadata records 'orig_level' */
646                         rv = sysfs_set_num(sub, NULL, "level", 4);
647                         if (rv < 0) {
648                                 err = errno;
649                                 break;
650                         }
651                         /* we want spares to be used for capacity
652                          * expansion, not rebuild
653                          */
654                         takeover_delta = 1;
655
656                         sysfs_free(sub);
657                         level = 4;
658                 }
659
660                 sub = NULL;
661                 switch (level) {
662                 default:
663                         rv = -1;
664                         break;
665                 case 4:
666                 case 5:
667                 case 6:
668                         sub = sysfs_read(-1, e->devnum, GET_CHUNK|GET_CACHE);
669                         if (!sub)
670                                 break;
671                         cache = (sub->array.chunk_size / 4096) * 4;
672                         if (cache > sub->cache_size)
673                                 rv = subarray_set_num(container, sub,
674                                                       "stripe_cache_size", cache);
675                         if (rv) {
676                                 err = errno;
677                                 break;
678                         }
679                         /* fall through */
680                 case 1:
681                         if (!sub)
682                                 sub = sysfs_read(-1, e->devnum, GET_VERSION);
683                         if (!sub)
684                                 break;
685
686                         rv = subarray_set_num(container, sub, "raid_disks",
687                                               raid_disks + takeover_delta);
688                         if (rv)
689                                 err = errno;
690                         else
691                                 changed++;
692                         break;
693                 }
694                 sysfs_free(sub);
695                 if (rv)
696                         break;
697         }
698         free_mdstat(ent);
699         if (rv) {
700                 fprintf(stderr, Name
701                         ": failed to initiate container reshape%s%s\n",
702                         err ? ": " : "", err ? strerror(err) : "");
703                 return rv;
704         }
705
706         return changed;
707 }
708
709 static void revert_container_raid_disks(struct supertype *st, int fd, char *container)
710 {
711         /* we failed to prepare all subarrays in the container for
712          * reshape, so cancel the changes and restore the nominal raid
713          * level
714          */
715         struct mdstat_ent *ent, *e;
716
717         ent = mdstat_read(0, 0);
718         if (!ent) {
719                 fprintf(stderr, Name
720                         ": failed to read /proc/mdstat while aborting reshape\n");
721                 return;
722         }
723
724         if (st->ss->load_container(st, fd, NULL)) {
725                 fprintf(stderr, Name
726                         ": failed read metadata while aborting reshape\n");
727                 return ;
728         }
729
730
731         for (e = ent; e; e = e->next) {
732                 int level_fixed = 0, disks_fixed = 0;
733                 struct mdinfo *sub, *prev;
734                 char *subarray;
735
736                 if (!is_container_member(e, container))
737                         continue;
738
739                 subarray = to_subarray(e, container);
740                 prev = st->ss->container_content(st, subarray);
741
742                 /* changing level might change raid_disks so we do it
743                  * first and then check if raid_disks still needs fixing
744                  */
745                 if (map_name(pers, e->level) != prev->array.level) {
746                         sub = sysfs_read(-1, e->devnum, GET_VERSION);
747                         if (sub &&
748                             !sysfs_set_num(sub, NULL, "level", prev->array.level))
749                                 level_fixed = 1;
750                         sysfs_free(sub);
751                 } else
752                         level_fixed = 1;
753
754                 sub = sysfs_read(-1, e->devnum, GET_DISKS);
755                 if (sub && sub->array.raid_disks != prev->array.raid_disks) {
756                         if (!subarray_set_num(container, sub, "raid_disks",
757                                               prev->array.raid_disks))
758                                 disks_fixed = 1;
759                 } else if (sub)
760                         disks_fixed = 1;
761                 sysfs_free(sub);
762
763                 if (!disks_fixed || !level_fixed)
764                         fprintf(stderr, Name
765                                 ": failed to restore %s to a %d-disk %s array\n",
766                                 e->dev, prev->array.raid_disks,
767                                 map_num(pers, prev->array.level));
768                 free(prev);
769         }
770         st->ss->free_super(st);
771         free_mdstat(ent);
772 }
773
774 int remove_disks_on_raid10_to_raid0_takeover(struct supertype *st,
775                                              struct mdinfo *sra,
776                                              int layout)
777 {
778         int nr_of_copies;
779         struct mdinfo *remaining;
780         int slot;
781
782         nr_of_copies = layout & 0xff;
783
784         remaining = sra->devs;
785         sra->devs = NULL;
786         /* for each 'copy', select one device and remove from the list. */
787         for (slot = 0; slot < sra->array.raid_disks; slot += nr_of_copies) {
788                 struct mdinfo **diskp;
789                 int found = 0;
790
791                 /* Find a working device to keep */
792                 for (diskp =  &remaining; *diskp ; diskp = &(*diskp)->next) {
793                         struct mdinfo *disk = *diskp;
794
795                         if (disk->disk.raid_disk < slot)
796                                 continue;
797                         if (disk->disk.raid_disk >= slot + nr_of_copies)
798                                 continue;
799                         if (disk->disk.state & (1<<MD_DISK_REMOVED))
800                                 continue;
801                         if (disk->disk.state & (1<<MD_DISK_FAULTY))
802                                 continue;
803                         if (!(disk->disk.state & (1<<MD_DISK_SYNC)))
804                                 continue;
805
806                         /* We have found a good disk to use! */
807                         *diskp = disk->next;
808                         disk->next = sra->devs;
809                         sra->devs = disk;
810                         found = 1;
811                         break;
812                 }
813                 if (!found)
814                         break;
815         }
816
817         if (slot < sra->array.raid_disks) {
818                 /* didn't find all slots */
819                 struct mdinfo **e;
820                 e = &remaining;
821                 while (*e)
822                         e = &(*e)->next;
823                 *e = sra->devs;
824                 sra->devs = remaining;
825                 return 1;
826         }
827
828         /* Remove all 'remaining' devices from the array */
829         while (remaining) {
830                 struct mdinfo *sd = remaining;
831                 remaining = sd->next;
832
833                 sysfs_set_str(sra, sd, "state", "faulty");
834                 sysfs_set_str(sra, sd, "slot", "none");
835                 sysfs_set_str(sra, sd, "state", "remove");
836                 sd->disk.state |= (1<<MD_DISK_REMOVED);
837                 sd->disk.state &= ~(1<<MD_DISK_SYNC);
838                 sd->next = sra->devs;
839                 sra->devs = sd;
840         }
841         return 0;
842 }
843
844 int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
845                  long long size,
846                  int level, char *layout_str, int chunksize, int raid_disks)
847 {
848         /* Make some changes in the shape of an array.
849          * The kernel must support the change.
850          *
851          * There are three different changes.  Each can trigger
852          * a resync or recovery so we freeze that until we have
853          * requested everything (if kernel supports freezing - 2.6.30).
854          * The steps are:
855          *  - change size (i.e. component_size)
856          *  - change level
857          *  - change layout/chunksize/ndisks
858          *
859          * The last can require a reshape.  It is different on different
860          * levels so we need to check the level before actioning it.
861          * Some times the level change needs to be requested after the
862          * reshape (e.g. raid6->raid5, raid5->raid0)
863          *
864          */
865         struct mdu_array_info_s array, orig;
866         char *c;
867         int rv = 0;
868         struct supertype *st;
869         char *subarray = NULL;
870
871         int nchunk, ochunk;
872         int nlayout, olayout;
873         int ndisks, odisks;
874         unsigned int ndata, odata;
875         int orig_level = UnSet;
876         char alt_layout[40];
877         int *fdlist;
878         unsigned long long *offsets;
879         int d, i;
880         int nrdisks;
881         int err;
882         int frozen;
883         unsigned long a,b, blocks, stripes;
884         unsigned long cache;
885         unsigned long long array_size;
886         int changed = 0;
887         char *container = NULL;
888         int cfd = -1;
889         int done;
890
891         struct mdinfo *sra;
892         struct mdinfo *sd;
893
894         if (ioctl(fd, GET_ARRAY_INFO, &array) < 0) {
895                 fprintf(stderr, Name ": %s is not an active md array - aborting\n",
896                         devname);
897                 return 1;
898         }
899
900         if (size >= 0 &&
901             (chunksize || level!= UnSet || layout_str || raid_disks)) {
902                 fprintf(stderr, Name ": cannot change component size at the same time "
903                         "as other changes.\n"
904                         "   Change size first, then check data is intact before "
905                         "making other changes.\n");
906                 return 1;
907         }
908
909         if (raid_disks && raid_disks < array.raid_disks && array.level > 1 &&
910             get_linux_version() < 2006032 &&
911             !check_env("MDADM_FORCE_FEWER")) {
912                 fprintf(stderr, Name ": reducing the number of devices is not safe before Linux 2.6.32\n"
913                         "       Please use a newer kernel\n");
914                 return 1;
915         }
916
917         st = super_by_fd(fd, &subarray);
918         if (!st) {
919                 fprintf(stderr, Name ": Unable to determine metadata format for %s\n", devname);
920                 return 1;
921         }
922
923         /* in the external case we need to check that the requested reshape is
924          * supported, and perform an initial check that the container holds the
925          * pre-requisite spare devices (mdmon owns final validation)
926          */
927         if (st->ss->external) {
928                 int container_dev;
929                 int rv;
930
931                 if (subarray) {
932                         container_dev = st->container_dev;
933                         cfd = open_dev_excl(st->container_dev);
934                 } else if (size >= 0 || layout_str != NULL || chunksize != 0 ||
935                            level != UnSet) {
936                         fprintf(stderr,
937                                 Name ": %s is a container, only 'raid-devices' can be changed\n",
938                                 devname);
939                         return 1;
940                 } else {
941                         container_dev = st->devnum;
942                         close(fd);
943                         cfd = open_dev_excl(st->devnum);
944                         fd = cfd;
945                 }
946                 if (cfd < 0) {
947                         fprintf(stderr, Name ": Unable to open container for %s\n",
948                                 devname);
949                         free(subarray);
950                         return 1;
951                 }
952
953                 container = devnum2devname(st->devnum);
954                 if (!container) {
955                         fprintf(stderr, Name ": Could not determine container name\n");
956                         free(subarray);
957                         return 1;
958                 }
959
960                 if (subarray)
961                         rv = st->ss->load_container(st, cfd, NULL);
962                 else
963                         rv = st->ss->load_super(st, cfd, NULL);
964                 if (rv) {
965                         fprintf(stderr, Name ": Cannot read superblock for %s\n",
966                                 devname);
967                         free(subarray);
968                         return 1;
969                 }
970
971                 if (mdmon_running(container_dev))
972                         st->update_tail = &st->updates;
973         }
974
975         sra = sysfs_read(fd, 0, GET_LEVEL | GET_DISKS | GET_DEVS | GET_STATE);
976         if (sra) {
977                 if (st->ss->external && subarray == NULL) {
978                         array.level = LEVEL_CONTAINER;
979                         sra->array.level = LEVEL_CONTAINER;
980                 }
981         } else {
982                 fprintf(stderr, Name ": failed to read sysfs parameters for %s\n",
983                         devname);
984                 return 1;
985         }
986         frozen = freeze(st);
987         if (frozen < -1) {
988                 /* freeze() already spewed the reason */
989                 return 1;
990         } else if (frozen < 0) {
991                 fprintf(stderr, Name ": %s is performing resync/recovery and cannot"
992                         " be reshaped\n", devname);
993                 return 1;
994         }
995
996         /* ========= set size =============== */
997         if (size >= 0 && (size == 0 || size != array.size)) {
998                 long long orig_size = array.size;
999
1000                 if (reshape_super(st, size, UnSet, UnSet, 0, 0, NULL, devname, !quiet)) {
1001                         rv = 1;
1002                         goto release;
1003                 }
1004                 sync_metadata(st);
1005                 array.size = size;
1006                 if (array.size != size) {
1007                         /* got truncated to 32bit, write to
1008                          * component_size instead
1009                          */
1010                         if (sra)
1011                                 rv = sysfs_set_num(sra, NULL,
1012                                                    "component_size", size);
1013                         else
1014                                 rv = -1;
1015                 } else
1016                         rv = ioctl(fd, SET_ARRAY_INFO, &array);
1017                 if (rv != 0) {
1018                         int err = errno;
1019
1020                         /* restore metadata */
1021                         if (reshape_super(st, orig_size, UnSet, UnSet, 0, 0,
1022                                           NULL, devname, !quiet) == 0)
1023                                 sync_metadata(st);
1024                         fprintf(stderr, Name ": Cannot set device size for %s: %s\n",
1025                                 devname, strerror(err));
1026                         if (err == EBUSY && 
1027                             (array.state & (1<<MD_SB_BITMAP_PRESENT)))
1028                                 fprintf(stderr, "       Bitmap must be removed before size can be changed\n");
1029                         rv = 1;
1030                         goto release;
1031                 }
1032                 ioctl(fd, GET_ARRAY_INFO, &array);
1033                 size = get_component_size(fd)/2;
1034                 if (size == 0)
1035                         size = array.size;
1036                 if (!quiet)
1037                         fprintf(stderr, Name ": component size of %s has been set to %lluK\n",
1038                                 devname, size);
1039                 changed = 1;
1040         } else if (array.level != LEVEL_CONTAINER) {
1041                 size = get_component_size(fd)/2;
1042                 if (size == 0)
1043                         size = array.size;
1044         }
1045
1046         /* ========= check for Raid10 -> Raid0 conversion ===============
1047          * current implemenation assumes that following conditions must be met:
1048          * - far_copies == 1
1049          * - near_copies == 2
1050          */
1051         if (level == 0 && array.level == 10 &&
1052             array.layout == ((1 << 8) + 2) && !(array.raid_disks & 1)) {
1053                 int err;
1054                 err = remove_disks_on_raid10_to_raid0_takeover(st, sra, array.layout);
1055                 if (err) {
1056                         dprintf(Name": Array cannot be reshaped\n");
1057                         if (container)
1058                                 free(container);
1059                         if (cfd > -1)
1060                                 close(cfd);
1061                         return 1;
1062                 }
1063         }
1064
1065         /* ======= set level =========== */
1066         if (level != UnSet && level != array.level) {
1067                 /* Trying to change the level.
1068                  * We might need to change layout first and schedule a
1069                  * level change for later.
1070                  * Level changes that can happen immediately are:
1071                  * 0->4,5,6  1->5  4->5,6  5->1,6
1072                  * Level changes that need a layout change first are:
1073                  * 6->5,4,0 : need a -6 layout, or parity-last
1074                  * 5->4,0   : need parity-last
1075                  */
1076                 if ((array.level == 6 || array.level == 5) &&
1077                     (level == 5 || level == 4 || level == 0)) {
1078                         /* Don't change level yet, but choose intermediate
1079                          * layout
1080                          */
1081                         if (level == 5) {
1082                                 if (layout_str == NULL)
1083                                         switch (array.layout) {
1084                                         case ALGORITHM_LEFT_ASYMMETRIC:
1085                                         case ALGORITHM_LEFT_ASYMMETRIC_6:
1086                                         case ALGORITHM_ROTATING_N_RESTART:
1087                                                 layout_str = "left-asymmetric-6";
1088                                                 break;
1089                                         case ALGORITHM_LEFT_SYMMETRIC:
1090                                         case ALGORITHM_LEFT_SYMMETRIC_6:
1091                                         case ALGORITHM_ROTATING_N_CONTINUE:
1092                                                 layout_str = "left-symmetric-6";
1093                                                 break;
1094                                         case ALGORITHM_RIGHT_ASYMMETRIC:
1095                                         case ALGORITHM_RIGHT_ASYMMETRIC_6:
1096                                         case ALGORITHM_ROTATING_ZERO_RESTART:
1097                                                 layout_str = "right-asymmetric-6";
1098                                                 break;
1099                                         case ALGORITHM_RIGHT_SYMMETRIC:
1100                                         case ALGORITHM_RIGHT_SYMMETRIC_6:
1101                                                 layout_str = "right-symmetric-6";
1102                                                 break;
1103                                         case ALGORITHM_PARITY_0:
1104                                         case ALGORITHM_PARITY_0_6:
1105                                                 layout_str = "parity-first-6";
1106                                                 break;
1107                                         case ALGORITHM_PARITY_N:
1108                                                 layout_str = "parity-last";
1109                                                 break;
1110                                         default:
1111                                                 fprintf(stderr, Name ": %s: cannot"
1112                                                         "convert layout to RAID5 equivalent\n",
1113                                                         devname);
1114                                                 rv = 1;
1115                                                 goto release;
1116                                         }
1117                                 else {
1118                                         int l = map_name(r5layout, layout_str);
1119                                         if (l == UnSet) {
1120                                                 fprintf(stderr, Name ": %s: layout '%s' not recognised\n",
1121                                                         devname, layout_str);
1122                                                 rv = 1;
1123                                                 goto release;
1124                                         }
1125                                         if (l != ALGORITHM_PARITY_N) {
1126                                                 /* need the -6 version */
1127                                                 char *ls = map_num(r5layout, l);
1128                                                 strcat(strcpy(alt_layout, ls),
1129                                                        "-6");
1130                                                 layout_str = alt_layout;
1131                                         }
1132                                 }
1133                                 if (raid_disks)
1134                                         /* The final raid6->raid5 conversion
1135                                          * will reduce the number of disks,
1136                                          * so now we need to aim higher
1137                                          */
1138                                         raid_disks++;
1139                         } else
1140                                 layout_str = "parity-last";
1141                 } else {
1142                         /* Level change is a simple takeover.  In the external
1143                          * case we don't check with the metadata handler until
1144                          * we establish what the final layout will be.  If the
1145                          * level change is disallowed we will revert to
1146                          * orig_level without disturbing the metadata, otherwise
1147                          * we will send an update.
1148                          */
1149                         c = map_num(pers, level);
1150                         if (c == NULL) {
1151                                 rv = 1;/* not possible */
1152                                 goto release;
1153                         }
1154                         err = sysfs_set_str(sra, NULL, "level", c);
1155                         if (err) {
1156                                 err = errno;
1157                                 fprintf(stderr, Name ": %s: could not set level to %s\n",
1158                                         devname, c);
1159                                 if (err == EBUSY && 
1160                                     (array.state & (1<<MD_SB_BITMAP_PRESENT)))
1161                                         fprintf(stderr, "       Bitmap must be removed before level can be changed\n");
1162                                 rv = 1;
1163                                 goto release;
1164                         }
1165                         orig = array;
1166                         orig_level = orig.level;
1167                         ioctl(fd, GET_ARRAY_INFO, &array);
1168                         if (layout_str == NULL &&
1169                             orig.level == 5 && level == 6 &&
1170                             array.layout != orig.layout)
1171                                 layout_str = map_num(r5layout, orig.layout);
1172                         if (!quiet)
1173                                 fprintf(stderr, Name " level of %s changed to %s\n",
1174                                         devname, c);
1175                         changed = 1;
1176                 }
1177         }
1178
1179         /* ========= set shape (chunk_size / layout / ndisks)  ============== */
1180         /* Check if layout change is a no-op */
1181         switch (array.level) {
1182         case 5:
1183                 if (layout_str && array.layout == map_name(r5layout, layout_str))
1184                         layout_str = NULL;
1185                 break;
1186         case 6:
1187                 if (layout_str == NULL &&
1188                     ((chunksize && chunksize * 1024 != array.chunk_size) ||
1189                      (raid_disks && raid_disks != array.raid_disks)) &&
1190                     array.layout >= 16) {
1191                         fprintf(stderr, Name
1192                                 ": %s has a non-standard layout.  If you wish to preserve this\n"
1193                                 "      during the reshape, please specify --layout=preserve\n"
1194                                 "      If you want to change it, specify a layout or use --layout=normalise\n",
1195                                 devname);
1196                         rv = 1;
1197                         goto release;
1198                 }
1199                 if (layout_str &&
1200                     (strcmp(layout_str, "normalise") == 0 ||
1201                      strcmp(layout_str, "normalize") == 0)) {
1202                         char *hyphen;
1203                         strcpy(alt_layout, map_num(r6layout, array.layout));
1204                         hyphen = strrchr(alt_layout, '-');
1205                         if (hyphen && strcmp(hyphen, "-6") == 0) {
1206                                 *hyphen = 0;
1207                                 layout_str = alt_layout;
1208                         }
1209                 }
1210
1211                 if (layout_str && array.layout == map_name(r6layout, layout_str))
1212                         layout_str = NULL;
1213                 if (layout_str && strcmp(layout_str, "preserve") == 0)
1214                         layout_str = NULL;
1215                 break;
1216         }
1217         if (layout_str == NULL
1218             && (chunksize == 0 || chunksize*1024 == array.chunk_size)
1219             && (raid_disks == 0 || raid_disks == array.raid_disks)) {
1220                 if (reshape_super(st, -1, level, UnSet, 0, 0, NULL, devname, !quiet)) {
1221                         rv = 1;
1222                         goto release;
1223                 }
1224                 sync_metadata(st);
1225                 rv = 0;
1226                 if (level != UnSet && level != array.level) {
1227                         /* Looks like this level change doesn't need
1228                          * a reshape after all.
1229                          */
1230                         c = map_num(pers, level);
1231                         if (c) {
1232                                 rv = sysfs_set_str(sra, NULL, "level", c);
1233                                 if (rv) {
1234                                         int err = errno;
1235                                         fprintf(stderr, Name ": %s: could not set level to %s\n",
1236                                                 devname, c);
1237                                         if (err == EBUSY && 
1238                                             (array.state & (1<<MD_SB_BITMAP_PRESENT)))
1239                                                 fprintf(stderr, "       Bitmap must be removed before level can be changed\n");
1240                                         rv = 1;
1241                                 }
1242                         }
1243                 } else if (!changed && !quiet)
1244                         fprintf(stderr, Name ": %s: no change requested\n",
1245                                 devname);
1246
1247                 if (st->ss->external && !mdmon_running(st->container_dev) &&
1248                     level > 0) {
1249                         start_mdmon(st->container_dev);
1250                         ping_monitor(container);
1251                 }
1252                 goto release;
1253         }
1254
1255         c = map_num(pers, array.level);
1256         if (c == NULL) c = "-unknown-";
1257         switch (array.level) {
1258         default: /* raid0, linear, multipath cannot be reconfigured */
1259                 fprintf(stderr, Name ": %s array %s cannot be reshaped.\n",
1260                         c, devname);
1261                 /* TODO raid0 raiddisks can be reshaped via raid4 */
1262                 rv = 1;
1263                 break;
1264         case LEVEL_CONTAINER: {
1265                 int count;
1266
1267                 /* double check that we are not changing anything but raid_disks */
1268                 if (size >= 0 || layout_str != NULL || chunksize != 0 || level != UnSet) {
1269                         fprintf(stderr,
1270                                 Name ": %s is a container, only 'raid-devices' can be changed\n",
1271                                 devname);
1272                         rv = 1;
1273                         goto release;
1274                 }
1275
1276                 st->update_tail = &st->updates;
1277                 if (reshape_super(st, -1, UnSet, UnSet, 0, raid_disks,
1278                                   backup_file, devname, !quiet)) {
1279                         rv = 1;
1280                         goto release;
1281                 }
1282
1283                 count = reshape_container_raid_disks(container, raid_disks);
1284                 if (count < 0) {
1285                         revert_container_raid_disks(st, fd, container);
1286                         rv = 1;
1287                         goto release;
1288                 } else if (count == 0) {
1289                         if (!quiet)
1290                                 fprintf(stderr, Name
1291                                         ": no active subarrays to reshape\n");
1292                         goto release;
1293                 }
1294
1295                 if (!mdmon_running(st->devnum)) {
1296                         start_mdmon(st->devnum);
1297                         ping_monitor(container);
1298                 }
1299                 sync_metadata(st);
1300
1301                 /* give mdmon a chance to allocate spares */
1302                 ping_manager(container);
1303
1304                 /* manage_reshape takes care of releasing the array(s) */
1305                 st->ss->manage_reshape(st, backup_file);
1306                 frozen = 0;
1307                 goto release;
1308         }
1309         case LEVEL_FAULTY: /* only 'layout' change is permitted */
1310
1311                 if (chunksize  || raid_disks) {
1312                         fprintf(stderr, Name ": %s: Cannot change chunksize or disks of a 'faulty' array\n",
1313                                 devname);
1314                         rv = 1;
1315                         break;
1316                 }
1317                 if (layout_str == NULL)
1318                         break; /* nothing to do.... */
1319
1320                 array.layout = parse_layout_faulty(layout_str);
1321                 if (array.layout < 0) {
1322                         fprintf(stderr, Name ": %s: layout %s not understood for 'faulty' array\n",
1323                                 devname, layout_str);
1324                         rv = 1;
1325                         break;
1326                 }
1327                 if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
1328                         fprintf(stderr, Name ": Cannot set layout for %s: %s\n",
1329                                 devname, strerror(errno));
1330                         rv = 1;
1331                 } else if (!quiet)
1332                         printf("layout for %s set to %d\n", devname, array.layout);
1333                 break;
1334
1335         case 1: /* only raid_disks can each be changed. */
1336
1337                 if (chunksize || layout_str != NULL) {
1338                         fprintf(stderr, Name ": %s: Cannot change chunk size or layout for a RAID1 array.\n",
1339                                 devname);
1340                         rv = 1;
1341                         break;
1342                 }
1343                 if (raid_disks > 0) {
1344                         if (reshape_super(st, -1, UnSet, UnSet, 0, raid_disks,
1345                                           NULL, devname, !quiet)) {
1346                                 rv = 1;
1347                                 goto release;
1348                         }
1349                         sync_metadata(st);
1350                         array.raid_disks = raid_disks;
1351                         if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
1352                                 fprintf(stderr, Name ": Cannot set raid-devices for %s: %s\n",
1353                                         devname, strerror(errno));
1354                                 rv = 1;
1355                         }
1356                 }
1357                 break;
1358
1359         case 4:
1360         case 5:
1361         case 6:
1362
1363                 /*
1364                  * layout/chunksize/raid_disks can be changed
1365                  * though the kernel may not support it all.
1366                  */
1367                 if (subarray) {
1368                         fprintf(stderr, Name ": Cannot reshape subarrays yet\n");
1369                         break;
1370                 }
1371
1372                 /*
1373                  * There are three possibilities.
1374                  * 1/ The array will shrink.
1375                  *    We need to ensure the reshape will pause before reaching
1376                  *    the 'critical section'.  We also need to fork and wait for
1377                  *    that to happen.  When it does we 
1378                  *       suspend/backup/complete/unfreeze
1379                  *
1380                  * 2/ The array will not change size.
1381                  *    This requires that we keep a backup of a sliding window
1382                  *    so that we can restore data after a crash.  So we need
1383                  *    to fork and monitor progress.
1384                  *
1385                  * 3/ The array will grow. This is relatively easy.
1386                  *    However the kernel's restripe routines will cheerfully
1387                  *    overwrite some early data before it is safe.  So we
1388                  *    need to make a backup of the early parts of the array
1389                  *    and be ready to restore it if rebuild aborts very early.
1390                  *
1391                  *    We backup data by writing it to one spare, or to a
1392                  *    file which was given on command line.
1393                  *
1394                  *    [FOLLOWING IS OLD AND PARTLY WRONG]
1395                  *    So: we enumerate the devices in the array and
1396                  *    make sure we can open all of them.
1397                  *    Then we freeze the early part of the array and
1398                  *    backup to the various spares.
1399                  *    Then we request changes and start the reshape.
1400                  *    Monitor progress until it has passed the danger zone.
1401                  *    and finally invalidate the copied data and unfreeze the
1402                  *    start of the array.
1403                  *
1404                  * In each case, we first make sure that storage is available
1405                  * for the required backup.
1406                  * Then we:
1407                  *   -  request the shape change.
1408                  *   -  for to handle backup etc.
1409                  */
1410                 nchunk = ochunk = array.chunk_size;
1411                 nlayout = olayout = array.layout;
1412                 ndisks = odisks = array.raid_disks;
1413
1414                 if (chunksize) {
1415                         nchunk = chunksize * 1024;
1416                         if (size % chunksize) {
1417                                 fprintf(stderr, Name ": component size %lluK is not"
1418                                         " a multiple of chunksize %dK\n",
1419                                         size, chunksize);
1420                                 break;
1421                         }
1422                 }
1423                 if (layout_str != NULL)
1424                         switch(array.level) {
1425                         case 4: /* ignore layout */
1426                                 break;
1427                         case 5:
1428                                 nlayout = map_name(r5layout, layout_str);
1429                                 if (nlayout == UnSet) {
1430                                         fprintf(stderr, Name ": layout %s not understood for raid5.\n",
1431                                                 layout_str);
1432                                         rv = 1;
1433                                         goto release;
1434                                 }
1435                                 break;
1436
1437                         case 6:
1438                                 nlayout = map_name(r6layout, layout_str);
1439                                 if (nlayout == UnSet) {
1440                                         fprintf(stderr, Name ": layout %s not understood for raid6.\n",
1441                                                 layout_str);
1442                                         rv = 1;
1443                                         goto release;
1444                                 }
1445                                 break;
1446                         }
1447                 if (raid_disks) ndisks = raid_disks;
1448
1449                 odata = odisks-1;
1450                 ndata = ndisks-1;
1451                 if (array.level == 6) {
1452                         odata--; /* number of data disks */
1453                         ndata--;
1454                 }
1455
1456                 if (odata == ndata &&
1457                     get_linux_version() < 2006032) {
1458                         fprintf(stderr, Name ": in-place reshape is not safe before 2.6.32, sorry.\n");
1459                         break;
1460                 }
1461
1462                 /* Check that we can hold all the data */
1463                 get_dev_size(fd, NULL, &array_size);
1464                 if (ndata * (unsigned long long)size < (array_size/1024)) {
1465                         fprintf(stderr, Name ": this change will reduce the size of the array.\n"
1466                                 "       use --grow --array-size first to truncate array.\n"
1467                                 "       e.g. mdadm --grow %s --array-size %llu\n",
1468                                 devname, ndata * size);
1469                         rv = 1;
1470                         break;
1471                 }
1472
1473                 /* So how much do we need to backup.
1474                  * We need an amount of data which is both a whole number of
1475                  * old stripes and a whole number of new stripes.
1476                  * So LCM for (chunksize*datadisks).
1477                  */
1478                 a = (ochunk/512) * odata;
1479                 b = (nchunk/512) * ndata;
1480                 /* Find GCD */
1481                 while (a != b) {
1482                         if (a < b)
1483                                 b -= a;
1484                         if (b < a)
1485                                 a -= b;
1486                 }
1487                 /* LCM == product / GCD */
1488                 blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a;
1489
1490                 sysfs_free(sra);
1491                 sra = sysfs_read(fd, 0,
1492                                  GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
1493                                  GET_CACHE);
1494
1495                 if (!sra) {
1496                         fprintf(stderr, Name ": %s: Cannot get array details from sysfs\n",
1497                                 devname);
1498                         rv = 1;
1499                         break;
1500                 }
1501
1502                 if (ndata == odata) {
1503                         /* Make 'blocks' bigger for better throughput, but
1504                          * not so big that we reject it below.
1505                          * Try for 16 megabytes
1506                          */
1507                         while (blocks * 32 < sra->component_size &&
1508                                blocks < 16*1024*2)
1509                                blocks *= 2;
1510                 } else
1511                         fprintf(stderr, Name ": Need to backup %luK of critical "
1512                                 "section..\n", blocks/2);
1513
1514                 if (blocks >= sra->component_size/2) {
1515                         fprintf(stderr, Name ": %s: Something wrong - reshape aborted\n",
1516                                 devname);
1517                         rv = 1;
1518                         break;
1519                 }
1520                 nrdisks = array.raid_disks + sra->array.spare_disks;
1521                 /* Now we need to open all these devices so we can read/write.
1522                  */
1523                 fdlist = malloc((1+nrdisks) * sizeof(int));
1524                 offsets = malloc((1+nrdisks) * sizeof(offsets[0]));
1525                 if (!fdlist || !offsets) {
1526                         fprintf(stderr, Name ": malloc failed: grow aborted\n");
1527                         rv = 1;
1528                         break;
1529                 }
1530                 for (d=0; d <= nrdisks; d++)
1531                         fdlist[d] = -1;
1532                 d = array.raid_disks;
1533                 for (sd = sra->devs; sd; sd=sd->next) {
1534                         if (sd->disk.state & (1<<MD_DISK_FAULTY))
1535                                 continue;
1536                         if (sd->disk.state & (1<<MD_DISK_SYNC)) {
1537                                 char *dn = map_dev(sd->disk.major,
1538                                                    sd->disk.minor, 1);
1539                                 fdlist[sd->disk.raid_disk]
1540                                         = dev_open(dn, O_RDONLY);
1541                                 offsets[sd->disk.raid_disk] = sd->data_offset*512;
1542                                 if (fdlist[sd->disk.raid_disk] < 0) {
1543                                         fprintf(stderr, Name ": %s: cannot open component %s\n",
1544                                                 devname, dn?dn:"-unknown-");
1545                                         rv = 1;
1546                                         goto release;
1547                                 }
1548                         } else if (backup_file == NULL) {
1549                                 /* spare */
1550                                 char *dn = map_dev(sd->disk.major,
1551                                                    sd->disk.minor, 1);
1552                                 fdlist[d] = dev_open(dn, O_RDWR);
1553                                 offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512;
1554                                 if (fdlist[d]<0) {
1555                                         fprintf(stderr, Name ": %s: cannot open component %s\n",
1556                                                 devname, dn?dn:"-unknown");
1557                                         rv = 1;
1558                                         goto release;
1559                                 }
1560                                 d++;
1561                         }
1562                 }
1563                 if (backup_file == NULL) {
1564                         if (st->ss->external && !st->ss->manage_reshape) {
1565                                 fprintf(stderr, Name ": %s Grow operation not supported by %s metadata\n",
1566                                         devname, st->ss->name);
1567                                 rv = 1;
1568                                 break;
1569                         }
1570                         if (ndata <= odata) {
1571                                 fprintf(stderr, Name ": %s: Cannot grow - need backup-file\n",
1572                                         devname);
1573                                 rv = 1;
1574                                 break;
1575                         } else if (sra->array.spare_disks == 0) {
1576                                 fprintf(stderr, Name ": %s: Cannot grow - need a spare or "
1577                                         "backup-file to backup critical section\n",
1578                                         devname);
1579                                 rv = 1;
1580                                 break;
1581                         }
1582                         if (d == array.raid_disks) {
1583                                 fprintf(stderr, Name ": %s: No spare device for backup\n",
1584                                         devname);
1585                                 rv = 1;
1586                                 break;
1587                         }
1588                 } else {
1589                         /* need to check backup file is large enough */
1590                         char buf[512];
1591                         fdlist[d] = open(backup_file, O_RDWR|O_CREAT|O_EXCL,
1592                                      S_IRUSR | S_IWUSR);
1593                         offsets[d] = 8 * 512;
1594                         if (fdlist[d] < 0) {
1595                                 fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
1596                                         devname, backup_file, strerror(errno));
1597                                 rv = 1;
1598                                 break;
1599                         }
1600                         memset(buf, 0, 512);
1601                         for (i=0; i < (signed)blocks + 1 ; i++) {
1602                                 if (write(fdlist[d], buf, 512) != 512) {
1603                                         fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
1604                                                 devname, backup_file, strerror(errno));
1605                                         rv = 1;
1606                                         break;
1607                                 }
1608                         }
1609                         if (fsync(fdlist[d]) != 0) {
1610                                 fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
1611                                         devname, backup_file, strerror(errno));
1612                                 rv = 1;
1613                                 break;
1614                         }
1615                         d++;
1616                 }
1617
1618                 /* check that the operation is supported by the metadata */
1619                 if (reshape_super(st, -1, level, nlayout, nchunk, ndisks,
1620                                   backup_file, devname, !quiet)) {
1621                         rv = 1;
1622                         break;
1623                 }
1624
1625                 /* lastly, check that the internal stripe cache is
1626                  * large enough, or it won't work.
1627                  */
1628                 
1629                 cache = (nchunk < ochunk) ? ochunk : nchunk;
1630                 cache = cache * 4 / 4096;
1631                 if (cache < blocks / 8 / odisks + 16)
1632                         /* Make it big enough to hold 'blocks' */
1633                         cache = blocks / 8 / odisks + 16;
1634                 if (sra->cache_size < cache)
1635                         sysfs_set_num(sra, NULL, "stripe_cache_size",
1636                                       cache+1);
1637                 /* Right, everything seems fine. Let's kick things off.
1638                  * If only changing raid_disks, use ioctl, else use
1639                  * sysfs.
1640                  */
1641                 sync_metadata(st);
1642                 if (ochunk == nchunk && olayout == nlayout) {
1643                         array.raid_disks = ndisks;
1644                         if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
1645                                 int err = errno;
1646                                 rv = 1;
1647                                 fprintf(stderr, Name ": Cannot set device shape for %s: %s\n",
1648                                         devname, strerror(errno));
1649                                 if (ndisks < odisks &&
1650                                     get_linux_version() < 2006030)
1651                                         fprintf(stderr, Name ": linux 2.6.30 or later required\n");
1652                                 if (err == EBUSY && 
1653                                     (array.state & (1<<MD_SB_BITMAP_PRESENT)))
1654                                         fprintf(stderr, "       Bitmap must be removed before shape can be changed\n");
1655
1656                                 break;
1657                         }
1658                 } else {
1659                         /* set them all just in case some old 'new_*' value
1660                          * persists from some earlier problem
1661                          */
1662                         int err = err; /* only used if rv==1, and always set if
1663                                         * rv==1, so initialisation not needed,
1664                                         * despite gcc warning
1665                                         */
1666                         if (sysfs_set_num(sra, NULL, "chunk_size", nchunk) < 0)
1667                                 rv = 1, err = errno;
1668                         if (!rv && sysfs_set_num(sra, NULL, "layout", nlayout) < 0)
1669                                 rv = 1, err = errno;
1670                         if (!rv && sysfs_set_num(sra, NULL, "raid_disks", ndisks) < 0)
1671                                 rv = 1, err = errno;
1672                         if (rv) {
1673                                 fprintf(stderr, Name ": Cannot set device shape for %s\n",
1674                                         devname);
1675                                 if (get_linux_version() < 2006030)
1676                                         fprintf(stderr, Name ": linux 2.6.30 or later required\n");
1677                                 if (err == EBUSY && 
1678                                     (array.state & (1<<MD_SB_BITMAP_PRESENT)))
1679                                         fprintf(stderr, "       Bitmap must be removed before shape can be changed\n");
1680                                 break;
1681                         }
1682                 }
1683
1684                 if (ndisks == 2 && odisks == 2) {
1685                         /* No reshape is needed in this trivial case */
1686                         rv = 0;
1687                         break;
1688                 }
1689
1690                 if (st->ss->external) {
1691                         /* metadata handler takes it from here */
1692                         ping_manager(container);
1693                         st->ss->manage_reshape(st, backup_file);
1694                         frozen = 0;
1695                         break;
1696                 }
1697
1698                 /* set up the backup-super-block.  This requires the
1699                  * uuid from the array.
1700                  */
1701                 /* Find a superblock */
1702                 for (sd = sra->devs; sd; sd = sd->next) {
1703                         char *dn;
1704                         int devfd;
1705                         int ok;
1706                         if (sd->disk.state & (1<<MD_DISK_FAULTY))
1707                                 continue;
1708                         dn = map_dev(sd->disk.major, sd->disk.minor, 1);
1709                         devfd = dev_open(dn, O_RDONLY);
1710                         if (devfd < 0)
1711                                 continue;
1712                         ok = st->ss->load_super(st, devfd, NULL);
1713                         close(devfd);
1714                         if (ok >= 0)
1715                                 break;
1716                 }
1717                 if (!sd) {
1718                         fprintf(stderr, Name ": %s: Cannot find a superblock\n",
1719                                 devname);
1720                         rv = 1;
1721                         break;
1722                 }
1723
1724                 memset(&bsb, 0, 512);
1725                 memcpy(bsb.magic, "md_backup_data-1", 16);
1726                 st->ss->uuid_from_super(st, (int*)&bsb.set_uuid);
1727                 bsb.mtime = __cpu_to_le64(time(0));
1728                 bsb.devstart2 = blocks;
1729                 stripes = blocks / (ochunk/512) / odata;
1730                 /* Now we just need to kick off the reshape and watch, while
1731                  * handling backups of the data...
1732                  * This is all done by a forked background process.
1733                  */
1734                 switch(fork()) {
1735                 case 0:
1736                         close(fd);
1737                         if (check_env("MDADM_GROW_VERIFY"))
1738                                 fd = open(devname, O_RDONLY | O_DIRECT);
1739                         else
1740                                 fd = -1;
1741                         mlockall(MCL_FUTURE);
1742
1743                         if (odata < ndata)
1744                                 done = child_grow(fd, sra, stripes,
1745                                                   fdlist, offsets,
1746                                                   odisks, ochunk, array.level, olayout, odata,
1747                                                   d - odisks, fdlist+odisks, offsets+odisks);
1748                         else if (odata > ndata)
1749                                 done = child_shrink(fd, sra, stripes,
1750                                                     fdlist, offsets,
1751                                                     odisks, ochunk, array.level, olayout, odata,
1752                                                     d - odisks, fdlist+odisks, offsets+odisks);
1753                         else
1754                                 done = child_same_size(fd, sra, stripes,
1755                                                        fdlist, offsets,
1756                                                        0,
1757                                                        odisks, ochunk, array.level, olayout, odata,
1758                                                        d - odisks, fdlist+odisks, offsets+odisks);
1759                         if (backup_file && done)
1760                                 unlink(backup_file);
1761                         if (level != UnSet && level != array.level) {
1762                                 /* We need to wait for the reshape to finish
1763                                  * (which will have happened unless odata < ndata)
1764                                  * and then set the level
1765                                  */
1766
1767                                 c = map_num(pers, level);
1768                                 if (c == NULL)
1769                                         exit(0);/* not possible */
1770
1771                                 if (odata < ndata)
1772                                         wait_reshape(sra);
1773                                 err = sysfs_set_str(sra, NULL, "level", c);
1774                                 if (err)
1775                                         fprintf(stderr, Name ": %s: could not set level to %s\n",
1776                                                 devname, c);
1777                         }
1778                         exit(0);
1779                 case -1:
1780                         fprintf(stderr, Name ": Cannot run child to monitor reshape: %s\n",
1781                                 strerror(errno));
1782                         rv = 1;
1783                         break;
1784                 default:
1785                         /* The child will take care of unfreezing the array */
1786                         frozen = 0;
1787                         break;
1788                 }
1789                 break;
1790
1791         }
1792
1793  release:
1794         if (rv && orig_level != UnSet && sra) {
1795                 c = map_num(pers, orig_level);
1796                 if (c && sysfs_set_str(sra, NULL, "level", c) == 0)
1797                         fprintf(stderr, Name ": aborting level change\n");
1798         }
1799         unfreeze(st, frozen);
1800         return rv;
1801 }
1802
1803 /*
1804  * We run a child process in the background which performs the following
1805  * steps:
1806  *   - wait for resync to reach a certain point
1807  *   - suspend io to the following section
1808  *   - backup that section
1809  *   - allow resync to proceed further
1810  *   - resume io
1811  *   - discard the backup.
1812  *
1813  * When are combined in slightly different ways in the three cases.
1814  * Grow:
1815  *   - suspend/backup/allow/wait/resume/discard
1816  * Shrink:
1817  *   - allow/wait/suspend/backup/allow/wait/resume/discard
1818  * same-size:
1819  *   - wait/resume/discard/suspend/backup/allow
1820  *
1821  * suspend/backup/allow always come together
1822  * wait/resume/discard do too.
1823  * For the same-size case we have two backups to improve flow.
1824  * 
1825  */
1826
1827 /* FIXME return status is never checked */
1828 static int grow_backup(struct mdinfo *sra,
1829                 unsigned long long offset, /* per device */
1830                 unsigned long stripes, /* per device */
1831                 int *sources, unsigned long long *offsets,
1832                 int disks, int chunk, int level, int layout,
1833                 int dests, int *destfd, unsigned long long *destoffsets,
1834                 int part, int *degraded,
1835                 char *buf)
1836 {
1837         /* Backup 'blocks' sectors at 'offset' on each device of the array,
1838          * to storage 'destfd' (offset 'destoffsets'), after first
1839          * suspending IO.  Then allow resync to continue
1840          * over the suspended section.
1841          * Use part 'part' of the backup-super-block.
1842          */
1843         int odata = disks;
1844         int rv = 0;
1845         int i;
1846         unsigned long long ll;
1847         int new_degraded;
1848         //printf("offset %llu\n", offset);
1849         if (level >= 4)
1850                 odata--;
1851         if (level == 6)
1852                 odata--;
1853         sysfs_set_num(sra, NULL, "suspend_hi", (offset + stripes * (chunk/512)) * odata);
1854         /* Check that array hasn't become degraded, else we might backup the wrong data */
1855         sysfs_get_ll(sra, NULL, "degraded", &ll);
1856         new_degraded = (int)ll;
1857         if (new_degraded != *degraded) {
1858                 /* check each device to ensure it is still working */
1859                 struct mdinfo *sd;
1860                 for (sd = sra->devs ; sd ; sd = sd->next) {
1861                         if (sd->disk.state & (1<<MD_DISK_FAULTY))
1862                                 continue;
1863                         if (sd->disk.state & (1<<MD_DISK_SYNC)) {
1864                                 char sbuf[20];
1865                                 if (sysfs_get_str(sra, sd, "state", sbuf, 20) < 0 ||
1866                                     strstr(sbuf, "faulty") ||
1867                                     strstr(sbuf, "in_sync") == NULL) {
1868                                         /* this device is dead */
1869                                         sd->disk.state = (1<<MD_DISK_FAULTY);
1870                                         if (sd->disk.raid_disk >= 0 &&
1871                                             sources[sd->disk.raid_disk] >= 0) {
1872                                                 close(sources[sd->disk.raid_disk]);
1873                                                 sources[sd->disk.raid_disk] = -1;
1874                                         }
1875                                 }
1876                         }
1877                 }
1878                 *degraded = new_degraded;
1879         }
1880         if (part) {
1881                 bsb.arraystart2 = __cpu_to_le64(offset * odata);
1882                 bsb.length2 = __cpu_to_le64(stripes * (chunk/512) * odata);
1883         } else {
1884                 bsb.arraystart = __cpu_to_le64(offset * odata);
1885                 bsb.length = __cpu_to_le64(stripes * (chunk/512) * odata);
1886         }
1887         if (part)
1888                 bsb.magic[15] = '2';
1889         for (i = 0; i < dests; i++)
1890                 if (part)
1891                         lseek64(destfd[i], destoffsets[i] + __le64_to_cpu(bsb.devstart2)*512, 0);
1892                 else
1893                         lseek64(destfd[i], destoffsets[i], 0);
1894
1895         rv = save_stripes(sources, offsets, 
1896                           disks, chunk, level, layout,
1897                           dests, destfd,
1898                           offset*512*odata, stripes * chunk * odata,
1899                           buf);
1900
1901         if (rv)
1902                 return rv;
1903         bsb.mtime = __cpu_to_le64(time(0));
1904         for (i = 0; i < dests; i++) {
1905                 bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
1906
1907                 bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
1908                 if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
1909                         bsb.sb_csum2 = bsb_csum((char*)&bsb,
1910                                                 ((char*)&bsb.sb_csum2)-((char*)&bsb));
1911
1912                 rv = -1;
1913                 if ((unsigned long long)lseek64(destfd[i], destoffsets[i] - 4096, 0)
1914                     != destoffsets[i] - 4096)
1915                         break;
1916                 if (write(destfd[i], &bsb, 512) != 512)
1917                         break;
1918                 if (destoffsets[i] > 4096) {
1919                         if ((unsigned long long)lseek64(destfd[i], destoffsets[i]+stripes*chunk*odata, 0) !=
1920                             destoffsets[i]+stripes*chunk*odata)
1921                                 break;
1922                         if (write(destfd[i], &bsb, 512) != 512)
1923                                 break;
1924                 }
1925                 fsync(destfd[i]);
1926                 rv = 0;
1927         }
1928
1929         return rv;
1930 }
1931
1932 /* in 2.6.30, the value reported by sync_completed can be
1933  * less that it should be by one stripe.
1934  * This only happens when reshape hits sync_max and pauses.
1935  * So allow wait_backup to either extent sync_max further
1936  * than strictly necessary, or return before the
1937  * sync has got quite as far as we would really like.
1938  * This is what 'blocks2' is for.
1939  * The various caller give appropriate values so that
1940  * every works.
1941  */
1942 /* FIXME return value is often ignored */
1943 static int wait_backup(struct mdinfo *sra,
1944                 unsigned long long offset, /* per device */
1945                 unsigned long long blocks, /* per device */
1946                 unsigned long long blocks2, /* per device - hack */
1947                 int dests, int *destfd, unsigned long long *destoffsets,
1948                 int part)
1949 {
1950         /* Wait for resync to pass the section that was backed up
1951          * then erase the backup and allow IO
1952          */
1953         int fd = sysfs_get_fd(sra, NULL, "sync_completed");
1954         unsigned long long completed;
1955         int i;
1956         int rv;
1957
1958         if (fd < 0)
1959                 return -1;
1960         sysfs_set_num(sra, NULL, "sync_max", offset + blocks + blocks2);
1961         if (offset == 0)
1962                 sysfs_set_str(sra, NULL, "sync_action", "reshape");
1963         do {
1964                 char action[20];
1965                 fd_set rfds;
1966                 FD_ZERO(&rfds);
1967                 FD_SET(fd, &rfds);
1968                 select(fd+1, NULL, NULL, &rfds, NULL);
1969                 if (sysfs_fd_get_ll(fd, &completed) < 0) {
1970                         close(fd);
1971                         return -1;
1972                 }
1973                 if (sysfs_get_str(sra, NULL, "sync_action",
1974                                   action, 20) > 0 &&
1975                     strncmp(action, "reshape", 7) != 0)
1976                         break;
1977         } while (completed < offset + blocks);
1978         close(fd);
1979
1980         if (part) {
1981                 bsb.arraystart2 = __cpu_to_le64(0);
1982                 bsb.length2 = __cpu_to_le64(0);
1983         } else {
1984                 bsb.arraystart = __cpu_to_le64(0);
1985                 bsb.length = __cpu_to_le64(0);
1986         }
1987         bsb.mtime = __cpu_to_le64(time(0));
1988         rv = 0;
1989         for (i = 0; i < dests; i++) {
1990                 bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
1991                 bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
1992                 if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
1993                         bsb.sb_csum2 = bsb_csum((char*)&bsb,
1994                                                 ((char*)&bsb.sb_csum2)-((char*)&bsb));
1995                 if ((unsigned long long)lseek64(destfd[i], destoffsets[i]-4096, 0) !=
1996                     destoffsets[i]-4096)
1997                         rv = -1;
1998                 if (rv == 0 && 
1999                     write(destfd[i], &bsb, 512) != 512)
2000                         rv = -1;
2001                 fsync(destfd[i]);
2002         }
2003         return rv;
2004 }
2005
2006 static void fail(char *msg)
2007 {
2008         int rv;
2009         rv = (write(2, msg, strlen(msg)) != (int)strlen(msg));
2010         rv |= (write(2, "\n", 1) != 1);
2011         exit(rv ? 1 : 2);
2012 }
2013
2014 static char *abuf, *bbuf;
2015 static unsigned long long abuflen;
2016 static void validate(int afd, int bfd, unsigned long long offset)
2017 {
2018         /* check that the data in the backup against the array.
2019          * This is only used for regression testing and should not
2020          * be used while the array is active
2021          */
2022         if (afd < 0)
2023                 return;
2024         lseek64(bfd, offset - 4096, 0);
2025         if (read(bfd, &bsb2, 512) != 512)
2026                 fail("cannot read bsb");
2027         if (bsb2.sb_csum != bsb_csum((char*)&bsb2,
2028                                      ((char*)&bsb2.sb_csum)-((char*)&bsb2)))
2029                 fail("first csum bad");
2030         if (memcmp(bsb2.magic, "md_backup_data", 14) != 0)
2031                 fail("magic is bad");
2032         if (memcmp(bsb2.magic, "md_backup_data-2", 16) == 0 &&
2033             bsb2.sb_csum2 != bsb_csum((char*)&bsb2,
2034                                      ((char*)&bsb2.sb_csum2)-((char*)&bsb2)))
2035                 fail("second csum bad");
2036
2037         if (__le64_to_cpu(bsb2.devstart)*512 != offset)
2038                 fail("devstart is wrong");
2039
2040         if (bsb2.length) {
2041                 unsigned long long len = __le64_to_cpu(bsb2.length)*512;
2042
2043                 if (abuflen < len) {
2044                         free(abuf);
2045                         free(bbuf);
2046                         abuflen = len;
2047                         if (posix_memalign((void**)&abuf, 4096, abuflen) ||
2048                             posix_memalign((void**)&bbuf, 4096, abuflen)) {
2049                                 abuflen = 0;
2050                                 /* just stop validating on mem-alloc failure */
2051                                 return;
2052                         }
2053                 }
2054
2055                 lseek64(bfd, offset, 0);
2056                 if ((unsigned long long)read(bfd, bbuf, len) != len) {
2057                         //printf("len %llu\n", len);
2058                         fail("read first backup failed");
2059                 }
2060                 lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0);
2061                 if ((unsigned long long)read(afd, abuf, len) != len)
2062                         fail("read first from array failed");
2063                 if (memcmp(bbuf, abuf, len) != 0) {
2064                         #if 0
2065                         int i;
2066                         printf("offset=%llu len=%llu\n",
2067                                (unsigned long long)__le64_to_cpu(bsb2.arraystart)*512, len);
2068                         for (i=0; i<len; i++)
2069                                 if (bbuf[i] != abuf[i]) {
2070                                         printf("first diff byte %d\n", i);
2071                                         break;
2072                                 }
2073                         #endif
2074                         fail("data1 compare failed");
2075                 }
2076         }
2077         if (bsb2.length2) {
2078                 unsigned long long len = __le64_to_cpu(bsb2.length2)*512;
2079
2080                 if (abuflen < len) {
2081                         free(abuf);
2082                         free(bbuf);
2083                         abuflen = len;
2084                         abuf = malloc(abuflen);
2085                         bbuf = malloc(abuflen);
2086                 }
2087
2088                 lseek64(bfd, offset+__le64_to_cpu(bsb2.devstart2)*512, 0);
2089                 if ((unsigned long long)read(bfd, bbuf, len) != len)
2090                         fail("read second backup failed");
2091                 lseek64(afd, __le64_to_cpu(bsb2.arraystart2)*512, 0);
2092                 if ((unsigned long long)read(afd, abuf, len) != len)
2093                         fail("read second from array failed");
2094                 if (memcmp(bbuf, abuf, len) != 0)
2095                         fail("data2 compare failed");
2096         }
2097 }
2098
2099 static int child_grow(int afd, struct mdinfo *sra, unsigned long stripes,
2100                       int *fds, unsigned long long *offsets,
2101                       int disks, int chunk, int level, int layout, int data,
2102                       int dests, int *destfd, unsigned long long *destoffsets)
2103 {
2104         char *buf;
2105         int degraded = 0;
2106
2107         if (posix_memalign((void**)&buf, 4096, disks * chunk))
2108                 /* Don't start the 'reshape' */
2109                 return 0;
2110         sysfs_set_num(sra, NULL, "suspend_hi", 0);
2111         sysfs_set_num(sra, NULL, "suspend_lo", 0);
2112         grow_backup(sra, 0, stripes,
2113                     fds, offsets, disks, chunk, level, layout,
2114                     dests, destfd, destoffsets,
2115                     0, &degraded, buf);
2116         validate(afd, destfd[0], destoffsets[0]);
2117         wait_backup(sra, 0, stripes * (chunk / 512), stripes * (chunk / 512),
2118                     dests, destfd, destoffsets,
2119                     0);
2120         sysfs_set_num(sra, NULL, "suspend_lo", (stripes * (chunk/512)) * data);
2121         free(buf);
2122         /* FIXME this should probably be numeric */
2123         sysfs_set_str(sra, NULL, "sync_max", "max");
2124         return 1;
2125 }
2126
2127 static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes,
2128                         int *fds, unsigned long long *offsets,
2129                         int disks, int chunk, int level, int layout, int data,
2130                         int dests, int *destfd, unsigned long long *destoffsets)
2131 {
2132         char *buf;
2133         unsigned long long start;
2134         int rv;
2135         int degraded = 0;
2136
2137         if (posix_memalign((void**)&buf, 4096, disks * chunk))
2138                 return 0;
2139         start = sra->component_size - stripes * (chunk/512);
2140         sysfs_set_num(sra, NULL, "sync_max", start);
2141         sysfs_set_str(sra, NULL, "sync_action", "reshape");
2142         sysfs_set_num(sra, NULL, "suspend_lo", 0);
2143         sysfs_set_num(sra, NULL, "suspend_hi", 0);
2144         rv = wait_backup(sra, 0, start - stripes * (chunk/512), stripes * (chunk/512),
2145                          dests, destfd, destoffsets, 0);
2146         if (rv < 0)
2147                 return 0;
2148         grow_backup(sra, 0, stripes,
2149                     fds, offsets,
2150                     disks, chunk, level, layout,
2151                     dests, destfd, destoffsets,
2152                     0, &degraded, buf);
2153         validate(afd, destfd[0], destoffsets[0]);
2154         wait_backup(sra, start, stripes*(chunk/512), 0,
2155                     dests, destfd, destoffsets, 0);
2156         sysfs_set_num(sra, NULL, "suspend_lo", (stripes * (chunk/512)) * data);
2157         free(buf);
2158         /* FIXME this should probably be numeric */
2159         sysfs_set_str(sra, NULL, "sync_max", "max");
2160         return 1;
2161 }
2162
2163 static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
2164                            int *fds, unsigned long long *offsets,
2165                            unsigned long long start,
2166                            int disks, int chunk, int level, int layout, int data,
2167                            int dests, int *destfd, unsigned long long *destoffsets)
2168 {
2169         unsigned long long size;
2170         unsigned long tailstripes = stripes;
2171         int part;
2172         char *buf;
2173         unsigned long long speed;
2174         int degraded = 0;
2175
2176
2177         if (posix_memalign((void**)&buf, 4096, disks * chunk))
2178                 return 0;
2179
2180         sysfs_set_num(sra, NULL, "suspend_lo", 0);
2181         sysfs_set_num(sra, NULL, "suspend_hi", 0);
2182
2183         sysfs_get_ll(sra, NULL, "sync_speed_min", &speed);
2184         sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
2185
2186         grow_backup(sra, start, stripes,
2187                     fds, offsets,
2188                     disks, chunk, level, layout,
2189                     dests, destfd, destoffsets,
2190                     0, &degraded, buf);
2191         grow_backup(sra, (start + stripes) * (chunk/512), stripes,
2192                     fds, offsets,
2193                     disks, chunk, level, layout,
2194                     dests, destfd, destoffsets,
2195                     1, &degraded, buf);
2196         validate(afd, destfd[0], destoffsets[0]);
2197         part = 0;
2198         start += stripes * 2; /* where to read next */
2199         size = sra->component_size / (chunk/512);
2200         while (start < size) {
2201                 if (wait_backup(sra, (start-stripes*2)*(chunk/512),
2202                                 stripes*(chunk/512), 0,
2203                                 dests, destfd, destoffsets,
2204                                 part) < 0)
2205                         return 0;
2206                 sysfs_set_num(sra, NULL, "suspend_lo", start*(chunk/512) * data);
2207                 if (start + stripes > size)
2208                         tailstripes = (size - start);
2209
2210                 grow_backup(sra, start*(chunk/512), tailstripes,
2211                             fds, offsets,
2212                             disks, chunk, level, layout,
2213                             dests, destfd, destoffsets,
2214                             part, &degraded, buf);
2215                 start += stripes;
2216                 part = 1 - part;
2217                 validate(afd, destfd[0], destoffsets[0]);
2218         }
2219         if (wait_backup(sra, (start-stripes*2) * (chunk/512), stripes * (chunk/512), 0,
2220                         dests, destfd, destoffsets,
2221                         part) < 0)
2222                 return 0;
2223         sysfs_set_num(sra, NULL, "suspend_lo", ((start-stripes)*(chunk/512)) * data);
2224         wait_backup(sra, (start-stripes) * (chunk/512), tailstripes * (chunk/512), 0,
2225                     dests, destfd, destoffsets,
2226                     1-part);
2227         sysfs_set_num(sra, NULL, "suspend_lo", (size*(chunk/512)) * data);
2228         sysfs_set_num(sra, NULL, "sync_speed_min", speed);
2229         free(buf);
2230         return 1;
2231 }
2232
2233 /*
2234  * If any spare contains md_back_data-1 which is recent wrt mtime,
2235  * write that data into the array and update the super blocks with
2236  * the new reshape_progress
2237  */
2238 int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt,
2239                  char *backup_file, int verbose)
2240 {
2241         int i, j;
2242         int old_disks;
2243         unsigned long long *offsets;
2244         unsigned long long  nstripe, ostripe;
2245         int ndata, odata;
2246
2247         if (info->new_level != info->array.level)
2248                 return 1; /* Cannot handle level changes (they are instantaneous) */
2249
2250         odata = info->array.raid_disks - info->delta_disks - 1;
2251         if (info->array.level == 6) odata--; /* number of data disks */
2252         ndata = info->array.raid_disks - 1;
2253         if (info->new_level == 6) ndata--;
2254
2255         old_disks = info->array.raid_disks - info->delta_disks;
2256
2257         if (info->delta_disks <= 0)
2258                 /* Didn't grow, so the backup file must have
2259                  * been used
2260                  */
2261                 old_disks = cnt;
2262         for (i=old_disks-(backup_file?1:0); i<cnt; i++) {
2263                 struct mdinfo dinfo;
2264                 int fd;
2265                 int bsbsize;
2266                 char *devname, namebuf[20];
2267
2268                 /* This was a spare and may have some saved data on it.
2269                  * Load the superblock, find and load the
2270                  * backup_super_block.
2271                  * If either fail, go on to next device.
2272                  * If the backup contains no new info, just return
2273                  * else restore data and update all superblocks
2274                  */
2275                 if (i == old_disks-1) {
2276                         fd = open(backup_file, O_RDONLY);
2277                         if (fd<0) {
2278                                 fprintf(stderr, Name ": backup file %s inaccessible: %s\n",
2279                                         backup_file, strerror(errno));
2280                                 continue;
2281                         }
2282                         devname = backup_file;
2283                 } else {
2284                         fd = fdlist[i];
2285                         if (fd < 0)
2286                                 continue;
2287                         if (st->ss->load_super(st, fd, NULL))
2288                                 continue;
2289
2290                         st->ss->getinfo_super(st, &dinfo, NULL);
2291                         st->ss->free_super(st);
2292
2293                         if (lseek64(fd,
2294                                     (dinfo.data_offset + dinfo.component_size - 8) <<9,
2295                                     0) < 0) {
2296                                 fprintf(stderr, Name ": Cannot seek on device %d\n", i);
2297                                 continue; /* Cannot seek */
2298                         }
2299                         sprintf(namebuf, "device-%d", i);
2300                         devname = namebuf;
2301                 }
2302                 if (read(fd, &bsb, sizeof(bsb)) != sizeof(bsb)) {
2303                         if (verbose)
2304                                 fprintf(stderr, Name ": Cannot read from %s\n", devname);
2305                         continue; /* Cannot read */
2306                 }
2307                 if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0 &&
2308                     memcmp(bsb.magic, "md_backup_data-2", 16) != 0) {
2309                         if (verbose)
2310                                 fprintf(stderr, Name ": No backup metadata on %s\n", devname);
2311                         continue;
2312                 }
2313                 if (bsb.sb_csum != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb))) {
2314                         if (verbose)
2315                                 fprintf(stderr, Name ": Bad backup-metadata checksum on %s\n", devname);
2316                         continue; /* bad checksum */
2317                 }
2318                 if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0 &&
2319                     bsb.sb_csum2 != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb))) {
2320                         if (verbose)
2321                                 fprintf(stderr, Name ": Bad backup-metadata checksum2 on %s\n", devname);
2322                         continue; /* Bad second checksum */
2323                 }
2324                 if (memcmp(bsb.set_uuid,info->uuid, 16) != 0) {
2325                         if (verbose)
2326                                 fprintf(stderr, Name ": Wrong uuid on backup-metadata on %s\n", devname);
2327                         continue; /* Wrong uuid */
2328                 }
2329
2330                 /* array utime and backup-mtime should be updated at much the same time, but it seems that
2331                  * sometimes they aren't... So allow considerable flexability in matching, and allow
2332                  * this test to be overridden by an environment variable.
2333                  */
2334                 if (info->array.utime > (int)__le64_to_cpu(bsb.mtime) + 2*60*60 ||
2335                     info->array.utime < (int)__le64_to_cpu(bsb.mtime) - 10*60) {
2336                         if (check_env("MDADM_GROW_ALLOW_OLD")) {
2337                                 fprintf(stderr, Name ": accepting backup with timestamp %lu "
2338                                         "for array with timestamp %lu\n",
2339                                         (unsigned long)__le64_to_cpu(bsb.mtime),
2340                                         (unsigned long)info->array.utime);
2341                         } else {
2342                                 if (verbose)
2343                                         fprintf(stderr, Name ": too-old timestamp on "
2344                                                 "backup-metadata on %s\n", devname);
2345                                 continue; /* time stamp is too bad */
2346                         }
2347                 }
2348
2349                 if (bsb.magic[15] == '1') {
2350                 if (info->delta_disks >= 0) {
2351                         /* reshape_progress is increasing */
2352                         if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) <
2353                             info->reshape_progress) {
2354                         nonew:
2355                                 if (verbose)
2356                                         fprintf(stderr, Name ": backup-metadata found on %s but is not needed\n", devname);
2357                                 continue; /* No new data here */
2358                         }
2359                 } else {
2360                         /* reshape_progress is decreasing */
2361                         if (__le64_to_cpu(bsb.arraystart) >=
2362                             info->reshape_progress)
2363                                 goto nonew; /* No new data here */
2364                 }
2365                 } else {
2366                 if (info->delta_disks >= 0) {
2367                         /* reshape_progress is increasing */
2368                         if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) <
2369                             info->reshape_progress &&
2370                             __le64_to_cpu(bsb.arraystart2) + __le64_to_cpu(bsb.length2) <
2371                             info->reshape_progress)
2372                                 goto nonew; /* No new data here */
2373                 } else {
2374                         /* reshape_progress is decreasing */
2375                         if (__le64_to_cpu(bsb.arraystart) >=
2376                             info->reshape_progress &&
2377                             __le64_to_cpu(bsb.arraystart2) >=
2378                             info->reshape_progress)
2379                                 goto nonew; /* No new data here */
2380                 }
2381                 }
2382                 if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) {
2383                 second_fail:
2384                         if (verbose)
2385                                 fprintf(stderr, Name ": Failed to verify secondary backup-metadata block on %s\n",
2386                                         devname);
2387                         continue; /* Cannot seek */
2388                 }
2389                 /* There should be a duplicate backup superblock 4k before here */
2390                 if (lseek64(fd, -4096, 1) < 0 ||
2391                     read(fd, &bsb2, sizeof(bsb2)) != sizeof(bsb2))
2392                         goto second_fail; /* Cannot find leading superblock */
2393                 if (bsb.magic[15] == '1')
2394                         bsbsize = offsetof(struct mdp_backup_super, pad1);
2395                 else
2396                         bsbsize = offsetof(struct mdp_backup_super, pad);
2397                 if (memcmp(&bsb2, &bsb, bsbsize) != 0)
2398                         goto second_fail; /* Cannot find leading superblock */
2399
2400                 /* Now need the data offsets for all devices. */
2401                 offsets = malloc(sizeof(*offsets)*info->array.raid_disks);
2402                 for(j=0; j<info->array.raid_disks; j++) {
2403                         if (fdlist[j] < 0)
2404                                 continue;
2405                         if (st->ss->load_super(st, fdlist[j], NULL))
2406                                 /* FIXME should be this be an error */
2407                                 continue;
2408                         st->ss->getinfo_super(st, &dinfo, NULL);
2409                         st->ss->free_super(st);
2410                         offsets[j] = dinfo.data_offset * 512;
2411                 }
2412                 printf(Name ": restoring critical section\n");
2413
2414                 if (restore_stripes(fdlist, offsets,
2415                                     info->array.raid_disks,
2416                                     info->new_chunk,
2417                                     info->new_level,
2418                                     info->new_layout,
2419                                     fd, __le64_to_cpu(bsb.devstart)*512,
2420                                     __le64_to_cpu(bsb.arraystart)*512,
2421                                     __le64_to_cpu(bsb.length)*512)) {
2422                         /* didn't succeed, so giveup */
2423                         if (verbose)
2424                                 fprintf(stderr, Name ": Error restoring backup from %s\n",
2425                                         devname);
2426                         return 1;
2427                 }
2428                 
2429                 if (bsb.magic[15] == '2' &&
2430                     restore_stripes(fdlist, offsets,
2431                                     info->array.raid_disks,
2432                                     info->new_chunk,
2433                                     info->new_level,
2434                                     info->new_layout,
2435                                     fd, __le64_to_cpu(bsb.devstart)*512 +
2436                                     __le64_to_cpu(bsb.devstart2)*512,
2437                                     __le64_to_cpu(bsb.arraystart2)*512,
2438                                     __le64_to_cpu(bsb.length2)*512)) {
2439                         /* didn't succeed, so giveup */
2440                         if (verbose)
2441                                 fprintf(stderr, Name ": Error restoring second backup from %s\n",
2442                                         devname);
2443                         return 1;
2444                 }
2445
2446
2447                 /* Ok, so the data is restored. Let's update those superblocks. */
2448
2449                 if (info->delta_disks >= 0) {
2450                         info->reshape_progress = __le64_to_cpu(bsb.arraystart) +
2451                                 __le64_to_cpu(bsb.length);
2452                         if (bsb.magic[15] == '2') {
2453                                 unsigned long long p2 = __le64_to_cpu(bsb.arraystart2) +
2454                                         __le64_to_cpu(bsb.length2);
2455                                 if (p2 > info->reshape_progress)
2456                                         info->reshape_progress = p2;
2457                         }
2458                 } else {
2459                         info->reshape_progress = __le64_to_cpu(bsb.arraystart);
2460                         if (bsb.magic[15] == '2') {
2461                                 unsigned long long p2 = __le64_to_cpu(bsb.arraystart2);
2462                                 if (p2 < info->reshape_progress)
2463                                         info->reshape_progress = p2;
2464                         }
2465                 }
2466                 for (j=0; j<info->array.raid_disks; j++) {
2467                         if (fdlist[j] < 0) continue;
2468                         if (st->ss->load_super(st, fdlist[j], NULL))
2469                                 continue;
2470                         st->ss->getinfo_super(st, &dinfo, NULL);
2471                         dinfo.reshape_progress = info->reshape_progress;
2472                         st->ss->update_super(st, &dinfo,
2473                                              "_reshape_progress",
2474                                              NULL,0, 0, NULL);
2475                         st->ss->store_super(st, fdlist[j]);
2476                         st->ss->free_super(st);
2477                 }
2478                 return 0;
2479         }
2480         /* Didn't find any backup data, try to see if any
2481          * was needed.
2482          */
2483         if (info->delta_disks < 0) {
2484                 /* When shrinking, the critical section is at the end.
2485                  * So see if we are before the critical section.
2486                  */
2487                 unsigned long long first_block;
2488                 nstripe = ostripe = 0;
2489                 first_block = 0;
2490                 while (ostripe >= nstripe) {
2491                         ostripe += info->array.chunk_size / 512;
2492                         first_block = ostripe * odata;
2493                         nstripe = first_block / ndata / (info->new_chunk/512) *
2494                                 (info->new_chunk/512);
2495                 }
2496
2497                 if (info->reshape_progress >= first_block)
2498                         return 0;
2499         }
2500         if (info->delta_disks > 0) {
2501                 /* See if we are beyond the critical section. */
2502                 unsigned long long last_block;
2503                 nstripe = ostripe = 0;
2504                 last_block = 0;
2505                 while (nstripe >= ostripe) {
2506                         nstripe += info->new_chunk / 512;
2507                         last_block = nstripe * ndata;
2508                         ostripe = last_block / odata / (info->array.chunk_size/512) *
2509                                 (info->array.chunk_size/512);
2510                 }
2511
2512                 if (info->reshape_progress >= last_block)
2513                         return 0;
2514         }
2515         /* needed to recover critical section! */
2516         if (verbose)
2517                 fprintf(stderr, Name ": Failed to find backup of critical section\n");
2518         return 1;
2519 }
2520
2521 int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
2522                   char *backup_file)
2523 {
2524         /* Array is assembled and ready to be started, but
2525          * monitoring is probably required.
2526          * So:
2527          *   - start read-only
2528          *   - set upper bound for resync
2529          *   - initialise the 'suspend' boundaries
2530          *   - switch to read-write
2531          *   - fork and continue monitoring
2532          */
2533         int err;
2534         int backup_list[1];
2535         unsigned long long backup_offsets[1];
2536         int odisks, ndisks, ochunk, nchunk,odata,ndata;
2537         unsigned long a,b,blocks,stripes;
2538         int backup_fd;
2539         int *fds;
2540         unsigned long long *offsets;
2541         int d;
2542         struct mdinfo *sra, *sd;
2543         int rv;
2544         unsigned long cache;
2545         int done = 0;
2546
2547         err = sysfs_set_str(info, NULL, "array_state", "readonly");
2548         if (err)
2549                 return err;
2550
2551         /* make sure reshape doesn't progress until we are ready */
2552         sysfs_set_str(info, NULL, "sync_max", "0");
2553         sysfs_set_str(info, NULL, "array_state", "active"); /* FIXME or clean */
2554
2555         sra = sysfs_read(-1, devname2devnum(info->sys_name),
2556                          GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
2557                          GET_CACHE);
2558         if (!sra)
2559                 return 1;
2560
2561         /* ndisks is not growing, so raid_disks is old and +delta is new */
2562         odisks = info->array.raid_disks;
2563         ndisks = odisks + info->delta_disks;
2564         odata = odisks - 1;
2565         ndata = ndisks - 1;
2566         if (info->array.level == 6) {
2567                 odata--;
2568                 ndata--;
2569         }
2570         ochunk = info->array.chunk_size;
2571         nchunk = info->new_chunk;
2572
2573         a = (ochunk/512) * odata;
2574         b = (nchunk/512) * ndata;
2575         /* Find GCD */
2576         while (a != b) {
2577                 if (a < b)
2578                         b -= a;
2579                 if (b < a)
2580                         a -= b;
2581         }
2582         /* LCM == product / GCD */
2583         blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a;
2584
2585         if (ndata == odata)
2586                 while (blocks * 32 < sra->component_size &&
2587                        blocks < 16*1024*2)
2588                         blocks *= 2;
2589         stripes = blocks / (info->array.chunk_size/512) / odata;
2590
2591         /* check that the internal stripe cache is
2592          * large enough, or it won't work.
2593          */
2594         cache = (nchunk < ochunk) ? ochunk : nchunk;
2595         cache = cache * 4 / 4096;
2596         if (cache < blocks / 8 / odisks + 16)
2597                 /* Make it big enough to hold 'blocks' */
2598                 cache = blocks / 8 / odisks + 16;
2599         if (sra->cache_size < cache)
2600                 sysfs_set_num(sra, NULL, "stripe_cache_size",
2601                               cache+1);
2602
2603         memset(&bsb, 0, 512);
2604         memcpy(bsb.magic, "md_backup_data-1", 16);
2605         memcpy(&bsb.set_uuid, info->uuid, 16);
2606         bsb.mtime = __cpu_to_le64(time(0));
2607         bsb.devstart2 = blocks;
2608
2609         backup_fd = open(backup_file, O_RDWR|O_CREAT, S_IRUSR | S_IWUSR);
2610         backup_list[0] = backup_fd;
2611         backup_offsets[0] = 8 * 512;
2612         fds = malloc(odisks * sizeof(fds[0]));
2613         offsets = malloc(odisks * sizeof(offsets[0]));
2614         for (d=0; d<odisks; d++)
2615                 fds[d] = -1;
2616
2617         for (sd = sra->devs; sd; sd = sd->next) {
2618                 if (sd->disk.state & (1<<MD_DISK_FAULTY))
2619                         continue;
2620                 if (sd->disk.state & (1<<MD_DISK_SYNC)) {
2621                         char *dn = map_dev(sd->disk.major,
2622                                            sd->disk.minor, 1);
2623                         fds[sd->disk.raid_disk]
2624                                 = dev_open(dn, O_RDONLY);
2625                         offsets[sd->disk.raid_disk] = sd->data_offset*512;
2626                         if (fds[sd->disk.raid_disk] < 0) {
2627                                 fprintf(stderr, Name ": %s: cannot open component %s\n",
2628                                         info->sys_name, dn?dn:"-unknown-");
2629                                 rv = 1;
2630                                 goto release;
2631                         }
2632                         free(dn);
2633                 }
2634         }
2635
2636         switch(fork()) {
2637         case 0:
2638                 close(mdfd);
2639                 mlockall(MCL_FUTURE);
2640                 if (info->delta_disks < 0)
2641                         done = child_shrink(-1, info, stripes,
2642                                             fds, offsets,
2643                                             info->array.raid_disks,
2644                                             info->array.chunk_size,
2645                                             info->array.level, info->array.layout,
2646                                             odata,
2647                                             1, backup_list, backup_offsets);
2648                 else if (info->delta_disks == 0) {
2649                         /* The 'start' is a per-device stripe number.
2650                          * reshape_progress is a per-array sector number.
2651                          * So divide by ndata * chunk_size
2652                          */
2653                         unsigned long long start = info->reshape_progress / ndata;
2654                         start /= (info->array.chunk_size/512);
2655                         done = child_same_size(-1, info, stripes,
2656                                                fds, offsets,
2657                                                start,
2658                                                info->array.raid_disks,
2659                                                info->array.chunk_size,
2660                                                info->array.level, info->array.layout,
2661                                                odata,
2662                                                1, backup_list, backup_offsets);
2663                 }
2664                 if (backup_file && done)
2665                         unlink(backup_file);
2666                 /* FIXME should I intuit a level change */
2667                 exit(0);
2668         case -1:
2669                 fprintf(stderr, Name ": Cannot run child to continue monitoring reshape: %s\n",
2670                         strerror(errno));
2671                 return 1;
2672         default:
2673                 break;
2674         }
2675 release:
2676         return 0;
2677 }
2678
2679