]> git.ipfire.org Git - thirdparty/mdadm.git/blob - Grow.c
Grow: support restart of new migrations.
[thirdparty/mdadm.git] / Grow.c
1 /*
2 * mdadm - manage Linux "md" devices aka RAID arrays.
3 *
4 * Copyright (C) 2001-2006 Neil Brown <neilb@suse.de>
5 *
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 * Author: Neil Brown
22 * Email: <neilb@cse.unsw.edu.au>
23 * Paper: Neil Brown
24 * School of Computer Science and Engineering
25 * The University of New South Wales
26 * Sydney, 2052
27 * Australia
28 */
29 #include "mdadm.h"
30 #include "dlink.h"
31 #include <sys/mman.h>
32
33 #if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN)
34 #error no endian defined
35 #endif
36 #include "md_u.h"
37 #include "md_p.h"
38
39 #ifndef offsetof
40 #define offsetof(t,f) ((size_t)&(((t*)0)->f))
41 #endif
42
43 int Grow_Add_device(char *devname, int fd, char *newdev)
44 {
45 /* Add a device to an active array.
46 * Currently, just extend a linear array.
47 * This requires writing a new superblock on the
48 * new device, calling the kernel to add the device,
49 * and if that succeeds, update the superblock on
50 * all other devices.
51 * This means that we need to *find* all other devices.
52 */
53 struct mdinfo info;
54
55 struct stat stb;
56 int nfd, fd2;
57 int d, nd;
58 struct supertype *st = NULL;
59
60
61 if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) {
62 fprintf(stderr, Name ": cannot get array info for %s\n", devname);
63 return 1;
64 }
65
66 st = super_by_fd(fd);
67 if (!st) {
68 fprintf(stderr, Name ": cannot handle arrays with superblock version %d\n", info.array.major_version);
69 return 1;
70 }
71
72 if (info.array.level != -1) {
73 fprintf(stderr, Name ": can only add devices to linear arrays\n");
74 return 1;
75 }
76
77 nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT);
78 if (nfd < 0) {
79 fprintf(stderr, Name ": cannot open %s\n", newdev);
80 return 1;
81 }
82 fstat(nfd, &stb);
83 if ((stb.st_mode & S_IFMT) != S_IFBLK) {
84 fprintf(stderr, Name ": %s is not a block device!\n", newdev);
85 close(nfd);
86 return 1;
87 }
88 /* now check out all the devices and make sure we can read the superblock */
89 for (d=0 ; d < info.array.raid_disks ; d++) {
90 mdu_disk_info_t disk;
91 char *dv;
92
93 disk.number = d;
94 if (ioctl(fd, GET_DISK_INFO, &disk) < 0) {
95 fprintf(stderr, Name ": cannot get device detail for device %d\n",
96 d);
97 return 1;
98 }
99 dv = map_dev(disk.major, disk.minor, 1);
100 if (!dv) {
101 fprintf(stderr, Name ": cannot find device file for device %d\n",
102 d);
103 return 1;
104 }
105 fd2 = dev_open(dv, O_RDWR);
106 if (!fd2) {
107 fprintf(stderr, Name ": cannot open device file %s\n", dv);
108 return 1;
109 }
110 st->ss->free_super(st);
111
112 if (st->ss->load_super(st, fd2, NULL)) {
113 fprintf(stderr, Name ": cannot find super block on %s\n", dv);
114 close(fd2);
115 return 1;
116 }
117 close(fd2);
118 }
119 /* Ok, looks good. Lets update the superblock and write it out to
120 * newdev.
121 */
122
123 info.disk.number = d;
124 info.disk.major = major(stb.st_rdev);
125 info.disk.minor = minor(stb.st_rdev);
126 info.disk.raid_disk = d;
127 info.disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE);
128 st->ss->update_super(st, &info, "linear-grow-new", newdev,
129 0, 0, NULL);
130
131 if (st->ss->store_super(st, nfd)) {
132 fprintf(stderr, Name ": Cannot store new superblock on %s\n",
133 newdev);
134 close(nfd);
135 return 1;
136 }
137 close(nfd);
138
139 if (ioctl(fd, ADD_NEW_DISK, &info.disk) != 0) {
140 fprintf(stderr, Name ": Cannot add new disk to this array\n");
141 return 1;
142 }
143 /* Well, that seems to have worked.
144 * Now go through and update all superblocks
145 */
146
147 if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) {
148 fprintf(stderr, Name ": cannot get array info for %s\n", devname);
149 return 1;
150 }
151
152 nd = d;
153 for (d=0 ; d < info.array.raid_disks ; d++) {
154 mdu_disk_info_t disk;
155 char *dv;
156
157 disk.number = d;
158 if (ioctl(fd, GET_DISK_INFO, &disk) < 0) {
159 fprintf(stderr, Name ": cannot get device detail for device %d\n",
160 d);
161 return 1;
162 }
163 dv = map_dev(disk.major, disk.minor, 1);
164 if (!dv) {
165 fprintf(stderr, Name ": cannot find device file for device %d\n",
166 d);
167 return 1;
168 }
169 fd2 = dev_open(dv, O_RDWR);
170 if (fd2 < 0) {
171 fprintf(stderr, Name ": cannot open device file %s\n", dv);
172 return 1;
173 }
174 if (st->ss->load_super(st, fd2, NULL)) {
175 fprintf(stderr, Name ": cannot find super block on %s\n", dv);
176 close(fd);
177 return 1;
178 }
179 info.array.raid_disks = nd+1;
180 info.array.nr_disks = nd+1;
181 info.array.active_disks = nd+1;
182 info.array.working_disks = nd+1;
183
184 st->ss->update_super(st, &info, "linear-grow-update", dv,
185 0, 0, NULL);
186
187 if (st->ss->store_super(st, fd2)) {
188 fprintf(stderr, Name ": Cannot store new superblock on %s\n", dv);
189 close(fd2);
190 return 1;
191 }
192 close(fd2);
193 }
194
195 return 0;
196 }
197
198 int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int write_behind, int force)
199 {
200 /*
201 * First check that array doesn't have a bitmap
202 * Then create the bitmap
203 * Then add it
204 *
205 * For internal bitmaps, we need to check the version,
206 * find all the active devices, and write the bitmap block
207 * to all devices
208 */
209 mdu_bitmap_file_t bmf;
210 mdu_array_info_t array;
211 struct supertype *st;
212 int major = BITMAP_MAJOR_HI;
213 int vers = md_get_version(fd);
214 unsigned long long bitmapsize, array_size;
215
216 if (vers < 9003) {
217 major = BITMAP_MAJOR_HOSTENDIAN;
218 #ifdef __BIG_ENDIAN
219 fprintf(stderr, Name ": Warning - bitmaps created on this kernel are not portable\n"
220 " between different architectured. Consider upgrading the Linux kernel.\n");
221 #endif
222 }
223
224 if (ioctl(fd, GET_BITMAP_FILE, &bmf) != 0) {
225 if (errno == ENOMEM)
226 fprintf(stderr, Name ": Memory allocation failure.\n");
227 else
228 fprintf(stderr, Name ": bitmaps not supported by this kernel.\n");
229 return 1;
230 }
231 if (bmf.pathname[0]) {
232 if (strcmp(file,"none")==0) {
233 if (ioctl(fd, SET_BITMAP_FILE, -1)!= 0) {
234 fprintf(stderr, Name ": failed to remove bitmap %s\n",
235 bmf.pathname);
236 return 1;
237 }
238 return 0;
239 }
240 fprintf(stderr, Name ": %s already has a bitmap (%s)\n",
241 devname, bmf.pathname);
242 return 1;
243 }
244 if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) {
245 fprintf(stderr, Name ": cannot get array status for %s\n", devname);
246 return 1;
247 }
248 if (array.state & (1<<MD_SB_BITMAP_PRESENT)) {
249 if (strcmp(file, "none")==0) {
250 array.state &= ~(1<<MD_SB_BITMAP_PRESENT);
251 if (ioctl(fd, SET_ARRAY_INFO, &array)!= 0) {
252 fprintf(stderr, Name ": failed to remove internal bitmap.\n");
253 return 1;
254 }
255 return 0;
256 }
257 fprintf(stderr, Name ": Internal bitmap already present on %s\n",
258 devname);
259 return 1;
260 }
261 if (array.level <= 0) {
262 fprintf(stderr, Name ": Bitmaps not meaningful with level %s\n",
263 map_num(pers, array.level)?:"of this array");
264 return 1;
265 }
266 bitmapsize = array.size;
267 bitmapsize <<= 1;
268 if (get_dev_size(fd, NULL, &array_size) &&
269 array_size > (0x7fffffffULL<<9)) {
270 /* Array is big enough that we cannot trust array.size
271 * try other approaches
272 */
273 bitmapsize = get_component_size(fd);
274 }
275 if (bitmapsize == 0) {
276 fprintf(stderr, Name ": Cannot reliably determine size of array to create bitmap - sorry.\n");
277 return 1;
278 }
279
280 if (array.level == 10) {
281 int ncopies = (array.layout&255)*((array.layout>>8)&255);
282 bitmapsize = bitmapsize * array.raid_disks / ncopies;
283 }
284
285 st = super_by_fd(fd);
286 if (!st) {
287 fprintf(stderr, Name ": Cannot understand version %d.%d\n",
288 array.major_version, array.minor_version);
289 return 1;
290 }
291 if (strcmp(file, "none") == 0) {
292 fprintf(stderr, Name ": no bitmap found on %s\n", devname);
293 return 1;
294 } else if (strcmp(file, "internal") == 0) {
295 int d;
296 for (d=0; d< st->max_devs; d++) {
297 mdu_disk_info_t disk;
298 char *dv;
299 disk.number = d;
300 if (ioctl(fd, GET_DISK_INFO, &disk) < 0)
301 continue;
302 if (disk.major == 0 &&
303 disk.minor == 0)
304 continue;
305 if ((disk.state & (1<<MD_DISK_SYNC))==0)
306 continue;
307 dv = map_dev(disk.major, disk.minor, 1);
308 if (dv) {
309 int fd2 = dev_open(dv, O_RDWR);
310 if (fd2 < 0)
311 continue;
312 if (st->ss->load_super(st, fd2, NULL)==0) {
313 if (st->ss->add_internal_bitmap(
314 st,
315 &chunk, delay, write_behind,
316 bitmapsize, 0, major)
317 )
318 st->ss->write_bitmap(st, fd2);
319 else {
320 fprintf(stderr, Name ": failed to create internal bitmap - chunksize problem.\n");
321 close(fd2);
322 return 1;
323 }
324 }
325 close(fd2);
326 }
327 }
328 array.state |= (1<<MD_SB_BITMAP_PRESENT);
329 if (ioctl(fd, SET_ARRAY_INFO, &array)!= 0) {
330 fprintf(stderr, Name ": failed to set internal bitmap.\n");
331 return 1;
332 }
333 } else {
334 int uuid[4];
335 int bitmap_fd;
336 int d;
337 int max_devs = st->max_devs;
338
339 /* try to load a superblock */
340 for (d=0; d<max_devs; d++) {
341 mdu_disk_info_t disk;
342 char *dv;
343 int fd2;
344 disk.number = d;
345 if (ioctl(fd, GET_DISK_INFO, &disk) < 0)
346 continue;
347 if ((disk.major==0 && disk.minor==0) ||
348 (disk.state & (1<<MD_DISK_REMOVED)))
349 continue;
350 dv = map_dev(disk.major, disk.minor, 1);
351 if (!dv) continue;
352 fd2 = dev_open(dv, O_RDONLY);
353 if (fd2 >= 0 &&
354 st->ss->load_super(st, fd2, NULL) == 0) {
355 close(fd2);
356 st->ss->uuid_from_super(st, uuid);
357 break;
358 }
359 close(fd2);
360 }
361 if (d == max_devs) {
362 fprintf(stderr, Name ": cannot find UUID for array!\n");
363 return 1;
364 }
365 if (CreateBitmap(file, force, (char*)uuid, chunk,
366 delay, write_behind, bitmapsize, major)) {
367 return 1;
368 }
369 bitmap_fd = open(file, O_RDWR);
370 if (bitmap_fd < 0) {
371 fprintf(stderr, Name ": weird: %s cannot be opened\n",
372 file);
373 return 1;
374 }
375 if (ioctl(fd, SET_BITMAP_FILE, bitmap_fd) < 0) {
376 fprintf(stderr, Name ": Cannot set bitmap file for %s: %s\n",
377 devname, strerror(errno));
378 return 1;
379 }
380 }
381
382 return 0;
383 }
384
385
386 /*
387 * When reshaping an array we might need to backup some data.
388 * This is written to all spares with a 'super_block' describing it.
389 * The superblock goes 1K form the end of the used space on the
390 * device.
391 * It if written after the backup is complete.
392 * It has the following structure.
393 */
394
395 struct mdp_backup_super {
396 char magic[16]; /* md_backup_data-1 or -2 */
397 __u8 set_uuid[16];
398 __u64 mtime;
399 /* start/sizes in 512byte sectors */
400 __u64 devstart; /* address on backup device/file of data */
401 __u64 arraystart;
402 __u64 length;
403 __u32 sb_csum; /* csum of preceeding bytes. */
404 __u32 pad1;
405 __u64 devstart2; /* offset in to data of second section */
406 __u64 arraystart2;
407 __u64 length2;
408 __u32 sb_csum2; /* csum of preceeding bytes. */
409 __u8 pad[512-68-32];
410 } __attribute__((aligned(512))) bsb;
411
412 int bsb_csum(char *buf, int len)
413 {
414 int i;
415 int csum = 0;
416 for (i=0; i<len; i++)
417 csum = (csum<<3) + buf[0];
418 return __cpu_to_le32(csum);
419 }
420
421 static int child_grow(int afd, struct mdinfo *sra, unsigned long blocks,
422 int *fds, unsigned long long *offsets,
423 int disks, int chunk, int level, int layout, int data,
424 int dests, int *destfd, unsigned long long *destoffsets);
425 static int child_shrink(int afd, struct mdinfo *sra, unsigned long blocks,
426 int *fds, unsigned long long *offsets,
427 int disks, int chunk, int level, int layout, int data,
428 int dests, int *destfd, unsigned long long *destoffsets);
429 static int child_same_size(int afd, struct mdinfo *sra, unsigned long blocks,
430 int *fds, unsigned long long *offsets,
431 unsigned long long start,
432 int disks, int chunk, int level, int layout, int data,
433 int dests, int *destfd, unsigned long long *destoffsets);
434
435 int freeze_array(struct mdinfo *sra)
436 {
437 /* Try to freeze resync on this array.
438 * Return -1 if the array is busy,
439 * return 0 if this kernel doesn't support 'frozen'
440 * return 1 if it worked.
441 */
442 char buf[20];
443 if (sysfs_get_str(sra, NULL, "sync_action", buf, 20) <= 0)
444 return 0;
445 if (strcmp(buf, "idle\n") != 0 &&
446 strcmp(buf, "frozen\n") != 0)
447 return -1;
448 if (sysfs_set_str(sra, NULL, "sync_action", "frozen") < 0)
449 return 0;
450 return 1;
451 }
452
453 void unfreeze_array(struct mdinfo *sra, int frozen)
454 {
455 /* If 'frozen' is 1, unfreeze the array */
456 if (frozen > 0)
457 sysfs_set_str(sra, NULL, "sync_action", "idle");
458 }
459
460 void wait_reshape(struct mdinfo *sra)
461 {
462 int fd = sysfs_get_fd(sra, NULL, "sync_action");
463 char action[20];
464
465 do {
466 fd_set rfds;
467 FD_ZERO(&rfds);
468 FD_SET(fd, &rfds);
469 select(fd+1, NULL, NULL, &rfds, NULL);
470
471 if (sysfs_fd_get_str(fd, action, 20) < 0) {
472 close(fd);
473 return;
474 }
475 } while (strncmp(action, "reshape", 7) == 0);
476 }
477
478
479 int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
480 long long size,
481 int level, char *layout_str, int chunksize, int raid_disks)
482 {
483 /* Make some changes in the shape of an array.
484 * The kernel must support the change.
485 *
486 * There are three different changes. Each can trigger
487 * a resync or recovery so we freeze that until we have
488 * requested everything (if kernel supports freezing - 2.6.30).
489 * The steps are:
490 * - change size (i.e. component_size)
491 * - change level
492 * - change layout/chunksize/ndisks
493 *
494 * The last can require a reshape. It is different on different
495 * levels so we need to check the level before actioning it.
496 * Some times the level change needs to be requested after the
497 * reshape (e.g. raid6->raid5, raid5->raid0)
498 *
499 */
500 struct mdu_array_info_s array, orig;
501 char *c;
502 int rv = 0;
503 struct supertype *st;
504
505 int nchunk, ochunk;
506 int nlayout, olayout;
507 int ndisks, odisks;
508 int ndata, odata;
509 int orig_level = UnSet;
510 char alt_layout[40];
511 int *fdlist;
512 unsigned long long *offsets;
513 int d, i;
514 int nrdisks;
515 int err;
516 int frozen;
517 unsigned long a,b, blocks, stripes;
518 int cache;
519 unsigned long long array_size;
520 int changed = 0;
521 int done;
522
523 struct mdinfo *sra;
524 struct mdinfo *sd;
525
526 if (ioctl(fd, GET_ARRAY_INFO, &array) < 0) {
527 fprintf(stderr, Name ": %s is not an active md array - aborting\n",
528 devname);
529 return 1;
530 }
531 sra = sysfs_read(fd, 0, GET_LEVEL);
532 frozen = freeze_array(sra);
533 if (frozen < 0) {
534 fprintf(stderr, Name ": %s is performing resync/recovery and cannot"
535 " be reshaped\n", devname);
536 return 1;
537 }
538
539 /* ========= set size =============== */
540 if (size >= 0 && (size == 0 || size != array.size)) {
541 array.size = size;
542 if (array.size != size) {
543 /* got truncated to 32bit, write to
544 * component_size instead
545 */
546 if (sra)
547 rv = sysfs_set_num(sra, NULL,
548 "component_size", size);
549 else
550 rv = -1;
551 } else
552 rv = ioctl(fd, SET_ARRAY_INFO, &array);
553 if (rv != 0) {
554 fprintf(stderr, Name ": Cannot set device size for %s: %s\n",
555 devname, strerror(errno));
556 rv = 1;
557 goto release;
558 }
559 ioctl(fd, GET_ARRAY_INFO, &array);
560 if (!quiet)
561 fprintf(stderr, Name ": component size of %s has been set to %dK\n",
562 devname, array.size);
563 changed = 1;
564 }
565
566 /* ======= set level =========== */
567 if (level != UnSet && level != array.level) {
568 /* Trying to change the level.
569 * We might need to change layout first and schedule a
570 * level change for later.
571 * Level changes that can happen immediately are:
572 * 0->4,5,6 1->5 4->5,6 5->1,6
573 * Level changes that need a layout change first are:
574 * 6->5,4,0 : need a -6 layout, or parity-last
575 * 5->4,0 : need parity-last
576 */
577 if ((array.level == 6 || array.level == 5) &&
578 (level == 5 || level == 4 || level == 0)) {
579 /* Don't change level yet, but choose intermediate
580 * layout
581 */
582 if (level == 5) {
583 if (layout_str == NULL)
584 switch (array.layout) {
585 case ALGORITHM_LEFT_ASYMMETRIC:
586 case ALGORITHM_LEFT_ASYMMETRIC_6:
587 case ALGORITHM_ROTATING_N_RESTART:
588 layout_str = "left-asymmetric-6";
589 break;
590 case ALGORITHM_LEFT_SYMMETRIC:
591 case ALGORITHM_LEFT_SYMMETRIC_6:
592 case ALGORITHM_ROTATING_N_CONTINUE:
593 layout_str = "left-symmetric-6";
594 break;
595 case ALGORITHM_RIGHT_ASYMMETRIC:
596 case ALGORITHM_RIGHT_ASYMMETRIC_6:
597 case ALGORITHM_ROTATING_ZERO_RESTART:
598 layout_str = "right-asymmetric-6";
599 break;
600 case ALGORITHM_RIGHT_SYMMETRIC:
601 case ALGORITHM_RIGHT_SYMMETRIC_6:
602 layout_str = "right-symmetric-6";
603 break;
604 case ALGORITHM_PARITY_0:
605 case ALGORITHM_PARITY_0_6:
606 layout_str = "parity-first-6";
607 break;
608 case ALGORITHM_PARITY_N:
609 layout_str = "parity-last";
610 break;
611 default:
612 fprintf(stderr, Name ": %s: cannot"
613 "convert layout to RAID5 equivalent\n",
614 devname);
615 rv = 1;
616 goto release;
617 }
618 else {
619 int l = map_name(r5layout, layout_str);
620 if (l == UnSet) {
621 fprintf(stderr, Name ": %s: layout '%s' not recognised\n",
622 devname, layout_str);
623 rv = 1;
624 goto release;
625 }
626 if (l != ALGORITHM_PARITY_N) {
627 /* need the -6 version */
628 char *ls = map_num(r5layout, l);
629 strcat(strcpy(alt_layout, ls),
630 "-6");
631 layout_str = alt_layout;
632 }
633 }
634 if (raid_disks)
635 /* The find raid6->raid5 conversion
636 * will reduce the number of disks,
637 * so now we need to aim higher
638 */
639 raid_disks++;
640 } else
641 layout_str = "parity-last";
642 } else {
643 c = map_num(pers, level);
644 if (c == NULL)
645 return 1;/* not possible */
646 err = sysfs_set_str(sra, NULL, "level", c);
647 if (err) {
648 fprintf(stderr, Name ": %s: could not set level to %s\n",
649 devname, c);
650 rv = 1;
651 goto release;
652 }
653 orig = array;
654 orig_level = orig.level;
655 ioctl(fd, GET_ARRAY_INFO, &array);
656 if (layout_str == NULL &&
657 orig.level == 5 && level == 6 &&
658 array.layout != orig.layout)
659 layout_str = map_num(r5layout, orig.layout);
660 if (!quiet)
661 fprintf(stderr, Name " level of %s changed to %s\n",
662 devname, c);
663 changed = 1;
664 }
665 }
666
667 /* ========= set shape (chunk_size / layout / ndisks) ============== */
668 /* Check if layout change is a no-op */
669 if (layout_str) switch(array.level) {
670 case 5:
671 if (array.layout == map_name(r5layout, layout_str))
672 layout_str = NULL;
673 break;
674 case 6:
675 if (layout_str == NULL &&
676 ((chunksize && chunksize * 1024 != array.chunk_size) ||
677 (raid_disks && raid_disks != array.raid_disks)) &&
678 array.layout >= 16) {
679 fprintf(stderr, Name
680 ": %s has a non-standard layout. If you wish to preserve this\n"
681 " during the reshape, please specify --layout=preserve\n"
682 " If you want to change it, specify a layout or use --layout=normalise\n",
683 devname);
684 rv = 1;
685 goto release;
686 }
687 if (strcmp(layout_str, "normalise") == 0 ||
688 strcmp(layout_str, "normalize") == 0) {
689 char *hyphen;
690 strcpy(alt_layout, map_num(r6layout, array.layout));
691 hyphen = strrchr(alt_layout, '-');
692 if (hyphen && strcmp(hyphen, "-6") == 0) {
693 *hyphen = 0;
694 layout_str = alt_layout;
695 }
696 }
697
698 if (array.layout == map_name(r6layout, layout_str))
699 layout_str = NULL;
700 if (layout_str && strcmp(layout_str, "preserve") == 0)
701 layout_str = NULL;
702 break;
703 }
704 if (layout_str == NULL
705 && (chunksize == 0 || chunksize*1024 == array.chunk_size)
706 && (raid_disks == 0 || raid_disks == array.raid_disks)) {
707 rv = 0;
708 if (level != UnSet && level != array.level) {
709 /* Looks like this level change doesn't need
710 * a reshape after all.
711 */
712 c = map_num(pers, level);
713 if (c) {
714 rv = sysfs_set_str(sra, NULL, "level", c);
715 if (rv)
716 fprintf(stderr, Name ": %s: could not set level to %s\n",
717 devname, c);
718 }
719 } else if (!changed && !quiet)
720 fprintf(stderr, Name ": %s: no change requested\n",
721 devname);
722 goto release;
723 }
724
725 c = map_num(pers, array.level);
726 if (c == NULL) c = "-unknown-";
727 switch(array.level) {
728 default: /* raid0, linear, multipath cannot be reconfigured */
729 fprintf(stderr, Name ": %s array %s cannot be reshaped.\n",
730 c, devname);
731 rv = 1;
732 break;
733
734 case LEVEL_FAULTY: /* only 'layout' change is permitted */
735
736 if (chunksize || raid_disks) {
737 fprintf(stderr, Name ": %s: Cannot change chunksize or disks of a 'faulty' array\n",
738 devname);
739 rv = 1;
740 break;
741 }
742 if (layout_str == NULL)
743 break; /* nothing to do.... */
744
745 array.layout = parse_layout_faulty(layout_str);
746 if (array.layout < 0) {
747 int rv;
748 fprintf(stderr, Name ": %s: layout %s not understood for 'faulty' array\n",
749 devname, layout_str);
750 rv = 1;
751 break;
752 }
753 if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
754 fprintf(stderr, Name ": Cannot set layout for %s: %s\n",
755 devname, strerror(errno));
756 rv = 1;
757 } else if (!quiet)
758 printf("layout for %s set to %d\n", devname, array.layout);
759 break;
760
761 case 1: /* only raid_disks can each be changed. */
762
763 if (chunksize || layout_str != NULL) {
764 fprintf(stderr, Name ": %s: Cannot change chunk size or layout for a RAID1 array.\n",
765 devname);
766 rv = 1;
767 break;
768 }
769 if (raid_disks > 0) {
770 array.raid_disks = raid_disks;
771 if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
772 fprintf(stderr, Name ": Cannot set raid-devices for %s: %s\n",
773 devname, strerror(errno));
774 rv = 1;
775 }
776 }
777 break;
778
779 case 4:
780 case 5:
781 case 6:
782
783 /*
784 * layout/chunksize/raid_disks can be changed
785 * though the kernel may not support it all.
786 */
787 st = super_by_fd(fd);
788
789 /*
790 * There are three possibilities.
791 * 1/ The array will shrink.
792 * We need to ensure the reshape will pause before reaching
793 * the 'critical section'. We also need to fork and wait for
794 * that to happen. When it does we
795 * suspend/backup/complete/unfreeze
796 *
797 * 2/ The array will not change size.
798 * This requires that we keep a backup of a sliding window
799 * so that we can restore data after a crash. So we need
800 * to fork and monitor progress.
801 *
802 * 3/ The array will grow. This is relatively easy.
803 * However the kernel's restripe routines will cheerfully
804 * overwrite some early data before it is safe. So we
805 * need to make a backup of the early parts of the array
806 * and be ready to restore it if rebuild aborts very early.
807 *
808 * We backup data by writing it to one spare, or to a
809 * file which was given on command line.
810 *
811 * [FOLLOWING IS OLD AND PARTLY WRONG]
812 * So: we enumerate the devices in the array and
813 * make sure we can open all of them.
814 * Then we freeze the early part of the array and
815 * backup to the various spares.
816 * Then we request changes and start the reshape.
817 * Monitor progress until it has passed the danger zone.
818 * and finally invalidate the copied data and unfreeze the
819 * start of the array.
820 *
821 * In each case, we first make sure that storage is available
822 * for the required backup.
823 * Then we:
824 * - request the shape change.
825 * - for to handle backup etc.
826 */
827 nchunk = ochunk = array.chunk_size;
828 nlayout = olayout = array.layout;
829 ndisks = odisks = array.raid_disks;
830
831 if (chunksize) {
832 nchunk = chunksize * 1024;
833 if (array.size % chunksize) {
834 fprintf(stderr, Name ": component size %dK is not"
835 " a multiple of chunksize %dK\n",
836 array.size, chunksize);
837 break;
838 }
839 }
840 if (layout_str != NULL)
841 switch(array.level) {
842 case 4: /* ignore layout */
843 break;
844 case 5:
845 nlayout = map_name(r5layout, layout_str);
846 if (nlayout == UnSet) {
847 fprintf(stderr, Name ": layout %s not understood for raid5.\n",
848 layout_str);
849 return 1;
850 }
851 break;
852
853 case 6:
854 nlayout = map_name(r6layout, layout_str);
855 if (nlayout == UnSet) {
856 fprintf(stderr, Name ": layout %s not understood for raid6.\n",
857 layout_str);
858 return 1;
859 }
860 break;
861 }
862 if (raid_disks) ndisks = raid_disks;
863
864 odata = odisks-1;
865 ndata = ndisks-1;
866 if (array.level == 6) {
867 odata--; /* number of data disks */
868 ndata--;
869 }
870
871 /* Check that we can hold all the data */
872 size = ndata * array.size;
873 get_dev_size(fd, NULL, &array_size);
874 if (size < (array_size/1024)) {
875 fprintf(stderr, Name ": this change will reduce the size of the array.\n"
876 " use --grow --array-size first to truncate array.\n"
877 " e.g. mdadm --grow %s --array-size %llu\n",
878 devname, size);
879 rv = 1;
880 break;
881 }
882
883 /* So how much do we need to backup.
884 * We need an amount of data which is both a whole number of
885 * old stripes and a whole number of new stripes.
886 * So LCM for (chunksize*datadisks).
887 */
888 a = ochunk/512 * odata;
889 b = nchunk/512 * ndata;
890 /* Find GCD */
891 while (a != b) {
892 if (a < b)
893 b -= a;
894 if (b < a)
895 a -= b;
896 }
897 /* LCM == product / GCD */
898 blocks = ochunk/512 * nchunk/512 * odata * ndata / a;
899
900 if (ndata == odata)
901 blocks *= 16;
902 else
903 fprintf(stderr, Name ": Need to backup %luK of critical "
904 "section..\n", blocks/2);
905
906 sysfs_free(sra);
907 sra = sysfs_read(fd, 0,
908 GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
909 GET_CACHE);
910 if (!sra) {
911 fprintf(stderr, Name ": %s: Cannot get array details from sysfs\n",
912 devname);
913 rv = 1;
914 break;
915 }
916
917 if (blocks >= sra->component_size/2) {
918 fprintf(stderr, Name ": %s: Something wrong - reshape aborted\n",
919 devname);
920 rv = 1;
921 break;
922 }
923 nrdisks = array.nr_disks + sra->array.spare_disks;
924 /* Now we need to open all these devices so we can read/write.
925 */
926 fdlist = malloc((1+nrdisks) * sizeof(int));
927 offsets = malloc((1+nrdisks) * sizeof(offsets[0]));
928 if (!fdlist || !offsets) {
929 fprintf(stderr, Name ": malloc failed: grow aborted\n");
930 rv = 1;
931 break;
932 }
933 for (d=0; d <= nrdisks; d++)
934 fdlist[d] = -1;
935 d = array.raid_disks;
936 for (sd = sra->devs; sd; sd=sd->next) {
937 if (sd->disk.state & (1<<MD_DISK_FAULTY))
938 continue;
939 if (sd->disk.state & (1<<MD_DISK_SYNC)) {
940 char *dn = map_dev(sd->disk.major,
941 sd->disk.minor, 1);
942 fdlist[sd->disk.raid_disk]
943 = dev_open(dn, O_RDONLY);
944 offsets[sd->disk.raid_disk] = sd->data_offset*512;
945 if (fdlist[sd->disk.raid_disk] < 0) {
946 fprintf(stderr, Name ": %s: cannot open component %s\n",
947 devname, dn?dn:"-unknown-");
948 rv = 1;
949 goto release;
950 }
951 } else if (backup_file == NULL) {
952 /* spare */
953 char *dn = map_dev(sd->disk.major,
954 sd->disk.minor, 1);
955 fdlist[d] = dev_open(dn, O_RDWR);
956 offsets[d] = (sra->component_size - blocks - 8)*512;
957 if (fdlist[d]<0) {
958 fprintf(stderr, Name ": %s: cannot open component %s\n",
959 devname, dn?dn:"-unknown");
960 rv = 1;
961 goto release;
962 }
963 d++;
964 }
965 }
966 if (backup_file == NULL) {
967 if (ndata <= odata) {
968 fprintf(stderr, Name ": %s: Cannot grow - need backup-file\n",
969 devname);
970 rv = 1;
971 break;
972 } else if (sra->array.spare_disks == 0) {
973 fprintf(stderr, Name ": %s: Cannot grow - need a spare or "
974 "backup-file to backup critical section\n",
975 devname);
976 rv = 1;
977 break;
978 }
979 if (d == array.raid_disks) {
980 fprintf(stderr, Name ": %s: No spare device for backup\n",
981 devname);
982 rv = 1;
983 break;
984 }
985 } else {
986 /* need to check backup file is large enough */
987 char buf[512];
988 fdlist[d] = open(backup_file, O_RDWR|O_CREAT|O_EXCL,
989 S_IRUSR | S_IWUSR);
990 offsets[d] = 8 * 512;
991 if (fdlist[d] < 0) {
992 fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
993 devname, backup_file, strerror(errno));
994 rv = 1;
995 break;
996 }
997 memset(buf, 0, 512);
998 for (i=0; i < blocks + 1 ; i++) {
999 if (write(fdlist[d], buf, 512) != 512) {
1000 fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
1001 devname, backup_file, strerror(errno));
1002 rv = 1;
1003 break;
1004 }
1005 }
1006 if (fsync(fdlist[d]) != 0) {
1007 fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
1008 devname, backup_file, strerror(errno));
1009 rv = 1;
1010 break;
1011 }
1012 d++;
1013 }
1014
1015 /* lastly, check that the internal stripe cache is
1016 * large enough, or it won't work.
1017 */
1018
1019 cache = (nchunk < ochunk) ? ochunk : nchunk;
1020 cache = cache * 4 / 4096;
1021 if (sra->cache_size < cache)
1022 sysfs_set_num(sra, NULL, "stripe_cache_size",
1023 cache+1);
1024 /* Right, everything seems fine. Let's kick things off.
1025 * If only changing raid_disks, use ioctl, else use
1026 * sysfs.
1027 */
1028 if (ochunk == nchunk && olayout == nlayout) {
1029 array.raid_disks = ndisks;
1030 if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
1031 rv = 1;
1032 fprintf(stderr, Name ": Cannot set device shape for %s: %s\n",
1033 devname, strerror(errno));
1034 if (ndisks < odisks &&
1035 get_linux_version() < 2006030)
1036 fprintf(stderr, Name ": linux 2.6.30 or later required\n");
1037
1038 break;
1039 }
1040 } else {
1041 /* set them all just in case some old 'new_*' value
1042 * persists from some earlier problem
1043 */
1044 if (sysfs_set_num(sra, NULL, "chunk_size", nchunk) < 0)
1045 rv = 1;
1046 if (sysfs_set_num(sra, NULL, "layout", nlayout) < 0)
1047 rv = 1;
1048 if (sysfs_set_num(sra, NULL, "raid_disks", ndisks) < 0)
1049 rv = 1;
1050 if (rv) {
1051 fprintf(stderr, Name ": Cannot set device shape for %s\n",
1052 devname);
1053 if (get_linux_version() < 2006030)
1054 fprintf(stderr, Name ": linux 2.6.30 or later required\n");
1055 break;
1056 }
1057 }
1058
1059 if (ndisks == 2 && odisks == 2) {
1060 /* No reshape is needed in this trivial case */
1061 rv = 0;
1062 break;
1063 }
1064
1065 /* set up the backup-super-block. This requires the
1066 * uuid from the array.
1067 */
1068 /* Find a superblock */
1069 for (sd = sra->devs; sd; sd = sd->next) {
1070 char *dn;
1071 int devfd;
1072 int ok;
1073 if (sd->disk.state & (1<<MD_DISK_FAULTY))
1074 continue;
1075 dn = map_dev(sd->disk.major, sd->disk.minor, 1);
1076 devfd = dev_open(dn, O_RDONLY);
1077 if (devfd < 0)
1078 continue;
1079 ok = st->ss->load_super(st, devfd, NULL);
1080 close(devfd);
1081 if (ok >= 0)
1082 break;
1083 }
1084 if (!sd) {
1085 fprintf(stderr, Name ": %s: Cannot find a superblock\n",
1086 devname);
1087 rv = 1;
1088 break;
1089 }
1090
1091 memset(&bsb, 0, 512);
1092 memcpy(bsb.magic, "md_backup_data-1", 16);
1093 st->ss->uuid_from_super(st, (int*)&bsb.set_uuid);
1094 bsb.mtime = __cpu_to_le64(time(0));
1095 bsb.devstart2 = blocks;
1096 stripes = blocks / (ochunk/512) / odata;
1097 /* Now we just need to kick off the reshape and watch, while
1098 * handling backups of the data...
1099 * This is all done by a forked background process.
1100 */
1101 switch(fork()) {
1102 case 0:
1103 close(fd);
1104 if (check_env("MDADM_GROW_VERIFY"))
1105 fd = open(devname, O_RDONLY | O_DIRECT);
1106 else
1107 fd = -1;
1108 mlockall(MCL_FUTURE);
1109
1110 if (odata < ndata)
1111 done = child_grow(fd, sra, stripes,
1112 fdlist, offsets,
1113 odisks, ochunk, array.level, olayout, odata,
1114 d - odisks, fdlist+odisks, offsets+odisks);
1115 else if (odata > ndata)
1116 done = child_shrink(fd, sra, stripes,
1117 fdlist, offsets,
1118 odisks, ochunk, array.level, olayout, odata,
1119 d - odisks, fdlist+odisks, offsets+odisks);
1120 else
1121 done = child_same_size(fd, sra, stripes,
1122 fdlist, offsets,
1123 0,
1124 odisks, ochunk, array.level, olayout, odata,
1125 d - odisks, fdlist+odisks, offsets+odisks);
1126 if (backup_file && done)
1127 unlink(backup_file);
1128 if (level != UnSet && level != array.level) {
1129 /* We need to wait for the reshape to finish
1130 * (which will have happened unless odata < ndata)
1131 * and then set the level
1132 */
1133
1134 c = map_num(pers, level);
1135 if (c == NULL)
1136 exit(0);/* not possible */
1137
1138 if (odata < ndata)
1139 wait_reshape(sra);
1140 err = sysfs_set_str(sra, NULL, "level", c);
1141 if (err)
1142 fprintf(stderr, Name ": %s: could not set level to %s\n",
1143 devname, c);
1144 }
1145 exit(0);
1146 case -1:
1147 fprintf(stderr, Name ": Cannot run child to monitor reshape: %s\n",
1148 strerror(errno));
1149 rv = 1;
1150 break;
1151 default:
1152 /* The child will take care of unfreezing the array */
1153 frozen = 0;
1154 break;
1155 }
1156 break;
1157
1158 }
1159
1160 release:
1161 if (rv && orig_level != UnSet && sra) {
1162 c = map_num(pers, orig_level);
1163 if (c && sysfs_set_str(sra, NULL, "level", c) == 0)
1164 fprintf(stderr, Name ": aborting level change\n");
1165 }
1166 if (sra)
1167 unfreeze_array(sra, frozen);
1168 return rv;
1169 }
1170
1171 /*
1172 * We run a child process in the background which performs the following
1173 * steps:
1174 * - wait for resync to reach a certain point
1175 * - suspend io to the following section
1176 * - backup that section
1177 * - allow resync to proceed further
1178 * - resume io
1179 * - discard the backup.
1180 *
1181 * When are combined in slightly different ways in the three cases.
1182 * Grow:
1183 * - suspend/backup/allow/wait/resume/discard
1184 * Shrink:
1185 * - allow/wait/suspend/backup/allow/wait/resume/discard
1186 * same-size:
1187 * - wait/resume/discard/suspend/backup/allow
1188 *
1189 * suspend/backup/allow always come together
1190 * wait/resume/discard do too.
1191 * For the same-size case we have two backups to improve flow.
1192 *
1193 */
1194
1195 int grow_backup(struct mdinfo *sra,
1196 unsigned long long offset, /* per device */
1197 unsigned long stripes, /* per device */
1198 int *sources, unsigned long long *offsets,
1199 int disks, int chunk, int level, int layout,
1200 int dests, int *destfd, unsigned long long *destoffsets,
1201 int part,
1202 char *buf)
1203 {
1204 /* Backup 'blocks' sectors at 'offset' on each device of the array,
1205 * to storage 'destfd' (offset 'destoffsets'), after first
1206 * suspending IO. Then allow resync to continue
1207 * over the suspended section.
1208 * Use part 'part' of the backup-super-block.
1209 */
1210 int odata = disks;
1211 int rv = 0;
1212 int i;
1213 //printf("offset %llu\n", offset);
1214 if (level >= 4)
1215 odata--;
1216 if (level == 6)
1217 odata--;
1218 sysfs_set_num(sra, NULL, "suspend_hi", (offset + stripes * chunk/512) * odata);
1219 if (part) {
1220 bsb.arraystart2 = __cpu_to_le64(offset * odata);
1221 bsb.length2 = __cpu_to_le64(stripes * chunk/512 * odata);
1222 } else {
1223 bsb.arraystart = __cpu_to_le64(offset * odata);
1224 bsb.length = __cpu_to_le64(stripes * chunk/512 * odata);
1225 }
1226 if (part)
1227 bsb.magic[15] = '2';
1228 for (i = 0; i < dests; i++)
1229 if (part)
1230 lseek64(destfd[i], destoffsets[i] + __le64_to_cpu(bsb.devstart2)*512, 0);
1231 else
1232 lseek64(destfd[i], destoffsets[i], 0);
1233
1234 rv = save_stripes(sources, offsets,
1235 disks, chunk, level, layout,
1236 dests, destfd,
1237 offset*512*odata, stripes * chunk * odata,
1238 buf);
1239
1240 if (rv)
1241 return rv;
1242 for (i = 0; i < dests; i++) {
1243 bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
1244
1245 bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
1246 if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
1247 bsb.sb_csum2 = bsb_csum((char*)&bsb,
1248 ((char*)&bsb.sb_csum2)-((char*)&bsb));
1249
1250 lseek64(destfd[i], destoffsets[i] - 4096, 0);
1251 write(destfd[i], &bsb, 512);
1252 fsync(destfd[i]);
1253 }
1254
1255 return 0;
1256 }
1257
1258 /* in 2.6.30, the value reported by sync_completed can be
1259 * less that it should be by one stripe.
1260 * This only happens when reshape hits sync_max and pauses.
1261 * So allow wait_backup to either extent sync_max further
1262 * than strictly necessary, or return before the
1263 * sync has got quite as far as we would really like.
1264 * This is what 'blocks2' is for.
1265 * The various caller give appropriate values so that
1266 * every works.
1267 */
1268 int wait_backup(struct mdinfo *sra,
1269 unsigned long long offset, /* per device */
1270 unsigned long long blocks, /* per device */
1271 unsigned long long blocks2, /* per device - hack */
1272 int dests, int *destfd, unsigned long long *destoffsets,
1273 int part)
1274 {
1275 /* Wait for resync to pass the section that was backed up
1276 * then erase the backup and allow IO
1277 */
1278 int fd = sysfs_get_fd(sra, NULL, "sync_completed");
1279 unsigned long long completed;
1280 int i;
1281
1282 if (fd < 0)
1283 return -1;
1284 sysfs_set_num(sra, NULL, "sync_max", offset + blocks + blocks2);
1285 if (offset == 0)
1286 sysfs_set_str(sra, NULL, "sync_action", "reshape");
1287 do {
1288 char action[20];
1289 fd_set rfds;
1290 FD_ZERO(&rfds);
1291 FD_SET(fd, &rfds);
1292 select(fd+1, NULL, NULL, &rfds, NULL);
1293 if (sysfs_fd_get_ll(fd, &completed) < 0) {
1294 close(fd);
1295 return -1;
1296 }
1297 if (sysfs_get_str(sra, NULL, "sync_action",
1298 action, 20) > 0 &&
1299 strncmp(action, "reshape", 7) != 0)
1300 break;
1301 } while (completed < offset + blocks);
1302 close(fd);
1303
1304 if (part) {
1305 bsb.arraystart2 = __cpu_to_le64(0);
1306 bsb.length2 = __cpu_to_le64(0);
1307 } else {
1308 bsb.arraystart = __cpu_to_le64(0);
1309 bsb.length = __cpu_to_le64(0);
1310 }
1311 for (i = 0; i < dests; i++) {
1312 bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
1313 bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
1314 if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
1315 bsb.sb_csum2 = bsb_csum((char*)&bsb,
1316 ((char*)&bsb.sb_csum2)-((char*)&bsb));
1317 lseek64(destfd[i], destoffsets[i]-4096, 0);
1318 write(destfd[i], &bsb, 512);
1319 fsync(destfd[i]);
1320 }
1321 return 0;
1322 }
1323
1324 static void fail(char *msg)
1325 {
1326 write(2, msg, strlen(msg));
1327 write(2, "\n", 1);
1328 exit(1);
1329 }
1330
1331 static char *abuf, *bbuf;
1332 static int abuflen;
1333 static void validate(int afd, int bfd, unsigned long long offset)
1334 {
1335 /* check that the data in the backup against the array.
1336 * This is only used for regression testing and should not
1337 * be used while the array is active
1338 */
1339 struct mdp_backup_super bsb2;
1340 if (afd < 0)
1341 return;
1342 lseek64(bfd, offset - 4096, 0);
1343 if (read(bfd, &bsb2, 512) != 512)
1344 fail("cannot read bsb");
1345 if (bsb2.sb_csum != bsb_csum((char*)&bsb2,
1346 ((char*)&bsb2.sb_csum)-((char*)&bsb2)))
1347 fail("first csum bad");
1348 if (memcmp(bsb2.magic, "md_backup_data", 14) != 0)
1349 fail("magic is bad");
1350 if (memcmp(bsb2.magic, "md_backup_data-2", 16) == 0 &&
1351 bsb2.sb_csum2 != bsb_csum((char*)&bsb2,
1352 ((char*)&bsb2.sb_csum2)-((char*)&bsb2)))
1353 fail("second csum bad");
1354
1355 if (__le64_to_cpu(bsb2.devstart)*512 != offset)
1356 fail("devstart is wrong");
1357
1358 if (bsb2.length) {
1359 unsigned long long len = __le64_to_cpu(bsb2.length)*512;
1360
1361 if (abuflen < len) {
1362 free(abuf);
1363 free(bbuf);
1364 abuflen = len;
1365 posix_memalign((void**)&abuf, 4096, abuflen);
1366 posix_memalign((void**)&bbuf, 4096, abuflen);
1367 }
1368
1369 lseek64(bfd, offset, 0);
1370 if (read(bfd, bbuf, len) != len) {
1371 printf("len %llu\n", len);
1372 fail("read first backup failed");
1373 }
1374 lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0);
1375 if (read(afd, abuf, len) != len)
1376 fail("read first from array failed");
1377 if (memcmp(bbuf, abuf, len) != 0) {
1378 int i;
1379 printf("offset=%llu len=%llu\n",
1380 __le64_to_cpu(bsb2.arraystart)*512, len);
1381 for (i=0; i<len; i++)
1382 if (bbuf[i] != abuf[i]) {
1383 printf("first diff byte %d\n", i);
1384 break;
1385 }
1386 fail("data1 compare failed");
1387 }
1388 }
1389 if (bsb2.length2) {
1390 unsigned long long len = __le64_to_cpu(bsb2.length2)*512;
1391
1392 if (abuflen < len) {
1393 free(abuf);
1394 free(bbuf);
1395 abuflen = len;
1396 abuf = malloc(abuflen);
1397 bbuf = malloc(abuflen);
1398 }
1399
1400 lseek64(bfd, offset+__le64_to_cpu(bsb2.devstart2)*512, 0);
1401 if (read(bfd, bbuf, len) != len)
1402 fail("read second backup failed");
1403 lseek64(afd, __le64_to_cpu(bsb2.arraystart2)*512, 0);
1404 if (read(afd, abuf, len) != len)
1405 fail("read second from array failed");
1406 if (memcmp(bbuf, abuf, len) != 0)
1407 fail("data2 compare failed");
1408 }
1409 }
1410
1411 static int child_grow(int afd, struct mdinfo *sra, unsigned long stripes,
1412 int *fds, unsigned long long *offsets,
1413 int disks, int chunk, int level, int layout, int data,
1414 int dests, int *destfd, unsigned long long *destoffsets)
1415 {
1416 char *buf;
1417
1418 posix_memalign((void**)&buf, 4096, disks * chunk);
1419 sysfs_set_num(sra, NULL, "suspend_hi", 0);
1420 sysfs_set_num(sra, NULL, "suspend_lo", 0);
1421 grow_backup(sra, 0, stripes,
1422 fds, offsets, disks, chunk, level, layout,
1423 dests, destfd, destoffsets,
1424 0, buf);
1425 validate(afd, destfd[0], destoffsets[0]);
1426 if (wait_backup(sra, 0, stripes * chunk / 512, stripes * chunk / 512,
1427 dests, destfd, destoffsets,
1428 0) < 0)
1429 return 0;
1430 sysfs_set_num(sra, NULL, "suspend_lo", (stripes * chunk/512) * data);
1431 free(buf);
1432 /* FIXME this should probably be numeric */
1433 sysfs_set_str(sra, NULL, "sync_max", "max");
1434 return 1;
1435 }
1436
1437 static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes,
1438 int *fds, unsigned long long *offsets,
1439 int disks, int chunk, int level, int layout, int data,
1440 int dests, int *destfd, unsigned long long *destoffsets)
1441 {
1442 char *buf;
1443 unsigned long long start;
1444 int rv;
1445
1446 posix_memalign((void**)&buf, 4096, disks * chunk);
1447 start = sra->component_size - stripes * chunk/512;
1448 sysfs_set_num(sra, NULL, "sync_max", start);
1449 sysfs_set_str(sra, NULL, "sync_action", "reshape");
1450 sysfs_set_num(sra, NULL, "suspend_lo", 0);
1451 sysfs_set_num(sra, NULL, "suspend_hi", 0);
1452 rv = wait_backup(sra, 0, start - stripes * chunk/512, stripes * chunk/512,
1453 dests, destfd, destoffsets, 0);
1454 if (rv < 0)
1455 return 0;
1456 grow_backup(sra, 0, stripes,
1457 fds, offsets,
1458 disks, chunk, level, layout,
1459 dests, destfd, destoffsets,
1460 0, buf);
1461 validate(afd, destfd[0], destoffsets[0]);
1462 rv = wait_backup(sra, start, stripes*chunk/512, 0,
1463 dests, destfd, destoffsets, 0);
1464 if (rv < 0)
1465 return 0;
1466 sysfs_set_num(sra, NULL, "suspend_lo", (stripes * chunk/512) * data);
1467 free(buf);
1468 /* FIXME this should probably be numeric */
1469 sysfs_set_str(sra, NULL, "sync_max", "max");
1470 return 1;
1471 }
1472
1473 static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
1474 int *fds, unsigned long long *offsets,
1475 unsigned long long start,
1476 int disks, int chunk, int level, int layout, int data,
1477 int dests, int *destfd, unsigned long long *destoffsets)
1478 {
1479 unsigned long long size;
1480 unsigned long tailstripes = stripes;
1481 int part;
1482 char *buf;
1483 unsigned long long speed;
1484
1485
1486 posix_memalign((void**)&buf, 4096, disks * chunk);
1487
1488 sysfs_set_num(sra, NULL, "suspend_lo", 0);
1489 sysfs_set_num(sra, NULL, "suspend_hi", 0);
1490
1491 sysfs_get_ll(sra, NULL, "sync_speed_min", &speed);
1492 sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
1493
1494 grow_backup(sra, start, stripes,
1495 fds, offsets,
1496 disks, chunk, level, layout,
1497 dests, destfd, destoffsets,
1498 0, buf);
1499 grow_backup(sra, (start + stripes) * chunk/512, stripes,
1500 fds, offsets,
1501 disks, chunk, level, layout,
1502 dests, destfd, destoffsets,
1503 1, buf);
1504 validate(afd, destfd[0], destoffsets[0]);
1505 part = 0;
1506 start += stripes * 2; /* where to read next */
1507 size = sra->component_size / (chunk/512);
1508 while (start < size) {
1509 if (wait_backup(sra, (start-stripes*2)*chunk/512,
1510 stripes*chunk/512, 0,
1511 dests, destfd, destoffsets,
1512 part) < 0)
1513 return 0;
1514 sysfs_set_num(sra, NULL, "suspend_lo", start*chunk/512 * data);
1515 if (start + stripes > size)
1516 tailstripes = (size - start);
1517
1518 grow_backup(sra, start*chunk/512, tailstripes,
1519 fds, offsets,
1520 disks, chunk, level, layout,
1521 dests, destfd, destoffsets,
1522 part, buf);
1523 start += stripes;
1524 part = 1 - part;
1525 validate(afd, destfd[0], destoffsets[0]);
1526 }
1527 if (wait_backup(sra, (start-stripes*2) * chunk/512, stripes * chunk/512, 0,
1528 dests, destfd, destoffsets,
1529 part) < 0)
1530 return 0;
1531 sysfs_set_num(sra, NULL, "suspend_lo", ((start-stripes)*chunk/512) * data);
1532 if (wait_backup(sra, (start-stripes) * chunk/512, tailstripes * chunk/512, 0,
1533 dests, destfd, destoffsets,
1534 1-part) < 0)
1535 return 0;
1536 sysfs_set_num(sra, NULL, "suspend_lo", (size*chunk/512) * data);
1537 sysfs_set_num(sra, NULL, "sync_speed_min", speed);
1538 free(buf);
1539 return 1;
1540 }
1541
1542 /*
1543 * If any spare contains md_back_data-1 which is recent wrt mtime,
1544 * write that data into the array and update the super blocks with
1545 * the new reshape_progress
1546 */
1547 int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt, char *backup_file)
1548 {
1549 int i, j;
1550 int old_disks;
1551 unsigned long long *offsets;
1552 unsigned long long nstripe, ostripe, last_block;
1553 int ndata, odata;
1554
1555 if (info->new_level != info->array.level)
1556 return 1; /* Cannot handle level changes (they are instantaneous) */
1557
1558 odata = info->array.raid_disks - info->delta_disks - 1;
1559 if (info->array.level == 6) odata--; /* number of data disks */
1560 ndata = info->array.raid_disks - 1;
1561 if (info->new_level == 6) ndata--;
1562
1563 old_disks = info->array.raid_disks - info->delta_disks;
1564
1565 if (info->delta_disks <= 0)
1566 /* Didn't grow, so the backup file must have
1567 * been used
1568 */
1569 old_disks = cnt;
1570 for (i=old_disks-(backup_file?1:0); i<cnt; i++) {
1571 struct mdinfo dinfo;
1572 char buf[4096];
1573 int fd;
1574 int bsbsize;
1575
1576 /* This was a spare and may have some saved data on it.
1577 * Load the superblock, find and load the
1578 * backup_super_block.
1579 * If either fail, go on to next device.
1580 * If the backup contains no new info, just return
1581 * else restore data and update all superblocks
1582 */
1583 if (i == old_disks-1) {
1584 fd = open(backup_file, O_RDONLY);
1585 if (fd<0) {
1586 fprintf(stderr, Name ": backup file %s inaccessible: %s\n",
1587 backup_file, strerror(errno));
1588 continue;
1589 }
1590 } else {
1591 fd = fdlist[i];
1592 if (fd < 0)
1593 continue;
1594 if (st->ss->load_super(st, fd, NULL))
1595 continue;
1596
1597 st->ss->getinfo_super(st, &dinfo);
1598 st->ss->free_super(st);
1599
1600 if (lseek64(fd,
1601 (dinfo.data_offset + dinfo.component_size - 8) <<9,
1602 0) < 0)
1603 continue; /* Cannot seek */
1604 }
1605 if (read(fd, &bsb, sizeof(bsb)) != sizeof(bsb))
1606 continue; /* Cannot read */
1607 if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0 &&
1608 memcmp(bsb.magic, "md_backup_data-2", 16) != 0)
1609 continue;
1610 if (bsb.sb_csum != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb)))
1611 continue; /* bad checksum */
1612 if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0 &&
1613 bsb.sb_csum2 != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb)))
1614 if (memcmp(bsb.set_uuid,info->uuid, 16) != 0)
1615 continue; /* Wrong uuid */
1616
1617 if (info->array.utime > __le64_to_cpu(bsb.mtime) + 3600 ||
1618 info->array.utime < __le64_to_cpu(bsb.mtime))
1619 continue; /* time stamp is too bad */
1620
1621 if (bsb.magic[15] == '1') {
1622 if (info->delta_disks >= 0) {
1623 /* reshape_progress is increasing */
1624 if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) <
1625 info->reshape_progress)
1626 continue; /* No new data here */
1627 } else {
1628 /* reshape_progress is decreasing */
1629 if (__le64_to_cpu(bsb.arraystart) >=
1630 info->reshape_progress)
1631 continue; /* No new data here */
1632 }
1633 } else {
1634 if (info->delta_disks >= 0) {
1635 /* reshape_progress is increasing */
1636 if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) <
1637 info->reshape_progress &&
1638 __le64_to_cpu(bsb.arraystart2) + __le64_to_cpu(bsb.length2) <
1639 info->reshape_progress)
1640 continue; /* No new data here */
1641 } else {
1642 /* reshape_progress is decreasing */
1643 if (__le64_to_cpu(bsb.arraystart) >=
1644 info->reshape_progress &&
1645 __le64_to_cpu(bsb.arraystart2) >=
1646 info->reshape_progress)
1647 continue; /* No new data here */
1648 }
1649 }
1650 if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0)
1651 continue; /* Cannot seek */
1652 /* There should be a duplicate backup superblock 4k before here */
1653 if (lseek64(fd, -4096, 1) < 0 ||
1654 read(fd, buf, 4096) != 4096)
1655 continue; /* Cannot find leading superblock */
1656 if (bsb.magic[15] == '1')
1657 bsbsize = offsetof(struct mdp_backup_super, pad1);
1658 else
1659 bsbsize = offsetof(struct mdp_backup_super, pad);
1660 if (memcmp(buf, &bsb, bsbsize) != 0)
1661 continue; /* Cannot find leading superblock */
1662
1663 /* Now need the data offsets for all devices. */
1664 offsets = malloc(sizeof(*offsets)*info->array.raid_disks);
1665 for(j=0; j<info->array.raid_disks; j++) {
1666 if (fdlist[j] < 0)
1667 continue;
1668 if (st->ss->load_super(st, fdlist[j], NULL))
1669 /* FIXME should be this be an error */
1670 continue;
1671 st->ss->getinfo_super(st, &dinfo);
1672 st->ss->free_super(st);
1673 offsets[j] = dinfo.data_offset;
1674 }
1675 printf(Name ": restoring critical section\n");
1676
1677 if (restore_stripes(fdlist, offsets,
1678 info->array.raid_disks,
1679 info->new_chunk,
1680 info->new_level,
1681 info->new_layout,
1682 fd, __le64_to_cpu(bsb.devstart)*512,
1683 __le64_to_cpu(bsb.arraystart),
1684 __le64_to_cpu(bsb.length)*512)) {
1685 /* didn't succeed, so giveup */
1686 return 1;
1687 }
1688
1689 if (bsb.magic[15] == '2' &&
1690 restore_stripes(fdlist, offsets,
1691 info->array.raid_disks,
1692 info->new_chunk,
1693 info->new_level,
1694 info->new_layout,
1695 fd, __le64_to_cpu(bsb.devstart)*512 +
1696 __le64_to_cpu(bsb.devstart2)*512,
1697 __le64_to_cpu(bsb.arraystart2),
1698 __le64_to_cpu(bsb.length2)*512)) {
1699 /* didn't succeed, so giveup */
1700 return 1;
1701 }
1702
1703
1704 /* Ok, so the data is restored. Let's update those superblocks. */
1705
1706 if (info->delta_disks >= 0) {
1707 info->reshape_progress = __le64_to_cpu(bsb.arraystart) +
1708 __le64_to_cpu(bsb.length);
1709 if (bsb.magic[15] == '2') {
1710 unsigned long long p2 = __le64_to_cpu(bsb.arraystart2) +
1711 __le64_to_cpu(bsb.length2);
1712 if (p2 > info->reshape_progress)
1713 info->reshape_progress = p2;
1714 }
1715 } else {
1716 info->reshape_progress = __le64_to_cpu(bsb.arraystart);
1717 if (bsb.magic[15] == '2') {
1718 unsigned long long p2 = __le64_to_cpu(bsb.arraystart2);
1719 if (p2 < info->reshape_progress)
1720 info->reshape_progress = p2;
1721 }
1722 }
1723 for (j=0; j<info->array.raid_disks; j++) {
1724 if (fdlist[j] < 0) continue;
1725 if (st->ss->load_super(st, fdlist[j], NULL))
1726 continue;
1727 st->ss->getinfo_super(st, &dinfo);
1728 dinfo.reshape_progress = info->reshape_progress;
1729 st->ss->update_super(st, &dinfo,
1730 "_reshape_progress",
1731 NULL,0, 0, NULL);
1732 st->ss->store_super(st, fdlist[j]);
1733 st->ss->free_super(st);
1734 }
1735 return 0;
1736 }
1737 /* Didn't find any backup data, try to see if any
1738 * was needed.
1739 */
1740 if (info->delta_disks == 0)
1741 /* Alway need backup data when size doesn't change */
1742 return 1;
1743 nstripe = ostripe = 0;
1744 last_block = 0;
1745 while (nstripe >= ostripe) {
1746 nstripe += info->new_chunk / 512;
1747 last_block = nstripe * ndata;
1748 ostripe = last_block / odata / (info->array.chunk_size/512) *
1749 (info->array.chunk_size/512);
1750 }
1751
1752 if (info->reshape_progress >= last_block)
1753 return 0;
1754 /* needed to recover critical section! */
1755 return 1;
1756 }
1757
1758 int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
1759 char *backup_file)
1760 {
1761 /* Array is assembled and ready to be started, but
1762 * monitoring is probably required.
1763 * So:
1764 * - start read-only
1765 * - set upper bound for resync
1766 * - initialise the 'suspend' boundaries
1767 * - switch to read-write
1768 * - fork and continue monitoring
1769 */
1770 int err;
1771 int backup_list[1];
1772 unsigned long long backup_offsets[1];
1773 int odisks, ndisks, ochunk, nchunk,odata,ndata;
1774 unsigned long a,b,blocks,stripes;
1775 int backup_fd;
1776 int *fds;
1777 unsigned long long *offsets;
1778 int d;
1779 struct mdinfo *sra, *sd;
1780 int rv;
1781 int done = 0;
1782
1783 err = sysfs_set_str(info, NULL, "array_state", "readonly");
1784 if (err)
1785 return err;
1786
1787 /* make sure reshape doesn't progress until we are ready */
1788 sysfs_set_str(info, NULL, "sync_max", "0");
1789 sysfs_set_str(info, NULL, "array_state", "active"); /* FIXME or clean */
1790
1791 /* ndisks is not growing, so raid_disks is old and +delta is new */
1792 odisks = info->array.raid_disks;
1793 ndisks = odisks + info->delta_disks;
1794 odata = odisks - 1;
1795 ndata = ndisks - 1;
1796 if (info->array.level == 6) {
1797 odata--;
1798 ndata--;
1799 }
1800 ochunk = info->array.chunk_size;
1801 nchunk = info->new_chunk;
1802
1803
1804 a = ochunk/512 * odata;
1805 b = nchunk/512 * ndata;
1806 /* Find GCD */
1807 while (a != b) {
1808 if (a < b)
1809 b -= a;
1810 if (b < a)
1811 a -= b;
1812 }
1813 /* LCM == product / GCD */
1814 blocks = ochunk/512 * nchunk/512 * odata * ndata / a;
1815
1816 if (ndata == odata)
1817 blocks *= 16;
1818 stripes = blocks / (info->array.chunk_size/512) / odata;
1819
1820
1821 memset(&bsb, 0, 512);
1822 memcpy(bsb.magic, "md_backup_data-1", 16);
1823 memcpy(&bsb.set_uuid, info->uuid, 16);
1824 bsb.mtime = __cpu_to_le64(time(0));
1825 bsb.devstart2 = blocks;
1826
1827 backup_fd = open(backup_file, O_RDWR|O_CREAT, S_IRUSR | S_IWUSR);
1828 backup_list[0] = backup_fd;
1829 backup_offsets[0] = 8 * 512;
1830 fds = malloc(odisks * sizeof(fds[0]));
1831 offsets = malloc(odisks * sizeof(offsets[0]));
1832 for (d=0; d<odisks; d++)
1833 fds[d] = -1;
1834
1835 sra = sysfs_read(-1, devname2devnum(info->sys_name),
1836 GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
1837 GET_CACHE);
1838
1839 for (sd = sra->devs; sd; sd = sd->next) {
1840 if (sd->disk.state & (1<<MD_DISK_FAULTY))
1841 continue;
1842 if (sd->disk.state & (1<<MD_DISK_SYNC)) {
1843 char *dn = map_dev(sd->disk.major,
1844 sd->disk.minor, 1);
1845 fds[sd->disk.raid_disk]
1846 = dev_open(dn, O_RDONLY);
1847 offsets[sd->disk.raid_disk] = sd->data_offset*512;
1848 if (fds[sd->disk.raid_disk] < 0) {
1849 fprintf(stderr, Name ": %s: cannot open component %s\n",
1850 info->sys_name, dn?dn:"-unknown-");
1851 rv = 1;
1852 goto release;
1853 }
1854 free(dn);
1855 }
1856 }
1857
1858 switch(fork()) {
1859 case 0:
1860 close(mdfd);
1861 mlockall(MCL_FUTURE);
1862 if (info->delta_disks < 0)
1863 done = child_shrink(-1, info, stripes,
1864 fds, offsets,
1865 info->array.raid_disks,
1866 info->array.chunk_size,
1867 info->array.level, info->array.layout,
1868 odata,
1869 1, backup_list, backup_offsets);
1870 else if (info->delta_disks == 0) {
1871 /* The 'start' is a per-device stripe number.
1872 * reshape_progress is a per-array sector number.
1873 * So divide by ndata * chunk_size
1874 */
1875 unsigned long long start = info->reshape_progress / ndata;
1876 start /= (info->array.chunk_size/512);
1877 done = child_same_size(-1, info, stripes,
1878 fds, offsets,
1879 start,
1880 info->array.raid_disks,
1881 info->array.chunk_size,
1882 info->array.level, info->array.layout,
1883 odata,
1884 1, backup_list, backup_offsets);
1885 }
1886 if (backup_file && done)
1887 unlink(backup_file);
1888 /* FIXME should I intuit a level change */
1889 exit(0);
1890 case -1:
1891 fprintf(stderr, Name ": Cannot run child to continue monitoring reshape: %s\n",
1892 strerror(errno));
1893 return 1;
1894 default:
1895 break;
1896 }
1897 release:
1898 return 0;
1899 }
1900
1901