]> git.ipfire.org Git - thirdparty/mdadm.git/blob - Grow.c
FIX: Pass container name to reshape array for external meta data
[thirdparty/mdadm.git] / Grow.c
1 /*
2 * mdadm - manage Linux "md" devices aka RAID arrays.
3 *
4 * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
5 *
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 * Author: Neil Brown
22 * Email: <neilb@suse.de>
23 */
24 #include "mdadm.h"
25 #include "dlink.h"
26 #include <sys/mman.h>
27
28 #if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN)
29 #error no endian defined
30 #endif
31 #include "md_u.h"
32 #include "md_p.h"
33
34 #ifndef offsetof
35 #define offsetof(t,f) ((size_t)&(((t*)0)->f))
36 #endif
37
38 int Grow_Add_device(char *devname, int fd, char *newdev)
39 {
40 /* Add a device to an active array.
41 * Currently, just extend a linear array.
42 * This requires writing a new superblock on the
43 * new device, calling the kernel to add the device,
44 * and if that succeeds, update the superblock on
45 * all other devices.
46 * This means that we need to *find* all other devices.
47 */
48 struct mdinfo info;
49
50 struct stat stb;
51 int nfd, fd2;
52 int d, nd;
53 struct supertype *st = NULL;
54 char *subarray = NULL;
55
56 if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) {
57 fprintf(stderr, Name ": cannot get array info for %s\n", devname);
58 return 1;
59 }
60
61 if (info.array.level != -1) {
62 fprintf(stderr, Name ": can only add devices to linear arrays\n");
63 return 1;
64 }
65
66 st = super_by_fd(fd, &subarray);
67 if (!st) {
68 fprintf(stderr, Name ": cannot handle arrays with superblock version %d\n", info.array.major_version);
69 return 1;
70 }
71
72 if (subarray) {
73 fprintf(stderr, Name ": Cannot grow linear sub-arrays yet\n");
74 free(subarray);
75 free(st);
76 }
77
78 nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT);
79 if (nfd < 0) {
80 fprintf(stderr, Name ": cannot open %s\n", newdev);
81 free(st);
82 return 1;
83 }
84 fstat(nfd, &stb);
85 if ((stb.st_mode & S_IFMT) != S_IFBLK) {
86 fprintf(stderr, Name ": %s is not a block device!\n", newdev);
87 close(nfd);
88 free(st);
89 return 1;
90 }
91 /* now check out all the devices and make sure we can read the superblock */
92 for (d=0 ; d < info.array.raid_disks ; d++) {
93 mdu_disk_info_t disk;
94 char *dv;
95
96 st->ss->free_super(st);
97
98 disk.number = d;
99 if (ioctl(fd, GET_DISK_INFO, &disk) < 0) {
100 fprintf(stderr, Name ": cannot get device detail for device %d\n",
101 d);
102 close(nfd);
103 free(st);
104 return 1;
105 }
106 dv = map_dev(disk.major, disk.minor, 1);
107 if (!dv) {
108 fprintf(stderr, Name ": cannot find device file for device %d\n",
109 d);
110 close(nfd);
111 free(st);
112 return 1;
113 }
114 fd2 = dev_open(dv, O_RDWR);
115 if (!fd2) {
116 fprintf(stderr, Name ": cannot open device file %s\n", dv);
117 close(nfd);
118 free(st);
119 return 1;
120 }
121
122 if (st->ss->load_super(st, fd2, NULL)) {
123 fprintf(stderr, Name ": cannot find super block on %s\n", dv);
124 close(nfd);
125 close(fd2);
126 free(st);
127 return 1;
128 }
129 close(fd2);
130 }
131 /* Ok, looks good. Lets update the superblock and write it out to
132 * newdev.
133 */
134
135 info.disk.number = d;
136 info.disk.major = major(stb.st_rdev);
137 info.disk.minor = minor(stb.st_rdev);
138 info.disk.raid_disk = d;
139 info.disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE);
140 st->ss->update_super(st, &info, "linear-grow-new", newdev,
141 0, 0, NULL);
142
143 if (st->ss->store_super(st, nfd)) {
144 fprintf(stderr, Name ": Cannot store new superblock on %s\n",
145 newdev);
146 close(nfd);
147 return 1;
148 }
149 close(nfd);
150
151 if (ioctl(fd, ADD_NEW_DISK, &info.disk) != 0) {
152 fprintf(stderr, Name ": Cannot add new disk to this array\n");
153 return 1;
154 }
155 /* Well, that seems to have worked.
156 * Now go through and update all superblocks
157 */
158
159 if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) {
160 fprintf(stderr, Name ": cannot get array info for %s\n", devname);
161 return 1;
162 }
163
164 nd = d;
165 for (d=0 ; d < info.array.raid_disks ; d++) {
166 mdu_disk_info_t disk;
167 char *dv;
168
169 disk.number = d;
170 if (ioctl(fd, GET_DISK_INFO, &disk) < 0) {
171 fprintf(stderr, Name ": cannot get device detail for device %d\n",
172 d);
173 return 1;
174 }
175 dv = map_dev(disk.major, disk.minor, 1);
176 if (!dv) {
177 fprintf(stderr, Name ": cannot find device file for device %d\n",
178 d);
179 return 1;
180 }
181 fd2 = dev_open(dv, O_RDWR);
182 if (fd2 < 0) {
183 fprintf(stderr, Name ": cannot open device file %s\n", dv);
184 return 1;
185 }
186 if (st->ss->load_super(st, fd2, NULL)) {
187 fprintf(stderr, Name ": cannot find super block on %s\n", dv);
188 close(fd);
189 return 1;
190 }
191 info.array.raid_disks = nd+1;
192 info.array.nr_disks = nd+1;
193 info.array.active_disks = nd+1;
194 info.array.working_disks = nd+1;
195
196 st->ss->update_super(st, &info, "linear-grow-update", dv,
197 0, 0, NULL);
198
199 if (st->ss->store_super(st, fd2)) {
200 fprintf(stderr, Name ": Cannot store new superblock on %s\n", dv);
201 close(fd2);
202 return 1;
203 }
204 close(fd2);
205 }
206
207 return 0;
208 }
209
210 int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int write_behind, int force)
211 {
212 /*
213 * First check that array doesn't have a bitmap
214 * Then create the bitmap
215 * Then add it
216 *
217 * For internal bitmaps, we need to check the version,
218 * find all the active devices, and write the bitmap block
219 * to all devices
220 */
221 mdu_bitmap_file_t bmf;
222 mdu_array_info_t array;
223 struct supertype *st;
224 char *subarray = NULL;
225 int major = BITMAP_MAJOR_HI;
226 int vers = md_get_version(fd);
227 unsigned long long bitmapsize, array_size;
228
229 if (vers < 9003) {
230 major = BITMAP_MAJOR_HOSTENDIAN;
231 fprintf(stderr, Name ": Warning - bitmaps created on this kernel"
232 " are not portable\n"
233 " between different architectures. Consider upgrading"
234 " the Linux kernel.\n");
235 }
236
237 if (ioctl(fd, GET_BITMAP_FILE, &bmf) != 0) {
238 if (errno == ENOMEM)
239 fprintf(stderr, Name ": Memory allocation failure.\n");
240 else
241 fprintf(stderr, Name ": bitmaps not supported by this kernel.\n");
242 return 1;
243 }
244 if (bmf.pathname[0]) {
245 if (strcmp(file,"none")==0) {
246 if (ioctl(fd, SET_BITMAP_FILE, -1)!= 0) {
247 fprintf(stderr, Name ": failed to remove bitmap %s\n",
248 bmf.pathname);
249 return 1;
250 }
251 return 0;
252 }
253 fprintf(stderr, Name ": %s already has a bitmap (%s)\n",
254 devname, bmf.pathname);
255 return 1;
256 }
257 if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) {
258 fprintf(stderr, Name ": cannot get array status for %s\n", devname);
259 return 1;
260 }
261 if (array.state & (1<<MD_SB_BITMAP_PRESENT)) {
262 if (strcmp(file, "none")==0) {
263 array.state &= ~(1<<MD_SB_BITMAP_PRESENT);
264 if (ioctl(fd, SET_ARRAY_INFO, &array)!= 0) {
265 fprintf(stderr, Name ": failed to remove internal bitmap.\n");
266 return 1;
267 }
268 return 0;
269 }
270 fprintf(stderr, Name ": Internal bitmap already present on %s\n",
271 devname);
272 return 1;
273 }
274
275 if (strcmp(file, "none") == 0) {
276 fprintf(stderr, Name ": no bitmap found on %s\n", devname);
277 return 1;
278 }
279 if (array.level <= 0) {
280 fprintf(stderr, Name ": Bitmaps not meaningful with level %s\n",
281 map_num(pers, array.level)?:"of this array");
282 return 1;
283 }
284 bitmapsize = array.size;
285 bitmapsize <<= 1;
286 if (get_dev_size(fd, NULL, &array_size) &&
287 array_size > (0x7fffffffULL<<9)) {
288 /* Array is big enough that we cannot trust array.size
289 * try other approaches
290 */
291 bitmapsize = get_component_size(fd);
292 }
293 if (bitmapsize == 0) {
294 fprintf(stderr, Name ": Cannot reliably determine size of array to create bitmap - sorry.\n");
295 return 1;
296 }
297
298 if (array.level == 10) {
299 int ncopies = (array.layout&255)*((array.layout>>8)&255);
300 bitmapsize = bitmapsize * array.raid_disks / ncopies;
301 }
302
303 st = super_by_fd(fd, &subarray);
304 if (!st) {
305 fprintf(stderr, Name ": Cannot understand version %d.%d\n",
306 array.major_version, array.minor_version);
307 return 1;
308 }
309 if (subarray) {
310 fprintf(stderr, Name ": Cannot add bitmaps to sub-arrays yet\n");
311 free(subarray);
312 free(st);
313 return 1;
314 }
315 if (strcmp(file, "internal") == 0) {
316 int d;
317 if (st->ss->add_internal_bitmap == NULL) {
318 fprintf(stderr, Name ": Internal bitmaps not supported "
319 "with %s metadata\n", st->ss->name);
320 return 1;
321 }
322 for (d=0; d< st->max_devs; d++) {
323 mdu_disk_info_t disk;
324 char *dv;
325 disk.number = d;
326 if (ioctl(fd, GET_DISK_INFO, &disk) < 0)
327 continue;
328 if (disk.major == 0 &&
329 disk.minor == 0)
330 continue;
331 if ((disk.state & (1<<MD_DISK_SYNC))==0)
332 continue;
333 dv = map_dev(disk.major, disk.minor, 1);
334 if (dv) {
335 int fd2 = dev_open(dv, O_RDWR);
336 if (fd2 < 0)
337 continue;
338 if (st->ss->load_super(st, fd2, NULL)==0) {
339 if (st->ss->add_internal_bitmap(
340 st,
341 &chunk, delay, write_behind,
342 bitmapsize, 0, major)
343 )
344 st->ss->write_bitmap(st, fd2);
345 else {
346 fprintf(stderr, Name ": failed to create internal bitmap - chunksize problem.\n");
347 close(fd2);
348 return 1;
349 }
350 }
351 close(fd2);
352 }
353 }
354 array.state |= (1<<MD_SB_BITMAP_PRESENT);
355 if (ioctl(fd, SET_ARRAY_INFO, &array)!= 0) {
356 if (errno == EBUSY)
357 fprintf(stderr, Name
358 ": Cannot add bitmap while array is"
359 " resyncing or reshaping etc.\n");
360 fprintf(stderr, Name ": failed to set internal bitmap.\n");
361 return 1;
362 }
363 } else {
364 int uuid[4];
365 int bitmap_fd;
366 int d;
367 int max_devs = st->max_devs;
368
369 /* try to load a superblock */
370 for (d=0; d<max_devs; d++) {
371 mdu_disk_info_t disk;
372 char *dv;
373 int fd2;
374 disk.number = d;
375 if (ioctl(fd, GET_DISK_INFO, &disk) < 0)
376 continue;
377 if ((disk.major==0 && disk.minor==0) ||
378 (disk.state & (1<<MD_DISK_REMOVED)))
379 continue;
380 dv = map_dev(disk.major, disk.minor, 1);
381 if (!dv) continue;
382 fd2 = dev_open(dv, O_RDONLY);
383 if (fd2 >= 0 &&
384 st->ss->load_super(st, fd2, NULL) == 0) {
385 close(fd2);
386 st->ss->uuid_from_super(st, uuid);
387 break;
388 }
389 close(fd2);
390 }
391 if (d == max_devs) {
392 fprintf(stderr, Name ": cannot find UUID for array!\n");
393 return 1;
394 }
395 if (CreateBitmap(file, force, (char*)uuid, chunk,
396 delay, write_behind, bitmapsize, major)) {
397 return 1;
398 }
399 bitmap_fd = open(file, O_RDWR);
400 if (bitmap_fd < 0) {
401 fprintf(stderr, Name ": weird: %s cannot be opened\n",
402 file);
403 return 1;
404 }
405 if (ioctl(fd, SET_BITMAP_FILE, bitmap_fd) < 0) {
406 int err = errno;
407 if (errno == EBUSY)
408 fprintf(stderr, Name
409 ": Cannot add bitmap while array is"
410 " resyncing or reshaping etc.\n");
411 fprintf(stderr, Name ": Cannot set bitmap file for %s: %s\n",
412 devname, strerror(err));
413 return 1;
414 }
415 }
416
417 return 0;
418 }
419
420
421 /*
422 * When reshaping an array we might need to backup some data.
423 * This is written to all spares with a 'super_block' describing it.
424 * The superblock goes 4K from the end of the used space on the
425 * device.
426 * It if written after the backup is complete.
427 * It has the following structure.
428 */
429
430 static struct mdp_backup_super {
431 char magic[16]; /* md_backup_data-1 or -2 */
432 __u8 set_uuid[16];
433 __u64 mtime;
434 /* start/sizes in 512byte sectors */
435 __u64 devstart; /* address on backup device/file of data */
436 __u64 arraystart;
437 __u64 length;
438 __u32 sb_csum; /* csum of preceeding bytes. */
439 __u32 pad1;
440 __u64 devstart2; /* offset in to data of second section */
441 __u64 arraystart2;
442 __u64 length2;
443 __u32 sb_csum2; /* csum of preceeding bytes. */
444 __u8 pad[512-68-32];
445 } __attribute__((aligned(512))) bsb, bsb2;
446
447 static __u32 bsb_csum(char *buf, int len)
448 {
449 int i;
450 int csum = 0;
451 for (i=0; i<len; i++)
452 csum = (csum<<3) + buf[0];
453 return __cpu_to_le32(csum);
454 }
455
456 static int check_idle(struct supertype *st)
457 {
458 /* Check that all member arrays for this container, or the
459 * container of this array, are idle
460 */
461 int container_dev = (st->container_dev != NoMdDev
462 ? st->container_dev : st->devnum);
463 char container[40];
464 struct mdstat_ent *ent, *e;
465 int is_idle = 1;
466
467 fmt_devname(container, container_dev);
468 ent = mdstat_read(0, 0);
469 for (e = ent ; e; e = e->next) {
470 if (!is_container_member(e, container))
471 continue;
472 if (e->percent >= 0) {
473 is_idle = 0;
474 break;
475 }
476 }
477 free_mdstat(ent);
478 return is_idle;
479 }
480
481 static int freeze_container(struct supertype *st)
482 {
483 int container_dev = (st->container_dev != NoMdDev
484 ? st->container_dev : st->devnum);
485 char container[40];
486
487 if (!check_idle(st))
488 return -1;
489
490 fmt_devname(container, container_dev);
491
492 if (block_monitor(container, 1)) {
493 fprintf(stderr, Name ": failed to freeze container\n");
494 return -2;
495 }
496
497 return 1;
498 }
499
500 static void unfreeze_container(struct supertype *st)
501 {
502 int container_dev = (st->container_dev != NoMdDev
503 ? st->container_dev : st->devnum);
504 char container[40];
505
506 fmt_devname(container, container_dev);
507
508 unblock_monitor(container, 1);
509 }
510
511 static int freeze(struct supertype *st)
512 {
513 /* Try to freeze resync/rebuild on this array/container.
514 * Return -1 if the array is busy,
515 * return -2 container cannot be frozen,
516 * return 0 if this kernel doesn't support 'frozen'
517 * return 1 if it worked.
518 */
519 if (st->ss->external)
520 return freeze_container(st);
521 else {
522 struct mdinfo *sra = sysfs_read(-1, st->devnum, GET_VERSION);
523 int err;
524
525 if (!sra)
526 return -1;
527 err = sysfs_freeze_array(sra);
528 sysfs_free(sra);
529 return err;
530 }
531 }
532
533 static void unfreeze(struct supertype *st)
534 {
535 if (st->ss->external)
536 return unfreeze_container(st);
537 else {
538 struct mdinfo *sra = sysfs_read(-1, st->devnum, GET_VERSION);
539
540 if (sra)
541 sysfs_set_str(sra, NULL, "sync_action", "idle");
542 else
543 fprintf(stderr, Name ": failed to unfreeze array\n");
544 sysfs_free(sra);
545 }
546 }
547
548 static void wait_reshape(struct mdinfo *sra)
549 {
550 int fd = sysfs_get_fd(sra, NULL, "sync_action");
551 char action[20];
552
553 if (fd < 0)
554 return;
555
556 while (sysfs_fd_get_str(fd, action, 20) > 0 &&
557 strncmp(action, "reshape", 7) == 0) {
558 fd_set rfds;
559 FD_ZERO(&rfds);
560 FD_SET(fd, &rfds);
561 select(fd+1, NULL, NULL, &rfds, NULL);
562 }
563 close(fd);
564 }
565
566 static int reshape_super(struct supertype *st, long long size, int level,
567 int layout, int chunksize, int raid_disks,
568 int delta_disks, char *backup_file, char *dev,
569 int verbose)
570 {
571 /* nothing extra to check in the native case */
572 if (!st->ss->external)
573 return 0;
574 if (!st->ss->reshape_super ||
575 !st->ss->manage_reshape) {
576 fprintf(stderr, Name ": %s metadata does not support reshape\n",
577 st->ss->name);
578 return 1;
579 }
580
581 return st->ss->reshape_super(st, size, level, layout, chunksize,
582 raid_disks, delta_disks, backup_file, dev,
583 verbose);
584 }
585
586 static void sync_metadata(struct supertype *st)
587 {
588 if (st->ss->external) {
589 if (st->update_tail) {
590 flush_metadata_updates(st);
591 st->update_tail = &st->updates;
592 } else
593 st->ss->sync_metadata(st);
594 }
595 }
596
597 static int subarray_set_num(char *container, struct mdinfo *sra, char *name, int n)
598 {
599 /* when dealing with external metadata subarrays we need to be
600 * prepared to handle EAGAIN. The kernel may need to wait for
601 * mdmon to mark the array active so the kernel can handle
602 * allocations/writeback when preparing the reshape action
603 * (md_allow_write()). We temporarily disable safe_mode_delay
604 * to close a race with the array_state going clean before the
605 * next write to raid_disks / stripe_cache_size
606 */
607 char safe[50];
608 int rc;
609
610 /* only 'raid_disks' and 'stripe_cache_size' trigger md_allow_write */
611 if (!container ||
612 (strcmp(name, "raid_disks") != 0 &&
613 strcmp(name, "stripe_cache_size") != 0))
614 return sysfs_set_num(sra, NULL, name, n);
615
616 rc = sysfs_get_str(sra, NULL, "safe_mode_delay", safe, sizeof(safe));
617 if (rc <= 0)
618 return -1;
619 sysfs_set_num(sra, NULL, "safe_mode_delay", 0);
620 rc = sysfs_set_num(sra, NULL, name, n);
621 if (rc < 0 && errno == EAGAIN) {
622 ping_monitor(container);
623 /* if we get EAGAIN here then the monitor is not active
624 * so stop trying
625 */
626 rc = sysfs_set_num(sra, NULL, name, n);
627 }
628 sysfs_set_str(sra, NULL, "safe_mode_delay", safe);
629 return rc;
630 }
631
632 int start_reshape(struct mdinfo *sra, int already_running)
633 {
634 int err;
635 sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL);
636 err = sysfs_set_num(sra, NULL, "suspend_hi", 0);
637 err = err ?: sysfs_set_num(sra, NULL, "suspend_lo", 0);
638 if (!already_running)
639 sysfs_set_num(sra, NULL, "sync_min", 0);
640 err = err ?: sysfs_set_num(sra, NULL, "sync_max", 0);
641 if (!already_running)
642 err = err ?: sysfs_set_str(sra, NULL, "sync_action", "reshape");
643
644 return err;
645 }
646
647 void abort_reshape(struct mdinfo *sra)
648 {
649 sysfs_set_str(sra, NULL, "sync_action", "idle");
650 sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL);
651 sysfs_set_num(sra, NULL, "suspend_hi", 0);
652 sysfs_set_num(sra, NULL, "suspend_lo", 0);
653 sysfs_set_num(sra, NULL, "sync_min", 0);
654 sysfs_set_str(sra, NULL, "sync_max", "max");
655 }
656
657 int remove_disks_for_takeover(struct supertype *st,
658 struct mdinfo *sra,
659 int layout)
660 {
661 int nr_of_copies;
662 struct mdinfo *remaining;
663 int slot;
664
665 if (sra->array.level == 10)
666 nr_of_copies = layout & 0xff;
667 else if (sra->array.level == 1)
668 nr_of_copies = sra->array.raid_disks;
669 else
670 return 1;
671
672 remaining = sra->devs;
673 sra->devs = NULL;
674 /* for each 'copy', select one device and remove from the list. */
675 for (slot = 0; slot < sra->array.raid_disks; slot += nr_of_copies) {
676 struct mdinfo **diskp;
677 int found = 0;
678
679 /* Find a working device to keep */
680 for (diskp = &remaining; *diskp ; diskp = &(*diskp)->next) {
681 struct mdinfo *disk = *diskp;
682
683 if (disk->disk.raid_disk < slot)
684 continue;
685 if (disk->disk.raid_disk >= slot + nr_of_copies)
686 continue;
687 if (disk->disk.state & (1<<MD_DISK_REMOVED))
688 continue;
689 if (disk->disk.state & (1<<MD_DISK_FAULTY))
690 continue;
691 if (!(disk->disk.state & (1<<MD_DISK_SYNC)))
692 continue;
693
694 /* We have found a good disk to use! */
695 *diskp = disk->next;
696 disk->next = sra->devs;
697 sra->devs = disk;
698 found = 1;
699 break;
700 }
701 if (!found)
702 break;
703 }
704
705 if (slot < sra->array.raid_disks) {
706 /* didn't find all slots */
707 struct mdinfo **e;
708 e = &remaining;
709 while (*e)
710 e = &(*e)->next;
711 *e = sra->devs;
712 sra->devs = remaining;
713 return 1;
714 }
715
716 /* Remove all 'remaining' devices from the array */
717 while (remaining) {
718 struct mdinfo *sd = remaining;
719 remaining = sd->next;
720
721 sysfs_set_str(sra, sd, "state", "faulty");
722 sysfs_set_str(sra, sd, "slot", "none");
723 /* for external metadata disks should be removed in mdmon */
724 if (!st->ss->external)
725 sysfs_set_str(sra, sd, "state", "remove");
726 sd->disk.state |= (1<<MD_DISK_REMOVED);
727 sd->disk.state &= ~(1<<MD_DISK_SYNC);
728 sd->next = sra->devs;
729 sra->devs = sd;
730 }
731 return 0;
732 }
733
734 void reshape_free_fdlist(int *fdlist,
735 unsigned long long *offsets,
736 int size)
737 {
738 int i;
739
740 for (i = 0; i < size; i++)
741 if (fdlist[i] >= 0)
742 close(fdlist[i]);
743
744 free(fdlist);
745 free(offsets);
746 }
747
748 int reshape_prepare_fdlist(char *devname,
749 struct mdinfo *sra,
750 int raid_disks,
751 int nrdisks,
752 unsigned long blocks,
753 char *backup_file,
754 int *fdlist,
755 unsigned long long *offsets)
756 {
757 int d = 0;
758 struct mdinfo *sd;
759
760 for (d = 0; d <= nrdisks; d++)
761 fdlist[d] = -1;
762 d = raid_disks;
763 for (sd = sra->devs; sd; sd = sd->next) {
764 if (sd->disk.state & (1<<MD_DISK_FAULTY))
765 continue;
766 if (sd->disk.state & (1<<MD_DISK_SYNC)) {
767 char *dn = map_dev(sd->disk.major,
768 sd->disk.minor, 1);
769 fdlist[sd->disk.raid_disk]
770 = dev_open(dn, O_RDONLY);
771 offsets[sd->disk.raid_disk] = sd->data_offset*512;
772 if (fdlist[sd->disk.raid_disk] < 0) {
773 fprintf(stderr,
774 Name ": %s: cannot open component %s\n",
775 devname, dn ? dn : "-unknown-");
776 d = -1;
777 goto release;
778 }
779 } else if (backup_file == NULL) {
780 /* spare */
781 char *dn = map_dev(sd->disk.major,
782 sd->disk.minor, 1);
783 fdlist[d] = dev_open(dn, O_RDWR);
784 offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512;
785 if (fdlist[d] < 0) {
786 fprintf(stderr, Name ": %s: cannot open component %s\n",
787 devname, dn ? dn : "-unknown-");
788 d = -1;
789 goto release;
790 }
791 d++;
792 }
793 }
794 release:
795 return d;
796 }
797
798 int reshape_open_backup_file(char *backup_file,
799 int fd,
800 char *devname,
801 long blocks,
802 int *fdlist,
803 unsigned long long *offsets,
804 int restart)
805 {
806 /* Return 1 on success, 0 on any form of failure */
807 /* need to check backup file is large enough */
808 char buf[512];
809 struct stat stb;
810 unsigned int dev;
811 int i;
812
813 *fdlist = open(backup_file, O_RDWR|O_CREAT|(restart ? O_TRUNC : O_EXCL),
814 S_IRUSR | S_IWUSR);
815 *offsets = 8 * 512;
816 if (*fdlist < 0) {
817 fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
818 devname, backup_file, strerror(errno));
819 return 0;
820 }
821 /* Guard against backup file being on array device.
822 * If array is partitioned or if LVM etc is in the
823 * way this will not notice, but it is better than
824 * nothing.
825 */
826 fstat(*fdlist, &stb);
827 dev = stb.st_dev;
828 fstat(fd, &stb);
829 if (stb.st_rdev == dev) {
830 fprintf(stderr, Name ": backup file must NOT be"
831 " on the array being reshaped.\n");
832 close(*fdlist);
833 return 0;
834 }
835
836 memset(buf, 0, 512);
837 for (i=0; i < blocks + 8 ; i++) {
838 if (write(*fdlist, buf, 512) != 512) {
839 fprintf(stderr, Name ": %s: cannot create"
840 " backup file %s: %s\n",
841 devname, backup_file, strerror(errno));
842 return 0;
843 }
844 }
845 if (fsync(*fdlist) != 0) {
846 fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
847 devname, backup_file, strerror(errno));
848 return 0;
849 }
850
851 return 1;
852 }
853
854 unsigned long compute_backup_blocks(int nchunk, int ochunk,
855 unsigned int ndata, unsigned int odata)
856 {
857 unsigned long a, b, blocks;
858 /* So how much do we need to backup.
859 * We need an amount of data which is both a whole number of
860 * old stripes and a whole number of new stripes.
861 * So LCM for (chunksize*datadisks).
862 */
863 a = (ochunk/512) * odata;
864 b = (nchunk/512) * ndata;
865 /* Find GCD */
866 while (a != b) {
867 if (a < b)
868 b -= a;
869 if (b < a)
870 a -= b;
871 }
872 /* LCM == product / GCD */
873 blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a;
874
875 return blocks;
876 }
877
878 char *analyse_change(struct mdinfo *info, struct reshape *re)
879 {
880 /* Based on the current array state in info->array and
881 * the changes in info->new_* etc, determine:
882 * - whether the change is possible
883 * - Intermediate level/raid_disks/layout
884 * - whether a restriping reshape is needed
885 * - number of sectors in minimum change unit. This
886 * will cover a whole number of stripes in 'before' and
887 * 'after'.
888 *
889 * Return message if the change should be rejected
890 * NULL if the change can be achieved
891 *
892 * This can be called as part of starting a reshape, or
893 * when assembling an array that is undergoing reshape.
894 */
895 int new_disks;
896 /* delta_parity records change in number of devices
897 * caused by level change
898 */
899 int delta_parity = 0;
900
901 /* If a new level not explicitly given, we assume no-change */
902 if (info->new_level == UnSet)
903 info->new_level = info->array.level;
904
905 if (info->new_chunk)
906 switch (info->new_level) {
907 case 0:
908 case 4:
909 case 5:
910 case 6:
911 case 10:
912 /* chunk size is meaningful, must divide component_size
913 * evenly
914 */
915 if (info->component_size % (info->new_chunk/512))
916 return "New chunk size does not"
917 " divide component size";
918 break;
919 default:
920 return "chunk size not meaningful for this level";
921 }
922 else
923 info->new_chunk = info->array.chunk_size;
924
925 switch (info->array.level) {
926 case 1:
927 /* RAID1 can convert to RAID1 with different disks, or
928 * raid5 with 2 disks, or
929 * raid0 with 1 disk
930 */
931 if (info->new_level == 0) {
932 if (info->delta_disks != UnSet &&
933 info->delta_disks != 0)
934 return "Cannot change number of disks "
935 "with RAID1->RAID0 conversion";
936 re->level = 0;
937 re->before.data_disks = 1;
938 re->after.data_disks = 1;
939 re->before.layout = 0;
940 re->backup_blocks = 0;
941 re->parity = 0;
942 return NULL;
943 }
944 if (info->new_level == 1) {
945 if (info->delta_disks == UnSet)
946 /* Don't know what to do */
947 return "no change requested for Growing RAID1";
948 re->level = 1;
949 re->backup_blocks = 0;
950 re->parity = 0;
951 return NULL;
952 }
953 if (info->array.raid_disks == 2 &&
954 info->new_level == 5) {
955 if (info->delta_disks != UnSet &&
956 info->delta_disks != 0)
957 return "Cannot change number of disks "
958 "with RAID1->RAID5 conversion";
959 re->level = 5;
960 re->before.data_disks = 1;
961 re->after.data_disks = 1;
962 re->before.layout = ALGORITHM_LEFT_SYMMETRIC;
963 info->array.chunk_size = 65536;
964 break;
965 }
966 /* Could do some multi-stage conversions, but leave that to
967 * later.
968 */
969 return "Impossibly level change request for RAID1";
970
971 case 10:
972 /* RAID10 can only be converted from near mode to
973 * RAID0 by removing some devices
974 */
975 if ((info->array.layout & ~0xff) != 0x100)
976 return "Cannot Grow RAID10 with far/offset layout";
977 /* number of devices must be multiple of number of copies */
978 if (info->array.raid_disks % (info->array.layout & 0xff))
979 return "RAID10 layout too complex for Grow operation";
980
981 if (info->new_level != 0)
982 return "RAID10 can only be changed to RAID0";
983 new_disks = (info->array.raid_disks
984 / (info->array.layout & 0xff));
985 if (info->delta_disks == UnSet)
986 info->delta_disks = (new_disks
987 - info->array.raid_disks);
988
989 if (info->delta_disks != new_disks - info->array.raid_disks)
990 return "New number of raid-devices impossible for RAID10";
991 if (info->new_chunk &&
992 info->new_chunk != info->array.chunk_size)
993 return "Cannot change chunk-size with RAID10 Grow";
994
995 /* looks good */
996 re->level = 0;
997 re->parity = 0;
998 re->before.data_disks = new_disks;
999 re->after.data_disks = re->before.data_disks;
1000 re->before.layout = 0;
1001 re->backup_blocks = 0;
1002 return NULL;
1003
1004 case 0:
1005 /* RAID0 can be converted to RAID10, or to RAID456 */
1006 if (info->new_level == 10) {
1007 if (info->new_layout == UnSet && info->delta_disks == UnSet) {
1008 /* Assume near=2 layout */
1009 info->new_layout = 0x102;
1010 info->delta_disks = info->array.raid_disks;
1011 }
1012 if (info->new_layout == UnSet) {
1013 int copies = 1 + (info->delta_disks
1014 / info->array.raid_disks);
1015 if (info->array.raid_disks * (copies-1)
1016 != info->delta_disks)
1017 return "Impossible number of devices"
1018 " for RAID0->RAID10";
1019 info->new_layout = 0x100 + copies;
1020 }
1021 if (info->delta_disks == UnSet) {
1022 int copies = info->new_layout & 0xff;
1023 if (info->new_layout != 0x100 + copies)
1024 return "New layout impossible"
1025 " for RAID0->RAID10";;
1026 info->delta_disks = (copies - 1) *
1027 info->array.raid_disks;
1028 }
1029 if (info->new_chunk &&
1030 info->new_chunk != info->array.chunk_size)
1031 return "Cannot change chunk-size with RAID0->RAID10";
1032 /* looks good */
1033 re->level = 10;
1034 re->parity = 0;
1035 re->before.data_disks = (info->array.raid_disks +
1036 info->delta_disks);
1037 re->after.data_disks = re->before.data_disks;
1038 re->before.layout = info->new_layout;
1039 re->backup_blocks = 0;
1040 return NULL;
1041 }
1042
1043 /* RAID0 can also covert to RAID0/4/5/6 by first converting to
1044 * a raid4 style layout of the final level.
1045 */
1046 switch (info->new_level) {
1047 case 4:
1048 delta_parity = 1;
1049 case 0:
1050 re->level = 4;
1051 re->before.layout = 0;
1052 break;
1053 case 5:
1054 delta_parity = 1;
1055 re->level = 5;
1056 re->before.layout = ALGORITHM_PARITY_N;
1057 break;
1058 case 6:
1059 delta_parity = 2;
1060 re->level = 6;
1061 re->before.layout = ALGORITHM_PARITY_N;
1062 break;
1063 default:
1064 return "Impossible level change requested";
1065 }
1066 re->before.data_disks = info->array.raid_disks;
1067 /* determining 'after' layout happens outside this 'switch' */
1068 break;
1069
1070 case 4:
1071 info->array.layout = ALGORITHM_PARITY_N;
1072 case 5:
1073 switch (info->new_level) {
1074 case 0:
1075 delta_parity = -1;
1076 case 4:
1077 re->level = info->array.level;
1078 re->before.data_disks = info->array.raid_disks - 1;
1079 re->before.layout = info->array.layout;
1080 break;
1081 case 5:
1082 re->level = 5;
1083 re->before.data_disks = info->array.raid_disks - 1;
1084 re->before.layout = info->array.layout;
1085 break;
1086 case 6:
1087 delta_parity = 1;
1088 re->level = 6;
1089 re->before.data_disks = info->array.raid_disks - 1;
1090 switch (info->array.layout) {
1091 case ALGORITHM_LEFT_ASYMMETRIC:
1092 re->before.layout = ALGORITHM_LEFT_ASYMMETRIC_6;
1093 break;
1094 case ALGORITHM_RIGHT_ASYMMETRIC:
1095 re->before.layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
1096 break;
1097 case ALGORITHM_LEFT_SYMMETRIC:
1098 re->before.layout = ALGORITHM_LEFT_SYMMETRIC_6;
1099 break;
1100 case ALGORITHM_RIGHT_SYMMETRIC:
1101 re->before.layout = ALGORITHM_RIGHT_SYMMETRIC_6;
1102 break;
1103 case ALGORITHM_PARITY_0:
1104 re->before.layout = ALGORITHM_PARITY_0_6;
1105 break;
1106 case ALGORITHM_PARITY_N:
1107 re->before.layout = ALGORITHM_PARITY_N_6;
1108 break;
1109 default:
1110 return "Cannot convert an array with this layout";
1111 }
1112 break;
1113 case 1:
1114 if (info->array.raid_disks != 2)
1115 return "Can only convert a 2-device array to RAID1";
1116 if (info->delta_disks != UnSet &&
1117 info->delta_disks != 0)
1118 return "Cannot set raid_disk when "
1119 "converting RAID5->RAID1";
1120 re->level = 1;
1121 break;
1122 default:
1123 return "Impossible level change requested";
1124 }
1125 break;
1126 case 6:
1127 switch (info->new_level) {
1128 case 4:
1129 case 5:
1130 delta_parity = -1;
1131 case 6:
1132 re->level = 6;
1133 re->before.data_disks = info->array.raid_disks - 2;
1134 re->before.layout = info->array.layout;
1135 break;
1136 default:
1137 return "Impossible level change requested";
1138 }
1139 break;
1140 }
1141
1142 /* If we reached here then it looks like a re-stripe is
1143 * happening. We have determined the intermediate level
1144 * and initial raid_disks/layout and stored these in 're'.
1145 *
1146 * We need to deduce the final layout that can be atomically
1147 * converted to the end state.
1148 */
1149 switch (info->new_level) {
1150 case 0:
1151 /* We can only get to RAID0 from RAID4 or RAID5
1152 * with appropriate layout and one extra device
1153 */
1154 if (re->level != 4 && re->level != 5)
1155 return "Cannot covert to RAID0 from this level";
1156
1157 switch (re->level) {
1158 case 4:
1159 re->after.layout = 0 ; break;
1160 case 5:
1161 re->after.layout = ALGORITHM_PARITY_N; break;
1162 }
1163 break;
1164
1165 case 4:
1166 /* We can only get to RAID4 from RAID5 */
1167 if (re->level != 4 && re->level != 5)
1168 return "Cannot convert to RAID4 from this level";
1169
1170 switch (re->level) {
1171 case 4:
1172 re->after.layout = 0 ; break;
1173 case 5:
1174 re->after.layout = ALGORITHM_PARITY_N; break;
1175 }
1176 break;
1177
1178 case 5:
1179 /* We get to RAID5 for RAID5 or RAID6 */
1180 if (re->level != 5 && re->level != 6)
1181 return "Cannot convert to RAID5 from this level";
1182
1183 switch (re->level) {
1184 case 5:
1185 if (info->new_layout == UnSet)
1186 re->after.layout = re->before.layout;
1187 else
1188 re->after.layout = info->new_layout;
1189 break;
1190 case 6:
1191 if (info->new_layout == UnSet)
1192 info->new_layout = re->before.layout;
1193
1194 /* after.layout needs to be raid6 version of new_layout */
1195 if (info->new_layout == ALGORITHM_PARITY_N)
1196 re->after.layout = ALGORITHM_PARITY_N;
1197 else {
1198 char layout[40];
1199 char *ls = map_num(r5layout, info->new_layout);
1200 int l;
1201 strcat(strcpy(layout, ls), "-6");
1202 l = map_name(r6layout, layout);
1203 if (l == UnSet)
1204 return "Cannot find RAID6 layout"
1205 " to convert to";
1206 re->after.layout = l;
1207 }
1208 }
1209 break;
1210
1211 case 6:
1212 /* We must already be at level 6 */
1213 if (re->level != 6)
1214 return "Impossible level change";
1215 if (info->new_layout == UnSet)
1216 re->after.layout = info->array.layout;
1217 else
1218 re->after.layout = info->new_layout;
1219 break;
1220 default:
1221 return "Impossible level change requested";
1222 }
1223 if (info->delta_disks == UnSet)
1224 info->delta_disks = delta_parity;
1225
1226 re->after.data_disks = (re->before.data_disks
1227 + info->delta_disks
1228 - delta_parity);
1229 switch (re->level) {
1230 case 6: re->parity = 2; break;
1231 case 4:
1232 case 5: re->parity = 1; break;
1233 default: re->parity = 0; break;
1234 }
1235 /* So we have a restripe operation, we need to calculate the number
1236 * of blocks per reshape operation.
1237 */
1238 if (info->new_chunk == 0)
1239 info->new_chunk = info->array.chunk_size;
1240 if (re->after.data_disks == re->before.data_disks &&
1241 re->after.layout == re->before.layout &&
1242 info->new_chunk == info->array.chunk_size) {
1243 /* Nothing to change */
1244 re->backup_blocks = 0;
1245 return NULL;
1246 }
1247 if (re->after.data_disks == 1 && re->before.data_disks == 1) {
1248 /* chunk and layout changes make no difference */
1249 re->backup_blocks = 0;
1250 return NULL;
1251 }
1252
1253 if (re->after.data_disks == re->before.data_disks &&
1254 get_linux_version() < 2006032)
1255 return "in-place reshape is not safe before 2.6.32 - sorry.";
1256
1257 if (re->after.data_disks < re->before.data_disks &&
1258 get_linux_version() < 2006030)
1259 return "reshape to fewer devices is not supported before 2.6.32 - sorry.";
1260
1261 re->backup_blocks = compute_backup_blocks(
1262 info->new_chunk, info->array.chunk_size,
1263 re->after.data_disks,
1264 re->before.data_disks);
1265
1266 re->new_size = info->component_size * re->after.data_disks;
1267 return NULL;
1268 }
1269
1270 static int reshape_array(char *container, int fd, char *devname,
1271 struct supertype *st, struct mdinfo *info,
1272 int force, char *backup_file, int quiet, int forked,
1273 int restart);
1274 static int reshape_container(char *container, int cfd, char *devname,
1275 struct supertype *st,
1276 struct mdinfo *info,
1277 int force,
1278 char *backup_file,
1279 int quiet);
1280
1281 int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
1282 long long size,
1283 int level, char *layout_str, int chunksize, int raid_disks,
1284 int force)
1285 {
1286 /* Make some changes in the shape of an array.
1287 * The kernel must support the change.
1288 *
1289 * There are three different changes. Each can trigger
1290 * a resync or recovery so we freeze that until we have
1291 * requested everything (if kernel supports freezing - 2.6.30).
1292 * The steps are:
1293 * - change size (i.e. component_size)
1294 * - change level
1295 * - change layout/chunksize/ndisks
1296 *
1297 * The last can require a reshape. It is different on different
1298 * levels so we need to check the level before actioning it.
1299 * Some times the level change needs to be requested after the
1300 * reshape (e.g. raid6->raid5, raid5->raid0)
1301 *
1302 */
1303 struct mdu_array_info_s array;
1304 int rv = 0;
1305 struct supertype *st;
1306 char *subarray = NULL;
1307
1308 int frozen;
1309 int changed = 0;
1310 char *container = NULL;
1311 char container_buf[20];
1312 int cfd = -1;
1313
1314 struct mdinfo info;
1315 struct mdinfo *sra;
1316
1317 if (ioctl(fd, GET_ARRAY_INFO, &array) < 0) {
1318 fprintf(stderr, Name ": %s is not an active md array - aborting\n",
1319 devname);
1320 return 1;
1321 }
1322
1323 if (size >= 0 &&
1324 (chunksize || level!= UnSet || layout_str || raid_disks)) {
1325 fprintf(stderr, Name ": cannot change component size at the same time "
1326 "as other changes.\n"
1327 " Change size first, then check data is intact before "
1328 "making other changes.\n");
1329 return 1;
1330 }
1331
1332 if (raid_disks && raid_disks < array.raid_disks && array.level > 1 &&
1333 get_linux_version() < 2006032 &&
1334 !check_env("MDADM_FORCE_FEWER")) {
1335 fprintf(stderr, Name ": reducing the number of devices is not safe before Linux 2.6.32\n"
1336 " Please use a newer kernel\n");
1337 return 1;
1338 }
1339
1340 st = super_by_fd(fd, &subarray);
1341 if (!st) {
1342 fprintf(stderr, Name ": Unable to determine metadata format for %s\n", devname);
1343 return 1;
1344 }
1345 if (raid_disks > st->max_devs) {
1346 fprintf(stderr, Name ": Cannot increase raid-disks on this array"
1347 " beyond %d\n", st->max_devs);
1348 return 1;
1349 }
1350
1351 /* in the external case we need to check that the requested reshape is
1352 * supported, and perform an initial check that the container holds the
1353 * pre-requisite spare devices (mdmon owns final validation)
1354 */
1355 if (st->ss->external) {
1356 int container_dev;
1357 int rv;
1358
1359 if (subarray) {
1360 container_dev = st->container_dev;
1361 cfd = open_dev_excl(st->container_dev);
1362 } else {
1363 container_dev = st->devnum;
1364 close(fd);
1365 cfd = open_dev_excl(st->devnum);
1366 fd = cfd;
1367 }
1368 if (cfd < 0) {
1369 fprintf(stderr, Name ": Unable to open container for %s\n",
1370 devname);
1371 free(subarray);
1372 return 1;
1373 }
1374
1375 fmt_devname(container_buf, container_dev);
1376 container = container_buf;
1377
1378 rv = st->ss->load_container(st, cfd, NULL);
1379
1380 if (rv) {
1381 fprintf(stderr, Name ": Cannot read superblock for %s\n",
1382 devname);
1383 free(subarray);
1384 return 1;
1385 }
1386
1387 if (mdmon_running(container_dev))
1388 st->update_tail = &st->updates;
1389 }
1390
1391 if (raid_disks > array.raid_disks &&
1392 array.spare_disks < (raid_disks - array.raid_disks) &&
1393 !force) {
1394 fprintf(stderr,
1395 Name ": Need %d spare%s to avoid degraded array,"
1396 " and only have %d.\n"
1397 " Use --force to over-ride this check.\n",
1398 raid_disks - array.raid_disks,
1399 raid_disks - array.raid_disks == 1 ? "" : "s",
1400 array.spare_disks);
1401 return 1;
1402 }
1403
1404 sra = sysfs_read(fd, 0, GET_LEVEL | GET_DISKS | GET_DEVS
1405 | GET_STATE | GET_VERSION);
1406 if (sra) {
1407 if (st->ss->external && subarray == NULL) {
1408 array.level = LEVEL_CONTAINER;
1409 sra->array.level = LEVEL_CONTAINER;
1410 }
1411 } else {
1412 fprintf(stderr, Name ": failed to read sysfs parameters for %s\n",
1413 devname);
1414 return 1;
1415 }
1416 frozen = freeze(st);
1417 if (frozen < -1) {
1418 /* freeze() already spewed the reason */
1419 return 1;
1420 } else if (frozen < 0) {
1421 fprintf(stderr, Name ": %s is performing resync/recovery and cannot"
1422 " be reshaped\n", devname);
1423 return 1;
1424 }
1425
1426 /* ========= set size =============== */
1427 if (size >= 0 && (size == 0 || size != array.size)) {
1428 long long orig_size = array.size;
1429
1430 if (reshape_super(st, size, UnSet, UnSet, 0, 0, UnSet, NULL,
1431 devname, !quiet)) {
1432 rv = 1;
1433 goto release;
1434 }
1435 sync_metadata(st);
1436 array.size = size;
1437 if (array.size != size) {
1438 /* got truncated to 32bit, write to
1439 * component_size instead
1440 */
1441 if (sra)
1442 rv = sysfs_set_num(sra, NULL,
1443 "component_size", size);
1444 else
1445 rv = -1;
1446 } else
1447 rv = ioctl(fd, SET_ARRAY_INFO, &array);
1448 if (rv != 0) {
1449 int err = errno;
1450
1451 /* restore metadata */
1452 if (reshape_super(st, orig_size, UnSet, UnSet, 0, 0,
1453 UnSet, NULL, devname, !quiet) == 0)
1454 sync_metadata(st);
1455 fprintf(stderr, Name ": Cannot set device size for %s: %s\n",
1456 devname, strerror(err));
1457 if (err == EBUSY &&
1458 (array.state & (1<<MD_SB_BITMAP_PRESENT)))
1459 fprintf(stderr, " Bitmap must be removed before size can be changed\n");
1460 rv = 1;
1461 goto release;
1462 }
1463 ioctl(fd, GET_ARRAY_INFO, &array);
1464 size = get_component_size(fd)/2;
1465 if (size == 0)
1466 size = array.size;
1467 if (!quiet)
1468 fprintf(stderr, Name ": component size of %s has been set to %lluK\n",
1469 devname, size);
1470 changed = 1;
1471 } else if (array.level != LEVEL_CONTAINER) {
1472 size = get_component_size(fd)/2;
1473 if (size == 0)
1474 size = array.size;
1475 }
1476
1477 /* ========= check for Raid10/Raid1 -> Raid0 conversion ===============
1478 * current implementation assumes that following conditions must be met:
1479 * - RAID10:
1480 * - far_copies == 1
1481 * - near_copies == 2
1482 */
1483 if ((level == 0 && array.level == 10 && sra &&
1484 array.layout == ((1 << 8) + 2) && !(array.raid_disks & 1)) ||
1485 (level == 0 && array.level == 1 && sra)) {
1486 int err;
1487 err = remove_disks_for_takeover(st, sra, array.layout);
1488 if (err) {
1489 dprintf(Name": Array cannot be reshaped\n");
1490 if (cfd > -1)
1491 close(cfd);
1492 rv = 1;
1493 goto release;
1494 }
1495 /* FIXME this is added with no justification - why is it here */
1496 ping_monitor(container);
1497 }
1498
1499 info.array = array;
1500 sysfs_init(&info, fd, NoMdDev);
1501 strcpy(info.text_version, sra->text_version);
1502 info.component_size = size*2;
1503 info.new_level = level;
1504 info.new_chunk = chunksize * 1024;
1505 if (raid_disks)
1506 info.delta_disks = raid_disks - info.array.raid_disks;
1507 else
1508 info.delta_disks = UnSet;
1509 if (layout_str == NULL) {
1510 info.new_layout = UnSet;
1511 if (info.array.level == 6 &&
1512 (info.new_level == 6 || info.new_level == UnSet) &&
1513 info.array.layout >= 16) {
1514 fprintf(stderr, Name
1515 ": %s has a non-standard layout. If you"
1516 " wish to preserve this\n"
1517 " during the reshape, please specify"
1518 " --layout=preserve\n"
1519 " If you want to change it, specify a"
1520 " layout or use --layout=normalise\n",
1521 devname);
1522 rv = 1;
1523 goto release;
1524 }
1525 } else if (strcmp(layout_str, "normalise") == 0 ||
1526 strcmp(layout_str, "normalize") == 0) {
1527 /* If we have a -6 RAID6 layout, remove the '-6'. */
1528 info.new_layout = UnSet;
1529 if (info.array.level == 6 && info.new_level == UnSet) {
1530 char l[40], *h;
1531 strcpy(l, map_num(r6layout, info.array.layout));
1532 h = strrchr(l, '-');
1533 if (h && strcmp(h, "-6") == 0) {
1534 *h = 0;
1535 info.new_layout = map_name(r6layout, l);
1536 }
1537 }
1538 } else if (strcmp(layout_str, "preserve") == 0) {
1539 info.new_layout = UnSet;
1540 } else {
1541 int l = info.new_level;
1542 if (l == UnSet)
1543 l = info.array.level;
1544 switch (l) {
1545 case 5:
1546 info.new_layout = map_name(r5layout, layout_str);
1547 break;
1548 case 6:
1549 info.new_layout = map_name(r6layout, layout_str);
1550 break;
1551 case 10:
1552 info.new_layout = parse_layout_10(layout_str);
1553 break;
1554 case LEVEL_FAULTY:
1555 info.new_layout = parse_layout_faulty(layout_str);
1556 break;
1557 default:
1558 fprintf(stderr, Name ": layout not meaningful"
1559 " with this level\n");
1560 rv = 1;
1561 goto release;
1562 }
1563 if (info.new_layout == UnSet) {
1564 fprintf(stderr, Name ": layout %s not understood"
1565 " for this level\n",
1566 layout_str);
1567 rv = 1;
1568 goto release;
1569 }
1570 }
1571
1572 if (array.level == LEVEL_CONTAINER) {
1573 /* This change is to be applied to every array in the
1574 * container. This is only needed when the metadata imposes
1575 * restraints of the various arrays in the container.
1576 * Currently we only know that IMSM requires all arrays
1577 * to have the same number of devices so changing the
1578 * number of devices (On-Line Capacity Expansion) must be
1579 * performed at the level of the container
1580 */
1581 rv = reshape_container(container, fd, devname, st, &info,
1582 force, backup_file, quiet);
1583 frozen = 0;
1584 } else {
1585 /* get spare devices from external metadata
1586 */
1587 if (st->ss->external) {
1588 struct mdinfo *info2;
1589
1590 info2 = st->ss->container_content(st, subarray);
1591 if (info2) {
1592 info.array.spare_disks =
1593 info2->array.spare_disks;
1594 sysfs_free(info2);
1595 }
1596 }
1597
1598 /* Impose these changes on a single array. First
1599 * check that the metadata is OK with the change. */
1600
1601 if (reshape_super(st, info.component_size, info.new_level,
1602 info.new_layout, info.new_chunk,
1603 info.array.raid_disks, info.delta_disks,
1604 backup_file, devname, quiet)) {
1605 rv = 1;
1606 goto release;
1607 }
1608 sync_metadata(st);
1609 rv = reshape_array(container, fd, devname, st, &info, force,
1610 backup_file, quiet, 0, 0);
1611 frozen = 0;
1612 }
1613 release:
1614 if (frozen > 0)
1615 unfreeze(st);
1616 return rv;
1617 }
1618
1619 static int reshape_array(char *container, int fd, char *devname,
1620 struct supertype *st, struct mdinfo *info,
1621 int force,
1622 char *backup_file, int quiet, int forked,
1623 int restart)
1624 {
1625 struct reshape reshape;
1626 int spares_needed;
1627 char *msg;
1628 int orig_level = UnSet;
1629 int disks, odisks;
1630
1631 struct mdu_array_info_s array;
1632 char *c;
1633
1634 int *fdlist;
1635 unsigned long long *offsets;
1636 int d;
1637 int nrdisks;
1638 int err;
1639 unsigned long blocks;
1640 unsigned long cache;
1641 unsigned long long array_size;
1642 int done;
1643 struct mdinfo *sra = NULL;
1644
1645 msg = analyse_change(info, &reshape);
1646 if (msg) {
1647 fprintf(stderr, Name ": %s\n", msg);
1648 goto release;
1649 }
1650 if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) {
1651 dprintf("Cannot get array information.\n");
1652 goto release;
1653 }
1654
1655 if (restart) {
1656 /* reshape already started. just skip to monitoring the reshape */
1657 if (reshape.backup_blocks == 0)
1658 return 0;
1659 goto started;
1660 }
1661 /* The container is frozen but the array may not be.
1662 * So freeze the array so spares don't get put to the wrong use
1663 * FIXME there should probably be a cleaner separation between
1664 * freeze_array and freeze_container.
1665 */
1666 sysfs_freeze_array(info);
1667 spares_needed = max(reshape.before.data_disks,
1668 reshape.after.data_disks)
1669 + reshape.parity - array.raid_disks;
1670
1671 if (!force &&
1672 info->new_level > 1 &&
1673 spares_needed > info->array.spare_disks) {
1674 fprintf(stderr,
1675 Name ": Need %d spare%s to avoid degraded array,"
1676 " and only have %d.\n"
1677 " Use --force to over-ride this check.\n",
1678 spares_needed,
1679 spares_needed == 1 ? "" : "s",
1680 info->array.spare_disks);
1681 goto release;
1682 }
1683
1684 if (reshape.level != info->array.level) {
1685 char *c = map_num(pers, reshape.level);
1686 int err;
1687 if (c == NULL)
1688 goto release;
1689
1690 err = sysfs_set_str(info, NULL, "level", c);
1691 if (err) {
1692 err = errno;
1693 fprintf(stderr, Name ": %s: could not set level to %s\n",
1694 devname, c);
1695 if (err == EBUSY &&
1696 (info->array.state & (1<<MD_SB_BITMAP_PRESENT)))
1697 fprintf(stderr, " Bitmap must be removed"
1698 " before level can be changed\n");
1699 goto release;
1700 }
1701 if (!quiet)
1702 fprintf(stderr, Name ": level of %s changed to %s\n",
1703 devname, c);
1704 orig_level = info->array.level;
1705 sysfs_freeze_array(info);
1706
1707 if (reshape.level > 0 && st->ss->external) {
1708 /* make sure mdmon is aware of the new level */
1709 if (!mdmon_running(st->container_dev))
1710 start_mdmon(st->container_dev);
1711 ping_monitor(container);
1712 }
1713 }
1714 /* ->reshape_super might have chosen some spares from the
1715 * container that it wants to be part of the new array.
1716 * We can collect them with ->container_content and give
1717 * them to the kernel.
1718 */
1719 if (st->ss->reshape_super && st->ss->container_content) {
1720 char *subarray = strchr(info->text_version+1, '/')+1;
1721 struct mdinfo *info2 =
1722 st->ss->container_content(st, subarray);
1723 struct mdinfo *d;
1724
1725 if (info2) {
1726 sysfs_init(info2, fd, st->devnum);
1727 for (d = info2->devs; d; d = d->next) {
1728 if (d->disk.state == 0 &&
1729 d->disk.raid_disk >= 0) {
1730 /* This is a spare that wants to
1731 * be part of the array.
1732 */
1733 add_disk(fd, st, info2, d);
1734 }
1735 }
1736 sysfs_free(info2);
1737 }
1738 }
1739
1740 if (reshape.backup_blocks == 0) {
1741 /* No restriping needed, but we might need to impose
1742 * some more changes: layout, raid_disks, chunk_size
1743 */
1744 /* read current array info */
1745 if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) {
1746 dprintf("Canot get array information.\n");
1747 goto release;
1748 }
1749 /* compare current array info with new values and if
1750 * it is different update them to new */
1751 if (info->new_layout != UnSet &&
1752 info->new_layout != array.layout) {
1753 array.layout = info->new_layout;
1754 if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
1755 fprintf(stderr, Name ": failed to set new layout\n");
1756 goto release;
1757 } else if (!quiet)
1758 printf("layout for %s set to %d\n",
1759 devname, array.layout);
1760 }
1761 if (info->delta_disks != UnSet &&
1762 info->delta_disks != 0 &&
1763 array.raid_disks != (info->array.raid_disks + info->delta_disks)) {
1764 array.raid_disks += info->delta_disks;
1765 if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
1766 fprintf(stderr, Name ": failed to set raid disks\n");
1767 goto release;
1768 } else if (!quiet) {
1769 printf("raid_disks for %s set to %d\n",
1770 devname, array.raid_disks);
1771 }
1772 }
1773 if (info->new_chunk != 0 &&
1774 info->new_chunk != array.chunk_size) {
1775 if (sysfs_set_num(info, NULL,
1776 "chunk_size", info->new_chunk) != 0) {
1777 fprintf(stderr, Name ": failed to set chunk size\n");
1778 goto release;
1779 } else if (!quiet)
1780 printf("chunk size for %s set to %d\n",
1781 devname, array.chunk_size);
1782 }
1783 unfreeze(st);
1784 return 0;
1785 }
1786
1787 /*
1788 * There are three possibilities.
1789 * 1/ The array will shrink.
1790 * We need to ensure the reshape will pause before reaching
1791 * the 'critical section'. We also need to fork and wait for
1792 * that to happen. When it does we
1793 * suspend/backup/complete/unfreeze
1794 *
1795 * 2/ The array will not change size.
1796 * This requires that we keep a backup of a sliding window
1797 * so that we can restore data after a crash. So we need
1798 * to fork and monitor progress.
1799 * In future we will allow the data_offset to change, so
1800 * a sliding backup becomes unnecessary.
1801 *
1802 * 3/ The array will grow. This is relatively easy.
1803 * However the kernel's restripe routines will cheerfully
1804 * overwrite some early data before it is safe. So we
1805 * need to make a backup of the early parts of the array
1806 * and be ready to restore it if rebuild aborts very early.
1807 * For externally managed metadata, we still need a forked
1808 * child to monitor the reshape and suspend IO over the region
1809 * that is being reshaped.
1810 *
1811 * We backup data by writing it to one spare, or to a
1812 * file which was given on command line.
1813 *
1814 * In each case, we first make sure that storage is available
1815 * for the required backup.
1816 * Then we:
1817 * - request the shape change.
1818 * - fork to handle backup etc.
1819 */
1820 started:
1821 /* Check that we can hold all the data */
1822 get_dev_size(fd, NULL, &array_size);
1823 if (reshape.new_size < (array_size/512)) {
1824 fprintf(stderr,
1825 Name ": this change will reduce the size of the array.\n"
1826 " use --grow --array-size first to truncate array.\n"
1827 " e.g. mdadm --grow %s --array-size %llu\n",
1828 devname, reshape.new_size/2);
1829 goto release;
1830 }
1831
1832 sra = sysfs_read(fd, 0,
1833 GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK|
1834 GET_CACHE);
1835 if (!sra) {
1836 fprintf(stderr, Name ": %s: Cannot get array details from sysfs\n",
1837 devname);
1838 goto release;
1839 }
1840
1841 /* Decide how many blocks (sectors) for a reshape
1842 * unit. The number we have so far is just a minimum
1843 */
1844 blocks = reshape.backup_blocks;
1845 if (reshape.before.data_disks ==
1846 reshape.after.data_disks) {
1847 /* Make 'blocks' bigger for better throughput, but
1848 * not so big that we reject it below.
1849 * Try for 16 megabytes
1850 */
1851 while (blocks * 32 < sra->component_size &&
1852 blocks < 16*1024*2)
1853 blocks *= 2;
1854 } else
1855 fprintf(stderr, Name ": Need to backup %luK of critical "
1856 "section..\n", blocks/2);
1857
1858 if (blocks >= sra->component_size/2) {
1859 fprintf(stderr, Name ": %s: Something wrong"
1860 " - reshape aborted\n",
1861 devname);
1862 goto release;
1863 }
1864
1865 /* Now we need to open all these devices so we can read/write.
1866 */
1867 nrdisks = max(reshape.before.data_disks,
1868 reshape.after.data_disks) + reshape.parity
1869 + sra->array.spare_disks;
1870 fdlist = malloc((1+nrdisks) * sizeof(int));
1871 offsets = malloc((1+nrdisks) * sizeof(offsets[0]));
1872 if (!fdlist || !offsets) {
1873 fprintf(stderr, Name ": malloc failed: grow aborted\n");
1874 goto release;
1875 }
1876
1877 odisks = reshape.before.data_disks + reshape.parity;
1878 d = reshape_prepare_fdlist(devname, sra, odisks,
1879 nrdisks, blocks, backup_file,
1880 fdlist, offsets);
1881 if (d < 0) {
1882 goto release;
1883 }
1884 if (backup_file == NULL) {
1885 if (reshape.after.data_disks <= reshape.before.data_disks) {
1886 fprintf(stderr,
1887 Name ": %s: Cannot grow - need backup-file\n",
1888 devname);
1889 goto release;
1890 } else if (sra->array.spare_disks == 0) {
1891 fprintf(stderr, Name ": %s: Cannot grow - need a spare or "
1892 "backup-file to backup critical section\n",
1893 devname);
1894 goto release;
1895 }
1896 } else {
1897 if (!reshape_open_backup_file(backup_file, fd, devname,
1898 (signed)blocks,
1899 fdlist+d, offsets+d, restart)) {
1900 goto release;
1901 }
1902 d++;
1903 }
1904
1905 /* lastly, check that the internal stripe cache is
1906 * large enough, or it won't work.
1907 * It must hold at least 4 stripes of the larger
1908 * chunk size
1909 */
1910 cache = max(info->array.chunk_size, info->new_chunk);
1911 cache *= 4; /* 4 stripes minimum */
1912 cache /= 512; /* convert to sectors */
1913 disks = min(reshape.before.data_disks, reshape.after.data_disks);
1914 /* make sure there is room for 'blocks' with a bit to spare */
1915 if (cache < 16 + blocks / disks)
1916 cache = 16 + blocks / disks;
1917 cache /= (4096/512); /* Covert from sectors to pages */
1918
1919 if (sra->cache_size < cache)
1920 subarray_set_num(container, sra, "stripe_cache_size",
1921 cache+1);
1922
1923 /* Right, everything seems fine. Let's kick things off.
1924 * If only changing raid_disks, use ioctl, else use
1925 * sysfs.
1926 */
1927 sync_metadata(st);
1928
1929 sra->new_chunk = info->new_chunk;
1930
1931 if (info->reshape_active)
1932 sra->reshape_progress = info->reshape_progress;
1933 else {
1934 sra->reshape_progress = 0;
1935 if (reshape.after.data_disks < reshape.before.data_disks)
1936 /* start from the end of the new array */
1937 sra->reshape_progress = (sra->component_size
1938 * reshape.after.data_disks);
1939 }
1940
1941 if (info->array.chunk_size == info->new_chunk &&
1942 reshape.before.layout == reshape.after.layout &&
1943 st->ss->external == 0) {
1944 /* use SET_ARRAY_INFO but only if reshape hasn't started */
1945 ioctl(fd, GET_ARRAY_INFO, &array);
1946 array.raid_disks = reshape.after.data_disks + reshape.parity;
1947 if (!info->reshape_active &&
1948 ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
1949 int err = errno;
1950
1951 fprintf(stderr,
1952 Name ": Cannot set device shape for %s: %s\n",
1953 devname, strerror(errno));
1954
1955 if (err == EBUSY &&
1956 (array.state & (1<<MD_SB_BITMAP_PRESENT)))
1957 fprintf(stderr,
1958 " Bitmap must be removed before"
1959 " shape can be changed\n");
1960
1961 goto release;
1962 }
1963 } else if (info->reshape_active && !st->ss->external) {
1964 /* We don't need to set anything here for internal
1965 * metadata, and for kernels before 2.6.38 we can
1966 * fail if we try.
1967 */
1968 } else {
1969 /* set them all just in case some old 'new_*' value
1970 * persists from some earlier problem.
1971 * We even set them when restarting in the middle. They will
1972 * already be set in that case so this will be a no-op,
1973 * but it is hard to tell the difference.
1974 */
1975 int err = 0;
1976 if (sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0)
1977 err = errno;
1978 if (!err && sysfs_set_num(sra, NULL, "layout",
1979 reshape.after.layout) < 0)
1980 err = errno;
1981 if (!err && subarray_set_num(container, sra, "raid_disks",
1982 reshape.after.data_disks +
1983 reshape.parity) < 0)
1984 err = errno;
1985 if (err) {
1986 fprintf(stderr, Name ": Cannot set device shape for %s\n",
1987 devname);
1988
1989 if (err == EBUSY &&
1990 (array.state & (1<<MD_SB_BITMAP_PRESENT)))
1991 fprintf(stderr,
1992 " Bitmap must be removed before"
1993 " shape can be changed\n");
1994 goto release;
1995 }
1996 }
1997
1998 err = start_reshape(sra, (info->reshape_active && !st->ss->external));
1999 if (err) {
2000 fprintf(stderr, Name ": Cannot start reshape for %s\n",
2001 devname);
2002 goto release;
2003 }
2004 if (restart)
2005 sysfs_set_str(sra, NULL, "array_state", "active");
2006
2007 /* Now we just need to kick off the reshape and watch, while
2008 * handling backups of the data...
2009 * This is all done by a forked background process.
2010 */
2011 switch(forked ? 0 : fork()) {
2012 case -1:
2013 fprintf(stderr, Name ": Cannot run child to monitor reshape: %s\n",
2014 strerror(errno));
2015 abort_reshape(sra);
2016 goto release;
2017 default:
2018 return 0;
2019 case 0:
2020 break;
2021 }
2022
2023 close(fd);
2024 if (check_env("MDADM_GROW_VERIFY"))
2025 fd = open(devname, O_RDONLY | O_DIRECT);
2026 else
2027 fd = -1;
2028 mlockall(MCL_FUTURE);
2029
2030 if (st->ss->external) {
2031 /* metadata handler takes it from here */
2032 done = st->ss->manage_reshape(
2033 fd, sra, &reshape, st, blocks,
2034 fdlist, offsets,
2035 d - odisks, fdlist+odisks,
2036 offsets+odisks);
2037 } else
2038 done = child_monitor(
2039 fd, sra, &reshape, st, blocks,
2040 fdlist, offsets,
2041 d - odisks, fdlist+odisks,
2042 offsets+odisks);
2043
2044 if (backup_file && done)
2045 unlink(backup_file);
2046 if (!done) {
2047 abort_reshape(sra);
2048 goto out;
2049 }
2050
2051 if (!st->ss->external &&
2052 !(reshape.before.data_disks != reshape.after.data_disks
2053 && info->custom_array_size) &&
2054 info->new_level == reshape.level &&
2055 !forked) {
2056 /* no need to wait for the reshape to finish as
2057 * there is nothing more to do.
2058 */
2059 exit(0);
2060 }
2061 wait_reshape(sra);
2062
2063 if (st->ss->external) {
2064 /* Re-load the metadata as much could have changed */
2065 int cfd = open_dev(st->container_dev);
2066 if (cfd >= 0) {
2067 ping_monitor(container);
2068 st->ss->free_super(st);
2069 st->ss->load_container(st, cfd, container);
2070 close(cfd);
2071 }
2072 }
2073
2074 /* set new array size if required customer_array_size is used
2075 * by this metadata.
2076 */
2077 if (reshape.before.data_disks !=
2078 reshape.after.data_disks &&
2079 info->custom_array_size) {
2080 struct mdinfo *info2;
2081 char *subarray = strchr(info->text_version+1, '/')+1;
2082
2083 info2 = st->ss->container_content(st, subarray);
2084 if (info2) {
2085 unsigned long long current_size = 0;
2086 unsigned long long new_size =
2087 info2->custom_array_size/2;
2088
2089 if (sysfs_get_ll(sra,
2090 NULL,
2091 "array_size",
2092 &current_size) == 0 &&
2093 new_size > current_size) {
2094 if (sysfs_set_num(sra, NULL,
2095 "array_size", new_size)
2096 < 0)
2097 dprintf("Error: Cannot"
2098 " set array size");
2099 else
2100 dprintf("Array size "
2101 "changed");
2102 dprintf(" from %llu to %llu.\n",
2103 current_size, new_size);
2104 }
2105 sysfs_free(info2);
2106 }
2107 }
2108
2109 if (info->new_level != reshape.level) {
2110
2111 c = map_num(pers, info->new_level);
2112 if (c) {
2113 err = sysfs_set_str(sra, NULL, "level", c);
2114 if (err)
2115 fprintf(stderr, Name\
2116 ": %s: could not set level "
2117 "to %s\n", devname, c);
2118 }
2119 }
2120 out:
2121 if (forked)
2122 return 0;
2123 unfreeze(st);
2124 exit(0);
2125
2126 release:
2127 if (orig_level != UnSet && sra) {
2128 c = map_num(pers, orig_level);
2129 if (c && sysfs_set_str(sra, NULL, "level", c) == 0)
2130 fprintf(stderr, Name ": aborting level change\n");
2131 }
2132 if (!forked)
2133 unfreeze(st);
2134 return 1;
2135 }
2136
2137 int reshape_container(char *container, int cfd, char *devname,
2138 struct supertype *st,
2139 struct mdinfo *info,
2140 int force,
2141 char *backup_file,
2142 int quiet)
2143 {
2144 struct mdinfo *cc = NULL;
2145
2146 /* component_size is not meaningful for a container,
2147 * so pass '-1' meaning 'no change'
2148 */
2149 if (reshape_super(st, -1, info->new_level,
2150 info->new_layout, info->new_chunk,
2151 info->array.raid_disks, info->delta_disks,
2152 backup_file, devname, quiet)) {
2153 unfreeze(st);
2154 return 1;
2155 }
2156
2157 sync_metadata(st);
2158
2159 /* ping monitor to be sure that update is on disk
2160 */
2161 ping_monitor(container);
2162
2163 switch (fork()) {
2164 case -1: /* error */
2165 perror("Cannot fork to complete reshape\n");
2166 unfreeze(st);
2167 return 1;
2168 default: /* parent */
2169 printf(Name ": multi-array reshape continues in background\n");
2170 return 0;
2171 case 0: /* child */
2172 break;
2173 }
2174
2175 while(1) {
2176 /* For each member array with reshape_active,
2177 * we need to perform the reshape.
2178 * We pick the first array that needs reshaping and
2179 * reshape it. reshape_array() will re-read the metadata
2180 * so the next time through a different array should be
2181 * ready for reshape.
2182 */
2183 struct mdinfo *content;
2184 int rv;
2185 int fd;
2186 struct mdstat_ent *mdstat;
2187 char *adev;
2188
2189 sysfs_free(cc);
2190
2191 cc = st->ss->container_content(st, NULL);
2192
2193 for (content = cc; content ; content = content->next) {
2194 char *subarray;
2195 if (!content->reshape_active)
2196 continue;
2197
2198 subarray = strchr(content->text_version+1, '/')+1;
2199 mdstat = mdstat_by_subdev(subarray,
2200 devname2devnum(container));
2201 if (!mdstat)
2202 continue;
2203 break;
2204 }
2205 if (!content)
2206 break;
2207
2208 fd = open_dev(mdstat->devnum);
2209 if (fd < 0)
2210 break;
2211 adev = map_dev(dev2major(mdstat->devnum),
2212 dev2minor(mdstat->devnum),
2213 0);
2214 if (!adev)
2215 adev = content->text_version;
2216
2217 sysfs_init(content, fd, mdstat->devnum);
2218
2219 rv = reshape_array(container, fd, adev, st,
2220 content, force,
2221 backup_file, quiet, 1, 0);
2222 close(fd);
2223 if (rv)
2224 break;
2225 }
2226 unfreeze(st);
2227 sysfs_free(cc);
2228 exit(0);
2229 }
2230
2231 /*
2232 * We run a child process in the background which performs the following
2233 * steps:
2234 * - wait for resync to reach a certain point
2235 * - suspend io to the following section
2236 * - backup that section
2237 * - allow resync to proceed further
2238 * - resume io
2239 * - discard the backup.
2240 *
2241 * When are combined in slightly different ways in the three cases.
2242 * Grow:
2243 * - suspend/backup/allow/wait/resume/discard
2244 * Shrink:
2245 * - allow/wait/suspend/backup/allow/wait/resume/discard
2246 * same-size:
2247 * - wait/resume/discard/suspend/backup/allow
2248 *
2249 * suspend/backup/allow always come together
2250 * wait/resume/discard do too.
2251 * For the same-size case we have two backups to improve flow.
2252 *
2253 */
2254
2255 int progress_reshape(struct mdinfo *info, struct reshape *reshape,
2256 unsigned long long backup_point,
2257 unsigned long long wait_point,
2258 unsigned long long *suspend_point,
2259 unsigned long long *reshape_completed)
2260 {
2261 /* This function is called repeatedly by the reshape manager.
2262 * It determines how much progress can safely be made and allows
2263 * that progress.
2264 * - 'info' identifies the array and particularly records in
2265 * ->reshape_progress the metadata's knowledge of progress
2266 * This is a sector offset from the start of the array
2267 * of the next array block to be relocated. This number
2268 * may increase from 0 or decrease from array_size, depending
2269 * on the type of reshape that is happening.
2270 * Note that in contrast, 'sync_completed' is a block count of the
2271 * reshape so far. It gives the distance between the start point
2272 * (head or tail of device) and the next place that data will be
2273 * written. It always increases.
2274 * - 'reshape' is the structure created by analyse_change
2275 * - 'backup_point' shows how much the metadata manager has backed-up
2276 * data. For reshapes with increasing progress, it is the next address
2277 * to be backed up, previous addresses have been backed-up. For
2278 * decreasing progress, it is the earliest address that has been
2279 * backed up - later address are also backed up.
2280 * So addresses between reshape_progress and backup_point are
2281 * backed up providing those are in the 'correct' order.
2282 * - 'wait_point' is an array address. When reshape_completed
2283 * passes this point, progress_reshape should return. It might
2284 * return earlier if it determines that ->reshape_progress needs
2285 * to be updated or further backup is needed.
2286 * - suspend_point is maintained by progress_reshape and the caller
2287 * should not touch it except to initialise to zero.
2288 * It is an array address and it only increases in 2.6.37 and earlier.
2289 * This makes it difficult to handle reducing reshapes with
2290 * external metadata.
2291 * However: it is similar to backup_point in that it records the
2292 * other end of a suspended region from reshape_progress.
2293 * it is moved to extend the region that is safe to backup and/or
2294 * reshape
2295 * - reshape_completed is read from sysfs and returned. The caller
2296 * should copy this into ->reshape_progress when it has reason to
2297 * believe that the metadata knows this, and any backup outside this
2298 * has been erased.
2299 *
2300 * Return value is:
2301 * 1 if more data from backup_point - but only as far as suspend_point,
2302 * should be backed up
2303 * 0 if things are progressing smoothly
2304 * -1 if the reshape is finished, either because it is all done,
2305 * or due to an error.
2306 */
2307
2308 int advancing = (reshape->after.data_disks
2309 >= reshape->before.data_disks);
2310 unsigned long long need_backup; /* All data between start of array and
2311 * here will at some point need to
2312 * be backed up.
2313 */
2314 unsigned long long read_offset, write_offset;
2315 unsigned long long write_range;
2316 unsigned long long max_progress, target, completed;
2317 unsigned long long array_size = (info->component_size
2318 * reshape->before.data_disks);
2319 int fd;
2320 char buf[20];
2321
2322 /* First, we unsuspend any region that is now known to be safe.
2323 * If suspend_point is on the 'wrong' side of reshape_progress, then
2324 * we don't have or need suspension at the moment. This is true for
2325 * native metadata when we don't need to back-up.
2326 */
2327 if (advancing) {
2328 if (info->reshape_progress <= *suspend_point)
2329 sysfs_set_num(info, NULL, "suspend_lo",
2330 info->reshape_progress);
2331 } else {
2332 /* Note: this won't work in 2.6.37 and before.
2333 * Something somewhere should make sure we don't need it!
2334 */
2335 if (info->reshape_progress >= *suspend_point)
2336 sysfs_set_num(info, NULL, "suspend_hi",
2337 info->reshape_progress);
2338 }
2339
2340 /* Now work out how far it is safe to progress.
2341 * If the read_offset for ->reshape_progress is less than
2342 * 'blocks' beyond the write_offset, we can only progress as far
2343 * as a backup.
2344 * Otherwise we can progress until the write_offset for the new location
2345 * reaches (within 'blocks' of) the read_offset at the current location.
2346 * However that region must be suspended unless we are using native
2347 * metadata.
2348 * If we need to suspend more, we limit it to 128M per device, which is
2349 * rather arbitrary and should be some time-based calculation.
2350 */
2351 read_offset = info->reshape_progress / reshape->before.data_disks;
2352 write_offset = info->reshape_progress / reshape->after.data_disks;
2353 write_range = info->new_chunk/512;
2354 if (reshape->before.data_disks == reshape->after.data_disks)
2355 need_backup = array_size;
2356 else
2357 need_backup = reshape->backup_blocks;
2358 if (advancing) {
2359 if (read_offset < write_offset + write_range)
2360 max_progress = backup_point;
2361 else
2362 max_progress =
2363 read_offset *
2364 reshape->after.data_disks;
2365 } else {
2366 if (read_offset > write_offset - write_range)
2367 /* Can only progress as far as has been backed up,
2368 * which must be suspended */
2369 max_progress = backup_point;
2370 else if (info->reshape_progress <= need_backup)
2371 max_progress = backup_point;
2372 else {
2373 if (info->array.major_version >= 0)
2374 /* Can progress until backup is needed */
2375 max_progress = need_backup;
2376 else {
2377 /* Can progress until metadata update is required */
2378 max_progress =
2379 read_offset *
2380 reshape->after.data_disks;
2381 /* but data must be suspended */
2382 if (max_progress < *suspend_point)
2383 max_progress = *suspend_point;
2384 }
2385 }
2386 }
2387
2388 /* We know it is safe to progress to 'max_progress' providing
2389 * it is suspended or we are using native metadata.
2390 * Consider extending suspend_point 128M per device if it
2391 * is less than 64M per device beyond reshape_progress.
2392 * But always do a multiple of 'blocks'
2393 * FIXME this is too big - it takes to long to complete
2394 * this much.
2395 */
2396 target = 64*1024*2 * min(reshape->before.data_disks,
2397 reshape->after.data_disks);
2398 target /= reshape->backup_blocks;
2399 if (target < 2)
2400 target = 2;
2401 target *= reshape->backup_blocks;
2402
2403 /* For externally managed metadata we always need to suspend IO to
2404 * the area being reshaped so we regularly push suspend_point forward.
2405 * For native metadata we only need the suspend if we are going to do
2406 * a backup.
2407 */
2408 if (advancing) {
2409 if ((need_backup > info->reshape_progress
2410 || info->array.major_version < 0) &&
2411 *suspend_point < info->reshape_progress + target) {
2412 if (need_backup < *suspend_point + 2 * target)
2413 *suspend_point = need_backup;
2414 else if (*suspend_point + 2 * target < array_size)
2415 *suspend_point += 2 * target;
2416 else
2417 *suspend_point = array_size;
2418 sysfs_set_num(info, NULL, "suspend_hi", *suspend_point);
2419 if (max_progress > *suspend_point)
2420 max_progress = *suspend_point;
2421 }
2422 } else {
2423 if (info->array.major_version >= 0) {
2424 /* Only need to suspend when about to backup */
2425 if (info->reshape_progress < need_backup * 2 &&
2426 *suspend_point > 0) {
2427 *suspend_point = 0;
2428 sysfs_set_num(info, NULL, "suspend_lo", 0);
2429 sysfs_set_num(info, NULL, "suspend_hi", need_backup);
2430 }
2431 } else {
2432 /* Need to suspend continually */
2433 if (info->reshape_progress < *suspend_point)
2434 *suspend_point = info->reshape_progress;
2435 if (*suspend_point + target < info->reshape_progress)
2436 /* No need to move suspend region yet */;
2437 else {
2438 if (*suspend_point >= 2 * target)
2439 *suspend_point -= 2 * target;
2440 else
2441 *suspend_point = 0;
2442 sysfs_set_num(info, NULL, "suspend_lo",
2443 *suspend_point);
2444 }
2445 if (max_progress < *suspend_point)
2446 max_progress = *suspend_point;
2447 }
2448 }
2449
2450 /* now set sync_max to allow that progress. sync_max, like
2451 * sync_completed is a count of sectors written per device, so
2452 * we find the difference between max_progress and the start point,
2453 * and divide that by after.data_disks to get a sync_max
2454 * number.
2455 * At the same time we convert wait_point to a similar number
2456 * for comparing against sync_completed.
2457 */
2458 /* scale down max_progress to per_disk */
2459 max_progress /= reshape->after.data_disks;
2460 /* Round to chunk size as some kernels give an erroneously high number */
2461 max_progress /= info->new_chunk/512;
2462 max_progress *= info->new_chunk/512;
2463 /* And round to old chunk size as the kernel wants that */
2464 max_progress /= info->array.chunk_size/512;
2465 max_progress *= info->array.chunk_size/512;
2466 /* Limit progress to the whole device */
2467 if (max_progress > info->component_size)
2468 max_progress = info->component_size;
2469 wait_point /= reshape->after.data_disks;
2470 if (!advancing) {
2471 /* switch from 'device offset' to 'processed block count' */
2472 max_progress = info->component_size - max_progress;
2473 wait_point = info->component_size - wait_point;
2474 }
2475
2476 sysfs_set_num(info, NULL, "sync_max", max_progress);
2477
2478 /* Now wait. If we have already reached the point that we were
2479 * asked to wait to, don't wait at all, else wait for any change.
2480 * We need to select on 'sync_completed' as that is the place that
2481 * notifications happen, but we are really interested in
2482 * 'reshape_position'
2483 */
2484 fd = sysfs_get_fd(info, NULL, "sync_completed");
2485 if (fd < 0)
2486 goto check_progress;
2487
2488 if (sysfs_fd_get_ll(fd, &completed) < 0) {
2489 close(fd);
2490 goto check_progress;
2491 }
2492 while (completed < max_progress && completed < wait_point) {
2493 /* Check that sync_action is still 'reshape' to avoid
2494 * waiting forever on a dead array
2495 */
2496 char action[20];
2497 fd_set rfds;
2498 if (sysfs_get_str(info, NULL, "sync_action",
2499 action, 20) <= 0 ||
2500 strncmp(action, "reshape", 7) != 0)
2501 break;
2502 /* Some kernels reset 'sync_completed' to zero
2503 * before setting 'sync_action' to 'idle'.
2504 * So we need these extra tests.
2505 */
2506 if (completed == 0 && advancing
2507 && info->reshape_progress > 0)
2508 break;
2509 if (completed == 0 && !advancing
2510 && info->reshape_progress < (info->component_size
2511 * reshape->after.data_disks))
2512 break;
2513 FD_ZERO(&rfds);
2514 FD_SET(fd, &rfds);
2515 select(fd+1, NULL, NULL, &rfds, NULL);
2516 if (sysfs_fd_get_ll(fd, &completed) < 0) {
2517 close(fd);
2518 goto check_progress;
2519 }
2520 }
2521 /* Some kernels reset 'sync_completed' to zero,
2522 * we need to have real point we are in md
2523 */
2524 if (completed == 0)
2525 completed = max_progress;
2526
2527 /* some kernels can give an incorrectly high 'completed' number */
2528 completed /= (info->new_chunk/512);
2529 completed *= (info->new_chunk/512);
2530 /* Convert 'completed' back in to a 'progress' number */
2531 completed *= reshape->after.data_disks;
2532 if (!advancing) {
2533 completed = info->component_size * reshape->after.data_disks
2534 - completed;
2535 }
2536 *reshape_completed = completed;
2537
2538 close(fd);
2539
2540 /* We return the need_backup flag. Caller will decide
2541 * how much - a multiple of ->backup_blocks up to *suspend_point
2542 */
2543 if (advancing)
2544 return need_backup > info->reshape_progress;
2545 else
2546 return need_backup >= info->reshape_progress;
2547
2548 check_progress:
2549 /* if we couldn't read a number from sync_completed, then
2550 * either the reshape did complete, or it aborted.
2551 * We can tell which by checking for 'none' in reshape_position.
2552 */
2553 strcpy(buf, "hi");
2554 if (sysfs_get_str(info, NULL, "reshape_position", buf, sizeof(buf)) < 0
2555 || strncmp(buf, "none", 4) != 0)
2556 return -2; /* abort */
2557 else {
2558 /* Maybe racing with array shutdown - check state */
2559 if (sysfs_get_str(info, NULL, "array_state", buf, sizeof(buf)) < 0
2560 || strncmp(buf, "inactive", 8) == 0
2561 || strncmp(buf, "clear",5) == 0)
2562 return -2; /* abort */
2563 return -1; /* complete */
2564 }
2565 }
2566
2567
2568 /* FIXME return status is never checked */
2569 static int grow_backup(struct mdinfo *sra,
2570 unsigned long long offset, /* per device */
2571 unsigned long stripes, /* per device, in old chunks */
2572 int *sources, unsigned long long *offsets,
2573 int disks, int chunk, int level, int layout,
2574 int dests, int *destfd, unsigned long long *destoffsets,
2575 int part, int *degraded,
2576 char *buf)
2577 {
2578 /* Backup 'blocks' sectors at 'offset' on each device of the array,
2579 * to storage 'destfd' (offset 'destoffsets'), after first
2580 * suspending IO. Then allow resync to continue
2581 * over the suspended section.
2582 * Use part 'part' of the backup-super-block.
2583 */
2584 int odata = disks;
2585 int rv = 0;
2586 int i;
2587 unsigned long long ll;
2588 int new_degraded;
2589 //printf("offset %llu\n", offset);
2590 if (level >= 4)
2591 odata--;
2592 if (level == 6)
2593 odata--;
2594
2595 /* Check that array hasn't become degraded, else we might backup the wrong data */
2596 if (sysfs_get_ll(sra, NULL, "degraded", &ll) < 0)
2597 return -1; /* FIXME this error is ignored */
2598 new_degraded = (int)ll;
2599 if (new_degraded != *degraded) {
2600 /* check each device to ensure it is still working */
2601 struct mdinfo *sd;
2602 for (sd = sra->devs ; sd ; sd = sd->next) {
2603 if (sd->disk.state & (1<<MD_DISK_FAULTY))
2604 continue;
2605 if (sd->disk.state & (1<<MD_DISK_SYNC)) {
2606 char sbuf[20];
2607 if (sysfs_get_str(sra, sd, "state", sbuf, 20) < 0 ||
2608 strstr(sbuf, "faulty") ||
2609 strstr(sbuf, "in_sync") == NULL) {
2610 /* this device is dead */
2611 sd->disk.state = (1<<MD_DISK_FAULTY);
2612 if (sd->disk.raid_disk >= 0 &&
2613 sources[sd->disk.raid_disk] >= 0) {
2614 close(sources[sd->disk.raid_disk]);
2615 sources[sd->disk.raid_disk] = -1;
2616 }
2617 }
2618 }
2619 }
2620 *degraded = new_degraded;
2621 }
2622 if (part) {
2623 bsb.arraystart2 = __cpu_to_le64(offset * odata);
2624 bsb.length2 = __cpu_to_le64(stripes * (chunk/512) * odata);
2625 } else {
2626 bsb.arraystart = __cpu_to_le64(offset * odata);
2627 bsb.length = __cpu_to_le64(stripes * (chunk/512) * odata);
2628 }
2629 if (part)
2630 bsb.magic[15] = '2';
2631 for (i = 0; i < dests; i++)
2632 if (part)
2633 lseek64(destfd[i], destoffsets[i] + __le64_to_cpu(bsb.devstart2)*512, 0);
2634 else
2635 lseek64(destfd[i], destoffsets[i], 0);
2636
2637 rv = save_stripes(sources, offsets,
2638 disks, chunk, level, layout,
2639 dests, destfd,
2640 offset*512*odata, stripes * chunk * odata,
2641 buf);
2642
2643 if (rv)
2644 return rv;
2645 bsb.mtime = __cpu_to_le64(time(0));
2646 for (i = 0; i < dests; i++) {
2647 bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
2648
2649 bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
2650 if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
2651 bsb.sb_csum2 = bsb_csum((char*)&bsb,
2652 ((char*)&bsb.sb_csum2)-((char*)&bsb));
2653
2654 rv = -1;
2655 if ((unsigned long long)lseek64(destfd[i], destoffsets[i] - 4096, 0)
2656 != destoffsets[i] - 4096)
2657 break;
2658 if (write(destfd[i], &bsb, 512) != 512)
2659 break;
2660 if (destoffsets[i] > 4096) {
2661 if ((unsigned long long)lseek64(destfd[i], destoffsets[i]+stripes*chunk*odata, 0) !=
2662 destoffsets[i]+stripes*chunk*odata)
2663 break;
2664 if (write(destfd[i], &bsb, 512) != 512)
2665 break;
2666 }
2667 fsync(destfd[i]);
2668 rv = 0;
2669 }
2670
2671 return rv;
2672 }
2673
2674 /* in 2.6.30, the value reported by sync_completed can be
2675 * less that it should be by one stripe.
2676 * This only happens when reshape hits sync_max and pauses.
2677 * So allow wait_backup to either extent sync_max further
2678 * than strictly necessary, or return before the
2679 * sync has got quite as far as we would really like.
2680 * This is what 'blocks2' is for.
2681 * The various caller give appropriate values so that
2682 * every works.
2683 */
2684 /* FIXME return value is often ignored */
2685 static int forget_backup(
2686 int dests, int *destfd, unsigned long long *destoffsets,
2687 int part)
2688 {
2689 /*
2690 * Erase backup 'part' (which is 0 or 1)
2691 */
2692 int i;
2693 int rv;
2694
2695 if (part) {
2696 bsb.arraystart2 = __cpu_to_le64(0);
2697 bsb.length2 = __cpu_to_le64(0);
2698 } else {
2699 bsb.arraystart = __cpu_to_le64(0);
2700 bsb.length = __cpu_to_le64(0);
2701 }
2702 bsb.mtime = __cpu_to_le64(time(0));
2703 rv = 0;
2704 for (i = 0; i < dests; i++) {
2705 bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
2706 bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
2707 if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
2708 bsb.sb_csum2 = bsb_csum((char*)&bsb,
2709 ((char*)&bsb.sb_csum2)-((char*)&bsb));
2710 if ((unsigned long long)lseek64(destfd[i], destoffsets[i]-4096, 0) !=
2711 destoffsets[i]-4096)
2712 rv = -1;
2713 if (rv == 0 &&
2714 write(destfd[i], &bsb, 512) != 512)
2715 rv = -1;
2716 fsync(destfd[i]);
2717 }
2718 return rv;
2719 }
2720
2721 static void fail(char *msg)
2722 {
2723 int rv;
2724 rv = (write(2, msg, strlen(msg)) != (int)strlen(msg));
2725 rv |= (write(2, "\n", 1) != 1);
2726 exit(rv ? 1 : 2);
2727 }
2728
2729 static char *abuf, *bbuf;
2730 static unsigned long long abuflen;
2731 static void validate(int afd, int bfd, unsigned long long offset)
2732 {
2733 /* check that the data in the backup against the array.
2734 * This is only used for regression testing and should not
2735 * be used while the array is active
2736 */
2737 if (afd < 0)
2738 return;
2739 lseek64(bfd, offset - 4096, 0);
2740 if (read(bfd, &bsb2, 512) != 512)
2741 fail("cannot read bsb");
2742 if (bsb2.sb_csum != bsb_csum((char*)&bsb2,
2743 ((char*)&bsb2.sb_csum)-((char*)&bsb2)))
2744 fail("first csum bad");
2745 if (memcmp(bsb2.magic, "md_backup_data", 14) != 0)
2746 fail("magic is bad");
2747 if (memcmp(bsb2.magic, "md_backup_data-2", 16) == 0 &&
2748 bsb2.sb_csum2 != bsb_csum((char*)&bsb2,
2749 ((char*)&bsb2.sb_csum2)-((char*)&bsb2)))
2750 fail("second csum bad");
2751
2752 if (__le64_to_cpu(bsb2.devstart)*512 != offset)
2753 fail("devstart is wrong");
2754
2755 if (bsb2.length) {
2756 unsigned long long len = __le64_to_cpu(bsb2.length)*512;
2757
2758 if (abuflen < len) {
2759 free(abuf);
2760 free(bbuf);
2761 abuflen = len;
2762 if (posix_memalign((void**)&abuf, 4096, abuflen) ||
2763 posix_memalign((void**)&bbuf, 4096, abuflen)) {
2764 abuflen = 0;
2765 /* just stop validating on mem-alloc failure */
2766 return;
2767 }
2768 }
2769
2770 lseek64(bfd, offset, 0);
2771 if ((unsigned long long)read(bfd, bbuf, len) != len) {
2772 //printf("len %llu\n", len);
2773 fail("read first backup failed");
2774 }
2775 lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0);
2776 if ((unsigned long long)read(afd, abuf, len) != len)
2777 fail("read first from array failed");
2778 if (memcmp(bbuf, abuf, len) != 0) {
2779 #if 0
2780 int i;
2781 printf("offset=%llu len=%llu\n",
2782 (unsigned long long)__le64_to_cpu(bsb2.arraystart)*512, len);
2783 for (i=0; i<len; i++)
2784 if (bbuf[i] != abuf[i]) {
2785 printf("first diff byte %d\n", i);
2786 break;
2787 }
2788 #endif
2789 fail("data1 compare failed");
2790 }
2791 }
2792 if (bsb2.length2) {
2793 unsigned long long len = __le64_to_cpu(bsb2.length2)*512;
2794
2795 if (abuflen < len) {
2796 free(abuf);
2797 free(bbuf);
2798 abuflen = len;
2799 abuf = malloc(abuflen);
2800 bbuf = malloc(abuflen);
2801 }
2802
2803 lseek64(bfd, offset+__le64_to_cpu(bsb2.devstart2)*512, 0);
2804 if ((unsigned long long)read(bfd, bbuf, len) != len)
2805 fail("read second backup failed");
2806 lseek64(afd, __le64_to_cpu(bsb2.arraystart2)*512, 0);
2807 if ((unsigned long long)read(afd, abuf, len) != len)
2808 fail("read second from array failed");
2809 if (memcmp(bbuf, abuf, len) != 0)
2810 fail("data2 compare failed");
2811 }
2812 }
2813
2814 int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
2815 struct supertype *st, unsigned long blocks,
2816 int *fds, unsigned long long *offsets,
2817 int dests, int *destfd, unsigned long long *destoffsets)
2818 {
2819 /* Monitor a reshape where backup is being performed using
2820 * 'native' mechanism - either to a backup file, or
2821 * to some space in a spare.
2822 */
2823 char *buf;
2824 int degraded = -1;
2825 unsigned long long speed;
2826 unsigned long long suspend_point, array_size;
2827 unsigned long long backup_point, wait_point;
2828 unsigned long long reshape_completed;
2829 int done = 0;
2830 int increasing = reshape->after.data_disks >= reshape->before.data_disks;
2831 int part = 0; /* The next part of the backup area to fill. It may already
2832 * be full, so we need to check */
2833 int level = reshape->level;
2834 int layout = reshape->before.layout;
2835 int data = reshape->before.data_disks;
2836 int disks = reshape->before.data_disks + reshape->parity;
2837 int chunk = sra->array.chunk_size;
2838 struct mdinfo *sd;
2839 unsigned long stripes;
2840
2841 /* set up the backup-super-block. This requires the
2842 * uuid from the array.
2843 */
2844 /* Find a superblock */
2845 for (sd = sra->devs; sd; sd = sd->next) {
2846 char *dn;
2847 int devfd;
2848 int ok;
2849 if (sd->disk.state & (1<<MD_DISK_FAULTY))
2850 continue;
2851 dn = map_dev(sd->disk.major, sd->disk.minor, 1);
2852 devfd = dev_open(dn, O_RDONLY);
2853 if (devfd < 0)
2854 continue;
2855 ok = st->ss->load_super(st, devfd, NULL);
2856 close(devfd);
2857 if (ok >= 0)
2858 break;
2859 }
2860 if (!sd) {
2861 fprintf(stderr, Name ": Cannot find a superblock\n");
2862 return 0;
2863 }
2864
2865 memset(&bsb, 0, 512);
2866 memcpy(bsb.magic, "md_backup_data-1", 16);
2867 st->ss->uuid_from_super(st, (int*)&bsb.set_uuid);
2868 bsb.mtime = __cpu_to_le64(time(0));
2869 bsb.devstart2 = blocks;
2870
2871 stripes = blocks / (sra->array.chunk_size/512) /
2872 reshape->before.data_disks;
2873
2874 if (posix_memalign((void**)&buf, 4096, disks * chunk))
2875 /* Don't start the 'reshape' */
2876 return 0;
2877 if (reshape->before.data_disks == reshape->after.data_disks) {
2878 sysfs_get_ll(sra, NULL, "sync_speed_min", &speed);
2879 sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
2880 }
2881
2882 if (increasing) {
2883 array_size = sra->component_size * reshape->after.data_disks;
2884 backup_point = sra->reshape_progress;
2885 suspend_point = 0;
2886 } else {
2887 array_size = sra->component_size * reshape->before.data_disks;
2888 backup_point = reshape->backup_blocks;
2889 suspend_point = array_size;
2890 }
2891
2892 while (!done) {
2893 int rv;
2894
2895 /* Want to return as soon the oldest backup slot can
2896 * be released as that allows us to start backing up
2897 * some more, providing suspend_point has been
2898 * advanced, which it should have.
2899 */
2900 if (increasing) {
2901 wait_point = array_size;
2902 if (part == 0 && __le64_to_cpu(bsb.length) > 0)
2903 wait_point = (__le64_to_cpu(bsb.arraystart) +
2904 __le64_to_cpu(bsb.length));
2905 if (part == 1 && __le64_to_cpu(bsb.length2) > 0)
2906 wait_point = (__le64_to_cpu(bsb.arraystart2) +
2907 __le64_to_cpu(bsb.length2));
2908 } else {
2909 wait_point = 0;
2910 if (part == 0 && __le64_to_cpu(bsb.length) > 0)
2911 wait_point = __le64_to_cpu(bsb.arraystart);
2912 if (part == 1 && __le64_to_cpu(bsb.length2) > 0)
2913 wait_point = __le64_to_cpu(bsb.arraystart2);
2914 }
2915
2916 rv = progress_reshape(sra, reshape,
2917 backup_point, wait_point,
2918 &suspend_point, &reshape_completed);
2919 /* external metadata would need to ping_monitor here */
2920 sra->reshape_progress = reshape_completed;
2921
2922 /* Clear any backup region that is before 'here' */
2923 if (increasing) {
2924 if (reshape_completed >= (__le64_to_cpu(bsb.arraystart) +
2925 __le64_to_cpu(bsb.length)))
2926 forget_backup(dests, destfd,
2927 destoffsets, 0);
2928 if (reshape_completed >= (__le64_to_cpu(bsb.arraystart2) +
2929 __le64_to_cpu(bsb.length2)))
2930 forget_backup(dests, destfd,
2931 destoffsets, 1);
2932 } else {
2933 if (reshape_completed <= (__le64_to_cpu(bsb.arraystart)))
2934 forget_backup(dests, destfd,
2935 destoffsets, 0);
2936 if (reshape_completed <= (__le64_to_cpu(bsb.arraystart2)))
2937 forget_backup(dests, destfd,
2938 destoffsets, 1);
2939 }
2940
2941 if (rv < 0) {
2942 if (rv == -1)
2943 done = 1;
2944 break;
2945 }
2946
2947 while (rv) {
2948 unsigned long long offset;
2949 unsigned long actual_stripes;
2950 /* Need to backup some data.
2951 * If 'part' is not used and the desired
2952 * backup size is suspended, do a backup,
2953 * then consider the next part.
2954 */
2955 /* Check that 'part' is unused */
2956 if (part == 0 && __le64_to_cpu(bsb.length) != 0)
2957 break;
2958 if (part == 1 && __le64_to_cpu(bsb.length2) != 0)
2959 break;
2960
2961 offset = backup_point / data;
2962 actual_stripes = stripes;
2963 if (increasing) {
2964 if (offset + actual_stripes * (chunk/512) >
2965 sra->component_size)
2966 actual_stripes = ((sra->component_size - offset)
2967 / (chunk/512));
2968 if (offset + actual_stripes * (chunk/512) >
2969 suspend_point/data)
2970 break;
2971 } else {
2972 if (offset < actual_stripes * (chunk/512))
2973 actual_stripes = offset / (chunk/512);
2974 offset -= actual_stripes * (chunk/512);
2975 if (offset < suspend_point/data)
2976 break;
2977 }
2978 if (actual_stripes == 0)
2979 break;
2980 grow_backup(sra, offset, actual_stripes,
2981 fds, offsets,
2982 disks, chunk, level, layout,
2983 dests, destfd, destoffsets,
2984 part, &degraded, buf);
2985 validate(afd, destfd[0], destoffsets[0]);
2986 /* record where 'part' is up to */
2987 part = !part;
2988 if (increasing)
2989 backup_point += actual_stripes * (chunk/512) * data;
2990 else
2991 backup_point -= actual_stripes * (chunk/512) * data;
2992 }
2993 }
2994
2995 /* FIXME maybe call progress_reshape one more time instead */
2996 abort_reshape(sra); /* remove any remaining suspension */
2997 if (reshape->before.data_disks == reshape->after.data_disks)
2998 sysfs_set_num(sra, NULL, "sync_speed_min", speed);
2999 free(buf);
3000 return done;
3001 }
3002
3003 /*
3004 * If any spare contains md_back_data-1 which is recent wrt mtime,
3005 * write that data into the array and update the super blocks with
3006 * the new reshape_progress
3007 */
3008 int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt,
3009 char *backup_file, int verbose)
3010 {
3011 int i, j;
3012 int old_disks;
3013 unsigned long long *offsets;
3014 unsigned long long nstripe, ostripe;
3015 int ndata, odata;
3016
3017 if (info->new_level != info->array.level)
3018 return 1; /* Cannot handle level changes (they are instantaneous) */
3019
3020 odata = info->array.raid_disks - info->delta_disks - 1;
3021 if (info->array.level == 6) odata--; /* number of data disks */
3022 ndata = info->array.raid_disks - 1;
3023 if (info->new_level == 6) ndata--;
3024
3025 old_disks = info->array.raid_disks - info->delta_disks;
3026
3027 if (info->delta_disks <= 0)
3028 /* Didn't grow, so the backup file must have
3029 * been used
3030 */
3031 old_disks = cnt;
3032 for (i=old_disks-(backup_file?1:0); i<cnt; i++) {
3033 struct mdinfo dinfo;
3034 int fd;
3035 int bsbsize;
3036 char *devname, namebuf[20];
3037 unsigned long long lo, hi;
3038
3039 /* This was a spare and may have some saved data on it.
3040 * Load the superblock, find and load the
3041 * backup_super_block.
3042 * If either fail, go on to next device.
3043 * If the backup contains no new info, just return
3044 * else restore data and update all superblocks
3045 */
3046 if (i == old_disks-1) {
3047 fd = open(backup_file, O_RDONLY);
3048 if (fd<0) {
3049 fprintf(stderr, Name ": backup file %s inaccessible: %s\n",
3050 backup_file, strerror(errno));
3051 continue;
3052 }
3053 devname = backup_file;
3054 } else {
3055 fd = fdlist[i];
3056 if (fd < 0)
3057 continue;
3058 if (st->ss->load_super(st, fd, NULL))
3059 continue;
3060
3061 st->ss->getinfo_super(st, &dinfo, NULL);
3062 st->ss->free_super(st);
3063
3064 if (lseek64(fd,
3065 (dinfo.data_offset + dinfo.component_size - 8) <<9,
3066 0) < 0) {
3067 fprintf(stderr, Name ": Cannot seek on device %d\n", i);
3068 continue; /* Cannot seek */
3069 }
3070 sprintf(namebuf, "device-%d", i);
3071 devname = namebuf;
3072 }
3073 if (read(fd, &bsb, sizeof(bsb)) != sizeof(bsb)) {
3074 if (verbose)
3075 fprintf(stderr, Name ": Cannot read from %s\n", devname);
3076 continue; /* Cannot read */
3077 }
3078 if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0 &&
3079 memcmp(bsb.magic, "md_backup_data-2", 16) != 0) {
3080 if (verbose)
3081 fprintf(stderr, Name ": No backup metadata on %s\n", devname);
3082 continue;
3083 }
3084 if (bsb.sb_csum != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb))) {
3085 if (verbose)
3086 fprintf(stderr, Name ": Bad backup-metadata checksum on %s\n", devname);
3087 continue; /* bad checksum */
3088 }
3089 if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0 &&
3090 bsb.sb_csum2 != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb))) {
3091 if (verbose)
3092 fprintf(stderr, Name ": Bad backup-metadata checksum2 on %s\n", devname);
3093 continue; /* Bad second checksum */
3094 }
3095 if (memcmp(bsb.set_uuid,info->uuid, 16) != 0) {
3096 if (verbose)
3097 fprintf(stderr, Name ": Wrong uuid on backup-metadata on %s\n", devname);
3098 continue; /* Wrong uuid */
3099 }
3100
3101 /* array utime and backup-mtime should be updated at much the same time, but it seems that
3102 * sometimes they aren't... So allow considerable flexability in matching, and allow
3103 * this test to be overridden by an environment variable.
3104 */
3105 if (info->array.utime > (int)__le64_to_cpu(bsb.mtime) + 2*60*60 ||
3106 info->array.utime < (int)__le64_to_cpu(bsb.mtime) - 10*60) {
3107 if (check_env("MDADM_GROW_ALLOW_OLD")) {
3108 fprintf(stderr, Name ": accepting backup with timestamp %lu "
3109 "for array with timestamp %lu\n",
3110 (unsigned long)__le64_to_cpu(bsb.mtime),
3111 (unsigned long)info->array.utime);
3112 } else {
3113 if (verbose)
3114 fprintf(stderr, Name ": too-old timestamp on "
3115 "backup-metadata on %s\n", devname);
3116 continue; /* time stamp is too bad */
3117 }
3118 }
3119
3120 if (bsb.magic[15] == '1') {
3121 if (bsb.length == 0)
3122 continue;
3123 if (info->delta_disks >= 0) {
3124 /* reshape_progress is increasing */
3125 if (__le64_to_cpu(bsb.arraystart)
3126 + __le64_to_cpu(bsb.length)
3127 < info->reshape_progress) {
3128 nonew:
3129 if (verbose)
3130 fprintf(stderr, Name
3131 ": backup-metadata found on %s but is not needed\n", devname);
3132 continue; /* No new data here */
3133 }
3134 } else {
3135 /* reshape_progress is decreasing */
3136 if (__le64_to_cpu(bsb.arraystart) >=
3137 info->reshape_progress)
3138 goto nonew; /* No new data here */
3139 }
3140 } else {
3141 if (bsb.length == 0 && bsb.length2 == 0)
3142 continue;
3143 if (info->delta_disks >= 0) {
3144 /* reshape_progress is increasing */
3145 if ((__le64_to_cpu(bsb.arraystart)
3146 + __le64_to_cpu(bsb.length)
3147 < info->reshape_progress)
3148 &&
3149 (__le64_to_cpu(bsb.arraystart2)
3150 + __le64_to_cpu(bsb.length2)
3151 < info->reshape_progress))
3152 goto nonew; /* No new data here */
3153 } else {
3154 /* reshape_progress is decreasing */
3155 if (__le64_to_cpu(bsb.arraystart) >=
3156 info->reshape_progress &&
3157 __le64_to_cpu(bsb.arraystart2) >=
3158 info->reshape_progress)
3159 goto nonew; /* No new data here */
3160 }
3161 }
3162 if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) {
3163 second_fail:
3164 if (verbose)
3165 fprintf(stderr, Name
3166 ": Failed to verify secondary backup-metadata block on %s\n",
3167 devname);
3168 continue; /* Cannot seek */
3169 }
3170 /* There should be a duplicate backup superblock 4k before here */
3171 if (lseek64(fd, -4096, 1) < 0 ||
3172 read(fd, &bsb2, sizeof(bsb2)) != sizeof(bsb2))
3173 goto second_fail; /* Cannot find leading superblock */
3174 if (bsb.magic[15] == '1')
3175 bsbsize = offsetof(struct mdp_backup_super, pad1);
3176 else
3177 bsbsize = offsetof(struct mdp_backup_super, pad);
3178 if (memcmp(&bsb2, &bsb, bsbsize) != 0)
3179 goto second_fail; /* Cannot find leading superblock */
3180
3181 /* Now need the data offsets for all devices. */
3182 offsets = malloc(sizeof(*offsets)*info->array.raid_disks);
3183 for(j=0; j<info->array.raid_disks; j++) {
3184 if (fdlist[j] < 0)
3185 continue;
3186 if (st->ss->load_super(st, fdlist[j], NULL))
3187 /* FIXME should be this be an error */
3188 continue;
3189 st->ss->getinfo_super(st, &dinfo, NULL);
3190 st->ss->free_super(st);
3191 offsets[j] = dinfo.data_offset * 512;
3192 }
3193 printf(Name ": restoring critical section\n");
3194
3195 if (restore_stripes(fdlist, offsets,
3196 info->array.raid_disks,
3197 info->new_chunk,
3198 info->new_level,
3199 info->new_layout,
3200 fd, __le64_to_cpu(bsb.devstart)*512,
3201 __le64_to_cpu(bsb.arraystart)*512,
3202 __le64_to_cpu(bsb.length)*512)) {
3203 /* didn't succeed, so giveup */
3204 if (verbose)
3205 fprintf(stderr, Name ": Error restoring backup from %s\n",
3206 devname);
3207 return 1;
3208 }
3209
3210 if (bsb.magic[15] == '2' &&
3211 restore_stripes(fdlist, offsets,
3212 info->array.raid_disks,
3213 info->new_chunk,
3214 info->new_level,
3215 info->new_layout,
3216 fd, __le64_to_cpu(bsb.devstart)*512 +
3217 __le64_to_cpu(bsb.devstart2)*512,
3218 __le64_to_cpu(bsb.arraystart2)*512,
3219 __le64_to_cpu(bsb.length2)*512)) {
3220 /* didn't succeed, so giveup */
3221 if (verbose)
3222 fprintf(stderr, Name ": Error restoring second backup from %s\n",
3223 devname);
3224 return 1;
3225 }
3226
3227
3228 /* Ok, so the data is restored. Let's update those superblocks. */
3229
3230 lo = hi = 0;
3231 if (bsb.length) {
3232 lo = __le64_to_cpu(bsb.arraystart);
3233 hi = lo + __le64_to_cpu(bsb.length);
3234 }
3235 if (bsb.magic[15] == '2' && bsb.length2) {
3236 unsigned long long lo1, hi1;
3237 lo1 = __le64_to_cpu(bsb.arraystart2);
3238 hi1 = lo1 + __le64_to_cpu(bsb.length2);
3239 if (lo == hi) {
3240 lo = lo1;
3241 hi = hi1;
3242 } else if (lo < lo1)
3243 hi = hi1;
3244 else
3245 lo = lo1;
3246 }
3247 if (lo < hi &&
3248 (info->reshape_progress < lo ||
3249 info->reshape_progress > hi))
3250 /* backup does not affect reshape_progress*/ ;
3251 else if (info->delta_disks >= 0) {
3252 info->reshape_progress = __le64_to_cpu(bsb.arraystart) +
3253 __le64_to_cpu(bsb.length);
3254 if (bsb.magic[15] == '2') {
3255 unsigned long long p2 = __le64_to_cpu(bsb.arraystart2) +
3256 __le64_to_cpu(bsb.length2);
3257 if (p2 > info->reshape_progress)
3258 info->reshape_progress = p2;
3259 }
3260 } else {
3261 info->reshape_progress = __le64_to_cpu(bsb.arraystart);
3262 if (bsb.magic[15] == '2') {
3263 unsigned long long p2 = __le64_to_cpu(bsb.arraystart2);
3264 if (p2 < info->reshape_progress)
3265 info->reshape_progress = p2;
3266 }
3267 }
3268 for (j=0; j<info->array.raid_disks; j++) {
3269 if (fdlist[j] < 0) continue;
3270 if (st->ss->load_super(st, fdlist[j], NULL))
3271 continue;
3272 st->ss->getinfo_super(st, &dinfo, NULL);
3273 dinfo.reshape_progress = info->reshape_progress;
3274 st->ss->update_super(st, &dinfo,
3275 "_reshape_progress",
3276 NULL,0, 0, NULL);
3277 st->ss->store_super(st, fdlist[j]);
3278 st->ss->free_super(st);
3279 }
3280 return 0;
3281 }
3282 /* Didn't find any backup data, try to see if any
3283 * was needed.
3284 */
3285 if (info->delta_disks < 0) {
3286 /* When shrinking, the critical section is at the end.
3287 * So see if we are before the critical section.
3288 */
3289 unsigned long long first_block;
3290 nstripe = ostripe = 0;
3291 first_block = 0;
3292 while (ostripe >= nstripe) {
3293 ostripe += info->array.chunk_size / 512;
3294 first_block = ostripe * odata;
3295 nstripe = first_block / ndata / (info->new_chunk/512) *
3296 (info->new_chunk/512);
3297 }
3298
3299 if (info->reshape_progress >= first_block)
3300 return 0;
3301 }
3302 if (info->delta_disks > 0) {
3303 /* See if we are beyond the critical section. */
3304 unsigned long long last_block;
3305 nstripe = ostripe = 0;
3306 last_block = 0;
3307 while (nstripe >= ostripe) {
3308 nstripe += info->new_chunk / 512;
3309 last_block = nstripe * ndata;
3310 ostripe = last_block / odata / (info->array.chunk_size/512) *
3311 (info->array.chunk_size/512);
3312 }
3313
3314 if (info->reshape_progress >= last_block)
3315 return 0;
3316 }
3317 /* needed to recover critical section! */
3318 if (verbose)
3319 fprintf(stderr, Name ": Failed to find backup of critical section\n");
3320 return 1;
3321 }
3322
3323 int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
3324 char *backup_file)
3325 {
3326 char buf[40];
3327 char *container = NULL;
3328 int err = sysfs_set_str(info, NULL, "array_state", "readonly");
3329 if (err)
3330 return err;
3331
3332 if (st->ss->external) {
3333 fmt_devname(buf, st->container_dev);
3334 container = buf;
3335 }
3336 return reshape_array(container, mdfd, "array", st, info, 1,
3337 backup_file, 0, 0, 1);
3338 }
3339
3340