]> git.ipfire.org Git - thirdparty/mdadm.git/blob - Create.c
mdadm: drop get_required_spare_criteria()
[thirdparty/mdadm.git] / Create.c
1 /*
2 * mdadm - manage Linux "md" devices aka RAID arrays.
3 *
4 * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
5 *
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 * Author: Neil Brown
22 * Email: <neilb@suse.de>
23 */
24
25 #include "mdadm.h"
26 #include "udev.h"
27 #include "md_u.h"
28 #include "md_p.h"
29 #include <ctype.h>
30 #include <fcntl.h>
31 #include <signal.h>
32 #include <sys/signalfd.h>
33 #include <sys/wait.h>
34
35 static int round_size_and_verify(unsigned long long *size, int chunk)
36 {
37 if (*size == 0)
38 return 0;
39 *size &= ~(unsigned long long)(chunk - 1);
40 if (*size == 0) {
41 pr_err("Size cannot be smaller than chunk.\n");
42 return 1;
43 }
44 return 0;
45 }
46
47 /**
48 * default_layout() - Get default layout for level.
49 * @st: metadata requested, could be NULL.
50 * @level: raid level requested.
51 * @verbose: verbose level.
52 *
53 * Try to ask metadata handler first, otherwise use global defaults.
54 *
55 * Return: Layout or &UnSet, return value meaning depends of level used.
56 */
57 int default_layout(struct supertype *st, int level, int verbose)
58 {
59 int layout = UnSet;
60 mapping_t *layout_map = NULL;
61 char *layout_name = NULL;
62
63 if (st && st->ss->default_geometry)
64 st->ss->default_geometry(st, &level, &layout, NULL);
65
66 if (layout != UnSet)
67 return layout;
68
69 switch (level) {
70 default: /* no layout */
71 layout = 0;
72 break;
73 case 0:
74 layout = RAID0_ORIG_LAYOUT;
75 break;
76 case 10:
77 layout = 0x102; /* near=2, far=1 */
78 layout_name = "n2";
79 break;
80 case 5:
81 case 6:
82 layout_map = r5layout;
83 break;
84 case LEVEL_FAULTY:
85 layout_map = faultylayout;
86 break;
87 }
88
89 if (layout_map) {
90 layout = map_name(layout_map, "default");
91 layout_name = map_num_s(layout_map, layout);
92 }
93 if (layout_name && verbose > 0)
94 pr_err("layout defaults to %s\n", layout_name);
95
96 return layout;
97 }
98
99 static pid_t write_zeroes_fork(int fd, struct shape *s, struct supertype *st,
100 struct mddev_dev *dv)
101
102 {
103 const unsigned long long req_size = 1 << 30;
104 unsigned long long offset_bytes, size_bytes, sz;
105 sigset_t sigset;
106 int ret = 0;
107 pid_t pid;
108
109 size_bytes = KIB_TO_BYTES(s->size);
110
111 /*
112 * If size_bytes is zero, this is a zoned raid array where
113 * each disk is of a different size and uses its full
114 * disk. Thus zero the entire disk.
115 */
116 if (!size_bytes && !get_dev_size(fd, dv->devname, &size_bytes))
117 return -1;
118
119 if (dv->data_offset != INVALID_SECTORS)
120 offset_bytes = SEC_TO_BYTES(dv->data_offset);
121 else
122 offset_bytes = SEC_TO_BYTES(st->data_offset);
123
124 pr_info("zeroing data from %lld to %lld on: %s\n",
125 offset_bytes, size_bytes, dv->devname);
126
127 pid = fork();
128 if (pid < 0) {
129 pr_err("Could not fork to zero disks: %s\n", strerror(errno));
130 return pid;
131 } else if (pid != 0) {
132 return pid;
133 }
134
135 sigemptyset(&sigset);
136 sigaddset(&sigset, SIGINT);
137 sigprocmask(SIG_UNBLOCK, &sigset, NULL);
138
139 while (size_bytes) {
140 /*
141 * Split requests to the kernel into 1GB chunks seeing the
142 * fallocate() call is not interruptible and blocking a
143 * ctrl-c for several minutes is not desirable.
144 *
145 * 1GB is chosen as a compromise: the user may still have
146 * to wait several seconds if they ctrl-c on devices that
147 * zero slowly, but will reduce the number of requests
148 * required and thus the overhead on devices that perform
149 * better.
150 */
151 sz = size_bytes;
152 if (sz >= req_size)
153 sz = req_size;
154
155 if (fallocate(fd, FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE,
156 offset_bytes, sz)) {
157 pr_err("zeroing %s failed: %s\n", dv->devname,
158 strerror(errno));
159 ret = 1;
160 break;
161 }
162
163 offset_bytes += sz;
164 size_bytes -= sz;
165 }
166
167 exit(ret);
168 }
169
170 static int wait_for_zero_forks(int *zero_pids, int count)
171 {
172 int wstatus, ret = 0, i, sfd, wait_count = 0;
173 struct signalfd_siginfo fdsi;
174 bool interrupted = false;
175 sigset_t sigset;
176 ssize_t s;
177
178 for (i = 0; i < count; i++)
179 if (zero_pids[i])
180 wait_count++;
181 if (!wait_count)
182 return 0;
183
184 sigemptyset(&sigset);
185 sigaddset(&sigset, SIGINT);
186 sigaddset(&sigset, SIGCHLD);
187 sigprocmask(SIG_BLOCK, &sigset, NULL);
188
189 sfd = signalfd(-1, &sigset, 0);
190 if (sfd < 0) {
191 pr_err("Unable to create signalfd: %s\n", strerror(errno));
192 return 1;
193 }
194
195 while (1) {
196 s = read(sfd, &fdsi, sizeof(fdsi));
197 if (s != sizeof(fdsi)) {
198 pr_err("Invalid signalfd read: %s\n", strerror(errno));
199 close(sfd);
200 return 1;
201 }
202
203 if (fdsi.ssi_signo == SIGINT) {
204 printf("\n");
205 pr_info("Interrupting zeroing processes, please wait...\n");
206 interrupted = true;
207 } else if (fdsi.ssi_signo == SIGCHLD) {
208 if (!--wait_count)
209 break;
210 }
211 }
212
213 close(sfd);
214
215 for (i = 0; i < count; i++) {
216 if (!zero_pids[i])
217 continue;
218
219 waitpid(zero_pids[i], &wstatus, 0);
220 zero_pids[i] = 0;
221 if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus))
222 ret = 1;
223 }
224
225 if (interrupted) {
226 pr_err("zeroing interrupted!\n");
227 return 1;
228 }
229
230 if (ret)
231 pr_err("zeroing failed!\n");
232 else
233 pr_info("zeroing finished\n");
234
235 return ret;
236 }
237
238 static int add_disk_to_super(int mdfd, struct shape *s, struct context *c,
239 struct supertype *st, struct mddev_dev *dv,
240 struct mdinfo *info, int have_container, int major_num,
241 int *zero_pid)
242 {
243 dev_t rdev;
244 int fd;
245
246 if (dv->disposition == 'j') {
247 info->disk.raid_disk = MD_DISK_ROLE_JOURNAL;
248 info->disk.state = (1<<MD_DISK_JOURNAL);
249 } else if (info->disk.raid_disk < s->raiddisks) {
250 info->disk.state = (1<<MD_DISK_ACTIVE) |
251 (1<<MD_DISK_SYNC);
252 } else {
253 info->disk.state = 0;
254 }
255
256 if (dv->writemostly == FlagSet) {
257 if (major_num == BITMAP_MAJOR_CLUSTERED) {
258 pr_err("Can not set %s --write-mostly with a clustered bitmap\n",dv->devname);
259 return 1;
260 } else {
261 info->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
262 }
263
264 }
265
266 if (dv->failfast == FlagSet)
267 info->disk.state |= (1<<MD_DISK_FAILFAST);
268
269 if (have_container) {
270 fd = -1;
271 } else {
272 if (st->ss->external && st->container_devnm[0])
273 fd = open(dv->devname, O_RDWR);
274 else
275 fd = open(dv->devname, O_RDWR|O_EXCL);
276
277 if (fd < 0) {
278 pr_err("failed to open %s after earlier success - aborting\n",
279 dv->devname);
280 return 1;
281 }
282 if (!fstat_is_blkdev(fd, dv->devname, &rdev)) {
283 close(fd);
284 return 1;
285 }
286 info->disk.major = major(rdev);
287 info->disk.minor = minor(rdev);
288 }
289 if (fd >= 0)
290 remove_partitions(fd);
291 if (st->ss->add_to_super(st, &info->disk, fd, dv->devname,
292 dv->data_offset)) {
293 ioctl(mdfd, STOP_ARRAY, NULL);
294 close(fd);
295 return 1;
296 }
297 st->ss->getinfo_super(st, info, NULL);
298
299 if (fd >= 0 && s->write_zeroes) {
300 *zero_pid = write_zeroes_fork(fd, s, st, dv);
301 if (*zero_pid <= 0) {
302 ioctl(mdfd, STOP_ARRAY, NULL);
303 close(fd);
304 return 1;
305 }
306 }
307
308 if (have_container && c->verbose > 0)
309 pr_err("Using %s for device %d\n",
310 map_dev(info->disk.major, info->disk.minor, 0),
311 info->disk.number);
312
313 if (!have_container) {
314 /* getinfo_super might have lost these ... */
315 info->disk.major = major(rdev);
316 info->disk.minor = minor(rdev);
317 }
318
319 return 0;
320 }
321
322 static int update_metadata(int mdfd, struct shape *s, struct supertype *st,
323 struct map_ent **map, struct mdinfo *info,
324 char *chosen_name)
325 {
326 struct mdinfo info_new;
327 struct map_ent *me = NULL;
328
329 /* check to see if the uuid has changed due to these
330 * metadata changes, and if so update the member array
331 * and container uuid. Note ->write_init_super clears
332 * the subarray cursor such that ->getinfo_super once
333 * again returns container info.
334 */
335 st->ss->getinfo_super(st, &info_new, NULL);
336 if (st->ss->external && !is_container(s->level) &&
337 !same_uuid(info_new.uuid, info->uuid, 0)) {
338 map_update(map, fd2devnm(mdfd),
339 info_new.text_version,
340 info_new.uuid, chosen_name);
341 me = map_by_devnm(map, st->container_devnm);
342 }
343
344 if (st->ss->write_init_super(st)) {
345 st->ss->free_super(st);
346 return 1;
347 }
348
349 /*
350 * Before activating the array, perform extra steps
351 * required to configure the internal write-intent
352 * bitmap.
353 */
354 if (info_new.consistency_policy == CONSISTENCY_POLICY_BITMAP &&
355 st->ss->set_bitmap && st->ss->set_bitmap(st, info)) {
356 st->ss->free_super(st);
357 return 1;
358 }
359
360 /* update parent container uuid */
361 if (me) {
362 char *path = xstrdup(me->path);
363
364 st->ss->getinfo_super(st, &info_new, NULL);
365 map_update(map, st->container_devnm, info_new.text_version,
366 info_new.uuid, path);
367 free(path);
368 }
369
370 flush_metadata_updates(st);
371 st->ss->free_super(st);
372
373 return 0;
374 }
375
376 static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
377 struct context *c, struct supertype *st,
378 struct map_ent **map, struct mddev_dev *devlist,
379 int total_slots, int have_container, int insert_point,
380 int major_num, char *chosen_name)
381 {
382 struct mddev_dev *moved_disk = NULL;
383 int pass, raid_disk_num, dnum;
384 int zero_pids[total_slots];
385 struct mddev_dev *dv;
386 struct mdinfo *infos;
387 sigset_t sigset, orig_sigset;
388 int ret = 0;
389
390 /*
391 * Block SIGINT so the main thread will always wait for the
392 * zeroing processes when being interrupted. Otherwise the
393 * zeroing processes will finish their work in the background
394 * keeping the disk busy.
395 */
396 sigemptyset(&sigset);
397 sigaddset(&sigset, SIGINT);
398 sigprocmask(SIG_BLOCK, &sigset, &orig_sigset);
399 memset(zero_pids, 0, sizeof(zero_pids));
400
401 infos = xmalloc(sizeof(*infos) * total_slots);
402 enable_fds(total_slots);
403 for (pass = 1; pass <= 2; pass++) {
404 for (dnum = 0, raid_disk_num = 0, dv = devlist; dv;
405 dv = (dv->next) ? (dv->next) : moved_disk, dnum++) {
406 if (dnum >= total_slots)
407 abort();
408 if (dnum == insert_point) {
409 raid_disk_num += 1;
410 moved_disk = dv;
411 continue;
412 }
413 if (strcasecmp(dv->devname, "missing") == 0) {
414 raid_disk_num += 1;
415 continue;
416 }
417 if (have_container)
418 moved_disk = NULL;
419 if (have_container && dnum < total_slots - 1)
420 /* repeatedly use the container */
421 moved_disk = dv;
422
423 switch(pass) {
424 case 1:
425 infos[dnum] = *info;
426 infos[dnum].disk.number = dnum;
427 infos[dnum].disk.raid_disk = raid_disk_num++;
428
429 if (dv->disposition == 'j')
430 raid_disk_num--;
431
432 ret = add_disk_to_super(mdfd, s, c, st, dv,
433 &infos[dnum], have_container,
434 major_num, &zero_pids[dnum]);
435 if (ret)
436 goto out;
437
438 break;
439 case 2:
440 infos[dnum].errors = 0;
441
442 ret = add_disk(mdfd, st, info, &infos[dnum]);
443 if (ret) {
444 pr_err("ADD_NEW_DISK for %s failed: %s\n",
445 dv->devname, strerror(errno));
446 if (errno == EINVAL &&
447 info->array.level == 0) {
448 pr_err("Possibly your kernel doesn't support RAID0 layouts.\n");
449 pr_err("Either upgrade, or use --layout=dangerous\n");
450 }
451 goto out;
452 }
453 break;
454 }
455 if (!have_container &&
456 dv == moved_disk && dnum != insert_point) break;
457 }
458
459 if (pass == 1) {
460 ret = wait_for_zero_forks(zero_pids, total_slots);
461 if (ret)
462 goto out;
463
464 ret = update_metadata(mdfd, s, st, map, info,
465 chosen_name);
466 if (ret)
467 goto out;
468 }
469 }
470
471 out:
472 if (ret)
473 wait_for_zero_forks(zero_pids, total_slots);
474 free(infos);
475 sigprocmask(SIG_SETMASK, &orig_sigset, NULL);
476 return ret;
477 }
478
479 int Create(struct supertype *st, struct mddev_ident *ident, int subdevs,
480 struct mddev_dev *devlist, struct shape *s, struct context *c)
481 {
482 /*
483 * Create a new raid array.
484 *
485 * First check that necessary details are available
486 * (i.e. level, raid-disks)
487 *
488 * Then check each disk to see what might be on it
489 * and report anything interesting.
490 *
491 * If anything looks odd, and runstop not set,
492 * abort.
493 *
494 * SET_ARRAY_INFO and ADD_NEW_DISK, and
495 * if runstop==run, or raiddisks disks were used,
496 * RUN_ARRAY
497 */
498 int mdfd;
499 unsigned long long minsize = 0, maxsize = 0;
500 char *mindisc = NULL;
501 char *maxdisc = NULL;
502 char *name = ident->name;
503 int *uuid = ident->uuid_set == 1 ? ident->uuid : NULL;
504 int dnum;
505 struct mddev_dev *dv;
506 dev_t rdev;
507 int fail = 0, warn = 0;
508 int first_missing = subdevs * 2;
509 int second_missing = subdevs * 2;
510 int missing_disks = 0;
511 int insert_point = subdevs * 2; /* where to insert a missing drive */
512 int total_slots;
513 int rv;
514 int bitmap_fd;
515 int have_container = 0;
516 int container_fd = -1;
517 int need_mdmon = 0;
518 unsigned long long bitmapsize;
519 struct mdinfo info;
520 int did_default = 0;
521 int do_default_layout = 0;
522 int do_default_chunk = 0;
523 char chosen_name[1024];
524 struct map_ent *map = NULL;
525 unsigned long long newsize;
526 mdu_array_info_t inf;
527
528 int major_num = BITMAP_MAJOR_HI;
529 if (s->bitmap_file && strcmp(s->bitmap_file, "clustered") == 0) {
530 major_num = BITMAP_MAJOR_CLUSTERED;
531 if (c->nodes <= 1) {
532 pr_err("At least 2 nodes are needed for cluster-md\n");
533 return 1;
534 }
535 }
536
537 memset(&info, 0, sizeof(info));
538 if (s->level == UnSet && st && st->ss->default_geometry)
539 st->ss->default_geometry(st, &s->level, NULL, NULL);
540 if (s->level == UnSet) {
541 pr_err("a RAID level is needed to create an array.\n");
542 return 1;
543 }
544 if (s->raiddisks < 4 && s->level == 6) {
545 pr_err("at least 4 raid-devices needed for level 6\n");
546 return 1;
547 }
548 if (s->raiddisks > 256 && s->level == 6) {
549 pr_err("no more than 256 raid-devices supported for level 6\n");
550 return 1;
551 }
552 if (s->raiddisks < 2 && s->level >= 4) {
553 pr_err("at least 2 raid-devices needed for level %d\n", s->level);
554 return 1;
555 }
556 if (s->level <= 0 && s->sparedisks) {
557 pr_err("This level does not support spare devices\n");
558 return 1;
559 }
560
561 if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) {
562 /* If given a single device, it might be a container, and we can
563 * extract a device list from there
564 */
565 int fd;
566
567 memset(&inf, 0, sizeof(inf));
568 fd = open(devlist->devname, O_RDONLY);
569 if (fd >= 0 &&
570 md_get_array_info(fd, &inf) == 0 && inf.raid_disks == 0) {
571 /* yep, looks like a container */
572 if (st) {
573 rv = st->ss->load_container(st, fd,
574 devlist->devname);
575 if (rv == 0)
576 have_container = 1;
577 } else {
578 st = super_by_fd(fd, NULL);
579 if (st && !(rv = st->ss->
580 load_container(st, fd,
581 devlist->devname)))
582 have_container = 1;
583 else
584 st = NULL;
585 }
586 if (have_container) {
587 subdevs = s->raiddisks;
588 first_missing = subdevs * 2;
589 second_missing = subdevs * 2;
590 insert_point = subdevs * 2;
591 }
592 }
593 if (fd >= 0)
594 close(fd);
595 }
596 if (st && st->ss->external && s->sparedisks) {
597 pr_err("This metadata type does not support spare disks at create time\n");
598 return 1;
599 }
600 if (subdevs > s->raiddisks+s->sparedisks+s->journaldisks) {
601 pr_err("You have listed more devices (%d) than are in the array(%d)!\n", subdevs, s->raiddisks+s->sparedisks);
602 return 1;
603 }
604 if (!have_container && subdevs < s->raiddisks+s->sparedisks+s->journaldisks) {
605 pr_err("You haven't given enough devices (real or missing) to create this array\n");
606 return 1;
607 }
608 if (s->bitmap_file && s->level <= 0) {
609 pr_err("bitmaps not meaningful with level %s\n",
610 map_num(pers, s->level)?:"given");
611 return 1;
612 }
613
614 /* now set some defaults */
615
616 if (s->layout == UnSet) {
617 do_default_layout = 1;
618 s->layout = default_layout(st, s->level, c->verbose);
619 }
620
621 if (s->level == 10)
622 /* check layout fits in array*/
623 if ((s->layout&255) * ((s->layout>>8)&255) > s->raiddisks) {
624 pr_err("that layout requires at least %d devices\n",
625 (s->layout&255) * ((s->layout>>8)&255));
626 return 1;
627 }
628
629 switch(s->level) {
630 case 4:
631 case 5:
632 case 10:
633 case 6:
634 case 0:
635 if (s->chunk == 0 || s->chunk == UnSet) {
636 s->chunk = UnSet;
637 do_default_chunk = 1;
638 /* chunk will be set later */
639 }
640 break;
641 case LEVEL_LINEAR:
642 /* a chunksize of zero 0s perfectly valid (and preferred) since 2.6.16 */
643 break;
644 case 1:
645 case LEVEL_FAULTY:
646 case LEVEL_MULTIPATH:
647 case LEVEL_CONTAINER:
648 if (s->chunk) {
649 pr_err("specifying chunk size is forbidden for this level\n");
650 return 1;
651 }
652 break;
653 default:
654 pr_err("unknown level %d\n", s->level);
655 return 1;
656 }
657
658 if (s->size == MAX_SIZE)
659 /* use '0' to mean 'max' now... */
660 s->size = 0;
661 if (s->size && s->chunk && s->chunk != UnSet)
662 if (round_size_and_verify(&s->size, s->chunk))
663 return 1;
664
665 newsize = s->size * 2;
666 if (st && ! st->ss->validate_geometry(st, s->level, s->layout, s->raiddisks,
667 &s->chunk, s->size*2,
668 s->data_offset, NULL,
669 &newsize, s->consistency_policy,
670 c->verbose >= 0))
671 return 1;
672
673 if (s->chunk && s->chunk != UnSet) {
674 newsize &= ~(unsigned long long)(s->chunk*2 - 1);
675 if (do_default_chunk) {
676 /* default chunk was just set */
677 if (c->verbose > 0)
678 pr_err("chunk size defaults to %dK\n", s->chunk);
679 if (round_size_and_verify(&s->size, s->chunk))
680 return 1;
681 do_default_chunk = 0;
682 }
683 }
684
685 if (s->size == 0) {
686 s->size = newsize / 2;
687 if (s->level == 1)
688 /* If this is ever reshaped to RAID5, we will
689 * need a chunksize. So round it off a bit
690 * now just to be safe
691 */
692 s->size &= ~(64ULL-1);
693
694 if (s->size && c->verbose > 0)
695 pr_err("setting size to %lluK\n", s->size);
696 }
697
698 /* now look at the subdevs */
699 info.array.active_disks = 0;
700 info.array.working_disks = 0;
701 dnum = 0;
702 for (dv = devlist; dv; dv = dv->next)
703 if (s->data_offset == VARIABLE_OFFSET)
704 dv->data_offset = INVALID_SECTORS;
705 else
706 dv->data_offset = s->data_offset;
707
708 for (dv=devlist; dv && !have_container; dv=dv->next, dnum++) {
709 char *dname = dv->devname;
710 unsigned long long freesize;
711 int dfd;
712 char *doff;
713
714 if (strcasecmp(dname, "missing") == 0) {
715 if (first_missing > dnum)
716 first_missing = dnum;
717 if (second_missing > dnum && dnum > first_missing)
718 second_missing = dnum;
719 missing_disks ++;
720 continue;
721 }
722 if (s->data_offset == VARIABLE_OFFSET) {
723 doff = strchr(dname, ':');
724 if (doff) {
725 *doff++ = 0;
726 dv->data_offset = parse_size(doff);
727 } else
728 dv->data_offset = INVALID_SECTORS;
729 } else
730 dv->data_offset = s->data_offset;
731
732 dfd = open(dname, O_RDONLY);
733 if (dfd < 0) {
734 pr_err("cannot open %s: %s\n",
735 dname, strerror(errno));
736 exit(2);
737 }
738 if (!fstat_is_blkdev(dfd, dname, NULL)) {
739 close(dfd);
740 exit(2);
741 }
742 close(dfd);
743 info.array.working_disks++;
744 if (dnum < s->raiddisks && dv->disposition != 'j')
745 info.array.active_disks++;
746 if (st == NULL) {
747 struct createinfo *ci = conf_get_create_info();
748 if (ci)
749 st = ci->supertype;
750 }
751 if (st == NULL) {
752 /* Need to choose a default metadata, which is different
753 * depending on geometry of array.
754 */
755 int i;
756 char *name = "default";
757 for(i = 0; !st && superlist[i]; i++) {
758 st = superlist[i]->match_metadata_desc(name);
759 if (!st)
760 continue;
761 if (do_default_layout)
762 s->layout = default_layout(st, s->level, c->verbose);
763 switch (st->ss->validate_geometry(
764 st, s->level, s->layout, s->raiddisks,
765 &s->chunk, s->size*2,
766 dv->data_offset, dname,
767 &freesize, s->consistency_policy,
768 c->verbose > 0)) {
769 case -1: /* Not valid, message printed, and not
770 * worth checking any further */
771 exit(2);
772 break;
773 case 0: /* Geometry not valid */
774 free(st);
775 st = NULL;
776 s->chunk = do_default_chunk ? UnSet : s->chunk;
777 break;
778 case 1: /* All happy */
779 break;
780 }
781 }
782
783 if (!st) {
784 int dfd = open(dname, O_RDONLY|O_EXCL);
785 if (dfd < 0) {
786 pr_err("cannot open %s: %s\n",
787 dname, strerror(errno));
788 exit(2);
789 }
790 pr_err("device %s not suitable for any style of array\n",
791 dname);
792 exit(2);
793 }
794 if (st->ss != &super0 ||
795 st->minor_version != 90)
796 did_default = 1;
797 } else {
798 if (do_default_layout)
799 s->layout = default_layout(st, s->level, 0);
800 if (!st->ss->validate_geometry(st, s->level, s->layout,
801 s->raiddisks,
802 &s->chunk, s->size*2,
803 dv->data_offset,
804 dname, &freesize,
805 s->consistency_policy,
806 c->verbose >= 0)) {
807
808 pr_err("%s is not suitable for this array.\n",
809 dname);
810 fail = 1;
811 continue;
812 }
813 }
814
815 if (dv->disposition == 'j')
816 goto skip_size_check; /* skip write journal for size check */
817
818 freesize /= 2; /* convert to K */
819 if (s->chunk && s->chunk != UnSet) {
820 /* round to chunk size */
821 freesize = freesize & ~(s->chunk-1);
822 if (do_default_chunk) {
823 /* default chunk was just set */
824 if (c->verbose > 0)
825 pr_err("chunk size defaults to %dK\n", s->chunk);
826 if (round_size_and_verify(&s->size, s->chunk))
827 return 1;
828 do_default_chunk = 0;
829 }
830 }
831 if (!freesize) {
832 pr_err("no free space left on %s\n", dname);
833 fail = 1;
834 continue;
835 }
836
837 if (s->size && freesize < s->size) {
838 pr_err("%s is smaller than given size. %lluK < %lluK + metadata\n",
839 dname, freesize, s->size);
840 fail = 1;
841 continue;
842 }
843 if (maxdisc == NULL || (maxdisc && freesize > maxsize)) {
844 maxdisc = dname;
845 maxsize = freesize;
846 }
847 if (mindisc ==NULL || (mindisc && freesize < minsize)) {
848 mindisc = dname;
849 minsize = freesize;
850 }
851 skip_size_check:
852 if (c->runstop != 1 || c->verbose >= 0) {
853 int fd = open(dname, O_RDONLY);
854 if (fd < 0) {
855 pr_err("Cannot open %s: %s\n",
856 dname, strerror(errno));
857 fail = 1;
858 continue;
859 }
860 warn |= check_ext2(fd, dname);
861 warn |= check_reiser(fd, dname);
862 warn |= check_raid(fd, dname);
863 if (strcmp(st->ss->name, "1.x") == 0 &&
864 st->minor_version >= 1)
865 /* metadata at front */
866 warn |= check_partitions(fd, dname, 0, 0);
867 else if (s->level == 1 || is_container(s->level) ||
868 (s->level == 0 && s->raiddisks == 1))
869 /* partitions could be meaningful */
870 warn |= check_partitions(fd, dname, freesize*2, s->size*2);
871 else
872 /* partitions cannot be meaningful */
873 warn |= check_partitions(fd, dname, 0, 0);
874 if (strcmp(st->ss->name, "1.x") == 0 &&
875 st->minor_version >= 1 &&
876 did_default &&
877 s->level == 1 &&
878 (warn & 1024) == 0) {
879 warn |= 1024;
880 pr_err("Note: this array has metadata at the start and\n"
881 " may not be suitable as a boot device. If you plan to\n"
882 " store '/boot' on this device please ensure that\n"
883 " your boot-loader understands md/v1.x metadata, or use\n"
884 " --metadata=0.90\n");
885 }
886 close(fd);
887 }
888 }
889 if (missing_disks == dnum && !have_container) {
890 pr_err("Subdevs can't be all missing\n");
891 return 1;
892 }
893 if (s->raiddisks + s->sparedisks > st->max_devs) {
894 pr_err("Too many devices: %s metadata only supports %d\n",
895 st->ss->name, st->max_devs);
896 return 1;
897 }
898 if (have_container)
899 info.array.working_disks = s->raiddisks;
900 if (fail) {
901 pr_err("create aborted\n");
902 return 1;
903 }
904 if (s->size == 0) {
905 if (mindisc == NULL && !have_container) {
906 pr_err("no size and no drives given - aborting create.\n");
907 return 1;
908 }
909 if (s->level > 0 || s->level == LEVEL_MULTIPATH ||
910 s->level == LEVEL_FAULTY || st->ss->external) {
911 /* size is meaningful */
912 if (!st->ss->validate_geometry(st, s->level, s->layout,
913 s->raiddisks,
914 &s->chunk, minsize*2,
915 s->data_offset,
916 NULL, NULL,
917 s->consistency_policy, 0)) {
918 pr_err("devices too large for RAID level %d\n", s->level);
919 return 1;
920 }
921 s->size = minsize;
922 if (s->level == 1)
923 /* If this is ever reshaped to RAID5, we will
924 * need a chunksize. So round it off a bit
925 * now just to be safe
926 */
927 s->size &= ~(64ULL-1);
928 if (c->verbose > 0)
929 pr_err("size set to %lluK\n", s->size);
930 }
931 }
932
933 if (!s->bitmap_file &&
934 !st->ss->external &&
935 s->level >= 1 &&
936 st->ss->add_internal_bitmap &&
937 s->journaldisks == 0 &&
938 (s->consistency_policy != CONSISTENCY_POLICY_RESYNC &&
939 s->consistency_policy != CONSISTENCY_POLICY_PPL) &&
940 (s->write_behind || s->size > 100*1024*1024ULL)) {
941 if (c->verbose > 0)
942 pr_err("automatically enabling write-intent bitmap on large array\n");
943 s->bitmap_file = "internal";
944 }
945 if (s->bitmap_file && str_is_none(s->bitmap_file) == true)
946 s->bitmap_file = NULL;
947
948 if (s->consistency_policy == CONSISTENCY_POLICY_PPL &&
949 !st->ss->write_init_ppl) {
950 pr_err("%s metadata does not support PPL\n", st->ss->name);
951 return 1;
952 }
953
954 if (!have_container && s->level > 0 && ((maxsize-s->size)*100 > maxsize)) {
955 if (c->runstop != 1 || c->verbose >= 0)
956 pr_err("largest drive (%s) exceeds size (%lluK) by more than 1%%\n",
957 maxdisc, s->size);
958 warn = 1;
959 }
960
961 if (st->ss->detail_platform && st->ss->detail_platform(0, 1, NULL) != 0) {
962 if (c->runstop != 1 || c->verbose >= 0)
963 pr_err("%s unable to enumerate platform support\n"
964 " array may not be compatible with hardware/firmware\n",
965 st->ss->name);
966 warn = 1;
967 }
968 st->nodes = c->nodes;
969 st->cluster_name = c->homecluster;
970
971 if (warn) {
972 if (c->runstop!= 1) {
973 if (!ask("Continue creating array? ")) {
974 pr_err("create aborted.\n");
975 return 1;
976 }
977 } else {
978 if (c->verbose > 0)
979 pr_err("creation continuing despite oddities due to --run\n");
980 }
981 }
982
983 /* If this is raid4/5, we want to configure the last active slot
984 * as missing, so that a reconstruct happens (faster than re-parity)
985 * FIX: Can we do this for raid6 as well?
986 */
987 if (st->ss->external == 0 && s->assume_clean == 0 &&
988 c->force == 0 && first_missing >= s->raiddisks) {
989 switch (s->level) {
990 case 4:
991 case 5:
992 insert_point = s->raiddisks-1;
993 s->sparedisks++;
994 info.array.active_disks--;
995 missing_disks++;
996 break;
997 default:
998 break;
999 }
1000 }
1001 /* For raid6, if creating with 1 missing drive, make a good drive
1002 * into a spare, else the create will fail
1003 */
1004 if (s->assume_clean == 0 && c->force == 0 && first_missing < s->raiddisks &&
1005 st->ss->external == 0 &&
1006 second_missing >= s->raiddisks && s->level == 6) {
1007 insert_point = s->raiddisks - 1;
1008 if (insert_point == first_missing)
1009 insert_point--;
1010 s->sparedisks ++;
1011 info.array.active_disks--;
1012 missing_disks++;
1013 }
1014
1015 if (s->level <= 0 && first_missing < subdevs * 2) {
1016 pr_err("This level does not support missing devices\n");
1017 return 1;
1018 }
1019
1020 /* We need to create the device */
1021 map_lock(&map);
1022 mdfd = create_mddev(ident->devname, ident->name, c->autof, LOCAL, chosen_name, 1);
1023 if (mdfd < 0) {
1024 map_unlock(&map);
1025 return 1;
1026 }
1027 /* verify if chosen_name is not in use,
1028 * it could be in conflict with already existing device
1029 * e.g. container, array
1030 */
1031 if (strncmp(chosen_name, DEV_MD_DIR, DEV_MD_DIR_LEN) == 0 &&
1032 map_by_name(&map, chosen_name + DEV_MD_DIR_LEN)) {
1033 pr_err("Array name %s is in use already.\n", chosen_name);
1034 close(mdfd);
1035 map_unlock(&map);
1036 udev_unblock();
1037 return 1;
1038 }
1039
1040 memset(&inf, 0, sizeof(inf));
1041 md_get_array_info(mdfd, &inf);
1042 if (inf.working_disks != 0) {
1043 pr_err("another array by this name is already running.\n");
1044 goto abort_locked;
1045 }
1046
1047 /* Ok, lets try some ioctls */
1048
1049 info.array.level = s->level;
1050 info.array.size = s->size;
1051 info.array.raid_disks = s->raiddisks;
1052 /* The kernel should *know* what md_minor we are dealing
1053 * with, but it chooses to trust me instead. Sigh
1054 */
1055 info.array.md_minor = 0;
1056 if (fstat_is_blkdev(mdfd, chosen_name, &rdev))
1057 info.array.md_minor = minor(rdev);
1058 info.array.not_persistent = 0;
1059
1060 if (((s->level == 4 || s->level == 5) &&
1061 (insert_point < s->raiddisks || first_missing < s->raiddisks)) ||
1062 (s->level == 6 && (insert_point < s->raiddisks ||
1063 second_missing < s->raiddisks)) ||
1064 (s->level <= 0) || s->assume_clean) {
1065 info.array.state = 1; /* clean, but one+ drive will be missing*/
1066 info.resync_start = MaxSector;
1067 } else {
1068 info.array.state = 0; /* not clean, but no errors */
1069 info.resync_start = 0;
1070 }
1071 if (s->level == 10) {
1072 /* for raid10, the bitmap size is the capacity of the array,
1073 * which is array.size * raid_disks / ncopies;
1074 * .. but convert to sectors.
1075 */
1076 int ncopies = ((s->layout>>8) & 255) * (s->layout & 255);
1077 bitmapsize = s->size * s->raiddisks / ncopies * 2;
1078 /* printf("bms=%llu as=%d rd=%d nc=%d\n", bitmapsize, s->size, s->raiddisks, ncopies);*/
1079 } else
1080 bitmapsize = s->size * 2;
1081
1082 /* There is lots of redundancy in these disk counts,
1083 * raid_disks is the most meaningful value
1084 * it describes the geometry of the array
1085 * it is constant
1086 * nr_disks is total number of used slots.
1087 * it should be raid_disks+spare_disks
1088 * spare_disks is the number of extra disks present
1089 * see above
1090 * active_disks is the number of working disks in
1091 * active slots. (With raid_disks)
1092 * working_disks is the total number of working disks,
1093 * including spares
1094 * failed_disks is the number of disks marked failed
1095 *
1096 * Ideally, the kernel would keep these (except raid_disks)
1097 * up-to-date as we ADD_NEW_DISK, but it doesn't (yet).
1098 * So for now, we assume that all raid and spare
1099 * devices will be given.
1100 */
1101 info.array.spare_disks=s->sparedisks;
1102 info.array.failed_disks=missing_disks;
1103 info.array.nr_disks = info.array.working_disks
1104 + info.array.failed_disks;
1105 info.array.layout = s->layout;
1106 info.array.chunk_size = s->chunk*1024;
1107
1108 if (*name == 0) {
1109 /* base name on devname */
1110 /* /dev/md0 -> 0
1111 * /dev/md_d0 -> d0
1112 * /dev/md_foo -> foo
1113 * /dev/md/1 -> 1
1114 * /dev/md/d1 -> d1
1115 * /dev/md/home -> home
1116 * /dev/mdhome -> home
1117 */
1118 /* FIXME compare this with rules in create_mddev */
1119 name = strrchr(chosen_name, '/');
1120
1121 if (name) {
1122 name++;
1123 if (strncmp(name, "md_", 3) == 0 &&
1124 strlen(name) > 3 && (name - chosen_name) == 5 /* /dev/ */)
1125 name += 3;
1126 else if (strncmp(name, "md", 2) == 0 &&
1127 strlen(name) > 2 && isdigit(name[2]) &&
1128 (name - chosen_name) == 5 /* /dev/ */)
1129 name += 2;
1130 }
1131 }
1132 if (!st->ss->init_super(st, &info.array, s, name, c->homehost, uuid,
1133 s->data_offset))
1134 goto abort_locked;
1135
1136 total_slots = info.array.nr_disks;
1137 st->ss->getinfo_super(st, &info, NULL);
1138 if (sysfs_init(&info, mdfd, NULL)) {
1139 pr_err("unable to initialize sysfs\n");
1140 goto abort_locked;
1141 }
1142
1143 if (did_default && c->verbose >= 0) {
1144 if (is_subarray(info.text_version)) {
1145 char devnm[32];
1146 char *ep;
1147 struct mdinfo *mdi;
1148
1149 strncpy(devnm, info.text_version+1, 32);
1150 devnm[31] = 0;
1151 ep = strchr(devnm, '/');
1152 if (ep)
1153 *ep = 0;
1154
1155 mdi = sysfs_read(-1, devnm, GET_VERSION);
1156
1157 pr_info("Creating array inside %s container %s\n",
1158 mdi?mdi->text_version:"managed", devnm);
1159 sysfs_free(mdi);
1160 } else
1161 pr_info("Defaulting to version %s metadata\n",
1162 info.text_version);
1163 }
1164
1165 map_update(&map, fd2devnm(mdfd), info.text_version,
1166 info.uuid, chosen_name);
1167 /* Keep map locked until devices have been added to array
1168 * to stop another mdadm from finding and using those devices.
1169 */
1170
1171 if (s->bitmap_file && (strcmp(s->bitmap_file, "internal") == 0 ||
1172 strcmp(s->bitmap_file, "clustered") == 0)) {
1173 if (!st->ss->add_internal_bitmap) {
1174 pr_err("internal bitmaps not supported with %s metadata\n",
1175 st->ss->name);
1176 goto abort_locked;
1177 }
1178 if (st->ss->add_internal_bitmap(st, &s->bitmap_chunk,
1179 c->delay, s->write_behind,
1180 bitmapsize, 1, major_num)) {
1181 pr_err("Given bitmap chunk size not supported.\n");
1182 goto abort_locked;
1183 }
1184 s->bitmap_file = NULL;
1185 }
1186
1187 if (sysfs_init(&info, mdfd, NULL)) {
1188 pr_err("unable to initialize sysfs\n");
1189 goto abort_locked;
1190 }
1191
1192 if (st->ss->external && st->container_devnm[0]) {
1193 /* member */
1194
1195 /* When creating a member, we need to be careful
1196 * to negotiate with mdmon properly.
1197 * If it is already running, we cannot write to
1198 * the devices and must ask it to do that part.
1199 * If it isn't running, we write to the devices,
1200 * and then start it.
1201 * We hold an exclusive open on the container
1202 * device to make sure mdmon doesn't exit after
1203 * we checked that it is running.
1204 *
1205 * For now, fail if it is already running.
1206 */
1207 container_fd = open_dev_excl(st->container_devnm);
1208 if (container_fd < 0) {
1209 pr_err("Cannot get exclusive open on container - weird.\n");
1210 goto abort_locked;
1211 }
1212 if (mdmon_running(st->container_devnm)) {
1213 if (c->verbose)
1214 pr_err("reusing mdmon for %s.\n",
1215 st->container_devnm);
1216 st->update_tail = &st->updates;
1217 } else
1218 need_mdmon = 1;
1219 }
1220 rv = set_array_info(mdfd, st, &info);
1221 if (rv) {
1222 pr_err("failed to set array info for %s: %s\n", chosen_name, strerror(errno));
1223 goto abort_locked;
1224 }
1225
1226 if (s->bitmap_file) {
1227 int uuid[4];
1228
1229 st->ss->uuid_from_super(st, uuid);
1230 if (CreateBitmap(s->bitmap_file, c->force, (char*)uuid, s->bitmap_chunk,
1231 c->delay, s->write_behind,
1232 bitmapsize,
1233 major_num)) {
1234 goto abort_locked;
1235 }
1236 bitmap_fd = open(s->bitmap_file, O_RDWR);
1237 if (bitmap_fd < 0) {
1238 pr_err("weird: %s cannot be opened\n",
1239 s->bitmap_file);
1240 goto abort_locked;
1241 }
1242 if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) {
1243 pr_err("Cannot set bitmap file for %s: %s\n", chosen_name, strerror(errno));
1244 goto abort_locked;
1245 }
1246 }
1247
1248 if (add_disks(mdfd, &info, s, c, st, &map, devlist, total_slots,
1249 have_container, insert_point, major_num, chosen_name))
1250 goto abort_locked;
1251
1252 map_unlock(&map);
1253
1254 if (is_container(s->level)) {
1255 /* No need to start. But we should signal udev to
1256 * create links */
1257 sysfs_uevent(&info, "change");
1258 if (c->verbose >= 0)
1259 pr_err("container %s prepared.\n", chosen_name);
1260 wait_for(chosen_name, mdfd);
1261 } else if (c->runstop == 1 || subdevs >= s->raiddisks) {
1262 if (st->ss->external) {
1263 int err;
1264 switch(s->level) {
1265 case LEVEL_LINEAR:
1266 case LEVEL_MULTIPATH:
1267 case 0:
1268 err = sysfs_set_str(&info, NULL, "array_state",
1269 c->readonly
1270 ? "readonly"
1271 : "active");
1272 need_mdmon = 0;
1273 break;
1274 default:
1275 err = sysfs_set_str(&info, NULL, "array_state",
1276 "readonly");
1277 break;
1278 }
1279 sysfs_set_safemode(&info, info.safe_mode_delay);
1280 if (err) {
1281 pr_err("failed to activate array.\n");
1282 ioctl(mdfd, STOP_ARRAY, NULL);
1283 goto abort;
1284 }
1285 } else if (c->readonly &&
1286 sysfs_attribute_available(
1287 &info, NULL, "array_state")) {
1288 if (sysfs_set_str(&info, NULL,
1289 "array_state", "readonly") < 0) {
1290 pr_err("Failed to start array: %s\n",
1291 strerror(errno));
1292 ioctl(mdfd, STOP_ARRAY, NULL);
1293 goto abort;
1294 }
1295 } else {
1296 /* param is not actually used */
1297 mdu_param_t param;
1298 if (ioctl(mdfd, RUN_ARRAY, &param)) {
1299 pr_err("RUN_ARRAY failed: %s\n",
1300 strerror(errno));
1301 if (errno == 524 /* ENOTSUP */ &&
1302 info.array.level == 0)
1303 cont_err("Please use --layout=original or --layout=alternate\n");
1304 if (info.array.chunk_size & (info.array.chunk_size-1)) {
1305 cont_err("Problem may be that chunk size is not a power of 2\n");
1306 }
1307 ioctl(mdfd, STOP_ARRAY, NULL);
1308 goto abort;
1309 }
1310 /* if start_ro module parameter is set, array is
1311 * auto-read-only, which is bad as the resync won't
1312 * start. So lets make it read-write now.
1313 */
1314 ioctl(mdfd, RESTART_ARRAY_RW, NULL);
1315 }
1316 if (c->verbose >= 0)
1317 pr_info("array %s started.\n", chosen_name);
1318 if (st->ss->external && st->container_devnm[0]) {
1319 if (need_mdmon)
1320 start_mdmon(st->container_devnm);
1321
1322 ping_monitor(st->container_devnm);
1323 close(container_fd);
1324 }
1325 wait_for(chosen_name, mdfd);
1326 } else {
1327 pr_err("not starting array - not enough devices.\n");
1328 }
1329 udev_unblock();
1330 close(mdfd);
1331 sysfs_uevent(&info, "change");
1332 return 0;
1333
1334 abort:
1335 udev_unblock();
1336 map_lock(&map);
1337 abort_locked:
1338 map_remove(&map, fd2devnm(mdfd));
1339 map_unlock(&map);
1340
1341 if (mdfd >= 0)
1342 close(mdfd);
1343 return 1;
1344 }