]> git.ipfire.org Git - thirdparty/mdadm.git/blob - Create.c
mdadm/ddf: Abort when raid disk is smaller in getinfo_super_ddf
[thirdparty/mdadm.git] / Create.c
1 /*
2 * mdadm - manage Linux "md" devices aka RAID arrays.
3 *
4 * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
5 *
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 * Author: Neil Brown
22 * Email: <neilb@suse.de>
23 */
24
25 #include "mdadm.h"
26 #include "md_u.h"
27 #include "md_p.h"
28 #include <ctype.h>
29 #include <fcntl.h>
30 #include <signal.h>
31 #include <sys/signalfd.h>
32 #include <sys/wait.h>
33
34 static int round_size_and_verify(unsigned long long *size, int chunk)
35 {
36 if (*size == 0)
37 return 0;
38 *size &= ~(unsigned long long)(chunk - 1);
39 if (*size == 0) {
40 pr_err("Size cannot be smaller than chunk.\n");
41 return 1;
42 }
43 return 0;
44 }
45
46 /**
47 * default_layout() - Get default layout for level.
48 * @st: metadata requested, could be NULL.
49 * @level: raid level requested.
50 * @verbose: verbose level.
51 *
52 * Try to ask metadata handler first, otherwise use global defaults.
53 *
54 * Return: Layout or &UnSet, return value meaning depends of level used.
55 */
56 int default_layout(struct supertype *st, int level, int verbose)
57 {
58 int layout = UnSet;
59 mapping_t *layout_map = NULL;
60 char *layout_name = NULL;
61
62 if (st && st->ss->default_geometry)
63 st->ss->default_geometry(st, &level, &layout, NULL);
64
65 if (layout != UnSet)
66 return layout;
67
68 switch (level) {
69 default: /* no layout */
70 layout = 0;
71 break;
72 case 0:
73 layout = RAID0_ORIG_LAYOUT;
74 break;
75 case 10:
76 layout = 0x102; /* near=2, far=1 */
77 layout_name = "n2";
78 break;
79 case 5:
80 case 6:
81 layout_map = r5layout;
82 break;
83 case LEVEL_FAULTY:
84 layout_map = faultylayout;
85 break;
86 }
87
88 if (layout_map) {
89 layout = map_name(layout_map, "default");
90 layout_name = map_num_s(layout_map, layout);
91 }
92 if (layout_name && verbose > 0)
93 pr_err("layout defaults to %s\n", layout_name);
94
95 return layout;
96 }
97
98 static pid_t write_zeroes_fork(int fd, struct shape *s, struct supertype *st,
99 struct mddev_dev *dv)
100
101 {
102 const unsigned long long req_size = 1 << 30;
103 unsigned long long offset_bytes, size_bytes, sz;
104 sigset_t sigset;
105 int ret = 0;
106 pid_t pid;
107
108 size_bytes = KIB_TO_BYTES(s->size);
109
110 /*
111 * If size_bytes is zero, this is a zoned raid array where
112 * each disk is of a different size and uses its full
113 * disk. Thus zero the entire disk.
114 */
115 if (!size_bytes && !get_dev_size(fd, dv->devname, &size_bytes))
116 return -1;
117
118 if (dv->data_offset != INVALID_SECTORS)
119 offset_bytes = SEC_TO_BYTES(dv->data_offset);
120 else
121 offset_bytes = SEC_TO_BYTES(st->data_offset);
122
123 pr_info("zeroing data from %lld to %lld on: %s\n",
124 offset_bytes, size_bytes, dv->devname);
125
126 pid = fork();
127 if (pid < 0) {
128 pr_err("Could not fork to zero disks: %s\n", strerror(errno));
129 return pid;
130 } else if (pid != 0) {
131 return pid;
132 }
133
134 sigemptyset(&sigset);
135 sigaddset(&sigset, SIGINT);
136 sigprocmask(SIG_UNBLOCK, &sigset, NULL);
137
138 while (size_bytes) {
139 /*
140 * Split requests to the kernel into 1GB chunks seeing the
141 * fallocate() call is not interruptible and blocking a
142 * ctrl-c for several minutes is not desirable.
143 *
144 * 1GB is chosen as a compromise: the user may still have
145 * to wait several seconds if they ctrl-c on devices that
146 * zero slowly, but will reduce the number of requests
147 * required and thus the overhead on devices that perform
148 * better.
149 */
150 sz = size_bytes;
151 if (sz >= req_size)
152 sz = req_size;
153
154 if (fallocate(fd, FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE,
155 offset_bytes, sz)) {
156 pr_err("zeroing %s failed: %s\n", dv->devname,
157 strerror(errno));
158 ret = 1;
159 break;
160 }
161
162 offset_bytes += sz;
163 size_bytes -= sz;
164 }
165
166 exit(ret);
167 }
168
169 static int wait_for_zero_forks(int *zero_pids, int count)
170 {
171 int wstatus, ret = 0, i, sfd, wait_count = 0;
172 struct signalfd_siginfo fdsi;
173 bool interrupted = false;
174 sigset_t sigset;
175 ssize_t s;
176
177 for (i = 0; i < count; i++)
178 if (zero_pids[i])
179 wait_count++;
180 if (!wait_count)
181 return 0;
182
183 sigemptyset(&sigset);
184 sigaddset(&sigset, SIGINT);
185 sigaddset(&sigset, SIGCHLD);
186 sigprocmask(SIG_BLOCK, &sigset, NULL);
187
188 sfd = signalfd(-1, &sigset, 0);
189 if (sfd < 0) {
190 pr_err("Unable to create signalfd: %s\n", strerror(errno));
191 return 1;
192 }
193
194 while (1) {
195 s = read(sfd, &fdsi, sizeof(fdsi));
196 if (s != sizeof(fdsi)) {
197 pr_err("Invalid signalfd read: %s\n", strerror(errno));
198 close(sfd);
199 return 1;
200 }
201
202 if (fdsi.ssi_signo == SIGINT) {
203 printf("\n");
204 pr_info("Interrupting zeroing processes, please wait...\n");
205 interrupted = true;
206 } else if (fdsi.ssi_signo == SIGCHLD) {
207 if (!--wait_count)
208 break;
209 }
210 }
211
212 close(sfd);
213
214 for (i = 0; i < count; i++) {
215 if (!zero_pids[i])
216 continue;
217
218 waitpid(zero_pids[i], &wstatus, 0);
219 zero_pids[i] = 0;
220 if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus))
221 ret = 1;
222 }
223
224 if (interrupted) {
225 pr_err("zeroing interrupted!\n");
226 return 1;
227 }
228
229 if (ret)
230 pr_err("zeroing failed!\n");
231 else
232 pr_info("zeroing finished\n");
233
234 return ret;
235 }
236
237 static int add_disk_to_super(int mdfd, struct shape *s, struct context *c,
238 struct supertype *st, struct mddev_dev *dv,
239 struct mdinfo *info, int have_container, int major_num,
240 int *zero_pid)
241 {
242 dev_t rdev;
243 int fd;
244
245 if (dv->disposition == 'j') {
246 info->disk.raid_disk = MD_DISK_ROLE_JOURNAL;
247 info->disk.state = (1<<MD_DISK_JOURNAL);
248 } else if (info->disk.raid_disk < s->raiddisks) {
249 info->disk.state = (1<<MD_DISK_ACTIVE) |
250 (1<<MD_DISK_SYNC);
251 } else {
252 info->disk.state = 0;
253 }
254
255 if (dv->writemostly == FlagSet) {
256 if (major_num == BITMAP_MAJOR_CLUSTERED) {
257 pr_err("Can not set %s --write-mostly with a clustered bitmap\n",dv->devname);
258 return 1;
259 } else {
260 info->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
261 }
262
263 }
264
265 if (dv->failfast == FlagSet)
266 info->disk.state |= (1<<MD_DISK_FAILFAST);
267
268 if (have_container) {
269 fd = -1;
270 } else {
271 if (st->ss->external && st->container_devnm[0])
272 fd = open(dv->devname, O_RDWR);
273 else
274 fd = open(dv->devname, O_RDWR|O_EXCL);
275
276 if (fd < 0) {
277 pr_err("failed to open %s after earlier success - aborting\n",
278 dv->devname);
279 return 1;
280 }
281 if (!fstat_is_blkdev(fd, dv->devname, &rdev))
282 return 1;
283 info->disk.major = major(rdev);
284 info->disk.minor = minor(rdev);
285 }
286 if (fd >= 0)
287 remove_partitions(fd);
288 if (st->ss->add_to_super(st, &info->disk, fd, dv->devname,
289 dv->data_offset)) {
290 ioctl(mdfd, STOP_ARRAY, NULL);
291 return 1;
292 }
293 st->ss->getinfo_super(st, info, NULL);
294
295 if (fd >= 0 && s->write_zeroes) {
296 *zero_pid = write_zeroes_fork(fd, s, st, dv);
297 if (*zero_pid <= 0) {
298 ioctl(mdfd, STOP_ARRAY, NULL);
299 return 1;
300 }
301 }
302
303 if (have_container && c->verbose > 0)
304 pr_err("Using %s for device %d\n",
305 map_dev(info->disk.major, info->disk.minor, 0),
306 info->disk.number);
307
308 if (!have_container) {
309 /* getinfo_super might have lost these ... */
310 info->disk.major = major(rdev);
311 info->disk.minor = minor(rdev);
312 }
313
314 return 0;
315 }
316
317 static int update_metadata(int mdfd, struct shape *s, struct supertype *st,
318 struct map_ent **map, struct mdinfo *info,
319 char *chosen_name)
320 {
321 struct mdinfo info_new;
322 struct map_ent *me = NULL;
323
324 /* check to see if the uuid has changed due to these
325 * metadata changes, and if so update the member array
326 * and container uuid. Note ->write_init_super clears
327 * the subarray cursor such that ->getinfo_super once
328 * again returns container info.
329 */
330 st->ss->getinfo_super(st, &info_new, NULL);
331 if (st->ss->external && !is_container(s->level) &&
332 !same_uuid(info_new.uuid, info->uuid, 0)) {
333 map_update(map, fd2devnm(mdfd),
334 info_new.text_version,
335 info_new.uuid, chosen_name);
336 me = map_by_devnm(map, st->container_devnm);
337 }
338
339 if (st->ss->write_init_super(st)) {
340 st->ss->free_super(st);
341 return 1;
342 }
343
344 /*
345 * Before activating the array, perform extra steps
346 * required to configure the internal write-intent
347 * bitmap.
348 */
349 if (info_new.consistency_policy == CONSISTENCY_POLICY_BITMAP &&
350 st->ss->set_bitmap && st->ss->set_bitmap(st, info)) {
351 st->ss->free_super(st);
352 return 1;
353 }
354
355 /* update parent container uuid */
356 if (me) {
357 char *path = xstrdup(me->path);
358
359 st->ss->getinfo_super(st, &info_new, NULL);
360 map_update(map, st->container_devnm, info_new.text_version,
361 info_new.uuid, path);
362 free(path);
363 }
364
365 flush_metadata_updates(st);
366 st->ss->free_super(st);
367
368 return 0;
369 }
370
371 static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
372 struct context *c, struct supertype *st,
373 struct map_ent **map, struct mddev_dev *devlist,
374 int total_slots, int have_container, int insert_point,
375 int major_num, char *chosen_name)
376 {
377 struct mddev_dev *moved_disk = NULL;
378 int pass, raid_disk_num, dnum;
379 int zero_pids[total_slots];
380 struct mddev_dev *dv;
381 struct mdinfo *infos;
382 sigset_t sigset, orig_sigset;
383 int ret = 0;
384
385 /*
386 * Block SIGINT so the main thread will always wait for the
387 * zeroing processes when being interrupted. Otherwise the
388 * zeroing processes will finish their work in the background
389 * keeping the disk busy.
390 */
391 sigemptyset(&sigset);
392 sigaddset(&sigset, SIGINT);
393 sigprocmask(SIG_BLOCK, &sigset, &orig_sigset);
394 memset(zero_pids, 0, sizeof(zero_pids));
395
396 infos = xmalloc(sizeof(*infos) * total_slots);
397 enable_fds(total_slots);
398 for (pass = 1; pass <= 2; pass++) {
399 for (dnum = 0, raid_disk_num = 0, dv = devlist; dv;
400 dv = (dv->next) ? (dv->next) : moved_disk, dnum++) {
401 if (dnum >= total_slots)
402 abort();
403 if (dnum == insert_point) {
404 raid_disk_num += 1;
405 moved_disk = dv;
406 continue;
407 }
408 if (strcasecmp(dv->devname, "missing") == 0) {
409 raid_disk_num += 1;
410 continue;
411 }
412 if (have_container)
413 moved_disk = NULL;
414 if (have_container && dnum < total_slots - 1)
415 /* repeatedly use the container */
416 moved_disk = dv;
417
418 switch(pass) {
419 case 1:
420 infos[dnum] = *info;
421 infos[dnum].disk.number = dnum;
422 infos[dnum].disk.raid_disk = raid_disk_num++;
423
424 if (dv->disposition == 'j')
425 raid_disk_num--;
426
427 ret = add_disk_to_super(mdfd, s, c, st, dv,
428 &infos[dnum], have_container,
429 major_num, &zero_pids[dnum]);
430 if (ret)
431 goto out;
432
433 break;
434 case 2:
435 infos[dnum].errors = 0;
436
437 ret = add_disk(mdfd, st, info, &infos[dnum]);
438 if (ret) {
439 pr_err("ADD_NEW_DISK for %s failed: %s\n",
440 dv->devname, strerror(errno));
441 if (errno == EINVAL &&
442 info->array.level == 0) {
443 pr_err("Possibly your kernel doesn't support RAID0 layouts.\n");
444 pr_err("Either upgrade, or use --layout=dangerous\n");
445 }
446 goto out;
447 }
448 break;
449 }
450 if (!have_container &&
451 dv == moved_disk && dnum != insert_point) break;
452 }
453
454 if (pass == 1) {
455 ret = wait_for_zero_forks(zero_pids, total_slots);
456 if (ret)
457 goto out;
458
459 ret = update_metadata(mdfd, s, st, map, info,
460 chosen_name);
461 if (ret)
462 goto out;
463 }
464 }
465
466 out:
467 if (ret)
468 wait_for_zero_forks(zero_pids, total_slots);
469 free(infos);
470 sigprocmask(SIG_SETMASK, &orig_sigset, NULL);
471 return ret;
472 }
473
474 int Create(struct supertype *st, struct mddev_ident *ident, int subdevs,
475 struct mddev_dev *devlist, struct shape *s, struct context *c)
476 {
477 /*
478 * Create a new raid array.
479 *
480 * First check that necessary details are available
481 * (i.e. level, raid-disks)
482 *
483 * Then check each disk to see what might be on it
484 * and report anything interesting.
485 *
486 * If anything looks odd, and runstop not set,
487 * abort.
488 *
489 * SET_ARRAY_INFO and ADD_NEW_DISK, and
490 * if runstop==run, or raiddisks disks were used,
491 * RUN_ARRAY
492 */
493 int mdfd;
494 unsigned long long minsize = 0, maxsize = 0;
495 char *mindisc = NULL;
496 char *maxdisc = NULL;
497 char *name = ident->name;
498 int *uuid = ident->uuid_set == 1 ? ident->uuid : NULL;
499 int dnum;
500 struct mddev_dev *dv;
501 dev_t rdev;
502 int fail = 0, warn = 0;
503 int first_missing = subdevs * 2;
504 int second_missing = subdevs * 2;
505 int missing_disks = 0;
506 int insert_point = subdevs * 2; /* where to insert a missing drive */
507 int total_slots;
508 int rv;
509 int bitmap_fd;
510 int have_container = 0;
511 int container_fd = -1;
512 int need_mdmon = 0;
513 unsigned long long bitmapsize;
514 struct mdinfo info;
515 int did_default = 0;
516 int do_default_layout = 0;
517 int do_default_chunk = 0;
518 char chosen_name[1024];
519 struct map_ent *map = NULL;
520 unsigned long long newsize;
521 mdu_array_info_t inf;
522
523 int major_num = BITMAP_MAJOR_HI;
524 if (s->bitmap_file && strcmp(s->bitmap_file, "clustered") == 0) {
525 major_num = BITMAP_MAJOR_CLUSTERED;
526 if (c->nodes <= 1) {
527 pr_err("At least 2 nodes are needed for cluster-md\n");
528 return 1;
529 }
530 }
531
532 memset(&info, 0, sizeof(info));
533 if (s->level == UnSet && st && st->ss->default_geometry)
534 st->ss->default_geometry(st, &s->level, NULL, NULL);
535 if (s->level == UnSet) {
536 pr_err("a RAID level is needed to create an array.\n");
537 return 1;
538 }
539 if (s->raiddisks < 4 && s->level == 6) {
540 pr_err("at least 4 raid-devices needed for level 6\n");
541 return 1;
542 }
543 if (s->raiddisks > 256 && s->level == 6) {
544 pr_err("no more than 256 raid-devices supported for level 6\n");
545 return 1;
546 }
547 if (s->raiddisks < 2 && s->level >= 4) {
548 pr_err("at least 2 raid-devices needed for level %d\n", s->level);
549 return 1;
550 }
551 if (s->level <= 0 && s->sparedisks) {
552 pr_err("This level does not support spare devices\n");
553 return 1;
554 }
555
556 if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) {
557 /* If given a single device, it might be a container, and we can
558 * extract a device list from there
559 */
560 int fd;
561
562 memset(&inf, 0, sizeof(inf));
563 fd = open(devlist->devname, O_RDONLY);
564 if (fd >= 0 &&
565 md_get_array_info(fd, &inf) == 0 && inf.raid_disks == 0) {
566 /* yep, looks like a container */
567 if (st) {
568 rv = st->ss->load_container(st, fd,
569 devlist->devname);
570 if (rv == 0)
571 have_container = 1;
572 } else {
573 st = super_by_fd(fd, NULL);
574 if (st && !(rv = st->ss->
575 load_container(st, fd,
576 devlist->devname)))
577 have_container = 1;
578 else
579 st = NULL;
580 }
581 if (have_container) {
582 subdevs = s->raiddisks;
583 first_missing = subdevs * 2;
584 second_missing = subdevs * 2;
585 insert_point = subdevs * 2;
586 }
587 }
588 if (fd >= 0)
589 close(fd);
590 }
591 if (st && st->ss->external && s->sparedisks) {
592 pr_err("This metadata type does not support spare disks at create time\n");
593 return 1;
594 }
595 if (subdevs > s->raiddisks+s->sparedisks+s->journaldisks) {
596 pr_err("You have listed more devices (%d) than are in the array(%d)!\n", subdevs, s->raiddisks+s->sparedisks);
597 return 1;
598 }
599 if (!have_container && subdevs < s->raiddisks+s->sparedisks+s->journaldisks) {
600 pr_err("You haven't given enough devices (real or missing) to create this array\n");
601 return 1;
602 }
603 if (s->bitmap_file && s->level <= 0) {
604 pr_err("bitmaps not meaningful with level %s\n",
605 map_num(pers, s->level)?:"given");
606 return 1;
607 }
608
609 /* now set some defaults */
610
611 if (s->layout == UnSet) {
612 do_default_layout = 1;
613 s->layout = default_layout(st, s->level, c->verbose);
614 }
615
616 if (s->level == 10)
617 /* check layout fits in array*/
618 if ((s->layout&255) * ((s->layout>>8)&255) > s->raiddisks) {
619 pr_err("that layout requires at least %d devices\n",
620 (s->layout&255) * ((s->layout>>8)&255));
621 return 1;
622 }
623
624 switch(s->level) {
625 case 4:
626 case 5:
627 case 10:
628 case 6:
629 case 0:
630 if (s->chunk == 0 || s->chunk == UnSet) {
631 s->chunk = UnSet;
632 do_default_chunk = 1;
633 /* chunk will be set later */
634 }
635 break;
636 case LEVEL_LINEAR:
637 /* a chunksize of zero 0s perfectly valid (and preferred) since 2.6.16 */
638 break;
639 case 1:
640 case LEVEL_FAULTY:
641 case LEVEL_MULTIPATH:
642 case LEVEL_CONTAINER:
643 if (s->chunk) {
644 pr_err("specifying chunk size is forbidden for this level\n");
645 return 1;
646 }
647 break;
648 default:
649 pr_err("unknown level %d\n", s->level);
650 return 1;
651 }
652
653 if (s->size == MAX_SIZE)
654 /* use '0' to mean 'max' now... */
655 s->size = 0;
656 if (s->size && s->chunk && s->chunk != UnSet)
657 if (round_size_and_verify(&s->size, s->chunk))
658 return 1;
659
660 newsize = s->size * 2;
661 if (st && ! st->ss->validate_geometry(st, s->level, s->layout, s->raiddisks,
662 &s->chunk, s->size*2,
663 s->data_offset, NULL,
664 &newsize, s->consistency_policy,
665 c->verbose >= 0))
666 return 1;
667
668 if (s->chunk && s->chunk != UnSet) {
669 newsize &= ~(unsigned long long)(s->chunk*2 - 1);
670 if (do_default_chunk) {
671 /* default chunk was just set */
672 if (c->verbose > 0)
673 pr_err("chunk size defaults to %dK\n", s->chunk);
674 if (round_size_and_verify(&s->size, s->chunk))
675 return 1;
676 do_default_chunk = 0;
677 }
678 }
679
680 if (s->size == 0) {
681 s->size = newsize / 2;
682 if (s->level == 1)
683 /* If this is ever reshaped to RAID5, we will
684 * need a chunksize. So round it off a bit
685 * now just to be safe
686 */
687 s->size &= ~(64ULL-1);
688
689 if (s->size && c->verbose > 0)
690 pr_err("setting size to %lluK\n", s->size);
691 }
692
693 /* now look at the subdevs */
694 info.array.active_disks = 0;
695 info.array.working_disks = 0;
696 dnum = 0;
697 for (dv = devlist; dv; dv = dv->next)
698 if (s->data_offset == VARIABLE_OFFSET)
699 dv->data_offset = INVALID_SECTORS;
700 else
701 dv->data_offset = s->data_offset;
702
703 for (dv=devlist; dv && !have_container; dv=dv->next, dnum++) {
704 char *dname = dv->devname;
705 unsigned long long freesize;
706 int dfd;
707 char *doff;
708
709 if (strcasecmp(dname, "missing") == 0) {
710 if (first_missing > dnum)
711 first_missing = dnum;
712 if (second_missing > dnum && dnum > first_missing)
713 second_missing = dnum;
714 missing_disks ++;
715 continue;
716 }
717 if (s->data_offset == VARIABLE_OFFSET) {
718 doff = strchr(dname, ':');
719 if (doff) {
720 *doff++ = 0;
721 dv->data_offset = parse_size(doff);
722 } else
723 dv->data_offset = INVALID_SECTORS;
724 } else
725 dv->data_offset = s->data_offset;
726
727 dfd = open(dname, O_RDONLY);
728 if (dfd < 0) {
729 pr_err("cannot open %s: %s\n",
730 dname, strerror(errno));
731 exit(2);
732 }
733 if (!fstat_is_blkdev(dfd, dname, NULL)) {
734 close(dfd);
735 exit(2);
736 }
737 close(dfd);
738 info.array.working_disks++;
739 if (dnum < s->raiddisks && dv->disposition != 'j')
740 info.array.active_disks++;
741 if (st == NULL) {
742 struct createinfo *ci = conf_get_create_info();
743 if (ci)
744 st = ci->supertype;
745 }
746 if (st == NULL) {
747 /* Need to choose a default metadata, which is different
748 * depending on geometry of array.
749 */
750 int i;
751 char *name = "default";
752 for(i = 0; !st && superlist[i]; i++) {
753 st = superlist[i]->match_metadata_desc(name);
754 if (!st)
755 continue;
756 if (do_default_layout)
757 s->layout = default_layout(st, s->level, c->verbose);
758 switch (st->ss->validate_geometry(
759 st, s->level, s->layout, s->raiddisks,
760 &s->chunk, s->size*2,
761 dv->data_offset, dname,
762 &freesize, s->consistency_policy,
763 c->verbose > 0)) {
764 case -1: /* Not valid, message printed, and not
765 * worth checking any further */
766 exit(2);
767 break;
768 case 0: /* Geometry not valid */
769 free(st);
770 st = NULL;
771 s->chunk = do_default_chunk ? UnSet : s->chunk;
772 break;
773 case 1: /* All happy */
774 break;
775 }
776 }
777
778 if (!st) {
779 int dfd = open(dname, O_RDONLY|O_EXCL);
780 if (dfd < 0) {
781 pr_err("cannot open %s: %s\n",
782 dname, strerror(errno));
783 exit(2);
784 }
785 pr_err("device %s not suitable for any style of array\n",
786 dname);
787 exit(2);
788 }
789 if (st->ss != &super0 ||
790 st->minor_version != 90)
791 did_default = 1;
792 } else {
793 if (do_default_layout)
794 s->layout = default_layout(st, s->level, 0);
795 if (!st->ss->validate_geometry(st, s->level, s->layout,
796 s->raiddisks,
797 &s->chunk, s->size*2,
798 dv->data_offset,
799 dname, &freesize,
800 s->consistency_policy,
801 c->verbose >= 0)) {
802
803 pr_err("%s is not suitable for this array.\n",
804 dname);
805 fail = 1;
806 continue;
807 }
808 }
809
810 if (dv->disposition == 'j')
811 goto skip_size_check; /* skip write journal for size check */
812
813 freesize /= 2; /* convert to K */
814 if (s->chunk && s->chunk != UnSet) {
815 /* round to chunk size */
816 freesize = freesize & ~(s->chunk-1);
817 if (do_default_chunk) {
818 /* default chunk was just set */
819 if (c->verbose > 0)
820 pr_err("chunk size defaults to %dK\n", s->chunk);
821 if (round_size_and_verify(&s->size, s->chunk))
822 return 1;
823 do_default_chunk = 0;
824 }
825 }
826 if (!freesize) {
827 pr_err("no free space left on %s\n", dname);
828 fail = 1;
829 continue;
830 }
831
832 if (s->size && freesize < s->size) {
833 pr_err("%s is smaller than given size. %lluK < %lluK + metadata\n",
834 dname, freesize, s->size);
835 fail = 1;
836 continue;
837 }
838 if (maxdisc == NULL || (maxdisc && freesize > maxsize)) {
839 maxdisc = dname;
840 maxsize = freesize;
841 }
842 if (mindisc ==NULL || (mindisc && freesize < minsize)) {
843 mindisc = dname;
844 minsize = freesize;
845 }
846 skip_size_check:
847 if (c->runstop != 1 || c->verbose >= 0) {
848 int fd = open(dname, O_RDONLY);
849 if (fd < 0) {
850 pr_err("Cannot open %s: %s\n",
851 dname, strerror(errno));
852 fail = 1;
853 continue;
854 }
855 warn |= check_ext2(fd, dname);
856 warn |= check_reiser(fd, dname);
857 warn |= check_raid(fd, dname);
858 if (strcmp(st->ss->name, "1.x") == 0 &&
859 st->minor_version >= 1)
860 /* metadata at front */
861 warn |= check_partitions(fd, dname, 0, 0);
862 else if (s->level == 1 || is_container(s->level) ||
863 (s->level == 0 && s->raiddisks == 1))
864 /* partitions could be meaningful */
865 warn |= check_partitions(fd, dname, freesize*2, s->size*2);
866 else
867 /* partitions cannot be meaningful */
868 warn |= check_partitions(fd, dname, 0, 0);
869 if (strcmp(st->ss->name, "1.x") == 0 &&
870 st->minor_version >= 1 &&
871 did_default &&
872 s->level == 1 &&
873 (warn & 1024) == 0) {
874 warn |= 1024;
875 pr_err("Note: this array has metadata at the start and\n"
876 " may not be suitable as a boot device. If you plan to\n"
877 " store '/boot' on this device please ensure that\n"
878 " your boot-loader understands md/v1.x metadata, or use\n"
879 " --metadata=0.90\n");
880 }
881 close(fd);
882 }
883 }
884 if (missing_disks == dnum && !have_container) {
885 pr_err("Subdevs can't be all missing\n");
886 return 1;
887 }
888 if (s->raiddisks + s->sparedisks > st->max_devs) {
889 pr_err("Too many devices: %s metadata only supports %d\n",
890 st->ss->name, st->max_devs);
891 return 1;
892 }
893 if (have_container)
894 info.array.working_disks = s->raiddisks;
895 if (fail) {
896 pr_err("create aborted\n");
897 return 1;
898 }
899 if (s->size == 0) {
900 if (mindisc == NULL && !have_container) {
901 pr_err("no size and no drives given - aborting create.\n");
902 return 1;
903 }
904 if (s->level > 0 || s->level == LEVEL_MULTIPATH ||
905 s->level == LEVEL_FAULTY || st->ss->external) {
906 /* size is meaningful */
907 if (!st->ss->validate_geometry(st, s->level, s->layout,
908 s->raiddisks,
909 &s->chunk, minsize*2,
910 s->data_offset,
911 NULL, NULL,
912 s->consistency_policy, 0)) {
913 pr_err("devices too large for RAID level %d\n", s->level);
914 return 1;
915 }
916 s->size = minsize;
917 if (s->level == 1)
918 /* If this is ever reshaped to RAID5, we will
919 * need a chunksize. So round it off a bit
920 * now just to be safe
921 */
922 s->size &= ~(64ULL-1);
923 if (c->verbose > 0)
924 pr_err("size set to %lluK\n", s->size);
925 }
926 }
927
928 if (!s->bitmap_file &&
929 !st->ss->external &&
930 s->level >= 1 &&
931 st->ss->add_internal_bitmap &&
932 s->journaldisks == 0 &&
933 (s->consistency_policy != CONSISTENCY_POLICY_RESYNC &&
934 s->consistency_policy != CONSISTENCY_POLICY_PPL) &&
935 (s->write_behind || s->size > 100*1024*1024ULL)) {
936 if (c->verbose > 0)
937 pr_err("automatically enabling write-intent bitmap on large array\n");
938 s->bitmap_file = "internal";
939 }
940 if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0)
941 s->bitmap_file = NULL;
942
943 if (s->consistency_policy == CONSISTENCY_POLICY_PPL &&
944 !st->ss->write_init_ppl) {
945 pr_err("%s metadata does not support PPL\n", st->ss->name);
946 return 1;
947 }
948
949 if (!have_container && s->level > 0 && ((maxsize-s->size)*100 > maxsize)) {
950 if (c->runstop != 1 || c->verbose >= 0)
951 pr_err("largest drive (%s) exceeds size (%lluK) by more than 1%%\n",
952 maxdisc, s->size);
953 warn = 1;
954 }
955
956 if (st->ss->detail_platform && st->ss->detail_platform(0, 1, NULL) != 0) {
957 if (c->runstop != 1 || c->verbose >= 0)
958 pr_err("%s unable to enumerate platform support\n"
959 " array may not be compatible with hardware/firmware\n",
960 st->ss->name);
961 warn = 1;
962 }
963 st->nodes = c->nodes;
964 st->cluster_name = c->homecluster;
965
966 if (warn) {
967 if (c->runstop!= 1) {
968 if (!ask("Continue creating array? ")) {
969 pr_err("create aborted.\n");
970 return 1;
971 }
972 } else {
973 if (c->verbose > 0)
974 pr_err("creation continuing despite oddities due to --run\n");
975 }
976 }
977
978 /* If this is raid4/5, we want to configure the last active slot
979 * as missing, so that a reconstruct happens (faster than re-parity)
980 * FIX: Can we do this for raid6 as well?
981 */
982 if (st->ss->external == 0 && s->assume_clean == 0 &&
983 c->force == 0 && first_missing >= s->raiddisks) {
984 switch (s->level) {
985 case 4:
986 case 5:
987 insert_point = s->raiddisks-1;
988 s->sparedisks++;
989 info.array.active_disks--;
990 missing_disks++;
991 break;
992 default:
993 break;
994 }
995 }
996 /* For raid6, if creating with 1 missing drive, make a good drive
997 * into a spare, else the create will fail
998 */
999 if (s->assume_clean == 0 && c->force == 0 && first_missing < s->raiddisks &&
1000 st->ss->external == 0 &&
1001 second_missing >= s->raiddisks && s->level == 6) {
1002 insert_point = s->raiddisks - 1;
1003 if (insert_point == first_missing)
1004 insert_point--;
1005 s->sparedisks ++;
1006 info.array.active_disks--;
1007 missing_disks++;
1008 }
1009
1010 if (s->level <= 0 && first_missing < subdevs * 2) {
1011 pr_err("This level does not support missing devices\n");
1012 return 1;
1013 }
1014
1015 /* We need to create the device */
1016 map_lock(&map);
1017 mdfd = create_mddev(ident->devname, ident->name, c->autof, LOCAL, chosen_name, 1);
1018 if (mdfd < 0) {
1019 map_unlock(&map);
1020 return 1;
1021 }
1022 /* verify if chosen_name is not in use,
1023 * it could be in conflict with already existing device
1024 * e.g. container, array
1025 */
1026 if (strncmp(chosen_name, DEV_MD_DIR, DEV_MD_DIR_LEN) == 0 &&
1027 map_by_name(&map, chosen_name + DEV_MD_DIR_LEN)) {
1028 pr_err("Array name %s is in use already.\n", chosen_name);
1029 close(mdfd);
1030 map_unlock(&map);
1031 udev_unblock();
1032 return 1;
1033 }
1034
1035 memset(&inf, 0, sizeof(inf));
1036 md_get_array_info(mdfd, &inf);
1037 if (inf.working_disks != 0) {
1038 pr_err("another array by this name is already running.\n");
1039 goto abort_locked;
1040 }
1041
1042 /* Ok, lets try some ioctls */
1043
1044 info.array.level = s->level;
1045 info.array.size = s->size;
1046 info.array.raid_disks = s->raiddisks;
1047 /* The kernel should *know* what md_minor we are dealing
1048 * with, but it chooses to trust me instead. Sigh
1049 */
1050 info.array.md_minor = 0;
1051 if (fstat_is_blkdev(mdfd, chosen_name, &rdev))
1052 info.array.md_minor = minor(rdev);
1053 info.array.not_persistent = 0;
1054
1055 if (((s->level == 4 || s->level == 5) &&
1056 (insert_point < s->raiddisks || first_missing < s->raiddisks)) ||
1057 (s->level == 6 && (insert_point < s->raiddisks ||
1058 second_missing < s->raiddisks)) ||
1059 (s->level <= 0) || s->assume_clean) {
1060 info.array.state = 1; /* clean, but one+ drive will be missing*/
1061 info.resync_start = MaxSector;
1062 } else {
1063 info.array.state = 0; /* not clean, but no errors */
1064 info.resync_start = 0;
1065 }
1066 if (s->level == 10) {
1067 /* for raid10, the bitmap size is the capacity of the array,
1068 * which is array.size * raid_disks / ncopies;
1069 * .. but convert to sectors.
1070 */
1071 int ncopies = ((s->layout>>8) & 255) * (s->layout & 255);
1072 bitmapsize = s->size * s->raiddisks / ncopies * 2;
1073 /* printf("bms=%llu as=%d rd=%d nc=%d\n", bitmapsize, s->size, s->raiddisks, ncopies);*/
1074 } else
1075 bitmapsize = s->size * 2;
1076
1077 /* There is lots of redundancy in these disk counts,
1078 * raid_disks is the most meaningful value
1079 * it describes the geometry of the array
1080 * it is constant
1081 * nr_disks is total number of used slots.
1082 * it should be raid_disks+spare_disks
1083 * spare_disks is the number of extra disks present
1084 * see above
1085 * active_disks is the number of working disks in
1086 * active slots. (With raid_disks)
1087 * working_disks is the total number of working disks,
1088 * including spares
1089 * failed_disks is the number of disks marked failed
1090 *
1091 * Ideally, the kernel would keep these (except raid_disks)
1092 * up-to-date as we ADD_NEW_DISK, but it doesn't (yet).
1093 * So for now, we assume that all raid and spare
1094 * devices will be given.
1095 */
1096 info.array.spare_disks=s->sparedisks;
1097 info.array.failed_disks=missing_disks;
1098 info.array.nr_disks = info.array.working_disks
1099 + info.array.failed_disks;
1100 info.array.layout = s->layout;
1101 info.array.chunk_size = s->chunk*1024;
1102
1103 if (*name == 0) {
1104 /* base name on devname */
1105 /* /dev/md0 -> 0
1106 * /dev/md_d0 -> d0
1107 * /dev/md_foo -> foo
1108 * /dev/md/1 -> 1
1109 * /dev/md/d1 -> d1
1110 * /dev/md/home -> home
1111 * /dev/mdhome -> home
1112 */
1113 /* FIXME compare this with rules in create_mddev */
1114 name = strrchr(chosen_name, '/');
1115
1116 if (name) {
1117 name++;
1118 if (strncmp(name, "md_", 3) == 0 &&
1119 strlen(name) > 3 && (name - chosen_name) == 5 /* /dev/ */)
1120 name += 3;
1121 else if (strncmp(name, "md", 2) == 0 &&
1122 strlen(name) > 2 && isdigit(name[2]) &&
1123 (name - chosen_name) == 5 /* /dev/ */)
1124 name += 2;
1125 }
1126 }
1127 if (!st->ss->init_super(st, &info.array, s, name, c->homehost, uuid,
1128 s->data_offset))
1129 goto abort_locked;
1130
1131 total_slots = info.array.nr_disks;
1132 st->ss->getinfo_super(st, &info, NULL);
1133 if (sysfs_init(&info, mdfd, NULL)) {
1134 pr_err("unable to initialize sysfs\n");
1135 goto abort_locked;
1136 }
1137
1138 if (did_default && c->verbose >= 0) {
1139 if (is_subarray(info.text_version)) {
1140 char devnm[32];
1141 char *ep;
1142 struct mdinfo *mdi;
1143
1144 strncpy(devnm, info.text_version+1, 32);
1145 devnm[31] = 0;
1146 ep = strchr(devnm, '/');
1147 if (ep)
1148 *ep = 0;
1149
1150 mdi = sysfs_read(-1, devnm, GET_VERSION);
1151
1152 pr_info("Creating array inside %s container %s\n",
1153 mdi?mdi->text_version:"managed", devnm);
1154 sysfs_free(mdi);
1155 } else
1156 pr_info("Defaulting to version %s metadata\n",
1157 info.text_version);
1158 }
1159
1160 map_update(&map, fd2devnm(mdfd), info.text_version,
1161 info.uuid, chosen_name);
1162 /* Keep map locked until devices have been added to array
1163 * to stop another mdadm from finding and using those devices.
1164 */
1165
1166 if (s->bitmap_file && (strcmp(s->bitmap_file, "internal") == 0 ||
1167 strcmp(s->bitmap_file, "clustered") == 0)) {
1168 if (!st->ss->add_internal_bitmap) {
1169 pr_err("internal bitmaps not supported with %s metadata\n",
1170 st->ss->name);
1171 goto abort_locked;
1172 }
1173 if (st->ss->add_internal_bitmap(st, &s->bitmap_chunk,
1174 c->delay, s->write_behind,
1175 bitmapsize, 1, major_num)) {
1176 pr_err("Given bitmap chunk size not supported.\n");
1177 goto abort_locked;
1178 }
1179 s->bitmap_file = NULL;
1180 }
1181
1182 if (sysfs_init(&info, mdfd, NULL)) {
1183 pr_err("unable to initialize sysfs\n");
1184 goto abort_locked;
1185 }
1186
1187 if (st->ss->external && st->container_devnm[0]) {
1188 /* member */
1189
1190 /* When creating a member, we need to be careful
1191 * to negotiate with mdmon properly.
1192 * If it is already running, we cannot write to
1193 * the devices and must ask it to do that part.
1194 * If it isn't running, we write to the devices,
1195 * and then start it.
1196 * We hold an exclusive open on the container
1197 * device to make sure mdmon doesn't exit after
1198 * we checked that it is running.
1199 *
1200 * For now, fail if it is already running.
1201 */
1202 container_fd = open_dev_excl(st->container_devnm);
1203 if (container_fd < 0) {
1204 pr_err("Cannot get exclusive open on container - weird.\n");
1205 goto abort_locked;
1206 }
1207 if (mdmon_running(st->container_devnm)) {
1208 if (c->verbose)
1209 pr_err("reusing mdmon for %s.\n",
1210 st->container_devnm);
1211 st->update_tail = &st->updates;
1212 } else
1213 need_mdmon = 1;
1214 }
1215 rv = set_array_info(mdfd, st, &info);
1216 if (rv) {
1217 pr_err("failed to set array info for %s: %s\n", chosen_name, strerror(errno));
1218 goto abort_locked;
1219 }
1220
1221 if (s->bitmap_file) {
1222 int uuid[4];
1223
1224 st->ss->uuid_from_super(st, uuid);
1225 if (CreateBitmap(s->bitmap_file, c->force, (char*)uuid, s->bitmap_chunk,
1226 c->delay, s->write_behind,
1227 bitmapsize,
1228 major_num)) {
1229 goto abort_locked;
1230 }
1231 bitmap_fd = open(s->bitmap_file, O_RDWR);
1232 if (bitmap_fd < 0) {
1233 pr_err("weird: %s cannot be opened\n",
1234 s->bitmap_file);
1235 goto abort_locked;
1236 }
1237 if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) {
1238 pr_err("Cannot set bitmap file for %s: %s\n", chosen_name, strerror(errno));
1239 goto abort_locked;
1240 }
1241 }
1242
1243 if (add_disks(mdfd, &info, s, c, st, &map, devlist, total_slots,
1244 have_container, insert_point, major_num, chosen_name))
1245 goto abort_locked;
1246
1247 map_unlock(&map);
1248
1249 if (is_container(s->level)) {
1250 /* No need to start. But we should signal udev to
1251 * create links */
1252 sysfs_uevent(&info, "change");
1253 if (c->verbose >= 0)
1254 pr_err("container %s prepared.\n", chosen_name);
1255 wait_for(chosen_name, mdfd);
1256 } else if (c->runstop == 1 || subdevs >= s->raiddisks) {
1257 if (st->ss->external) {
1258 int err;
1259 switch(s->level) {
1260 case LEVEL_LINEAR:
1261 case LEVEL_MULTIPATH:
1262 case 0:
1263 err = sysfs_set_str(&info, NULL, "array_state",
1264 c->readonly
1265 ? "readonly"
1266 : "active");
1267 need_mdmon = 0;
1268 break;
1269 default:
1270 err = sysfs_set_str(&info, NULL, "array_state",
1271 "readonly");
1272 break;
1273 }
1274 sysfs_set_safemode(&info, info.safe_mode_delay);
1275 if (err) {
1276 pr_err("failed to activate array.\n");
1277 ioctl(mdfd, STOP_ARRAY, NULL);
1278 goto abort;
1279 }
1280 } else if (c->readonly &&
1281 sysfs_attribute_available(
1282 &info, NULL, "array_state")) {
1283 if (sysfs_set_str(&info, NULL,
1284 "array_state", "readonly") < 0) {
1285 pr_err("Failed to start array: %s\n",
1286 strerror(errno));
1287 ioctl(mdfd, STOP_ARRAY, NULL);
1288 goto abort;
1289 }
1290 } else {
1291 /* param is not actually used */
1292 mdu_param_t param;
1293 if (ioctl(mdfd, RUN_ARRAY, &param)) {
1294 pr_err("RUN_ARRAY failed: %s\n",
1295 strerror(errno));
1296 if (errno == 524 /* ENOTSUP */ &&
1297 info.array.level == 0)
1298 cont_err("Please use --layout=original or --layout=alternate\n");
1299 if (info.array.chunk_size & (info.array.chunk_size-1)) {
1300 cont_err("Problem may be that chunk size is not a power of 2\n");
1301 }
1302 ioctl(mdfd, STOP_ARRAY, NULL);
1303 goto abort;
1304 }
1305 /* if start_ro module parameter is set, array is
1306 * auto-read-only, which is bad as the resync won't
1307 * start. So lets make it read-write now.
1308 */
1309 ioctl(mdfd, RESTART_ARRAY_RW, NULL);
1310 }
1311 if (c->verbose >= 0)
1312 pr_info("array %s started.\n", chosen_name);
1313 if (st->ss->external && st->container_devnm[0]) {
1314 if (need_mdmon)
1315 start_mdmon(st->container_devnm);
1316
1317 ping_monitor(st->container_devnm);
1318 close(container_fd);
1319 }
1320 wait_for(chosen_name, mdfd);
1321 } else {
1322 pr_err("not starting array - not enough devices.\n");
1323 }
1324 udev_unblock();
1325 close(mdfd);
1326 sysfs_uevent(&info, "change");
1327 return 0;
1328
1329 abort:
1330 udev_unblock();
1331 map_lock(&map);
1332 abort_locked:
1333 map_remove(&map, fd2devnm(mdfd));
1334 map_unlock(&map);
1335
1336 if (mdfd >= 0)
1337 close(mdfd);
1338 return 1;
1339 }