]> git.ipfire.org Git - thirdparty/mdadm.git/blob - Create.c
c84c1ac811d01b2f80dd93f5b5d729c833210aeb
[thirdparty/mdadm.git] / Create.c
1 /*
2 * mdadm - manage Linux "md" devices aka RAID arrays.
3 *
4 * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
5 *
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 * Author: Neil Brown
22 * Email: <neilb@suse.de>
23 */
24
25 #include "mdadm.h"
26 #include "md_u.h"
27 #include "md_p.h"
28 #include <ctype.h>
29
30 static int round_size_and_verify(unsigned long long *size, int chunk)
31 {
32 if (*size == 0)
33 return 0;
34 *size &= ~(unsigned long long)(chunk - 1);
35 if (*size == 0) {
36 pr_err("Size cannot be smaller than chunk.\n");
37 return 1;
38 }
39 return 0;
40 }
41
42 /**
43 * default_layout() - Get default layout for level.
44 * @st: metadata requested, could be NULL.
45 * @level: raid level requested.
46 * @verbose: verbose level.
47 *
48 * Try to ask metadata handler first, otherwise use global defaults.
49 *
50 * Return: Layout or &UnSet, return value meaning depends of level used.
51 */
52 int default_layout(struct supertype *st, int level, int verbose)
53 {
54 int layout = UnSet;
55 mapping_t *layout_map = NULL;
56 char *layout_name = NULL;
57
58 if (st && st->ss->default_geometry)
59 st->ss->default_geometry(st, &level, &layout, NULL);
60
61 if (layout != UnSet)
62 return layout;
63
64 switch (level) {
65 default: /* no layout */
66 layout = 0;
67 break;
68 case 0:
69 layout = RAID0_ORIG_LAYOUT;
70 break;
71 case 10:
72 layout = 0x102; /* near=2, far=1 */
73 layout_name = "n2";
74 break;
75 case 5:
76 case 6:
77 layout_map = r5layout;
78 break;
79 case LEVEL_FAULTY:
80 layout_map = faultylayout;
81 break;
82 }
83
84 if (layout_map) {
85 layout = map_name(layout_map, "default");
86 layout_name = map_num_s(layout_map, layout);
87 }
88 if (layout_name && verbose > 0)
89 pr_err("layout defaults to %s\n", layout_name);
90
91 return layout;
92 }
93
94 int Create(struct supertype *st, char *mddev,
95 char *name, int *uuid,
96 int subdevs, struct mddev_dev *devlist,
97 struct shape *s,
98 struct context *c, unsigned long long data_offset)
99 {
100 /*
101 * Create a new raid array.
102 *
103 * First check that necessary details are available
104 * (i.e. level, raid-disks)
105 *
106 * Then check each disk to see what might be on it
107 * and report anything interesting.
108 *
109 * If anything looks odd, and runstop not set,
110 * abort.
111 *
112 * SET_ARRAY_INFO and ADD_NEW_DISK, and
113 * if runstop==run, or raiddisks disks were used,
114 * RUN_ARRAY
115 */
116 int mdfd;
117 unsigned long long minsize = 0, maxsize = 0;
118 char *mindisc = NULL;
119 char *maxdisc = NULL;
120 int dnum, raid_disk_num;
121 struct mddev_dev *dv;
122 dev_t rdev;
123 int fail = 0, warn = 0;
124 int first_missing = subdevs * 2;
125 int second_missing = subdevs * 2;
126 int missing_disks = 0;
127 int insert_point = subdevs * 2; /* where to insert a missing drive */
128 int total_slots;
129 int pass;
130 int rv;
131 int bitmap_fd;
132 int have_container = 0;
133 int container_fd = -1;
134 int need_mdmon = 0;
135 unsigned long long bitmapsize;
136 struct mdinfo info, *infos;
137 int did_default = 0;
138 int do_default_layout = 0;
139 int do_default_chunk = 0;
140 unsigned long safe_mode_delay = 0;
141 char chosen_name[1024];
142 struct map_ent *map = NULL;
143 unsigned long long newsize;
144 mdu_array_info_t inf;
145
146 int major_num = BITMAP_MAJOR_HI;
147 if (s->bitmap_file && strcmp(s->bitmap_file, "clustered") == 0) {
148 major_num = BITMAP_MAJOR_CLUSTERED;
149 if (c->nodes <= 1) {
150 pr_err("At least 2 nodes are needed for cluster-md\n");
151 return 1;
152 }
153 }
154
155 memset(&info, 0, sizeof(info));
156 if (s->level == UnSet && st && st->ss->default_geometry)
157 st->ss->default_geometry(st, &s->level, NULL, NULL);
158 if (s->level == UnSet) {
159 pr_err("a RAID level is needed to create an array.\n");
160 return 1;
161 }
162 if (s->raiddisks < 4 && s->level == 6) {
163 pr_err("at least 4 raid-devices needed for level 6\n");
164 return 1;
165 }
166 if (s->raiddisks > 256 && s->level == 6) {
167 pr_err("no more than 256 raid-devices supported for level 6\n");
168 return 1;
169 }
170 if (s->raiddisks < 2 && s->level >= 4) {
171 pr_err("at least 2 raid-devices needed for level %d\n", s->level);
172 return 1;
173 }
174 if (s->level <= 0 && s->sparedisks) {
175 pr_err("This level does not support spare devices\n");
176 return 1;
177 }
178
179 if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) {
180 /* If given a single device, it might be a container, and we can
181 * extract a device list from there
182 */
183 int fd;
184
185 memset(&inf, 0, sizeof(inf));
186 fd = open(devlist->devname, O_RDONLY);
187 if (fd >= 0 &&
188 md_get_array_info(fd, &inf) == 0 && inf.raid_disks == 0) {
189 /* yep, looks like a container */
190 if (st) {
191 rv = st->ss->load_container(st, fd,
192 devlist->devname);
193 if (rv == 0)
194 have_container = 1;
195 } else {
196 st = super_by_fd(fd, NULL);
197 if (st && !(rv = st->ss->
198 load_container(st, fd,
199 devlist->devname)))
200 have_container = 1;
201 else
202 st = NULL;
203 }
204 if (have_container) {
205 subdevs = s->raiddisks;
206 first_missing = subdevs * 2;
207 second_missing = subdevs * 2;
208 insert_point = subdevs * 2;
209 }
210 }
211 if (fd >= 0)
212 close(fd);
213 }
214 if (st && st->ss->external && s->sparedisks) {
215 pr_err("This metadata type does not support spare disks at create time\n");
216 return 1;
217 }
218 if (subdevs > s->raiddisks+s->sparedisks+s->journaldisks) {
219 pr_err("You have listed more devices (%d) than are in the array(%d)!\n", subdevs, s->raiddisks+s->sparedisks);
220 return 1;
221 }
222 if (!have_container && subdevs < s->raiddisks+s->sparedisks+s->journaldisks) {
223 pr_err("You haven't given enough devices (real or missing) to create this array\n");
224 return 1;
225 }
226 if (s->bitmap_file && s->level <= 0) {
227 pr_err("bitmaps not meaningful with level %s\n",
228 map_num(pers, s->level)?:"given");
229 return 1;
230 }
231
232 /* now set some defaults */
233
234 if (s->layout == UnSet) {
235 do_default_layout = 1;
236 s->layout = default_layout(st, s->level, c->verbose);
237 }
238
239 if (s->level == 10)
240 /* check layout fits in array*/
241 if ((s->layout&255) * ((s->layout>>8)&255) > s->raiddisks) {
242 pr_err("that layout requires at least %d devices\n",
243 (s->layout&255) * ((s->layout>>8)&255));
244 return 1;
245 }
246
247 switch(s->level) {
248 case 4:
249 case 5:
250 case 10:
251 case 6:
252 case 0:
253 if (s->chunk == 0 || s->chunk == UnSet) {
254 s->chunk = UnSet;
255 do_default_chunk = 1;
256 /* chunk will be set later */
257 }
258 break;
259 case LEVEL_LINEAR:
260 /* a chunksize of zero 0s perfectly valid (and preferred) since 2.6.16 */
261 if (get_linux_version() < 2006016 && s->chunk == 0) {
262 s->chunk = 64;
263 if (c->verbose > 0)
264 pr_err("chunk size defaults to 64K\n");
265 }
266 break;
267 case 1:
268 case LEVEL_FAULTY:
269 case LEVEL_MULTIPATH:
270 case LEVEL_CONTAINER:
271 if (s->chunk) {
272 pr_err("specifying chunk size is forbidden for this level\n");
273 return 1;
274 }
275 break;
276 default:
277 pr_err("unknown level %d\n", s->level);
278 return 1;
279 }
280
281 if (s->size == MAX_SIZE)
282 /* use '0' to mean 'max' now... */
283 s->size = 0;
284 if (s->size && s->chunk && s->chunk != UnSet)
285 if (round_size_and_verify(&s->size, s->chunk))
286 return 1;
287
288 newsize = s->size * 2;
289 if (st && ! st->ss->validate_geometry(st, s->level, s->layout, s->raiddisks,
290 &s->chunk, s->size*2,
291 data_offset, NULL,
292 &newsize, s->consistency_policy,
293 c->verbose >= 0))
294 return 1;
295
296 if (s->chunk && s->chunk != UnSet) {
297 newsize &= ~(unsigned long long)(s->chunk*2 - 1);
298 if (do_default_chunk) {
299 /* default chunk was just set */
300 if (c->verbose > 0)
301 pr_err("chunk size defaults to %dK\n", s->chunk);
302 if (round_size_and_verify(&s->size, s->chunk))
303 return 1;
304 do_default_chunk = 0;
305 }
306 }
307
308 if (s->size == 0) {
309 s->size = newsize / 2;
310 if (s->level == 1)
311 /* If this is ever reshaped to RAID5, we will
312 * need a chunksize. So round it off a bit
313 * now just to be safe
314 */
315 s->size &= ~(64ULL-1);
316
317 if (s->size && c->verbose > 0)
318 pr_err("setting size to %lluK\n", s->size);
319 }
320
321 /* now look at the subdevs */
322 info.array.active_disks = 0;
323 info.array.working_disks = 0;
324 dnum = 0;
325 for (dv = devlist; dv; dv = dv->next)
326 if (data_offset == VARIABLE_OFFSET)
327 dv->data_offset = INVALID_SECTORS;
328 else
329 dv->data_offset = data_offset;
330
331 for (dv=devlist; dv && !have_container; dv=dv->next, dnum++) {
332 char *dname = dv->devname;
333 unsigned long long freesize;
334 int dfd;
335 char *doff;
336
337 if (strcasecmp(dname, "missing") == 0) {
338 if (first_missing > dnum)
339 first_missing = dnum;
340 if (second_missing > dnum && dnum > first_missing)
341 second_missing = dnum;
342 missing_disks ++;
343 continue;
344 }
345 if (data_offset == VARIABLE_OFFSET) {
346 doff = strchr(dname, ':');
347 if (doff) {
348 *doff++ = 0;
349 dv->data_offset = parse_size(doff);
350 } else
351 dv->data_offset = INVALID_SECTORS;
352 } else
353 dv->data_offset = data_offset;
354
355 dfd = open(dname, O_RDONLY);
356 if (dfd < 0) {
357 pr_err("cannot open %s: %s\n",
358 dname, strerror(errno));
359 exit(2);
360 }
361 if (!fstat_is_blkdev(dfd, dname, NULL)) {
362 close(dfd);
363 exit(2);
364 }
365 close(dfd);
366 info.array.working_disks++;
367 if (dnum < s->raiddisks && dv->disposition != 'j')
368 info.array.active_disks++;
369 if (st == NULL) {
370 struct createinfo *ci = conf_get_create_info();
371 if (ci)
372 st = ci->supertype;
373 }
374 if (st == NULL) {
375 /* Need to choose a default metadata, which is different
376 * depending on geometry of array.
377 */
378 int i;
379 char *name = "default";
380 for(i = 0; !st && superlist[i]; i++) {
381 st = superlist[i]->match_metadata_desc(name);
382 if (!st)
383 continue;
384 if (do_default_layout)
385 s->layout = default_layout(st, s->level, c->verbose);
386 switch (st->ss->validate_geometry(
387 st, s->level, s->layout, s->raiddisks,
388 &s->chunk, s->size*2,
389 dv->data_offset, dname,
390 &freesize, s->consistency_policy,
391 c->verbose > 0)) {
392 case -1: /* Not valid, message printed, and not
393 * worth checking any further */
394 exit(2);
395 break;
396 case 0: /* Geometry not valid */
397 free(st);
398 st = NULL;
399 s->chunk = do_default_chunk ? UnSet : s->chunk;
400 break;
401 case 1: /* All happy */
402 break;
403 }
404 }
405
406 if (!st) {
407 int dfd = open(dname, O_RDONLY|O_EXCL);
408 if (dfd < 0) {
409 pr_err("cannot open %s: %s\n",
410 dname, strerror(errno));
411 exit(2);
412 }
413 pr_err("device %s not suitable for any style of array\n",
414 dname);
415 exit(2);
416 }
417 if (st->ss != &super0 ||
418 st->minor_version != 90)
419 did_default = 1;
420 } else {
421 if (do_default_layout)
422 s->layout = default_layout(st, s->level, 0);
423 if (!st->ss->validate_geometry(st, s->level, s->layout,
424 s->raiddisks,
425 &s->chunk, s->size*2,
426 dv->data_offset,
427 dname, &freesize,
428 s->consistency_policy,
429 c->verbose >= 0)) {
430
431 pr_err("%s is not suitable for this array.\n",
432 dname);
433 fail = 1;
434 continue;
435 }
436 }
437
438 if (dv->disposition == 'j')
439 goto skip_size_check; /* skip write journal for size check */
440
441 freesize /= 2; /* convert to K */
442 if (s->chunk && s->chunk != UnSet) {
443 /* round to chunk size */
444 freesize = freesize & ~(s->chunk-1);
445 if (do_default_chunk) {
446 /* default chunk was just set */
447 if (c->verbose > 0)
448 pr_err("chunk size defaults to %dK\n", s->chunk);
449 if (round_size_and_verify(&s->size, s->chunk))
450 return 1;
451 do_default_chunk = 0;
452 }
453 }
454 if (!freesize) {
455 pr_err("no free space left on %s\n", dname);
456 fail = 1;
457 continue;
458 }
459
460 if (s->size && freesize < s->size) {
461 pr_err("%s is smaller than given size. %lluK < %lluK + metadata\n",
462 dname, freesize, s->size);
463 fail = 1;
464 continue;
465 }
466 if (maxdisc == NULL || (maxdisc && freesize > maxsize)) {
467 maxdisc = dname;
468 maxsize = freesize;
469 }
470 if (mindisc ==NULL || (mindisc && freesize < minsize)) {
471 mindisc = dname;
472 minsize = freesize;
473 }
474 skip_size_check:
475 if (c->runstop != 1 || c->verbose >= 0) {
476 int fd = open(dname, O_RDONLY);
477 if (fd < 0) {
478 pr_err("Cannot open %s: %s\n",
479 dname, strerror(errno));
480 fail = 1;
481 continue;
482 }
483 warn |= check_ext2(fd, dname);
484 warn |= check_reiser(fd, dname);
485 warn |= check_raid(fd, dname);
486 if (strcmp(st->ss->name, "1.x") == 0 &&
487 st->minor_version >= 1)
488 /* metadata at front */
489 warn |= check_partitions(fd, dname, 0, 0);
490 else if (s->level == 1 || s->level == LEVEL_CONTAINER ||
491 (s->level == 0 && s->raiddisks == 1))
492 /* partitions could be meaningful */
493 warn |= check_partitions(fd, dname, freesize*2, s->size*2);
494 else
495 /* partitions cannot be meaningful */
496 warn |= check_partitions(fd, dname, 0, 0);
497 if (strcmp(st->ss->name, "1.x") == 0 &&
498 st->minor_version >= 1 &&
499 did_default &&
500 s->level == 1 &&
501 (warn & 1024) == 0) {
502 warn |= 1024;
503 pr_err("Note: this array has metadata at the start and\n"
504 " may not be suitable as a boot device. If you plan to\n"
505 " store '/boot' on this device please ensure that\n"
506 " your boot-loader understands md/v1.x metadata, or use\n"
507 " --metadata=0.90\n");
508 }
509 close(fd);
510 }
511 }
512 if (missing_disks == dnum && !have_container) {
513 pr_err("Subdevs can't be all missing\n");
514 return 1;
515 }
516 if (s->raiddisks + s->sparedisks > st->max_devs) {
517 pr_err("Too many devices: %s metadata only supports %d\n",
518 st->ss->name, st->max_devs);
519 return 1;
520 }
521 if (have_container)
522 info.array.working_disks = s->raiddisks;
523 if (fail) {
524 pr_err("create aborted\n");
525 return 1;
526 }
527 if (s->size == 0) {
528 if (mindisc == NULL && !have_container) {
529 pr_err("no size and no drives given - aborting create.\n");
530 return 1;
531 }
532 if (s->level > 0 || s->level == LEVEL_MULTIPATH ||
533 s->level == LEVEL_FAULTY || st->ss->external) {
534 /* size is meaningful */
535 if (!st->ss->validate_geometry(st, s->level, s->layout,
536 s->raiddisks,
537 &s->chunk, minsize*2,
538 data_offset,
539 NULL, NULL,
540 s->consistency_policy, 0)) {
541 pr_err("devices too large for RAID level %d\n", s->level);
542 return 1;
543 }
544 s->size = minsize;
545 if (s->level == 1)
546 /* If this is ever reshaped to RAID5, we will
547 * need a chunksize. So round it off a bit
548 * now just to be safe
549 */
550 s->size &= ~(64ULL-1);
551 if (c->verbose > 0)
552 pr_err("size set to %lluK\n", s->size);
553 }
554 }
555
556 if (!s->bitmap_file &&
557 !st->ss->external &&
558 s->level >= 1 &&
559 st->ss->add_internal_bitmap &&
560 s->journaldisks == 0 &&
561 (s->consistency_policy != CONSISTENCY_POLICY_RESYNC &&
562 s->consistency_policy != CONSISTENCY_POLICY_PPL) &&
563 (s->write_behind || s->size > 100*1024*1024ULL)) {
564 if (c->verbose > 0)
565 pr_err("automatically enabling write-intent bitmap on large array\n");
566 s->bitmap_file = "internal";
567 }
568 if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0)
569 s->bitmap_file = NULL;
570
571 if (s->consistency_policy == CONSISTENCY_POLICY_PPL &&
572 !st->ss->write_init_ppl) {
573 pr_err("%s metadata does not support PPL\n", st->ss->name);
574 return 1;
575 }
576
577 if (!have_container && s->level > 0 && ((maxsize-s->size)*100 > maxsize)) {
578 if (c->runstop != 1 || c->verbose >= 0)
579 pr_err("largest drive (%s) exceeds size (%lluK) by more than 1%%\n",
580 maxdisc, s->size);
581 warn = 1;
582 }
583
584 if (st->ss->detail_platform && st->ss->detail_platform(0, 1, NULL) != 0) {
585 if (c->runstop != 1 || c->verbose >= 0)
586 pr_err("%s unable to enumerate platform support\n"
587 " array may not be compatible with hardware/firmware\n",
588 st->ss->name);
589 warn = 1;
590 }
591 st->nodes = c->nodes;
592 st->cluster_name = c->homecluster;
593
594 if (warn) {
595 if (c->runstop!= 1) {
596 if (!ask("Continue creating array? ")) {
597 pr_err("create aborted.\n");
598 return 1;
599 }
600 } else {
601 if (c->verbose > 0)
602 pr_err("creation continuing despite oddities due to --run\n");
603 }
604 }
605
606 /* If this is raid4/5, we want to configure the last active slot
607 * as missing, so that a reconstruct happens (faster than re-parity)
608 * FIX: Can we do this for raid6 as well?
609 */
610 if (st->ss->external == 0 && s->assume_clean == 0 &&
611 c->force == 0 && first_missing >= s->raiddisks) {
612 switch (s->level) {
613 case 4:
614 case 5:
615 insert_point = s->raiddisks-1;
616 s->sparedisks++;
617 info.array.active_disks--;
618 missing_disks++;
619 break;
620 default:
621 break;
622 }
623 }
624 /* For raid6, if creating with 1 missing drive, make a good drive
625 * into a spare, else the create will fail
626 */
627 if (s->assume_clean == 0 && c->force == 0 && first_missing < s->raiddisks &&
628 st->ss->external == 0 &&
629 second_missing >= s->raiddisks && s->level == 6) {
630 insert_point = s->raiddisks - 1;
631 if (insert_point == first_missing)
632 insert_point--;
633 s->sparedisks ++;
634 info.array.active_disks--;
635 missing_disks++;
636 }
637
638 if (s->level <= 0 && first_missing < subdevs * 2) {
639 pr_err("This level does not support missing devices\n");
640 return 1;
641 }
642
643 /* We need to create the device */
644 map_lock(&map);
645 mdfd = create_mddev(mddev, name, c->autof, LOCAL, chosen_name, 1);
646 if (mdfd < 0) {
647 map_unlock(&map);
648 return 1;
649 }
650 /* verify if chosen_name is not in use,
651 * it could be in conflict with already existing device
652 * e.g. container, array
653 */
654 if (strncmp(chosen_name, "/dev/md/", 8) == 0 &&
655 map_by_name(&map, chosen_name+8) != NULL) {
656 pr_err("Array name %s is in use already.\n",
657 chosen_name);
658 close(mdfd);
659 map_unlock(&map);
660 udev_unblock();
661 return 1;
662 }
663 mddev = chosen_name;
664
665 memset(&inf, 0, sizeof(inf));
666 md_get_array_info(mdfd, &inf);
667 if (inf.working_disks != 0) {
668 pr_err("another array by this name is already running.\n");
669 goto abort_locked;
670 }
671
672 /* Ok, lets try some ioctls */
673
674 info.array.level = s->level;
675 info.array.size = s->size;
676 info.array.raid_disks = s->raiddisks;
677 /* The kernel should *know* what md_minor we are dealing
678 * with, but it chooses to trust me instead. Sigh
679 */
680 info.array.md_minor = 0;
681 if (fstat_is_blkdev(mdfd, mddev, &rdev))
682 info.array.md_minor = minor(rdev);
683 info.array.not_persistent = 0;
684
685 if (((s->level == 4 || s->level == 5) &&
686 (insert_point < s->raiddisks || first_missing < s->raiddisks)) ||
687 (s->level == 6 && (insert_point < s->raiddisks ||
688 second_missing < s->raiddisks)) ||
689 (s->level <= 0) || s->assume_clean) {
690 info.array.state = 1; /* clean, but one+ drive will be missing*/
691 info.resync_start = MaxSector;
692 } else {
693 info.array.state = 0; /* not clean, but no errors */
694 info.resync_start = 0;
695 }
696 if (s->level == 10) {
697 /* for raid10, the bitmap size is the capacity of the array,
698 * which is array.size * raid_disks / ncopies;
699 * .. but convert to sectors.
700 */
701 int ncopies = ((s->layout>>8) & 255) * (s->layout & 255);
702 bitmapsize = s->size * s->raiddisks / ncopies * 2;
703 /* printf("bms=%llu as=%d rd=%d nc=%d\n", bitmapsize, s->size, s->raiddisks, ncopies);*/
704 } else
705 bitmapsize = s->size * 2;
706
707 /* There is lots of redundancy in these disk counts,
708 * raid_disks is the most meaningful value
709 * it describes the geometry of the array
710 * it is constant
711 * nr_disks is total number of used slots.
712 * it should be raid_disks+spare_disks
713 * spare_disks is the number of extra disks present
714 * see above
715 * active_disks is the number of working disks in
716 * active slots. (With raid_disks)
717 * working_disks is the total number of working disks,
718 * including spares
719 * failed_disks is the number of disks marked failed
720 *
721 * Ideally, the kernel would keep these (except raid_disks)
722 * up-to-date as we ADD_NEW_DISK, but it doesn't (yet).
723 * So for now, we assume that all raid and spare
724 * devices will be given.
725 */
726 info.array.spare_disks=s->sparedisks;
727 info.array.failed_disks=missing_disks;
728 info.array.nr_disks = info.array.working_disks
729 + info.array.failed_disks;
730 info.array.layout = s->layout;
731 info.array.chunk_size = s->chunk*1024;
732
733 if (name == NULL || *name == 0) {
734 /* base name on mddev */
735 /* /dev/md0 -> 0
736 * /dev/md_d0 -> d0
737 * /dev/md_foo -> foo
738 * /dev/md/1 -> 1
739 * /dev/md/d1 -> d1
740 * /dev/md/home -> home
741 * /dev/mdhome -> home
742 */
743 /* FIXME compare this with rules in create_mddev */
744 name = strrchr(mddev, '/');
745 if (name) {
746 name++;
747 if (strncmp(name, "md_", 3) == 0 &&
748 strlen(name) > 3 && (name-mddev) == 5 /* /dev/ */)
749 name += 3;
750 else if (strncmp(name, "md", 2) == 0 &&
751 strlen(name) > 2 && isdigit(name[2]) &&
752 (name-mddev) == 5 /* /dev/ */)
753 name += 2;
754 }
755 }
756 if (!st->ss->init_super(st, &info.array, s, name, c->homehost, uuid,
757 data_offset))
758 goto abort_locked;
759
760 total_slots = info.array.nr_disks;
761 st->ss->getinfo_super(st, &info, NULL);
762 if (sysfs_init(&info, mdfd, NULL)) {
763 pr_err("unable to initialize sysfs\n");
764 goto abort_locked;
765 }
766
767 if (did_default && c->verbose >= 0) {
768 if (is_subarray(info.text_version)) {
769 char devnm[32];
770 char *ep;
771 struct mdinfo *mdi;
772
773 strncpy(devnm, info.text_version+1, 32);
774 devnm[31] = 0;
775 ep = strchr(devnm, '/');
776 if (ep)
777 *ep = 0;
778
779 mdi = sysfs_read(-1, devnm, GET_VERSION);
780
781 pr_err("Creating array inside %s container %s\n",
782 mdi?mdi->text_version:"managed", devnm);
783 sysfs_free(mdi);
784 } else
785 pr_err("Defaulting to version %s metadata\n", info.text_version);
786 }
787
788 map_update(&map, fd2devnm(mdfd), info.text_version,
789 info.uuid, chosen_name);
790 /* Keep map locked until devices have been added to array
791 * to stop another mdadm from finding and using those devices.
792 */
793
794 if (s->bitmap_file && (strcmp(s->bitmap_file, "internal") == 0 ||
795 strcmp(s->bitmap_file, "clustered") == 0)) {
796 if (!st->ss->add_internal_bitmap) {
797 pr_err("internal bitmaps not supported with %s metadata\n",
798 st->ss->name);
799 goto abort_locked;
800 }
801 if (st->ss->add_internal_bitmap(st, &s->bitmap_chunk,
802 c->delay, s->write_behind,
803 bitmapsize, 1, major_num)) {
804 pr_err("Given bitmap chunk size not supported.\n");
805 goto abort_locked;
806 }
807 s->bitmap_file = NULL;
808 }
809
810 if (sysfs_init(&info, mdfd, NULL)) {
811 pr_err("unable to initialize sysfs\n");
812 goto abort_locked;
813 }
814
815 if (st->ss->external && st->container_devnm[0]) {
816 /* member */
817
818 /* When creating a member, we need to be careful
819 * to negotiate with mdmon properly.
820 * If it is already running, we cannot write to
821 * the devices and must ask it to do that part.
822 * If it isn't running, we write to the devices,
823 * and then start it.
824 * We hold an exclusive open on the container
825 * device to make sure mdmon doesn't exit after
826 * we checked that it is running.
827 *
828 * For now, fail if it is already running.
829 */
830 container_fd = open_dev_excl(st->container_devnm);
831 if (container_fd < 0) {
832 pr_err("Cannot get exclusive open on container - weird.\n");
833 goto abort_locked;
834 }
835 if (mdmon_running(st->container_devnm)) {
836 if (c->verbose)
837 pr_err("reusing mdmon for %s.\n",
838 st->container_devnm);
839 st->update_tail = &st->updates;
840 } else
841 need_mdmon = 1;
842 }
843 rv = set_array_info(mdfd, st, &info);
844 if (rv) {
845 pr_err("failed to set array info for %s: %s\n",
846 mddev, strerror(errno));
847 goto abort_locked;
848 }
849
850 if (s->bitmap_file) {
851 int uuid[4];
852
853 st->ss->uuid_from_super(st, uuid);
854 if (CreateBitmap(s->bitmap_file, c->force, (char*)uuid, s->bitmap_chunk,
855 c->delay, s->write_behind,
856 bitmapsize,
857 major_num)) {
858 goto abort_locked;
859 }
860 bitmap_fd = open(s->bitmap_file, O_RDWR);
861 if (bitmap_fd < 0) {
862 pr_err("weird: %s cannot be opened\n",
863 s->bitmap_file);
864 goto abort_locked;
865 }
866 if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) {
867 pr_err("Cannot set bitmap file for %s: %s\n",
868 mddev, strerror(errno));
869 goto abort_locked;
870 }
871 }
872
873 infos = xmalloc(sizeof(*infos) * total_slots);
874 enable_fds(total_slots);
875 for (pass = 1; pass <= 2; pass++) {
876 struct mddev_dev *moved_disk = NULL; /* the disk that was moved out of the insert point */
877
878 for (dnum = 0, raid_disk_num = 0, dv = devlist; dv;
879 dv = (dv->next) ? (dv->next) : moved_disk, dnum++) {
880 int fd;
881 struct mdinfo *inf = &infos[dnum];
882
883 if (dnum >= total_slots)
884 abort();
885 if (dnum == insert_point) {
886 raid_disk_num += 1;
887 moved_disk = dv;
888 continue;
889 }
890 if (strcasecmp(dv->devname, "missing") == 0) {
891 raid_disk_num += 1;
892 continue;
893 }
894 if (have_container)
895 moved_disk = NULL;
896 if (have_container && dnum < info.array.raid_disks - 1)
897 /* repeatedly use the container */
898 moved_disk = dv;
899
900 switch(pass) {
901 case 1:
902 *inf = info;
903
904 inf->disk.number = dnum;
905 inf->disk.raid_disk = raid_disk_num++;
906
907 if (dv->disposition == 'j') {
908 inf->disk.raid_disk = MD_DISK_ROLE_JOURNAL;
909 inf->disk.state = (1<<MD_DISK_JOURNAL);
910 raid_disk_num--;
911 } else if (inf->disk.raid_disk < s->raiddisks)
912 inf->disk.state = (1<<MD_DISK_ACTIVE) |
913 (1<<MD_DISK_SYNC);
914 else
915 inf->disk.state = 0;
916
917 if (dv->writemostly == FlagSet) {
918 if (major_num == BITMAP_MAJOR_CLUSTERED) {
919 pr_err("Can not set %s --write-mostly with a clustered bitmap\n",dv->devname);
920 goto abort_locked;
921 } else
922 inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
923 }
924 if (dv->failfast == FlagSet)
925 inf->disk.state |= (1<<MD_DISK_FAILFAST);
926
927 if (have_container)
928 fd = -1;
929 else {
930 if (st->ss->external &&
931 st->container_devnm[0])
932 fd = open(dv->devname, O_RDWR);
933 else
934 fd = open(dv->devname, O_RDWR|O_EXCL);
935
936 if (fd < 0) {
937 pr_err("failed to open %s after earlier success - aborting\n",
938 dv->devname);
939 goto abort_locked;
940 }
941 if (!fstat_is_blkdev(fd, dv->devname, &rdev))
942 return 1;
943 inf->disk.major = major(rdev);
944 inf->disk.minor = minor(rdev);
945 }
946 if (fd >= 0)
947 remove_partitions(fd);
948 if (st->ss->add_to_super(st, &inf->disk,
949 fd, dv->devname,
950 dv->data_offset)) {
951 ioctl(mdfd, STOP_ARRAY, NULL);
952 goto abort_locked;
953 }
954 st->ss->getinfo_super(st, inf, NULL);
955 safe_mode_delay = inf->safe_mode_delay;
956
957 if (have_container && c->verbose > 0)
958 pr_err("Using %s for device %d\n",
959 map_dev(inf->disk.major,
960 inf->disk.minor,
961 0), dnum);
962
963 if (!have_container) {
964 /* getinfo_super might have lost these ... */
965 inf->disk.major = major(rdev);
966 inf->disk.minor = minor(rdev);
967 }
968 break;
969 case 2:
970 inf->errors = 0;
971
972 rv = add_disk(mdfd, st, &info, inf);
973
974 if (rv) {
975 pr_err("ADD_NEW_DISK for %s failed: %s\n",
976 dv->devname, strerror(errno));
977 if (errno == EINVAL &&
978 info.array.level == 0) {
979 pr_err("Possibly your kernel doesn't support RAID0 layouts.\n");
980 pr_err("Either upgrade, or use --layout=dangerous\n");
981 }
982 goto abort_locked;
983 }
984 break;
985 }
986 if (!have_container &&
987 dv == moved_disk && dnum != insert_point) break;
988 }
989 if (pass == 1) {
990 struct mdinfo info_new;
991 struct map_ent *me = NULL;
992
993 /* check to see if the uuid has changed due to these
994 * metadata changes, and if so update the member array
995 * and container uuid. Note ->write_init_super clears
996 * the subarray cursor such that ->getinfo_super once
997 * again returns container info.
998 */
999 st->ss->getinfo_super(st, &info_new, NULL);
1000 if (st->ss->external && s->level != LEVEL_CONTAINER &&
1001 !same_uuid(info_new.uuid, info.uuid, 0)) {
1002 map_update(&map, fd2devnm(mdfd),
1003 info_new.text_version,
1004 info_new.uuid, chosen_name);
1005 me = map_by_devnm(&map, st->container_devnm);
1006 }
1007
1008 if (st->ss->write_init_super(st)) {
1009 st->ss->free_super(st);
1010 goto abort_locked;
1011 }
1012 /*
1013 * Before activating the array, perform extra steps
1014 * required to configure the internal write-intent
1015 * bitmap.
1016 */
1017 if (info_new.consistency_policy ==
1018 CONSISTENCY_POLICY_BITMAP &&
1019 st->ss->set_bitmap &&
1020 st->ss->set_bitmap(st, &info)) {
1021 st->ss->free_super(st);
1022 goto abort_locked;
1023 }
1024
1025 /* update parent container uuid */
1026 if (me) {
1027 char *path = xstrdup(me->path);
1028
1029 st->ss->getinfo_super(st, &info_new, NULL);
1030 map_update(&map, st->container_devnm,
1031 info_new.text_version,
1032 info_new.uuid, path);
1033 free(path);
1034 }
1035
1036 flush_metadata_updates(st);
1037 st->ss->free_super(st);
1038 }
1039 }
1040 map_unlock(&map);
1041 free(infos);
1042
1043 if (s->level == LEVEL_CONTAINER) {
1044 /* No need to start. But we should signal udev to
1045 * create links */
1046 sysfs_uevent(&info, "change");
1047 if (c->verbose >= 0)
1048 pr_err("container %s prepared.\n", mddev);
1049 wait_for(chosen_name, mdfd);
1050 } else if (c->runstop == 1 || subdevs >= s->raiddisks) {
1051 if (st->ss->external) {
1052 int err;
1053 switch(s->level) {
1054 case LEVEL_LINEAR:
1055 case LEVEL_MULTIPATH:
1056 case 0:
1057 err = sysfs_set_str(&info, NULL, "array_state",
1058 c->readonly
1059 ? "readonly"
1060 : "active");
1061 need_mdmon = 0;
1062 break;
1063 default:
1064 err = sysfs_set_str(&info, NULL, "array_state",
1065 "readonly");
1066 break;
1067 }
1068 sysfs_set_safemode(&info, safe_mode_delay);
1069 if (err) {
1070 pr_err("failed to activate array.\n");
1071 ioctl(mdfd, STOP_ARRAY, NULL);
1072 goto abort;
1073 }
1074 } else if (c->readonly &&
1075 sysfs_attribute_available(
1076 &info, NULL, "array_state")) {
1077 if (sysfs_set_str(&info, NULL,
1078 "array_state", "readonly") < 0) {
1079 pr_err("Failed to start array: %s\n",
1080 strerror(errno));
1081 ioctl(mdfd, STOP_ARRAY, NULL);
1082 goto abort;
1083 }
1084 } else {
1085 /* param is not actually used */
1086 mdu_param_t param;
1087 if (ioctl(mdfd, RUN_ARRAY, &param)) {
1088 pr_err("RUN_ARRAY failed: %s\n",
1089 strerror(errno));
1090 if (errno == 524 /* ENOTSUP */ &&
1091 info.array.level == 0)
1092 cont_err("Please use --layout=original or --layout=alternate\n");
1093 if (info.array.chunk_size & (info.array.chunk_size-1)) {
1094 cont_err("Problem may be that chunk size is not a power of 2\n");
1095 }
1096 ioctl(mdfd, STOP_ARRAY, NULL);
1097 goto abort;
1098 }
1099 /* if start_ro module parameter is set, array is
1100 * auto-read-only, which is bad as the resync won't
1101 * start. So lets make it read-write now.
1102 */
1103 ioctl(mdfd, RESTART_ARRAY_RW, NULL);
1104 }
1105 if (c->verbose >= 0)
1106 pr_err("array %s started.\n", mddev);
1107 if (st->ss->external && st->container_devnm[0]) {
1108 if (need_mdmon)
1109 start_mdmon(st->container_devnm);
1110
1111 ping_monitor(st->container_devnm);
1112 close(container_fd);
1113 }
1114 wait_for(chosen_name, mdfd);
1115 } else {
1116 pr_err("not starting array - not enough devices.\n");
1117 }
1118 udev_unblock();
1119 close(mdfd);
1120 sysfs_uevent(&info, "change");
1121 return 0;
1122
1123 abort:
1124 udev_unblock();
1125 map_lock(&map);
1126 abort_locked:
1127 map_remove(&map, fd2devnm(mdfd));
1128 map_unlock(&map);
1129
1130 if (mdfd >= 0)
1131 close(mdfd);
1132 return 1;
1133 }