af094952e834f568224cd5c84bef55bb5879a326
[thirdparty/mdadm.git] / Create.c
1 /*
2  * mdadm - manage Linux "md" devices aka RAID arrays.
3  *
4  * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
5  *
6  *
7  *    This program is free software; you can redistribute it and/or modify
8  *    it under the terms of the GNU General Public License as published by
9  *    the Free Software Foundation; either version 2 of the License, or
10  *    (at your option) any later version.
11  *
12  *    This program is distributed in the hope that it will be useful,
13  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *    GNU General Public License for more details.
16  *
17  *    You should have received a copy of the GNU General Public License
18  *    along with this program; if not, write to the Free Software
19  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20  *
21  *    Author: Neil Brown
22  *    Email: <neilb@suse.de>
23  */
24
25 #include "mdadm.h"
26 #include        "md_u.h"
27 #include        "md_p.h"
28 #include        <ctype.h>
29
30 static int default_layout(struct supertype *st, int level, int verbose)
31 {
32         int layout = UnSet;
33
34         if (st && st->ss->default_geometry)
35                 st->ss->default_geometry(st, &level, &layout, NULL);
36
37         if (layout == UnSet)
38                 switch(level) {
39                 default: /* no layout */
40                         layout = 0;
41                         break;
42                 case 10:
43                         layout = 0x102; /* near=2, far=1 */
44                         if (verbose > 0)
45                                 pr_err("layout defaults to n2\n");
46                         break;
47                 case 5:
48                 case 6:
49                         layout = map_name(r5layout, "default");
50                         if (verbose > 0)
51                                 pr_err("layout defaults to %s\n", map_num(r5layout, layout));
52                         break;
53                 case LEVEL_FAULTY:
54                         layout = map_name(faultylayout, "default");
55
56                         if (verbose > 0)
57                                 pr_err("layout defaults to %s\n", map_num(faultylayout, layout));
58                         break;
59                 }
60
61         return layout;
62 }
63
64
65 int Create(struct supertype *st, char *mddev,
66            int chunk, int level, int layout, unsigned long long size,
67            int raiddisks, int sparedisks,
68            char *name, char *homehost, int *uuid,
69            int subdevs, struct mddev_dev *devlist,
70            int runstop, int verbose, int force, int assume_clean,
71            char *bitmap_file, int bitmap_chunk, int write_behind,
72            int delay, int autof)
73 {
74         /*
75          * Create a new raid array.
76          *
77          * First check that necessary details are available
78          * (i.e. level, raid-disks)
79          *
80          * Then check each disk to see what might be on it
81          * and report anything interesting.
82          *
83          * If anything looks odd, and runstop not set,
84          * abort.
85          *
86          * SET_ARRAY_INFO and ADD_NEW_DISK, and
87          * if runstop==run, or raiddisks disks were used,
88          * RUN_ARRAY
89          */
90         int mdfd;
91         unsigned long long minsize=0, maxsize=0;
92         char *mindisc = NULL;
93         char *maxdisc = NULL;
94         int dnum;
95         struct mddev_dev *dv;
96         int fail=0, warn=0;
97         struct stat stb;
98         int first_missing = subdevs * 2;
99         int second_missing = subdevs * 2;
100         int missing_disks = 0;
101         int insert_point = subdevs * 2; /* where to insert a missing drive */
102         int total_slots;
103         int pass;
104         int vers;
105         int rv;
106         int bitmap_fd;
107         int have_container = 0;
108         int container_fd = -1;
109         int need_mdmon = 0;
110         unsigned long long bitmapsize;
111         struct mdinfo info, *infos;
112         int did_default = 0;
113         int do_default_layout = 0;
114         int do_default_chunk = 0;
115         unsigned long safe_mode_delay = 0;
116         char chosen_name[1024];
117         struct map_ent *map = NULL;
118         unsigned long long newsize;
119
120         int major_num = BITMAP_MAJOR_HI;
121
122         memset(&info, 0, sizeof(info));
123         if (level == UnSet && st && st->ss->default_geometry)
124                 st->ss->default_geometry(st, &level, NULL, NULL);
125         if (level == UnSet) {
126                 pr_err("a RAID level is needed to create an array.\n");
127                 return 1;
128         }
129         if (raiddisks < 4 && level == 6) {
130                 pr_err("at least 4 raid-devices needed for level 6\n");
131                 return 1;
132         }
133         if (raiddisks > 256 && level == 6) {
134                 pr_err("no more than 256 raid-devices supported for level 6\n");
135                 return 1;
136         }
137         if (raiddisks < 2 && level >= 4) {
138                 pr_err("at least 2 raid-devices needed for level 4 or 5\n");
139                 return 1;
140         }
141         if (level <= 0 && sparedisks) {
142                 pr_err("This level does not support spare devices\n");
143                 return 1;
144         }
145
146         if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) {
147                 /* If given a single device, it might be a container, and we can
148                  * extract a device list from there
149                  */
150                 mdu_array_info_t inf;
151                 int fd;
152
153                 memset(&inf, 0, sizeof(inf));
154                 fd = open(devlist->devname, O_RDONLY);
155                 if (fd >= 0 &&
156                     ioctl(fd, GET_ARRAY_INFO, &inf) == 0 &&
157                     inf.raid_disks == 0) {
158                         /* yep, looks like a container */
159                         if (st) {
160                                 rv = st->ss->load_container(st, fd,
161                                                             devlist->devname);
162                                 if (rv == 0)
163                                         have_container = 1;
164                         } else {
165                                 st = super_by_fd(fd, NULL);
166                                 if (st && !(rv = st->ss->
167                                             load_container(st, fd,
168                                                            devlist->devname)))
169                                         have_container = 1;
170                                 else
171                                         st = NULL;
172                         }
173                         if (have_container) {
174                                 subdevs = raiddisks;
175                                 first_missing = subdevs * 2;
176                                 second_missing = subdevs * 2;
177                                 insert_point = subdevs * 2;
178                         }
179                 }
180                 if (fd >= 0)
181                         close(fd);
182         }
183         if (st && st->ss->external && sparedisks) {
184                 pr_err("This metadata type does not support "
185                        "spare disks at create time\n");
186                 return 1;
187         }
188         if (subdevs > raiddisks+sparedisks) {
189                 pr_err("You have listed more devices (%d) than are in the array(%d)!\n", subdevs, raiddisks+sparedisks);
190                 return 1;
191         }
192         if (!have_container && subdevs < raiddisks+sparedisks) {
193                 pr_err("You haven't given enough devices (real or missing) to create this array\n");
194                 return 1;
195         }
196         if (bitmap_file && level <= 0) {
197                 pr_err("bitmaps not meaningful with level %s\n",
198                         map_num(pers, level)?:"given");
199                 return 1;
200         }
201
202         /* now set some defaults */
203
204
205         if (layout == UnSet) {
206                 do_default_layout = 1;
207                 layout = default_layout(st, level, verbose);
208         }
209
210         if (level == 10)
211                 /* check layout fits in array*/
212                 if ((layout&255) * ((layout>>8)&255) > raiddisks) {
213                         pr_err("that layout requires at least %d devices\n",
214                                 (layout&255) * ((layout>>8)&255));
215                         return 1;
216                 }
217
218         switch(level) {
219         case 4:
220         case 5:
221         case 10:
222         case 6:
223         case 0:
224                 if (chunk == 0 || chunk == UnSet) {
225                         chunk = UnSet;
226                         do_default_chunk = 1;
227                         /* chunk will be set later */
228                 }
229                 break;
230         case LEVEL_LINEAR:
231                 /* a chunksize of zero 0s perfectly valid (and preferred) since 2.6.16 */
232                 if (get_linux_version() < 2006016 && chunk == 0) {
233                         chunk = 64;
234                         if (verbose > 0)
235                                 pr_err("chunk size defaults to 64K\n");
236                 }
237                 break;
238         case 1:
239         case LEVEL_FAULTY:
240         case LEVEL_MULTIPATH:
241         case LEVEL_CONTAINER:
242                 if (chunk) {
243                         chunk = 0;
244                         if (verbose > 0)
245                                 pr_err("chunk size ignored for this level\n");
246                 }
247                 break;
248         default:
249                 pr_err("unknown level %d\n", level);
250                 return 1;
251         }
252         
253         if (size && chunk && chunk != UnSet)
254                 size &= ~(unsigned long long)(chunk - 1);
255         newsize = size * 2;
256         if (st && ! st->ss->validate_geometry(st, level, layout, raiddisks,
257                                               &chunk, size*2, NULL, &newsize, verbose>=0))
258                 return 1;
259
260         if (chunk && chunk != UnSet) {
261                 newsize &= ~(unsigned long long)(chunk*2 - 1);
262                 if (do_default_chunk) {
263                         /* default chunk was just set */
264                         if (verbose > 0)
265                                 pr_err("chunk size "
266                                         "defaults to %dK\n", chunk);
267                         size &= ~(unsigned long long)(chunk - 1);
268                         do_default_chunk = 0;
269                 }
270         }
271
272         if (size == 0) {
273                 size = newsize / 2;
274                 if (level == 1)
275                         /* If this is ever reshaped to RAID5, we will
276                          * need a chunksize.  So round it off a bit
277                          * now just to be safe
278                          */
279                         size &= ~(64ULL-1);
280
281                 if (size && verbose > 0)
282                         pr_err("setting size to %lluK\n",
283                                 (unsigned long long)size);
284         }
285
286         /* now look at the subdevs */
287         info.array.active_disks = 0;
288         info.array.working_disks = 0;
289         dnum = 0;
290         for (dv=devlist; dv && !have_container; dv=dv->next, dnum++) {
291                 char *dname = dv->devname;
292                 unsigned long long freesize;
293                 int dfd;
294
295                 if (strcasecmp(dname, "missing")==0) {
296                         if (first_missing > dnum)
297                                 first_missing = dnum;
298                         if (second_missing > dnum && dnum > first_missing)
299                                 second_missing = dnum;
300                         missing_disks ++;
301                         continue;
302                 }
303                 dfd = open(dname, O_RDONLY);
304                 if (dfd < 0) {
305                         pr_err("cannot open %s: %s\n",
306                                 dname, strerror(errno));
307                         exit(2);
308                 }
309                 if (fstat(dfd, &stb) != 0 ||
310                     (stb.st_mode & S_IFMT) != S_IFBLK) {
311                         close(dfd);
312                         pr_err("%s is not a block device\n",
313                                 dname);
314                         exit(2);
315                 }
316                 close(dfd);
317                 info.array.working_disks++;
318                 if (dnum < raiddisks)
319                         info.array.active_disks++;
320                 if (st == NULL) {
321                         struct createinfo *ci = conf_get_create_info();
322                         if (ci)
323                                 st = ci->supertype;
324                 }
325                 if (st == NULL) {
326                         /* Need to choose a default metadata, which is different
327                          * depending on geometry of array.
328                          */
329                         int i;
330                         char *name = "default";
331                         for(i=0; !st && superlist[i]; i++) {
332                                 st = superlist[i]->match_metadata_desc(name);
333                                 if (!st)
334                                         continue;
335                                 if (do_default_layout)
336                                         layout = default_layout(st, level, verbose);
337                                 switch (st->ss->validate_geometry(
338                                                 st, level, layout, raiddisks,
339                                                 &chunk, size*2, dname, &freesize,
340                                                 verbose > 0)) {
341                                 case -1: /* Not valid, message printed, and not
342                                           * worth checking any further */
343                                         exit(2);
344                                         break;
345                                 case 0: /* Geometry not valid */
346                                         free(st);
347                                         st = NULL;
348                                         chunk = do_default_chunk ? UnSet : chunk;
349                                         break;
350                                 case 1: /* All happy */
351                                         break;
352                                 }
353                         }
354
355                         if (!st) {
356                                 int dfd = open(dname, O_RDONLY|O_EXCL);
357                                 if (dfd < 0) {
358                                         pr_err("cannot open %s: %s\n",
359                                                 dname, strerror(errno));
360                                         exit(2);
361                                 }
362                                 pr_err("device %s not suitable "
363                                         "for any style of array\n",
364                                         dname);
365                                 exit(2);
366                         }
367                         if (st->ss != &super0 ||
368                             st->minor_version != 90)
369                                 did_default = 1;
370                 } else {
371                         if (do_default_layout)
372                                 layout = default_layout(st, level, 0);
373                         if (!st->ss->validate_geometry(st, level, layout,
374                                                        raiddisks,
375                                                        &chunk, size*2, dname,
376                                                        &freesize,
377                                                        verbose >= 0)) {
378
379                                 pr_err("%s is not suitable for "
380                                        "this array.\n",
381                                        dname);
382                                 fail = 1;
383                                 continue;
384                         }
385                 }
386
387                 freesize /= 2; /* convert to K */
388                 if (chunk && chunk != UnSet) {
389                         /* round to chunk size */
390                         freesize = freesize & ~(chunk-1);
391                         if (do_default_chunk) {
392                                 /* default chunk was just set */
393                                 if (verbose > 0)
394                                         pr_err("chunk size "
395                                                 "defaults to %dK\n", chunk);
396                                 size &= ~(unsigned long long)(chunk - 1);
397                                 do_default_chunk = 0;
398                         }
399                 }
400
401                 if (size && freesize < size) {
402                         pr_err("%s is smaller than given size."
403                                 " %lluK < %lluK + metadata\n",
404                                 dname, freesize, size);
405                         fail = 1;
406                         continue;
407                 }
408                 if (maxdisc == NULL || (maxdisc && freesize > maxsize)) {
409                         maxdisc = dname;
410                         maxsize = freesize;
411                 }
412                 if (mindisc ==NULL || (mindisc && freesize < minsize)) {
413                         mindisc = dname;
414                         minsize = freesize;
415                 }
416                 if (runstop != 1 || verbose >= 0) {
417                         int fd = open(dname, O_RDONLY);
418                         if (fd <0 ) {
419                                 pr_err("Cannot open %s: %s\n",
420                                         dname, strerror(errno));
421                                 fail=1;
422                                 continue;
423                         }
424                         warn |= check_ext2(fd, dname);
425                         warn |= check_reiser(fd, dname);
426                         warn |= check_raid(fd, dname);
427                         if (strcmp(st->ss->name, "1.x") == 0 &&
428                             st->minor_version >= 1)
429                                 /* metadata at front */
430                                 warn |= check_partitions(fd, dname, 0, 0);
431                         else if (level == 1 || level == LEVEL_CONTAINER
432                                     || (level == 0 && raiddisks == 1))
433                                 /* partitions could be meaningful */
434                                 warn |= check_partitions(fd, dname, freesize*2, size*2);
435                         else
436                                 /* partitions cannot be meaningful */
437                                 warn |= check_partitions(fd, dname, 0, 0);
438                         if (strcmp(st->ss->name, "1.x") == 0 &&
439                             st->minor_version >= 1 &&
440                             did_default &&
441                             level == 1 &&
442                             (warn & 1024) == 0) {
443                                 warn |= 1024;
444                                 pr_err("Note: this array has metadata at the start and\n"
445                                         "    may not be suitable as a boot device.  If you plan to\n"
446                                         "    store '/boot' on this device please ensure that\n"
447                                         "    your boot-loader understands md/v1.x metadata, or use\n"
448                                         "    --metadata=0.90\n");
449                         }
450                         close(fd);
451                 }
452         }
453         if (raiddisks + sparedisks > st->max_devs) {
454                 pr_err("Too many devices:"
455                         " %s metadata only supports %d\n",
456                         st->ss->name, st->max_devs);
457                 return 1;
458         }
459         if (have_container)
460                 info.array.working_disks = raiddisks;
461         if (fail) {
462                 pr_err("create aborted\n");
463                 return 1;
464         }
465         if (size == 0) {
466                 if (mindisc == NULL && !have_container) {
467                         pr_err("no size and no drives given - aborting create.\n");
468                         return 1;
469                 }
470                 if (level > 0 || level == LEVEL_MULTIPATH
471                     || level == LEVEL_FAULTY
472                     || st->ss->external ) {
473                         /* size is meaningful */
474                         if (!st->ss->validate_geometry(st, level, layout,
475                                                        raiddisks,
476                                                        &chunk, minsize*2,
477                                                        NULL, NULL, 0)) {
478                                 pr_err("devices too large for RAID level %d\n", level);
479                                 return 1;
480                         }
481                         size = minsize;
482                         if (level == 1)
483                                 /* If this is ever reshaped to RAID5, we will
484                                  * need a chunksize.  So round it off a bit
485                                  * now just to be safe
486                                  */
487                                 size &= ~(64ULL-1);
488                         if (verbose > 0)
489                                 pr_err("size set to %lluK\n", size);
490                 }
491         }
492         if (!have_container && level > 0 && ((maxsize-size)*100 > maxsize)) {
493                 if (runstop != 1 || verbose >= 0)
494                         pr_err("largest drive (%s) exceeds size (%lluK) by more than 1%%\n",
495                                 maxdisc, size);
496                 warn = 1;
497         }
498
499         if (st->ss->detail_platform && st->ss->detail_platform(0, 1) != 0) {
500                 if (runstop != 1 || verbose >= 0)
501                         pr_err("%s unable to enumerate platform support\n"
502                                 "    array may not be compatible with hardware/firmware\n",
503                                 st->ss->name);
504                 warn = 1;
505         }
506
507         if (warn) {
508                 if (runstop!= 1) {
509                         if (!ask("Continue creating array? ")) {
510                                 pr_err("create aborted.\n");
511                                 return 1;
512                         }
513                 } else {
514                         if (verbose > 0)
515                                 pr_err("creation continuing despite oddities due to --run\n");
516                 }
517         }
518
519         /* If this is raid4/5, we want to configure the last active slot
520          * as missing, so that a reconstruct happens (faster than re-parity)
521          * FIX: Can we do this for raid6 as well?
522          */
523         if (st->ss->external == 0 &&
524             assume_clean==0 && force == 0 && first_missing >= raiddisks) {
525                 switch ( level ) {
526                 case 4:
527                 case 5:
528                         insert_point = raiddisks-1;
529                         sparedisks++;
530                         info.array.active_disks--;
531                         missing_disks++;
532                         break;
533                 default:
534                         break;
535                 }
536         }
537         /* For raid6, if creating with 1 missing drive, make a good drive
538          * into a spare, else the create will fail
539          */
540         if (assume_clean == 0 && force == 0 && first_missing < raiddisks &&
541             st->ss->external == 0 &&
542             second_missing >= raiddisks && level == 6) {
543                 insert_point = raiddisks - 1;
544                 if (insert_point == first_missing)
545                         insert_point--;
546                 sparedisks ++;
547                 info.array.active_disks--;
548                 missing_disks++;
549         }
550
551         if (level <= 0 && first_missing < subdevs * 2) {
552                 pr_err("This level does not support missing devices\n");
553                 return 1;
554         }
555
556         /* We need to create the device */
557         map_lock(&map);
558         mdfd = create_mddev(mddev, name, autof, LOCAL, chosen_name);
559         if (mdfd < 0) {
560                 map_unlock(&map);
561                 return 1;
562         }
563         /* verify if chosen_name is not in use,
564          * it could be in conflict with already existing device
565          * e.g. container, array
566          */
567         if (strncmp(chosen_name, "/dev/md/", 8) == 0
568             && map_by_name(&map, chosen_name+8) != NULL) {
569                 pr_err("Array name %s is in use already.\n",
570                         chosen_name);
571                 close(mdfd);
572                 map_unlock(&map);
573                 return 1;
574         }
575         mddev = chosen_name;
576
577         vers = md_get_version(mdfd);
578         if (vers < 9000) {
579                 pr_err("Create requires md driver version 0.90.0 or later\n");
580                 goto abort_locked;
581         } else {
582                 mdu_array_info_t inf;
583                 memset(&inf, 0, sizeof(inf));
584                 ioctl(mdfd, GET_ARRAY_INFO, &inf);
585                 if (inf.working_disks != 0) {
586                         pr_err("another array by this name"
587                                 " is already running.\n");
588                         goto abort_locked;
589                 }
590         }
591
592         /* Ok, lets try some ioctls */
593
594         info.array.level = level;
595         info.array.size = size;
596         info.array.raid_disks = raiddisks;
597         /* The kernel should *know* what md_minor we are dealing
598          * with, but it chooses to trust me instead. Sigh
599          */
600         info.array.md_minor = 0;
601         if (fstat(mdfd, &stb)==0)
602                 info.array.md_minor = minor(stb.st_rdev);
603         info.array.not_persistent = 0;
604
605         if ( ( (level == 4 || level == 5) &&
606                (insert_point < raiddisks || first_missing < raiddisks) )
607              ||
608              ( level == 6 && (insert_point < raiddisks
609                               || second_missing < raiddisks))
610              ||
611              ( level <= 0 )
612              ||
613              assume_clean
614                 ) {
615                 info.array.state = 1; /* clean, but one+ drive will be missing*/
616                 info.resync_start = MaxSector;
617         } else {
618                 info.array.state = 0; /* not clean, but no errors */
619                 info.resync_start = 0;
620         }
621         if (level == 10) {
622                 /* for raid10, the bitmap size is the capacity of the array,
623                  * which is array.size * raid_disks / ncopies;
624                  * .. but convert to sectors.
625                  */
626                 int ncopies = ((layout>>8) & 255) * (layout & 255);
627                 bitmapsize = (unsigned long long)size * raiddisks / ncopies * 2;
628 /*              printf("bms=%llu as=%d rd=%d nc=%d\n", bitmapsize, size, raiddisks, ncopies);*/
629         } else
630                 bitmapsize = (unsigned long long)size * 2;
631
632         /* There is lots of redundancy in these disk counts,
633          * raid_disks is the most meaningful value
634          *          it describes the geometry of the array
635          *          it is constant
636          * nr_disks is total number of used slots.
637          *          it should be raid_disks+spare_disks
638          * spare_disks is the number of extra disks present
639          *          see above
640          * active_disks is the number of working disks in
641          *          active slots. (With raid_disks)
642          * working_disks is the total number of working disks,
643          *          including spares
644          * failed_disks is the number of disks marked failed
645          *
646          * Ideally, the kernel would keep these (except raid_disks)
647          * up-to-date as we ADD_NEW_DISK, but it doesn't (yet).
648          * So for now, we assume that all raid and spare
649          * devices will be given.
650          */
651         info.array.spare_disks=sparedisks;
652         info.array.failed_disks=missing_disks;
653         info.array.nr_disks = info.array.working_disks
654                 + info.array.failed_disks;
655         info.array.layout = layout;
656         info.array.chunk_size = chunk*1024;
657
658         if (name == NULL || *name == 0) {
659                 /* base name on mddev */
660                 /*  /dev/md0 -> 0
661                  *  /dev/md_d0 -> d0
662                  *  /dev/md/1 -> 1
663                  *  /dev/md/d1 -> d1
664                  *  /dev/md/home -> home
665                  *  /dev/mdhome -> home
666                  */
667                 /* FIXME compare this with rules in create_mddev */
668                 name = strrchr(mddev, '/');
669                 if (name) {
670                         name++;
671                         if (strncmp(name, "md_d", 4)==0 &&
672                             strlen(name) > 4 &&
673                             isdigit(name[4]) &&
674                             (name-mddev) == 5 /* /dev/ */)
675                                 name += 3;
676                         else if (strncmp(name, "md", 2)==0 &&
677                                  strlen(name) > 2 &&
678                                  isdigit(name[2]) &&
679                                  (name-mddev) == 5 /* /dev/ */)
680                                 name += 2;
681                 }
682         }
683         if (!st->ss->init_super(st, &info.array, size, name, homehost, uuid))
684                 goto abort_locked;
685
686         total_slots = info.array.nr_disks;
687         st->ss->getinfo_super(st, &info, NULL);
688         sysfs_init(&info, mdfd, 0);
689
690         if (did_default && verbose >= 0) {
691                 if (is_subarray(info.text_version)) {
692                         int dnum = devname2devnum(info.text_version+1);
693                         char *path;
694                         int mdp = get_mdp_major();
695                         struct mdinfo *mdi;
696                         if (dnum > 0)
697                                 path = map_dev(MD_MAJOR, dnum, 1);
698                         else
699                                 path = map_dev(mdp, (-1-dnum)<< 6, 1);
700
701                         mdi = sysfs_read(-1, dnum, GET_VERSION);
702
703                         pr_err("Creating array inside "
704                                 "%s container %s\n", 
705                                 mdi?mdi->text_version:"managed", path);
706                         sysfs_free(mdi);
707                 } else
708                         pr_err("Defaulting to version"
709                                 " %s metadata\n", info.text_version);
710         }
711
712         map_update(&map, fd2devnum(mdfd), info.text_version,
713                    info.uuid, chosen_name);
714         map_unlock(&map);
715
716         if (bitmap_file && vers < 9003) {
717                 major_num = BITMAP_MAJOR_HOSTENDIAN;
718 #ifdef __BIG_ENDIAN
719                 pr_err("Warning - bitmaps created on this kernel are not portable\n"
720                         "  between different architectured.  Consider upgrading the Linux kernel.\n");
721 #endif
722         }
723
724         if (bitmap_file && strcmp(bitmap_file, "internal")==0) {
725                 if ((vers%100) < 2) {
726                         pr_err("internal bitmaps not supported by this kernel.\n");
727                         goto abort;
728                 }
729                 if (!st->ss->add_internal_bitmap) {
730                         pr_err("internal bitmaps not supported with %s metadata\n",
731                                 st->ss->name);
732                         goto abort;
733                 }
734                 if (!st->ss->add_internal_bitmap(st, &bitmap_chunk,
735                                                  delay, write_behind,
736                                                  bitmapsize, 1, major_num)) {
737                         pr_err("Given bitmap chunk size not supported.\n");
738                         goto abort;
739                 }
740                 bitmap_file = NULL;
741         }
742
743
744         sysfs_init(&info, mdfd, 0);
745
746         if (st->ss->external && st->container_dev != NoMdDev) {
747                 /* member */
748
749                 /* When creating a member, we need to be careful
750                  * to negotiate with mdmon properly.
751                  * If it is already running, we cannot write to
752                  * the devices and must ask it to do that part.
753                  * If it isn't running, we write to the devices,
754                  * and then start it.
755                  * We hold an exclusive open on the container
756                  * device to make sure mdmon doesn't exit after
757                  * we checked that it is running.
758                  *
759                  * For now, fail if it is already running.
760                  */
761                 container_fd = open_dev_excl(st->container_dev);
762                 if (container_fd < 0) {
763                         pr_err("Cannot get exclusive "
764                                 "open on container - weird.\n");
765                         goto abort;
766                 }
767                 if (mdmon_running(st->container_dev)) {
768                         if (verbose)
769                                 pr_err("reusing mdmon "
770                                         "for %s.\n",
771                                         devnum2devname(st->container_dev));
772                         st->update_tail = &st->updates;
773                 } else
774                         need_mdmon = 1;
775         }
776         rv = set_array_info(mdfd, st, &info);
777         if (rv) {
778                 pr_err("failed to set array info for %s: %s\n",
779                         mddev, strerror(errno));
780                 goto abort;
781         }
782
783         if (bitmap_file) {
784                 int uuid[4];
785
786                 st->ss->uuid_from_super(st, uuid);
787                 if (CreateBitmap(bitmap_file, force, (char*)uuid, bitmap_chunk,
788                                  delay, write_behind,
789                                  bitmapsize,
790                                  major_num)) {
791                         goto abort;
792                 }
793                 bitmap_fd = open(bitmap_file, O_RDWR);
794                 if (bitmap_fd < 0) {
795                         pr_err("weird: %s cannot be openned\n",
796                                 bitmap_file);
797                         goto abort;
798                 }
799                 if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) {
800                         pr_err("Cannot set bitmap file for %s: %s\n",
801                                 mddev, strerror(errno));
802                         goto abort;
803                 }
804         }
805
806         infos = xmalloc(sizeof(*infos) * total_slots);
807
808         for (pass=1; pass <=2 ; pass++) {
809                 struct mddev_dev *moved_disk = NULL; /* the disk that was moved out of the insert point */
810
811                 for (dnum=0, dv = devlist ; dv ;
812                      dv=(dv->next)?(dv->next):moved_disk, dnum++) {
813                         int fd;
814                         struct stat stb;
815                         struct mdinfo *inf = &infos[dnum];
816
817                         if (dnum >= total_slots)
818                                 abort();
819                         if (dnum == insert_point) {
820                                 moved_disk = dv;
821                                 continue;
822                         }
823                         if (strcasecmp(dv->devname, "missing")==0)
824                                 continue;
825                         if (have_container)
826                                 moved_disk = NULL;
827                         if (have_container && dnum < info.array.raid_disks - 1)
828                                 /* repeatedly use the container */
829                                 moved_disk = dv;
830
831                         switch(pass) {
832                         case 1:
833                                 *inf = info;
834
835                                 inf->disk.number = dnum;
836                                 inf->disk.raid_disk = dnum;
837                                 if (inf->disk.raid_disk < raiddisks)
838                                         inf->disk.state = (1<<MD_DISK_ACTIVE) |
839                                                 (1<<MD_DISK_SYNC);
840                                 else
841                                         inf->disk.state = 0;
842
843                                 if (dv->writemostly == 1)
844                                         inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
845
846                                 if (have_container)
847                                         fd = -1;
848                                 else {
849                                         if (st->ss->external &&
850                                             st->container_dev != NoMdDev)
851                                                 fd = open(dv->devname, O_RDWR);
852                                         else
853                                                 fd = open(dv->devname, O_RDWR|O_EXCL);
854
855                                         if (fd < 0) {
856                                                 pr_err("failed to open %s "
857                                                         "after earlier success - aborting\n",
858                                                         dv->devname);
859                                                 goto abort;
860                                         }
861                                         fstat(fd, &stb);
862                                         inf->disk.major = major(stb.st_rdev);
863                                         inf->disk.minor = minor(stb.st_rdev);
864                                 }
865                                 if (fd >= 0)
866                                         remove_partitions(fd);
867                                 if (st->ss->add_to_super(st, &inf->disk,
868                                                          fd, dv->devname)) {
869                                         ioctl(mdfd, STOP_ARRAY, NULL);
870                                         goto abort;
871                                 }
872                                 st->ss->getinfo_super(st, inf, NULL);
873                                 safe_mode_delay = inf->safe_mode_delay;
874
875                                 if (have_container && verbose > 0)
876                                         pr_err("Using %s for device %d\n",
877                                                 map_dev(inf->disk.major,
878                                                         inf->disk.minor,
879                                                         0), dnum);
880
881                                 if (!have_container) {
882                                         /* getinfo_super might have lost these ... */
883                                         inf->disk.major = major(stb.st_rdev);
884                                         inf->disk.minor = minor(stb.st_rdev);
885                                 }
886                                 break;
887                         case 2:
888                                 inf->errors = 0;
889
890                                 rv = add_disk(mdfd, st, &info, inf);
891
892                                 if (rv) {
893                                         pr_err("ADD_NEW_DISK for %s "
894                                                "failed: %s\n",
895                                                dv->devname, strerror(errno));
896                                         goto abort;
897                                 }
898                                 break;
899                         }
900                         if (!have_container &&
901                             dv == moved_disk && dnum != insert_point) break;
902                 }
903                 if (pass == 1) {
904                         struct mdinfo info_new;
905                         struct map_ent *me = NULL;
906
907                         /* check to see if the uuid has changed due to these
908                          * metadata changes, and if so update the member array
909                          * and container uuid.  Note ->write_init_super clears
910                          * the subarray cursor such that ->getinfo_super once
911                          * again returns container info.
912                          */
913                         map_lock(&map);
914                         st->ss->getinfo_super(st, &info_new, NULL);
915                         if (st->ss->external && level != LEVEL_CONTAINER &&
916                             !same_uuid(info_new.uuid, info.uuid, 0)) {
917                                 map_update(&map, fd2devnum(mdfd),
918                                            info_new.text_version,
919                                            info_new.uuid, chosen_name);
920                                 me = map_by_devnum(&map, st->container_dev);
921                         }
922
923                         if (st->ss->write_init_super(st)) {
924                                 st->ss->free_super(st);
925                                 goto abort_locked;
926                         }
927
928                         /* update parent container uuid */
929                         if (me) {
930                                 char *path = xstrdup(me->path);
931
932                                 st->ss->getinfo_super(st, &info_new, NULL);
933                                 map_update(&map, st->container_dev,
934                                            info_new.text_version,
935                                            info_new.uuid, path);
936                                 free(path);
937                         }
938                         map_unlock(&map);
939
940                         flush_metadata_updates(st);
941                         st->ss->free_super(st);
942                 }
943         }
944         free(infos);
945
946         if (level == LEVEL_CONTAINER) {
947                 /* No need to start.  But we should signal udev to
948                  * create links */
949                 sysfs_uevent(&info, "change");
950                 if (verbose >= 0)
951                         pr_err("container %s prepared.\n", mddev);
952                 wait_for(chosen_name, mdfd);
953         } else if (runstop == 1 || subdevs >= raiddisks) {
954                 if (st->ss->external) {
955                         int err;
956                         switch(level) {
957                         case LEVEL_LINEAR:
958                         case LEVEL_MULTIPATH:
959                         case 0:
960                                 err = sysfs_set_str(&info, NULL, "array_state",
961                                                     "active");
962                                 need_mdmon = 0;
963                                 break;
964                         default:
965                                 err = sysfs_set_str(&info, NULL, "array_state",
966                                                     "readonly");
967                                 break;
968                         }
969                         sysfs_set_safemode(&info, safe_mode_delay);
970                         if (err) {
971                                 pr_err("failed to"
972                                        " activate array.\n");
973                                 ioctl(mdfd, STOP_ARRAY, NULL);
974                                 goto abort;
975                         }
976                 } else {
977                         /* param is not actually used */
978                         mdu_param_t param;
979                         if (ioctl(mdfd, RUN_ARRAY, &param)) {
980                                 pr_err("RUN_ARRAY failed: %s\n",
981                                         strerror(errno));
982                                 if (info.array.chunk_size & (info.array.chunk_size-1)) {
983                                         cont_err("Problem may be that "
984                                                  "chunk size is not a power of 2\n");
985                                 }
986                                 ioctl(mdfd, STOP_ARRAY, NULL);
987                                 goto abort;
988                         }
989                 }
990                 if (verbose >= 0)
991                         pr_err("array %s started.\n", mddev);
992                 if (st->ss->external && st->container_dev != NoMdDev) {
993                         if (need_mdmon)
994                                 start_mdmon(st->container_dev);
995
996                         ping_monitor_by_id(st->container_dev);
997                         close(container_fd);
998                 }
999                 wait_for(chosen_name, mdfd);
1000         } else {
1001                 pr_err("not starting array - not enough devices.\n");
1002         }
1003         close(mdfd);
1004         return 0;
1005
1006  abort:
1007         map_lock(&map);
1008  abort_locked:
1009         map_remove(&map, fd2devnum(mdfd));
1010         map_unlock(&map);
1011
1012         if (mdfd >= 0)
1013                 close(mdfd);
1014         return 1;
1015 }