Create: support --readonly flag.
[thirdparty/mdadm.git] / Create.c
1 /*
2  * mdadm - manage Linux "md" devices aka RAID arrays.
3  *
4  * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
5  *
6  *
7  *    This program is free software; you can redistribute it and/or modify
8  *    it under the terms of the GNU General Public License as published by
9  *    the Free Software Foundation; either version 2 of the License, or
10  *    (at your option) any later version.
11  *
12  *    This program is distributed in the hope that it will be useful,
13  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *    GNU General Public License for more details.
16  *
17  *    You should have received a copy of the GNU General Public License
18  *    along with this program; if not, write to the Free Software
19  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20  *
21  *    Author: Neil Brown
22  *    Email: <neilb@suse.de>
23  */
24
25 #include "mdadm.h"
26 #include        "md_u.h"
27 #include        "md_p.h"
28 #include        <ctype.h>
29
30 static int default_layout(struct supertype *st, int level, int verbose)
31 {
32         int layout = UnSet;
33
34         if (st && st->ss->default_geometry)
35                 st->ss->default_geometry(st, &level, &layout, NULL);
36
37         if (layout == UnSet)
38                 switch(level) {
39                 default: /* no layout */
40                         layout = 0;
41                         break;
42                 case 10:
43                         layout = 0x102; /* near=2, far=1 */
44                         if (verbose > 0)
45                                 pr_err("layout defaults to n2\n");
46                         break;
47                 case 5:
48                 case 6:
49                         layout = map_name(r5layout, "default");
50                         if (verbose > 0)
51                                 pr_err("layout defaults to %s\n", map_num(r5layout, layout));
52                         break;
53                 case LEVEL_FAULTY:
54                         layout = map_name(faultylayout, "default");
55
56                         if (verbose > 0)
57                                 pr_err("layout defaults to %s\n", map_num(faultylayout, layout));
58                         break;
59                 }
60
61         return layout;
62 }
63
64
65 int Create(struct supertype *st, char *mddev,
66            int chunk, int level, int layout, unsigned long long size,
67            int raiddisks, int sparedisks,
68            char *name, char *homehost, int *uuid,
69            int subdevs, struct mddev_dev *devlist,
70            int runstop, int readonly, int verbose,
71            int force, int assume_clean,
72            char *bitmap_file, int bitmap_chunk, int write_behind,
73            int delay, int autof)
74 {
75         /*
76          * Create a new raid array.
77          *
78          * First check that necessary details are available
79          * (i.e. level, raid-disks)
80          *
81          * Then check each disk to see what might be on it
82          * and report anything interesting.
83          *
84          * If anything looks odd, and runstop not set,
85          * abort.
86          *
87          * SET_ARRAY_INFO and ADD_NEW_DISK, and
88          * if runstop==run, or raiddisks disks were used,
89          * RUN_ARRAY
90          */
91         int mdfd;
92         unsigned long long minsize=0, maxsize=0;
93         char *mindisc = NULL;
94         char *maxdisc = NULL;
95         int dnum;
96         struct mddev_dev *dv;
97         int fail=0, warn=0;
98         struct stat stb;
99         int first_missing = subdevs * 2;
100         int second_missing = subdevs * 2;
101         int missing_disks = 0;
102         int insert_point = subdevs * 2; /* where to insert a missing drive */
103         int total_slots;
104         int pass;
105         int vers;
106         int rv;
107         int bitmap_fd;
108         int have_container = 0;
109         int container_fd = -1;
110         int need_mdmon = 0;
111         unsigned long long bitmapsize;
112         struct mdinfo info, *infos;
113         int did_default = 0;
114         int do_default_layout = 0;
115         int do_default_chunk = 0;
116         unsigned long safe_mode_delay = 0;
117         char chosen_name[1024];
118         struct map_ent *map = NULL;
119         unsigned long long newsize;
120
121         int major_num = BITMAP_MAJOR_HI;
122
123         memset(&info, 0, sizeof(info));
124         if (level == UnSet && st && st->ss->default_geometry)
125                 st->ss->default_geometry(st, &level, NULL, NULL);
126         if (level == UnSet) {
127                 pr_err("a RAID level is needed to create an array.\n");
128                 return 1;
129         }
130         if (raiddisks < 4 && level == 6) {
131                 pr_err("at least 4 raid-devices needed for level 6\n");
132                 return 1;
133         }
134         if (raiddisks > 256 && level == 6) {
135                 pr_err("no more than 256 raid-devices supported for level 6\n");
136                 return 1;
137         }
138         if (raiddisks < 2 && level >= 4) {
139                 pr_err("at least 2 raid-devices needed for level 4 or 5\n");
140                 return 1;
141         }
142         if (level <= 0 && sparedisks) {
143                 pr_err("This level does not support spare devices\n");
144                 return 1;
145         }
146
147         if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) {
148                 /* If given a single device, it might be a container, and we can
149                  * extract a device list from there
150                  */
151                 mdu_array_info_t inf;
152                 int fd;
153
154                 memset(&inf, 0, sizeof(inf));
155                 fd = open(devlist->devname, O_RDONLY);
156                 if (fd >= 0 &&
157                     ioctl(fd, GET_ARRAY_INFO, &inf) == 0 &&
158                     inf.raid_disks == 0) {
159                         /* yep, looks like a container */
160                         if (st) {
161                                 rv = st->ss->load_container(st, fd,
162                                                             devlist->devname);
163                                 if (rv == 0)
164                                         have_container = 1;
165                         } else {
166                                 st = super_by_fd(fd, NULL);
167                                 if (st && !(rv = st->ss->
168                                             load_container(st, fd,
169                                                            devlist->devname)))
170                                         have_container = 1;
171                                 else
172                                         st = NULL;
173                         }
174                         if (have_container) {
175                                 subdevs = raiddisks;
176                                 first_missing = subdevs * 2;
177                                 second_missing = subdevs * 2;
178                                 insert_point = subdevs * 2;
179                         }
180                 }
181                 if (fd >= 0)
182                         close(fd);
183         }
184         if (st && st->ss->external && sparedisks) {
185                 pr_err("This metadata type does not support "
186                        "spare disks at create time\n");
187                 return 1;
188         }
189         if (subdevs > raiddisks+sparedisks) {
190                 pr_err("You have listed more devices (%d) than are in the array(%d)!\n", subdevs, raiddisks+sparedisks);
191                 return 1;
192         }
193         if (!have_container && subdevs < raiddisks+sparedisks) {
194                 pr_err("You haven't given enough devices (real or missing) to create this array\n");
195                 return 1;
196         }
197         if (bitmap_file && level <= 0) {
198                 pr_err("bitmaps not meaningful with level %s\n",
199                         map_num(pers, level)?:"given");
200                 return 1;
201         }
202
203         /* now set some defaults */
204
205
206         if (layout == UnSet) {
207                 do_default_layout = 1;
208                 layout = default_layout(st, level, verbose);
209         }
210
211         if (level == 10)
212                 /* check layout fits in array*/
213                 if ((layout&255) * ((layout>>8)&255) > raiddisks) {
214                         pr_err("that layout requires at least %d devices\n",
215                                 (layout&255) * ((layout>>8)&255));
216                         return 1;
217                 }
218
219         switch(level) {
220         case 4:
221         case 5:
222         case 10:
223         case 6:
224         case 0:
225                 if (chunk == 0 || chunk == UnSet) {
226                         chunk = UnSet;
227                         do_default_chunk = 1;
228                         /* chunk will be set later */
229                 }
230                 break;
231         case LEVEL_LINEAR:
232                 /* a chunksize of zero 0s perfectly valid (and preferred) since 2.6.16 */
233                 if (get_linux_version() < 2006016 && chunk == 0) {
234                         chunk = 64;
235                         if (verbose > 0)
236                                 pr_err("chunk size defaults to 64K\n");
237                 }
238                 break;
239         case 1:
240         case LEVEL_FAULTY:
241         case LEVEL_MULTIPATH:
242         case LEVEL_CONTAINER:
243                 if (chunk) {
244                         chunk = 0;
245                         if (verbose > 0)
246                                 pr_err("chunk size ignored for this level\n");
247                 }
248                 break;
249         default:
250                 pr_err("unknown level %d\n", level);
251                 return 1;
252         }
253         
254         if (size && chunk && chunk != UnSet)
255                 size &= ~(unsigned long long)(chunk - 1);
256         newsize = size * 2;
257         if (st && ! st->ss->validate_geometry(st, level, layout, raiddisks,
258                                               &chunk, size*2, NULL, &newsize, verbose>=0))
259                 return 1;
260
261         if (chunk && chunk != UnSet) {
262                 newsize &= ~(unsigned long long)(chunk*2 - 1);
263                 if (do_default_chunk) {
264                         /* default chunk was just set */
265                         if (verbose > 0)
266                                 pr_err("chunk size "
267                                         "defaults to %dK\n", chunk);
268                         size &= ~(unsigned long long)(chunk - 1);
269                         do_default_chunk = 0;
270                 }
271         }
272
273         if (size == 0) {
274                 size = newsize / 2;
275                 if (level == 1)
276                         /* If this is ever reshaped to RAID5, we will
277                          * need a chunksize.  So round it off a bit
278                          * now just to be safe
279                          */
280                         size &= ~(64ULL-1);
281
282                 if (size && verbose > 0)
283                         pr_err("setting size to %lluK\n",
284                                 (unsigned long long)size);
285         }
286
287         /* now look at the subdevs */
288         info.array.active_disks = 0;
289         info.array.working_disks = 0;
290         dnum = 0;
291         for (dv=devlist; dv && !have_container; dv=dv->next, dnum++) {
292                 char *dname = dv->devname;
293                 unsigned long long freesize;
294                 int dfd;
295
296                 if (strcasecmp(dname, "missing")==0) {
297                         if (first_missing > dnum)
298                                 first_missing = dnum;
299                         if (second_missing > dnum && dnum > first_missing)
300                                 second_missing = dnum;
301                         missing_disks ++;
302                         continue;
303                 }
304                 dfd = open(dname, O_RDONLY);
305                 if (dfd < 0) {
306                         pr_err("cannot open %s: %s\n",
307                                 dname, strerror(errno));
308                         exit(2);
309                 }
310                 if (fstat(dfd, &stb) != 0 ||
311                     (stb.st_mode & S_IFMT) != S_IFBLK) {
312                         close(dfd);
313                         pr_err("%s is not a block device\n",
314                                 dname);
315                         exit(2);
316                 }
317                 close(dfd);
318                 info.array.working_disks++;
319                 if (dnum < raiddisks)
320                         info.array.active_disks++;
321                 if (st == NULL) {
322                         struct createinfo *ci = conf_get_create_info();
323                         if (ci)
324                                 st = ci->supertype;
325                 }
326                 if (st == NULL) {
327                         /* Need to choose a default metadata, which is different
328                          * depending on geometry of array.
329                          */
330                         int i;
331                         char *name = "default";
332                         for(i=0; !st && superlist[i]; i++) {
333                                 st = superlist[i]->match_metadata_desc(name);
334                                 if (!st)
335                                         continue;
336                                 if (do_default_layout)
337                                         layout = default_layout(st, level, verbose);
338                                 switch (st->ss->validate_geometry(
339                                                 st, level, layout, raiddisks,
340                                                 &chunk, size*2, dname, &freesize,
341                                                 verbose > 0)) {
342                                 case -1: /* Not valid, message printed, and not
343                                           * worth checking any further */
344                                         exit(2);
345                                         break;
346                                 case 0: /* Geometry not valid */
347                                         free(st);
348                                         st = NULL;
349                                         chunk = do_default_chunk ? UnSet : chunk;
350                                         break;
351                                 case 1: /* All happy */
352                                         break;
353                                 }
354                         }
355
356                         if (!st) {
357                                 int dfd = open(dname, O_RDONLY|O_EXCL);
358                                 if (dfd < 0) {
359                                         pr_err("cannot open %s: %s\n",
360                                                 dname, strerror(errno));
361                                         exit(2);
362                                 }
363                                 pr_err("device %s not suitable "
364                                         "for any style of array\n",
365                                         dname);
366                                 exit(2);
367                         }
368                         if (st->ss != &super0 ||
369                             st->minor_version != 90)
370                                 did_default = 1;
371                 } else {
372                         if (do_default_layout)
373                                 layout = default_layout(st, level, 0);
374                         if (!st->ss->validate_geometry(st, level, layout,
375                                                        raiddisks,
376                                                        &chunk, size*2, dname,
377                                                        &freesize,
378                                                        verbose >= 0)) {
379
380                                 pr_err("%s is not suitable for "
381                                        "this array.\n",
382                                        dname);
383                                 fail = 1;
384                                 continue;
385                         }
386                 }
387
388                 freesize /= 2; /* convert to K */
389                 if (chunk && chunk != UnSet) {
390                         /* round to chunk size */
391                         freesize = freesize & ~(chunk-1);
392                         if (do_default_chunk) {
393                                 /* default chunk was just set */
394                                 if (verbose > 0)
395                                         pr_err("chunk size "
396                                                 "defaults to %dK\n", chunk);
397                                 size &= ~(unsigned long long)(chunk - 1);
398                                 do_default_chunk = 0;
399                         }
400                 }
401
402                 if (size && freesize < size) {
403                         pr_err("%s is smaller than given size."
404                                 " %lluK < %lluK + metadata\n",
405                                 dname, freesize, size);
406                         fail = 1;
407                         continue;
408                 }
409                 if (maxdisc == NULL || (maxdisc && freesize > maxsize)) {
410                         maxdisc = dname;
411                         maxsize = freesize;
412                 }
413                 if (mindisc ==NULL || (mindisc && freesize < minsize)) {
414                         mindisc = dname;
415                         minsize = freesize;
416                 }
417                 if (runstop != 1 || verbose >= 0) {
418                         int fd = open(dname, O_RDONLY);
419                         if (fd <0 ) {
420                                 pr_err("Cannot open %s: %s\n",
421                                         dname, strerror(errno));
422                                 fail=1;
423                                 continue;
424                         }
425                         warn |= check_ext2(fd, dname);
426                         warn |= check_reiser(fd, dname);
427                         warn |= check_raid(fd, dname);
428                         if (strcmp(st->ss->name, "1.x") == 0 &&
429                             st->minor_version >= 1)
430                                 /* metadata at front */
431                                 warn |= check_partitions(fd, dname, 0, 0);
432                         else if (level == 1 || level == LEVEL_CONTAINER
433                                     || (level == 0 && raiddisks == 1))
434                                 /* partitions could be meaningful */
435                                 warn |= check_partitions(fd, dname, freesize*2, size*2);
436                         else
437                                 /* partitions cannot be meaningful */
438                                 warn |= check_partitions(fd, dname, 0, 0);
439                         if (strcmp(st->ss->name, "1.x") == 0 &&
440                             st->minor_version >= 1 &&
441                             did_default &&
442                             level == 1 &&
443                             (warn & 1024) == 0) {
444                                 warn |= 1024;
445                                 pr_err("Note: this array has metadata at the start and\n"
446                                         "    may not be suitable as a boot device.  If you plan to\n"
447                                         "    store '/boot' on this device please ensure that\n"
448                                         "    your boot-loader understands md/v1.x metadata, or use\n"
449                                         "    --metadata=0.90\n");
450                         }
451                         close(fd);
452                 }
453         }
454         if (raiddisks + sparedisks > st->max_devs) {
455                 pr_err("Too many devices:"
456                         " %s metadata only supports %d\n",
457                         st->ss->name, st->max_devs);
458                 return 1;
459         }
460         if (have_container)
461                 info.array.working_disks = raiddisks;
462         if (fail) {
463                 pr_err("create aborted\n");
464                 return 1;
465         }
466         if (size == 0) {
467                 if (mindisc == NULL && !have_container) {
468                         pr_err("no size and no drives given - aborting create.\n");
469                         return 1;
470                 }
471                 if (level > 0 || level == LEVEL_MULTIPATH
472                     || level == LEVEL_FAULTY
473                     || st->ss->external ) {
474                         /* size is meaningful */
475                         if (!st->ss->validate_geometry(st, level, layout,
476                                                        raiddisks,
477                                                        &chunk, minsize*2,
478                                                        NULL, NULL, 0)) {
479                                 pr_err("devices too large for RAID level %d\n", level);
480                                 return 1;
481                         }
482                         size = minsize;
483                         if (level == 1)
484                                 /* If this is ever reshaped to RAID5, we will
485                                  * need a chunksize.  So round it off a bit
486                                  * now just to be safe
487                                  */
488                                 size &= ~(64ULL-1);
489                         if (verbose > 0)
490                                 pr_err("size set to %lluK\n", size);
491                 }
492         }
493         if (!have_container && level > 0 && ((maxsize-size)*100 > maxsize)) {
494                 if (runstop != 1 || verbose >= 0)
495                         pr_err("largest drive (%s) exceeds size (%lluK) by more than 1%%\n",
496                                 maxdisc, size);
497                 warn = 1;
498         }
499
500         if (st->ss->detail_platform && st->ss->detail_platform(0, 1) != 0) {
501                 if (runstop != 1 || verbose >= 0)
502                         pr_err("%s unable to enumerate platform support\n"
503                                 "    array may not be compatible with hardware/firmware\n",
504                                 st->ss->name);
505                 warn = 1;
506         }
507
508         if (warn) {
509                 if (runstop!= 1) {
510                         if (!ask("Continue creating array? ")) {
511                                 pr_err("create aborted.\n");
512                                 return 1;
513                         }
514                 } else {
515                         if (verbose > 0)
516                                 pr_err("creation continuing despite oddities due to --run\n");
517                 }
518         }
519
520         /* If this is raid4/5, we want to configure the last active slot
521          * as missing, so that a reconstruct happens (faster than re-parity)
522          * FIX: Can we do this for raid6 as well?
523          */
524         if (st->ss->external == 0 &&
525             assume_clean==0 && force == 0 && first_missing >= raiddisks) {
526                 switch ( level ) {
527                 case 4:
528                 case 5:
529                         insert_point = raiddisks-1;
530                         sparedisks++;
531                         info.array.active_disks--;
532                         missing_disks++;
533                         break;
534                 default:
535                         break;
536                 }
537         }
538         /* For raid6, if creating with 1 missing drive, make a good drive
539          * into a spare, else the create will fail
540          */
541         if (assume_clean == 0 && force == 0 && first_missing < raiddisks &&
542             st->ss->external == 0 &&
543             second_missing >= raiddisks && level == 6) {
544                 insert_point = raiddisks - 1;
545                 if (insert_point == first_missing)
546                         insert_point--;
547                 sparedisks ++;
548                 info.array.active_disks--;
549                 missing_disks++;
550         }
551
552         if (level <= 0 && first_missing < subdevs * 2) {
553                 pr_err("This level does not support missing devices\n");
554                 return 1;
555         }
556
557         /* We need to create the device */
558         map_lock(&map);
559         mdfd = create_mddev(mddev, name, autof, LOCAL, chosen_name);
560         if (mdfd < 0) {
561                 map_unlock(&map);
562                 return 1;
563         }
564         /* verify if chosen_name is not in use,
565          * it could be in conflict with already existing device
566          * e.g. container, array
567          */
568         if (strncmp(chosen_name, "/dev/md/", 8) == 0
569             && map_by_name(&map, chosen_name+8) != NULL) {
570                 pr_err("Array name %s is in use already.\n",
571                         chosen_name);
572                 close(mdfd);
573                 map_unlock(&map);
574                 return 1;
575         }
576         mddev = chosen_name;
577
578         vers = md_get_version(mdfd);
579         if (vers < 9000) {
580                 pr_err("Create requires md driver version 0.90.0 or later\n");
581                 goto abort_locked;
582         } else {
583                 mdu_array_info_t inf;
584                 memset(&inf, 0, sizeof(inf));
585                 ioctl(mdfd, GET_ARRAY_INFO, &inf);
586                 if (inf.working_disks != 0) {
587                         pr_err("another array by this name"
588                                 " is already running.\n");
589                         goto abort_locked;
590                 }
591         }
592
593         /* Ok, lets try some ioctls */
594
595         info.array.level = level;
596         info.array.size = size;
597         info.array.raid_disks = raiddisks;
598         /* The kernel should *know* what md_minor we are dealing
599          * with, but it chooses to trust me instead. Sigh
600          */
601         info.array.md_minor = 0;
602         if (fstat(mdfd, &stb)==0)
603                 info.array.md_minor = minor(stb.st_rdev);
604         info.array.not_persistent = 0;
605
606         if ( ( (level == 4 || level == 5) &&
607                (insert_point < raiddisks || first_missing < raiddisks) )
608              ||
609              ( level == 6 && (insert_point < raiddisks
610                               || second_missing < raiddisks))
611              ||
612              ( level <= 0 )
613              ||
614              assume_clean
615                 ) {
616                 info.array.state = 1; /* clean, but one+ drive will be missing*/
617                 info.resync_start = MaxSector;
618         } else {
619                 info.array.state = 0; /* not clean, but no errors */
620                 info.resync_start = 0;
621         }
622         if (level == 10) {
623                 /* for raid10, the bitmap size is the capacity of the array,
624                  * which is array.size * raid_disks / ncopies;
625                  * .. but convert to sectors.
626                  */
627                 int ncopies = ((layout>>8) & 255) * (layout & 255);
628                 bitmapsize = (unsigned long long)size * raiddisks / ncopies * 2;
629 /*              printf("bms=%llu as=%d rd=%d nc=%d\n", bitmapsize, size, raiddisks, ncopies);*/
630         } else
631                 bitmapsize = (unsigned long long)size * 2;
632
633         /* There is lots of redundancy in these disk counts,
634          * raid_disks is the most meaningful value
635          *          it describes the geometry of the array
636          *          it is constant
637          * nr_disks is total number of used slots.
638          *          it should be raid_disks+spare_disks
639          * spare_disks is the number of extra disks present
640          *          see above
641          * active_disks is the number of working disks in
642          *          active slots. (With raid_disks)
643          * working_disks is the total number of working disks,
644          *          including spares
645          * failed_disks is the number of disks marked failed
646          *
647          * Ideally, the kernel would keep these (except raid_disks)
648          * up-to-date as we ADD_NEW_DISK, but it doesn't (yet).
649          * So for now, we assume that all raid and spare
650          * devices will be given.
651          */
652         info.array.spare_disks=sparedisks;
653         info.array.failed_disks=missing_disks;
654         info.array.nr_disks = info.array.working_disks
655                 + info.array.failed_disks;
656         info.array.layout = layout;
657         info.array.chunk_size = chunk*1024;
658
659         if (name == NULL || *name == 0) {
660                 /* base name on mddev */
661                 /*  /dev/md0 -> 0
662                  *  /dev/md_d0 -> d0
663                  *  /dev/md/1 -> 1
664                  *  /dev/md/d1 -> d1
665                  *  /dev/md/home -> home
666                  *  /dev/mdhome -> home
667                  */
668                 /* FIXME compare this with rules in create_mddev */
669                 name = strrchr(mddev, '/');
670                 if (name) {
671                         name++;
672                         if (strncmp(name, "md_d", 4)==0 &&
673                             strlen(name) > 4 &&
674                             isdigit(name[4]) &&
675                             (name-mddev) == 5 /* /dev/ */)
676                                 name += 3;
677                         else if (strncmp(name, "md", 2)==0 &&
678                                  strlen(name) > 2 &&
679                                  isdigit(name[2]) &&
680                                  (name-mddev) == 5 /* /dev/ */)
681                                 name += 2;
682                 }
683         }
684         if (!st->ss->init_super(st, &info.array, size, name, homehost, uuid))
685                 goto abort_locked;
686
687         total_slots = info.array.nr_disks;
688         st->ss->getinfo_super(st, &info, NULL);
689         sysfs_init(&info, mdfd, 0);
690
691         if (did_default && verbose >= 0) {
692                 if (is_subarray(info.text_version)) {
693                         int dnum = devname2devnum(info.text_version+1);
694                         char *path;
695                         int mdp = get_mdp_major();
696                         struct mdinfo *mdi;
697                         if (dnum > 0)
698                                 path = map_dev(MD_MAJOR, dnum, 1);
699                         else
700                                 path = map_dev(mdp, (-1-dnum)<< 6, 1);
701
702                         mdi = sysfs_read(-1, dnum, GET_VERSION);
703
704                         pr_err("Creating array inside "
705                                 "%s container %s\n", 
706                                 mdi?mdi->text_version:"managed", path);
707                         sysfs_free(mdi);
708                 } else
709                         pr_err("Defaulting to version"
710                                 " %s metadata\n", info.text_version);
711         }
712
713         map_update(&map, fd2devnum(mdfd), info.text_version,
714                    info.uuid, chosen_name);
715         map_unlock(&map);
716
717         if (bitmap_file && vers < 9003) {
718                 major_num = BITMAP_MAJOR_HOSTENDIAN;
719 #ifdef __BIG_ENDIAN
720                 pr_err("Warning - bitmaps created on this kernel are not portable\n"
721                         "  between different architectured.  Consider upgrading the Linux kernel.\n");
722 #endif
723         }
724
725         if (bitmap_file && strcmp(bitmap_file, "internal")==0) {
726                 if ((vers%100) < 2) {
727                         pr_err("internal bitmaps not supported by this kernel.\n");
728                         goto abort;
729                 }
730                 if (!st->ss->add_internal_bitmap) {
731                         pr_err("internal bitmaps not supported with %s metadata\n",
732                                 st->ss->name);
733                         goto abort;
734                 }
735                 if (!st->ss->add_internal_bitmap(st, &bitmap_chunk,
736                                                  delay, write_behind,
737                                                  bitmapsize, 1, major_num)) {
738                         pr_err("Given bitmap chunk size not supported.\n");
739                         goto abort;
740                 }
741                 bitmap_file = NULL;
742         }
743
744
745         sysfs_init(&info, mdfd, 0);
746
747         if (st->ss->external && st->container_dev != NoMdDev) {
748                 /* member */
749
750                 /* When creating a member, we need to be careful
751                  * to negotiate with mdmon properly.
752                  * If it is already running, we cannot write to
753                  * the devices and must ask it to do that part.
754                  * If it isn't running, we write to the devices,
755                  * and then start it.
756                  * We hold an exclusive open on the container
757                  * device to make sure mdmon doesn't exit after
758                  * we checked that it is running.
759                  *
760                  * For now, fail if it is already running.
761                  */
762                 container_fd = open_dev_excl(st->container_dev);
763                 if (container_fd < 0) {
764                         pr_err("Cannot get exclusive "
765                                 "open on container - weird.\n");
766                         goto abort;
767                 }
768                 if (mdmon_running(st->container_dev)) {
769                         if (verbose)
770                                 pr_err("reusing mdmon "
771                                         "for %s.\n",
772                                         devnum2devname(st->container_dev));
773                         st->update_tail = &st->updates;
774                 } else
775                         need_mdmon = 1;
776         }
777         rv = set_array_info(mdfd, st, &info);
778         if (rv) {
779                 pr_err("failed to set array info for %s: %s\n",
780                         mddev, strerror(errno));
781                 goto abort;
782         }
783
784         if (bitmap_file) {
785                 int uuid[4];
786
787                 st->ss->uuid_from_super(st, uuid);
788                 if (CreateBitmap(bitmap_file, force, (char*)uuid, bitmap_chunk,
789                                  delay, write_behind,
790                                  bitmapsize,
791                                  major_num)) {
792                         goto abort;
793                 }
794                 bitmap_fd = open(bitmap_file, O_RDWR);
795                 if (bitmap_fd < 0) {
796                         pr_err("weird: %s cannot be openned\n",
797                                 bitmap_file);
798                         goto abort;
799                 }
800                 if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) {
801                         pr_err("Cannot set bitmap file for %s: %s\n",
802                                 mddev, strerror(errno));
803                         goto abort;
804                 }
805         }
806
807         infos = xmalloc(sizeof(*infos) * total_slots);
808
809         for (pass=1; pass <=2 ; pass++) {
810                 struct mddev_dev *moved_disk = NULL; /* the disk that was moved out of the insert point */
811
812                 for (dnum=0, dv = devlist ; dv ;
813                      dv=(dv->next)?(dv->next):moved_disk, dnum++) {
814                         int fd;
815                         struct stat stb;
816                         struct mdinfo *inf = &infos[dnum];
817
818                         if (dnum >= total_slots)
819                                 abort();
820                         if (dnum == insert_point) {
821                                 moved_disk = dv;
822                                 continue;
823                         }
824                         if (strcasecmp(dv->devname, "missing")==0)
825                                 continue;
826                         if (have_container)
827                                 moved_disk = NULL;
828                         if (have_container && dnum < info.array.raid_disks - 1)
829                                 /* repeatedly use the container */
830                                 moved_disk = dv;
831
832                         switch(pass) {
833                         case 1:
834                                 *inf = info;
835
836                                 inf->disk.number = dnum;
837                                 inf->disk.raid_disk = dnum;
838                                 if (inf->disk.raid_disk < raiddisks)
839                                         inf->disk.state = (1<<MD_DISK_ACTIVE) |
840                                                 (1<<MD_DISK_SYNC);
841                                 else
842                                         inf->disk.state = 0;
843
844                                 if (dv->writemostly == 1)
845                                         inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
846
847                                 if (have_container)
848                                         fd = -1;
849                                 else {
850                                         if (st->ss->external &&
851                                             st->container_dev != NoMdDev)
852                                                 fd = open(dv->devname, O_RDWR);
853                                         else
854                                                 fd = open(dv->devname, O_RDWR|O_EXCL);
855
856                                         if (fd < 0) {
857                                                 pr_err("failed to open %s "
858                                                         "after earlier success - aborting\n",
859                                                         dv->devname);
860                                                 goto abort;
861                                         }
862                                         fstat(fd, &stb);
863                                         inf->disk.major = major(stb.st_rdev);
864                                         inf->disk.minor = minor(stb.st_rdev);
865                                 }
866                                 if (fd >= 0)
867                                         remove_partitions(fd);
868                                 if (st->ss->add_to_super(st, &inf->disk,
869                                                          fd, dv->devname)) {
870                                         ioctl(mdfd, STOP_ARRAY, NULL);
871                                         goto abort;
872                                 }
873                                 st->ss->getinfo_super(st, inf, NULL);
874                                 safe_mode_delay = inf->safe_mode_delay;
875
876                                 if (have_container && verbose > 0)
877                                         pr_err("Using %s for device %d\n",
878                                                 map_dev(inf->disk.major,
879                                                         inf->disk.minor,
880                                                         0), dnum);
881
882                                 if (!have_container) {
883                                         /* getinfo_super might have lost these ... */
884                                         inf->disk.major = major(stb.st_rdev);
885                                         inf->disk.minor = minor(stb.st_rdev);
886                                 }
887                                 break;
888                         case 2:
889                                 inf->errors = 0;
890
891                                 rv = add_disk(mdfd, st, &info, inf);
892
893                                 if (rv) {
894                                         pr_err("ADD_NEW_DISK for %s "
895                                                "failed: %s\n",
896                                                dv->devname, strerror(errno));
897                                         goto abort;
898                                 }
899                                 break;
900                         }
901                         if (!have_container &&
902                             dv == moved_disk && dnum != insert_point) break;
903                 }
904                 if (pass == 1) {
905                         struct mdinfo info_new;
906                         struct map_ent *me = NULL;
907
908                         /* check to see if the uuid has changed due to these
909                          * metadata changes, and if so update the member array
910                          * and container uuid.  Note ->write_init_super clears
911                          * the subarray cursor such that ->getinfo_super once
912                          * again returns container info.
913                          */
914                         map_lock(&map);
915                         st->ss->getinfo_super(st, &info_new, NULL);
916                         if (st->ss->external && level != LEVEL_CONTAINER &&
917                             !same_uuid(info_new.uuid, info.uuid, 0)) {
918                                 map_update(&map, fd2devnum(mdfd),
919                                            info_new.text_version,
920                                            info_new.uuid, chosen_name);
921                                 me = map_by_devnum(&map, st->container_dev);
922                         }
923
924                         if (st->ss->write_init_super(st)) {
925                                 st->ss->free_super(st);
926                                 goto abort_locked;
927                         }
928
929                         /* update parent container uuid */
930                         if (me) {
931                                 char *path = xstrdup(me->path);
932
933                                 st->ss->getinfo_super(st, &info_new, NULL);
934                                 map_update(&map, st->container_dev,
935                                            info_new.text_version,
936                                            info_new.uuid, path);
937                                 free(path);
938                         }
939                         map_unlock(&map);
940
941                         flush_metadata_updates(st);
942                         st->ss->free_super(st);
943                 }
944         }
945         free(infos);
946
947         if (level == LEVEL_CONTAINER) {
948                 /* No need to start.  But we should signal udev to
949                  * create links */
950                 sysfs_uevent(&info, "change");
951                 if (verbose >= 0)
952                         pr_err("container %s prepared.\n", mddev);
953                 wait_for(chosen_name, mdfd);
954         } else if (runstop == 1 || subdevs >= raiddisks) {
955                 if (st->ss->external) {
956                         int err;
957                         switch(level) {
958                         case LEVEL_LINEAR:
959                         case LEVEL_MULTIPATH:
960                         case 0:
961                                 err = sysfs_set_str(&info, NULL, "array_state",
962                                                     readonly
963                                                     ? "readonly"
964                                                     : "active");
965                                 need_mdmon = 0;
966                                 break;
967                         default:
968                                 err = sysfs_set_str(&info, NULL, "array_state",
969                                                     "readonly");
970                                 break;
971                         }
972                         sysfs_set_safemode(&info, safe_mode_delay);
973                         if (err) {
974                                 pr_err("failed to"
975                                        " activate array.\n");
976                                 ioctl(mdfd, STOP_ARRAY, NULL);
977                                 goto abort;
978                         }
979                 } else if (readonly &&
980                            sysfs_attribute_available(
981                                    &info, NULL, "array_state")) {
982                         if (sysfs_set_str(&info, NULL,
983                                           "array_state", "readonly") < 0) {
984                                 pr_err("Failed to start array: %s\n",
985                                        strerror(errno));
986                                 ioctl(mdfd, STOP_ARRAY, NULL);
987                                 goto abort;
988                         }
989                 } else {
990                         /* param is not actually used */
991                         mdu_param_t param;
992                         if (ioctl(mdfd, RUN_ARRAY, &param)) {
993                                 pr_err("RUN_ARRAY failed: %s\n",
994                                        strerror(errno));
995                                 if (info.array.chunk_size & (info.array.chunk_size-1)) {
996                                         cont_err("Problem may be that "
997                                                  "chunk size is not a power of 2\n");
998                                 }
999                                 ioctl(mdfd, STOP_ARRAY, NULL);
1000                                 goto abort;
1001                         }
1002                 }
1003                 if (verbose >= 0)
1004                         pr_err("array %s started.\n", mddev);
1005                 if (st->ss->external && st->container_dev != NoMdDev) {
1006                         if (need_mdmon)
1007                                 start_mdmon(st->container_dev);
1008
1009                         ping_monitor_by_id(st->container_dev);
1010                         close(container_fd);
1011                 }
1012                 wait_for(chosen_name, mdfd);
1013         } else {
1014                 pr_err("not starting array - not enough devices.\n");
1015         }
1016         close(mdfd);
1017         return 0;
1018
1019  abort:
1020         map_lock(&map);
1021  abort_locked:
1022         map_remove(&map, fd2devnum(mdfd));
1023         map_unlock(&map);
1024
1025         if (mdfd >= 0)
1026                 close(mdfd);
1027         return 1;
1028 }