int second_missing = subdevs * 2;
int missing_disks = 0;
int insert_point = subdevs * 2; /* where to insert a missing drive */
+ int total_slots;
int pass;
int vers;
int rv;
int bitmap_fd;
+ int have_container = 0;
+ int container_fd;
+ int need_mdmon = 0;
unsigned long long bitmapsize;
- struct mdinfo info;
+ struct mdinfo *sra;
+ struct mdinfo info, *infos;
+ int did_default = 0;
int major_num = BITMAP_MAJOR_HI;
return 1;
}
}
+ if (level == UnSet) {
+ /* "ddf" and "imsm" metadata only supports one level - should possibly
+ * push this into metadata handler??
+ */
+ if (st && (st->ss == &super_ddf || st->ss == &super_imsm))
+ level = LEVEL_CONTAINER;
+ }
+
if (level == UnSet) {
fprintf(stderr,
Name ": a RAID level is needed to create an array.\n");
Name ": This level does not support spare devices\n");
return 1;
}
- fd = open(devlist->devname, O_RDONLY, 0);
+
+ if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) {
+ /* If given a single device, it might be a container, and we can
+ * extract a device list from there
+ */
+ mdu_array_info_t inf;
+ int fd;
+
+ memset(&inf, 0, sizeof(inf));
++ fd = open(devlist->devname, O_RDONLY);
+ if (fd >= 0 &&
+ ioctl(fd, GET_ARRAY_INFO, &inf) == 0 &&
+ inf.raid_disks == 0) {
+ /* yep, looks like a container */
+ if (st) {
+ rv = st->ss->load_super(st, fd,
+ devlist->devname);
+ if (rv == 0)
+ have_container = 1;
+ } else {
+ st = guess_super(fd);
+ if (st && !(rv = st->ss->
+ load_super(st, fd,
+ devlist->devname)))
+ have_container = 1;
+ else
+ st = NULL;
+ }
+ }
+ if (fd >= 0)
+ close(fd);
+ if (have_container) {
+ subdevs = 0;
+ devlist = NULL;
+ }
+ }
if (subdevs > raiddisks+sparedisks) {
fprintf(stderr, Name ": You have listed more devices (%d) than are in the array(%d)!\n", subdevs, raiddisks+sparedisks);
return 1;
}
- if (subdevs < raiddisks+sparedisks) {
+ if (!have_container && subdevs < raiddisks+sparedisks) {
fprintf(stderr, Name ": You haven't given enough devices (real or missing) to create this array\n");
return 1;
}
case 1:
case LEVEL_FAULTY:
case LEVEL_MULTIPATH:
+ case LEVEL_CONTAINER:
if (chunk) {
chunk = 0;
if (verbose > 0)
return 1;
}
+ if (st && ! st->ss->validate_geometry(st, level, layout, raiddisks,
+ chunk, size, NULL, NULL, verbose>=0))
+ return 1;
+
/* now look at the subdevs */
info.array.active_disks = 0;
info.array.working_disks = 0;
dnum = 0;
for (dv=devlist; dv; dv=dv->next, dnum++) {
char *dname = dv->devname;
- unsigned long long ldsize, freesize;
- int fd;
+ unsigned long long freesize;
if (strcasecmp(dname, "missing")==0) {
if (first_missing > dnum)
first_missing = dnum;
info.array.working_disks++;
if (dnum < raiddisks)
info.array.active_disks++;
- fd = open(dname, O_RDONLY|O_EXCL);
- if (fd <0 ) {
- fprintf(stderr, Name ": Cannot open %s: %s\n",
- dname, strerror(errno));
- fail=1;
- continue;
- }
- if (!get_dev_size(fd, dname, &ldsize)) {
- fail = 1;
- close(fd);
- continue;
- }
if (st == NULL) {
struct createinfo *ci = conf_get_create_info();
if (ci)
}
if (st == NULL) {
/* Need to choose a default metadata, which is different
- * depending on the sizes of devices
+ * depending on geometry of array.
*/
int i;
char *name = "default";
- if (level >= 1 && ldsize > (0x7fffffffULL<<10))
- name = "default/large";
- for(i=0; !st && superlist[i]; i++)
+ for(i=0; !st && superlist[i]; i++) {
st = superlist[i]->match_metadata_desc(name);
+ if (st && !st->ss->validate_geometry
+ (st, level, layout, raiddisks,
+ chunk, size, dname, &freesize,
+ verbose > 0))
+ st = NULL;
+ }
if (!st) {
- fprintf(stderr, Name ": internal error - no default metadata style\n");
+ fprintf(stderr, Name ": device %s not suitable "
+ "for any style of array\n",
+ dname);
exit(2);
}
- if (st->ss->major != 0 ||
+ if (st->ss != &super0 ||
st->minor_version != 90)
- fprintf(stderr, Name ": Defaulting to version"
- " %d.%d metadata\n",
- st->ss->major,
- st->minor_version);
- }
- freesize = st->ss->avail_size(st, ldsize >> 9);
- if (freesize == 0) {
- fprintf(stderr, Name ": %s is too small: %luK\n",
- dname, (unsigned long)(ldsize>>10));
- fail = 1;
- close(fd);
- continue;
+ did_default = 1;
+ } else {
+ if (!st->ss->validate_geometry(st, level, layout,
+ raiddisks,
+ chunk, size, dname,
+ &freesize,
+ verbose > 0)) {
+
+ fprintf(stderr,
+ Name ": %s is not suitable for "
+ "this array.\n",
+ dname);
+ fail = 1;
+ continue;
+ }
}
freesize /= 2; /* convert to K */
if (size && freesize < size) {
fprintf(stderr, Name ": %s is smaller that given size."
- " %lluK < %lluK + superblock\n", dname, freesize, size);
+ " %lluK < %lluK + metadata\n",
+ dname, freesize, size);
fail = 1;
- close(fd);
continue;
}
if (maxdisc == NULL || (maxdisc && freesize > maxsize)) {
minsize = freesize;
}
if (runstop != 1 || verbose >= 0) {
- int fd = open(dname, O_RDONLY, 0);
++ int fd = open(dname, O_RDONLY);
+ if (fd <0 ) {
+ fprintf(stderr, Name ": Cannot open %s: %s\n",
+ dname, strerror(errno));
+ fail=1;
+ continue;
+ }
warn |= check_ext2(fd, dname);
warn |= check_reiser(fd, dname);
warn |= check_raid(fd, dname);
+ close(fd);
}
- close(fd);
}
if (fail) {
fprintf(stderr, Name ": create aborted\n");
return 1;
}
if (size == 0) {
- if (mindisc == NULL) {
+ if (mindisc == NULL && !have_container) {
fprintf(stderr, Name ": no size and no drives given - aborting create.\n");
return 1;
}
- if (level > 0 || level == LEVEL_MULTIPATH || level == LEVEL_FAULTY) {
+ if (level > 0 || level == LEVEL_MULTIPATH
+ || level == LEVEL_FAULTY
+ || st->ss->external ) {
/* size is meaningful */
- if (minsize > 0x100000000ULL && st->ss->major == 0) {
+ if (!st->ss->validate_geometry(st, level, layout,
+ raiddisks,
+ chunk, minsize,
+ NULL, NULL, 0)) {
fprintf(stderr, Name ": devices too large for RAID level %d\n", level);
return 1;
}
missing_disks++;
}
- if (level <= 0 && first_missing != subdevs * 2) {
+ if (level <= 0 && first_missing < subdevs * 2) {
fprintf(stderr,
Name ": This level does not support missing devices\n");
return 1;
( level == 6 && (insert_point < raiddisks
|| second_missing < raiddisks))
||
+ ( level <= 0 )
+ ||
assume_clean
- )
+ ) {
info.array.state = 1; /* clean, but one+ drive will be missing*/
- else
+ info.resync_start = ~0ULL;
+ } else {
info.array.state = 0; /* not clean, but no errors */
-
+ info.resync_start = 0;
+ }
if (level == 10) {
/* for raid10, the bitmap size is the capacity of the array,
* which is array.size * raid_disks / ncopies;
+ info.array.failed_disks;
info.array.layout = layout;
info.array.chunk_size = chunk*1024;
- info.array.major_version = st->ss->major;
if (name == NULL || *name == 0) {
/* base name on mddev */
if (!st->ss->init_super(st, &info.array, size, name, homehost, uuid))
return 1;
+ total_slots = info.array.nr_disks;
+ st->ss->getinfo_super(st, &info);
+
+ if (did_default && verbose >= 0) {
+ if (info.text_version[0] == '/') {
+ int dnum = devname2devnum(info.text_version+1);
+ char *path;
+ int mdp = get_mdp_major();
+ struct mdinfo *mdi;
+ if (dnum > 0)
+ path = map_dev(MD_MAJOR, dnum, 1);
+ else
+ path = map_dev(mdp, (-1-dnum)<< 6, 1);
+
+ mdi = sysfs_read(-1, dnum, GET_VERSION);
+
+ fprintf(stderr, Name ": Creating array inside "
+ "%s container %s\n",
+ mdi?mdi->text_version:"managed", path);
+ sysfs_free(mdi);
+ } else
+ fprintf(stderr, Name ": Defaulting to version"
+ " %s metadata\n", info.text_version);
+ }
+
if (bitmap_file && vers < 9003) {
major_num = BITMAP_MAJOR_HOSTENDIAN;
#ifdef __BIG_ENDIAN
}
-
- if ((vers % 100) >= 1) { /* can use different versions */
+ sra = sysfs_read(mdfd, 0, 0);
+
+ if (st->ss->external) {
+ char ver[100];
+ strcat(strcpy(ver, "external:"),
+ info.text_version);
+ if (st->ss->external && st->subarray[0]) {
+ /* member */
+
+ /* When creating a member, we need to be careful
+ * to negotiate with mdmon properly.
+ * If it is already running, we cannot write to
+ * the devices and must ask it to do that part.
+ * If it isn't running, we write to the devices,
+ * and then start it.
+ * We hold an exclusive open on the container
+ * device to make sure mdmon doesn't exit after
+ * we checked that it is running.
+ *
+ * For now, fail if it is already running.
+ */
+ container_fd = open_dev_excl(st->container_dev);
+ if (container_fd < 0) {
+ fprintf(stderr, Name ": Cannot get exclusive "
+ "open on container - weird.\n");
+ return 1;
+ }
+ if (mdmon_running(st->container_dev)) {
+ if (verbose)
+ fprintf(stderr, Name ": reusing mdmon "
+ "for %s.\n",
+ devnum2devname(st->container_dev));
+ st->update_tail = &st->updates;
+ } else
+ need_mdmon = 1;
+ }
+ if ((vers % 100) < 2 ||
+ sra == NULL ||
+ sysfs_set_str(sra, NULL, "metadata_version",
+ ver) < 0) {
+ fprintf(stderr, Name ": This kernel does not "
+ "support external metadata.\n");
+ return 1;
+ }
+ rv = sysfs_set_array(sra, &info);
+ } else if ((vers % 100) >= 1) { /* can use different versions */
mdu_array_info_t inf;
memset(&inf, 0, sizeof(inf));
- inf.major_version = st->ss->major;
- inf.minor_version = st->minor_version;
+ inf.major_version = info.array.major_version;
+ inf.minor_version = info.array.minor_version;
rv = ioctl(mdfd, SET_ARRAY_INFO, &inf);
} else
rv = ioctl(mdfd, SET_ARRAY_INFO, NULL);
}
}
-
+ infos = malloc(sizeof(*infos) * total_slots);
for (pass=1; pass <=2 ; pass++) {
mddev_dev_t moved_disk = NULL; /* the disk that was moved out of the insert point */
dv=(dv->next)?(dv->next):moved_disk, dnum++) {
int fd;
struct stat stb;
+ struct mdinfo *inf = &infos[dnum];
- info.disk.number = dnum;
+ if (dnum >= total_slots)
+ abort();
if (dnum == insert_point) {
moved_disk = dv;
}
- info.disk.raid_disk = info.disk.number;
- if (info.disk.raid_disk < raiddisks)
- info.disk.state = (1<<MD_DISK_ACTIVE) |
+ if (dnum == insert_point ||
+ strcasecmp(dv->devname, "missing")==0)
+ continue;
+
+ switch(pass) {
+ case 1:
+ *inf = info;
+
+ inf->disk.number = dnum;
+ inf->disk.raid_disk = dnum;
+ if (inf->disk.raid_disk < raiddisks)
+ inf->disk.state = (1<<MD_DISK_ACTIVE) |
(1<<MD_DISK_SYNC);
- else
- info.disk.state = 0;
- if (dv->writemostly)
- info.disk.state |= (1<<MD_DISK_WRITEMOSTLY);
+ else
+ inf->disk.state = 0;
+
+ if (dv->writemostly)
+ inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
+
+ if (st->ss->external && st->subarray[0])
- fd = open(dv->devname, O_RDWR, 0);
++ fd = open(dv->devname, O_RDWR);
+ else
- fd = open(dv->devname, O_RDWR|O_EXCL,0);
++ fd = open(dv->devname, O_RDWR|O_EXCL);
- if (dnum == insert_point ||
- strcasecmp(dv->devname, "missing")==0) {
- info.disk.major = 0;
- info.disk.minor = 0;
- info.disk.state = (1<<MD_DISK_FAULTY);
- } else {
- fd = open(dv->devname, O_RDONLY|O_EXCL);
if (fd < 0) {
- fprintf(stderr, Name ": failed to open %s after earlier success - aborting\n",
+ fprintf(stderr, Name ": failed to open %s "
+ "after earlier success - aborting\n",
dv->devname);
return 1;
}
fstat(fd, &stb);
- info.disk.major = major(stb.st_rdev);
- info.disk.minor = minor(stb.st_rdev);
+ inf->disk.major = major(stb.st_rdev);
+ inf->disk.minor = minor(stb.st_rdev);
+
remove_partitions(fd);
- close(fd);
- }
- switch(pass){
- case 1:
- st->ss->add_to_super(st, &info.disk);
+ st->ss->add_to_super(st, &inf->disk,
+ fd, dv->devname);
+ st->ss->getinfo_super(st, inf);
+
+ /* getinfo_super might have lost these ... */
+ inf->disk.major = major(stb.st_rdev);
+ inf->disk.minor = minor(stb.st_rdev);
break;
case 2:
- if (info.disk.state == 1) break;
- Kill(dv->devname, 0, 1); /* Just be sure it is clean */
- Kill(dv->devname, 0, 1); /* and again, there could be two superblocks */
- st->ss->write_init_super(st, &info.disk,
- dv->devname);
-
- if (ioctl(mdfd, ADD_NEW_DISK, &info.disk)) {
- fprintf(stderr, Name ": ADD_NEW_DISK for %s failed: %s\n",
+ inf->errors = 0;
+ rv = 0;
+
+ if (st->ss->external)
+ rv = sysfs_add_disk(sra, inf);
+ else
+ rv = ioctl(mdfd, ADD_NEW_DISK,
+ &inf->disk);
+
+ if (rv) {
+ fprintf(stderr,
+ Name ": ADD_NEW_DISK for %s "
+ "failed: %s\n",
dv->devname, strerror(errno));
st->ss->free_super(st);
return 1;
}
-
break;
}
if (dv == moved_disk && dnum != insert_point) break;
}
+ if (pass == 1) {
+ st->ss->write_init_super(st);
+ flush_metadata_updates(st);
+ }
}
+ free(infos);
st->ss->free_super(st);
/* param is not actually used */
- if (runstop == 1 || subdevs >= raiddisks) {
- mdu_param_t param;
- if (ioctl(mdfd, RUN_ARRAY, ¶m)) {
- fprintf(stderr, Name ": RUN_ARRAY failed: %s\n",
- strerror(errno));
- Manage_runstop(mddev, mdfd, -1, 0);
- return 1;
+ if (level == LEVEL_CONTAINER)
+ /* No need to start */
+ ;
+ else if (runstop == 1 || subdevs >= raiddisks) {
+ if (st->ss->external) {
+ switch(level) {
+ case LEVEL_LINEAR:
+ case LEVEL_MULTIPATH:
+ case 0:
+ sysfs_set_str(sra, NULL, "array_state",
+ "active");
+ need_mdmon = 0;
+ break;
+ default:
+ sysfs_set_str(sra, NULL, "array_state",
+ "readonly");
+ break;
+ }
+ } else {
+ mdu_param_t param;
+ if (ioctl(mdfd, RUN_ARRAY, ¶m)) {
+ fprintf(stderr, Name ": RUN_ARRAY failed: %s\n",
+ strerror(errno));
+ Manage_runstop(mddev, mdfd, -1, 0);
+ return 1;
+ }
}
if (verbose >= 0)
fprintf(stderr, Name ": array %s started.\n", mddev);
+ if (st->ss->external && st->subarray[0]) {
+ if (need_mdmon)
+ start_mdmon(st->container_dev);
+
+ ping_monitor(devnum2devname(st->container_dev));
+ close(container_fd);
+ }
} else {
fprintf(stderr, Name ": not starting array - not enough devices.\n");
}
return 1;
}
- nfd = open(newdev, O_RDWR|O_EXCL);
+ nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT);
if (nfd < 0) {
fprintf(stderr, Name ": cannot open %s\n", newdev);
return 1;
last_block = nstripe * ndata;
ostripe = last_block / odata / (ochunk/512) * (ochunk/512);
}
- printf("mdadm: Need to backup %lluK of critical section..\n", last_block/2);
+ fprintf(stderr, Name ": Need to backup %lluK of critical "
+ "section..\n", last_block/2);
sra = sysfs_read(fd, 0,
GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
}
spares = sra->array.spare_disks;
if (backup_file) {
- fdlist[d] = open(backup_file, O_RDWR|O_CREAT|O_EXCL, 0600);
+ fdlist[d] = open(backup_file, O_RDWR|O_CREAT|O_EXCL, S_IRUSR | S_IWUSR);
if (fdlist[d] < 0) {
fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
devname, backup_file, strerror(errno));
if (backup_file)
unlink(backup_file);
- printf(Name ": ... critical section passed.\n");
+ fprintf(stderr, Name ": ... critical section passed.\n");
break;
}
return 0;
OBJS = mdadm.o config.o mdstat.o ReadMe.o util.o Manage.o Assemble.o Build.o \
Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \
Incremental.o \
- mdopen.o super0.o super1.o bitmap.o restripe.o sysfs.o sha1.o \
- mapfile.o
+ mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \
+ restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o
SRCS = mdadm.c config.c mdstat.c ReadMe.c util.c Manage.c Assemble.c Build.c \
Create.c Detail.c Examine.c Grow.c Monitor.c dlink.c Kill.c Query.c \
Incremental.c \
- mdopen.c super0.c super1.c bitmap.c restripe.c sysfs.c sha1.c \
- mapfile.c
+ mdopen.c super0.c super1.c super-ddf.c super-intel.c bitmap.c \
+ restripe.c sysfs.c sha1.c mapfile.c crc32.c sg_io.c msg.c
+
+MON_OBJS = mdmon.o monitor.o managemon.o util.o mdstat.o sysfs.o config.o \
+ Kill.o sg_io.o dlink.o ReadMe.o super0.o super1.o super-intel.o \
+ super-ddf.o sha1.o crc32.o msg.o
+
STATICSRC = pwgr.c
STATICOBJS = pwgr.o
ASSEMBLE_SRCS := mdassemble.c Assemble.c Manage.c config.c dlink.c util.c \
- super0.c super1.c sha1.c
+ super0.c super1.c super-ddf.c super-intel.c sha1.c crc32.c sg_io.c
ASSEMBLE_AUTO_SRCS := mdopen.c mdstat.c sysfs.c
ASSEMBLE_FLAGS:= $(CFLAGS) -DMDASSEMBLE
ifdef MDASSEMBLE_AUTO
ASSEMBLE_FLAGS += -DMDASSEMBLE_AUTO
endif
-all : mdadm mdadm.man md.man mdadm.conf.man
+all : mdadm mdmon mdadm.man md.man mdadm.conf.man
everything: all mdadm.static swap_super test_stripe \
mdassemble mdassemble.auto mdassemble.static mdassemble.man \
mdadm.O2 : $(SRCS) mdadm.h
gcc -o mdadm.O2 $(CFLAGS) -DHAVE_STDINT_H -O2 $(SRCS)
+mdmon : $(MON_OBJS)
+ $(CC) $(LDFLAGS) -o mdmon $(MON_OBJS) $(LDLIBS)
+msg.o: msg.c msg.h
+
test_stripe : restripe.c mdadm.h
$(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe -DMAIN restripe.c
sha1.o : sha1.c sha1.h md5.h
$(CC) $(CFLAGS) -DHAVE_STDINT_H -o sha1.o -c sha1.c
-install : mdadm install-man
+install : mdadm mdmon install-man
$(INSTALL) -D $(STRIP) -m 755 mdadm $(DESTDIR)$(BINDIR)/mdadm
+ $(INSTALL) -D $(STRIP) -m 755 mdmon $(DESTDIR)$(BINDIR)/mdmon
install-static : mdadm.static install-man
$(INSTALL) -D $(STRIP) -m 755 mdadm.static $(DESTDIR)$(BINDIR)/mdadm
@echo "Please run 'sh ./test' as root"
clean :
- rm -f mdadm $(OBJS) $(STATICOBJS) core *.man mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \
+ rm -f mdadm mdmon $(OBJS) $(MON_OBJS) $(STATICOBJS) core *.man \
+ mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \
mdadm.Os mdadm.O2 \
mdassemble mdassemble.static mdassemble.auto mdassemble.uclibc \
mdassemble.klibc swap_super \
TAGS :
etags *.h *.c
+
+ DISTRO_MAKEFILE := $(wildcard distropkg/Makefile)
+ ifdef DISTRO_MAKEFILE
+ include $(DISTRO_MAKEFILE)
+ endif
+
{
/* Run or stop the array. array must already be configured
* required >= 0.90.0
+ * Only print failure messages if quiet == 0;
+ * quiet > 0 means really be quiet
+ * quiet < 0 means we will try again if it fails.
*/
mdu_param_t param; /* unused */
if (runstop == -1 && md_get_version(fd) < 9000) {
if (ioctl(fd, STOP_MD, 0)) {
- if (!quiet) fprintf(stderr, Name ": stopping device %s failed: %s\n",
- devname, strerror(errno));
+ if (quiet == 0) fprintf(stderr,
+ Name ": stopping device %s "
+ "failed: %s\n",
+ devname, strerror(errno));
return 1;
}
}
} else if (runstop < 0){
struct map_ent *map = NULL;
struct stat stb;
- if (ioctl(fd, STOP_ARRAY, NULL)) {
- if (quiet==0) {
- fprintf(stderr, Name ": fail to stop array %s: %s\n",
+ struct mdinfo *mdi;
+ /* If this is an mdmon managed array, just write 'inactive'
+ * to the array state and let mdmon clear up.
+ */
+ mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION);
+ if (mdi &&
+ mdi->array.level > 0 &&
+ mdi->text_version[0] == '/') {
+ char *cp;
+
+ /* This is mdmon managed. */
+ close(fd);
+ if (sysfs_set_str(mdi, NULL,
+ "array_state", "inactive") < 0) {
+ if (quiet == 0)
+ fprintf(stderr, Name
+ ": failed to stop array %s: %s\n",
+ devname, strerror(errno));
+ return 1;
+ }
+
+ /* Give monitor a chance to act */
+ cp = strchr(mdi->text_version+1, '/');
+ if (*cp)
+ *cp = 0;
+ ping_monitor(mdi->text_version+1);
+
+ fd = open(devname, O_RDONLY);
+ } else if (mdi &&
+ mdi->array.major_version == -1 &&
+ mdi->array.minor_version == -2 &&
+ mdi->text_version[0] != '/') {
+ /* container, possibly mdmon-managed.
+ * Make sure mdmon isn't opening it, which
+ * would interfere with the 'stop'
+ */
+ ping_monitor(mdi->sys_name);
+ }
+ if (mdi)
+ sysfs_free(mdi);
+
+ if (fd >= 0 && ioctl(fd, STOP_ARRAY, NULL)) {
- if (quiet == 0)
++ if (quiet == 0) {
+ fprintf(stderr, Name
+ ": failed to stop array %s: %s\n",
devname, strerror(errno));
+ if (errno == EBUSY)
+ fprintf(stderr, "Perhaps a running "
+ "process, mounted filesystem "
+ "or active volume group?\n");
+ }
return 1;
}
+
if (quiet <= 0)
fprintf(stderr, Name ": stopped %s\n", devname);
- if (fstat(fd, &stb) == 0) {
+ if (fd >= 0 && fstat(fd, &stb) == 0) {
int devnum;
if (major(stb.st_rdev) == MD_MAJOR)
devnum = minor(stb.st_rdev);
struct supertype *st, *tst;
int duuid[4];
int ouuid[4];
+ int lfd = -1;
if (ioctl(fd, GET_ARRAY_INFO, &array)) {
fprintf(stderr, Name ": cannot get array info for %s\n",
unsigned long long ldsize;
char dvname[20];
char *dnprintable = dv->devname;
+ int err;
next = dv->next;
jnext = 0;
return 1;
case 'a':
/* add the device */
-
+ if (tst->subarray[0]) {
+ fprintf(stderr, Name ": Cannot add disks to a"
+ " \'member\' array, perform this"
+ " operation on the parent container\n");
+ return 1;
+ }
/* Make sure it isn't in use (in 2.6 or later) */
- tfd = open(dv->devname, O_RDONLY|O_EXCL);
+ tfd = open(dv->devname, O_RDONLY|O_EXCL|O_DIRECT);
if (tfd < 0) {
fprintf(stderr, Name ": Cannot open %s: %s\n",
dv->devname, strerror(errno));
}
close(tfd);
- if (array.major_version == 0 &&
+
+ if (!tst->ss->external &&
+ array.major_version == 0 &&
md_get_version(fd)%100 < 2) {
if (ioctl(fd, HOT_ADD_DISK,
(unsigned long)stb.st_rdev)==0) {
disc.number =j;
disc.state = 0;
if (array.not_persistent==0) {
+ int dfd;
if (dv->writemostly)
disc.state |= 1 << MD_DISK_WRITEMOSTLY;
- tst->ss->add_to_super(tst, &disc);
- if (tst->ss->write_init_super(tst, &disc,
- dv->devname))
+ dfd = open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
+ tst->ss->add_to_super(tst, &disc, dfd,
+ dv->devname);
+ /* write_init_super will close 'dfd' */
+ if (tst->ss->write_init_super(tst))
return 1;
} else if (dv->re_add) {
/* this had better be raid1.
case 'r':
/* hot remove */
+ if (tst->subarray[0]) {
+ fprintf(stderr, Name ": Cannot remove disks from a"
+ " \'member\' array, perform this"
+ " operation on the parent container\n");
+ return 1;
+ }
+ if (tst->ss->external) {
+ /* To remove a device from a container, we must
+ * check that it isn't in use in an array.
+ * This involves looking in the 'holders'
+ * directory - there must be just one entry,
+ * the container.
+ * To ensure that it doesn't get used as a
+ * hold spare while we are checking, we
+ * get an O_EXCL open on the container
+ */
+ int dnum = fd2devnum(fd);
+ lfd = open_dev_excl(dnum);
+ if (lfd < 0) {
+ fprintf(stderr, Name
+ ": Cannot get exclusive access "
+ " to container - odd\n");
+ return 1;
+ }
+ if (!sysfs_unique_holder(dnum, stb.st_rdev)) {
+ fprintf(stderr, Name
+ ": %s is %s, cannot remove.\n",
+ dnprintable,
+ errno == EEXIST ? "still in use":
+ "not a member");
+ close(lfd);
+ return 1;
+ }
+ }
/* FIXME check that it is a current member */
- if (ioctl(fd, HOT_REMOVE_DISK, (unsigned long)stb.st_rdev)) {
+ err = ioctl(fd, HOT_REMOVE_DISK, (unsigned long)stb.st_rdev);
+ if (err && errno == ENODEV) {
+ /* Old kernels rejected this if no personality
+ * registered */
+ struct mdinfo *sra = sysfs_read(fd, 0, GET_DEVS);
+ struct mdinfo *dv = NULL;
+ if (sra)
+ dv = sra->devs;
+ for ( ; dv ; dv=dv->next)
+ if (dv->disk.major == major(stb.st_rdev) &&
+ dv->disk.minor == minor(stb.st_rdev))
+ break;
+ if (dv)
+ err = sysfs_set_str(sra, dv,
+ "state", "remove");
+ else
+ err = -1;
+ if (sra)
+ sysfs_free(sra);
+ }
+ if (err) {
fprintf(stderr, Name ": hot remove failed "
"for %s: %s\n", dnprintable,
strerror(errno));
+ if (lfd >= 0)
+ close(lfd);
return 1;
}
+ close(lfd);
if (verbose >= 0)
fprintf(stderr, Name ": hot removed %s\n",
dnprintable);
* whether it is an md device and whether it has
* a superblock
*/
- int fd = open(dev, O_RDONLY, 0);
+ int fd = open(dev, O_RDONLY);
int vers;
int ioctlerr;
int superror, superrno;
if (superror == 0) {
/* array might be active... */
st->ss->getinfo_super(st, &info);
- if (st->ss->major == 0) {
+ if (st->ss == &super0) {
mddev = get_md_name(info.array.md_minor);
disc.number = info.disk.number;
activity = "undetected";
activity,
map_num(pers, info.array.level),
mddev);
- if (st->ss->major == 0)
+ if (st->ss == &super0)
put_md_name(mddev);
}
return 0;
" --size= -z : Change the active size of devices in an array.\n"
" : This is useful if all devices have been replaced\n"
" : with larger devices.\n"
- " --raid-disks= -n : Change the number of active devices in an array.\n"
- " : array.\n"
+ " --raid-devices= -n : Change the number of active devices in an array.\n"
" --bitmap= -b : Add or remove a write-intent bitmap.\n"
+ " --backup-file= file : A file on a differt device to store data for a\n"
+ " : short time while increasing raid-devices on a\n"
+ " : RAID4/5/6 array. Not needed when a spare is present.\n"
;
char Help_incr[] =
{ "raid10", 10},
{ "10", 10},
{ "faulty", LEVEL_FAULTY},
+ { "container", LEVEL_CONTAINER},
{ NULL, 0}
};
of any given block are on different drives.
The 'far' arrangement can give sequential read performance equal to
- that of a RAID0 array, but at the cost of degraded write performance.
+ that of a RAID0 array, but at the cost of reduced write performance.
When 'offset' replicas are chosen, the multiple copies of a given
chunk are laid out on consecutive drives and at consecutive offsets.
.B md/stripe_cache_size
This is only available on RAID5 and RAID6. It records the size (in
pages per device) of the stripe cache which is used for synchronising
-all read and write operations to the array. The default is 128.
+all write operations to the array and all read operations if the array
+is degraded. The default is 256. Valid values are 17 to 32768.
Increasing this number can increase performance in some situations, at
-some cost in system memory.
+some cost in system memory. Note, setting this value too high can
+result in an "out of memory" condition for the system.
+memory_consumed = system_page_size * nr_disks * stripe_cache_size
+
+.TP
+.B md/preread_bypass_threshold
+This is only available on RAID5 and RAID6. This variable sets the
+number of times MD will service a full-stripe-write before servicing a
+stripe that requires some "prereading". For fairness this defaults to
+1. Valid values are 0 to stripe_cache_size. Setting this to 0
+maximizes sequential-write throughput at the cost of fairness to threads
+doing small or random writes.
.SS KERNEL PARAMETERS
is also not true RAID, and it only involves one device. It
provides a layer over a true device that can be used to inject faults.
- .\".B mdadm
+ .\".I mdadm
.\"is a program that can be used to create, manage, and monitor
.\"MD devices. As
.\"such it provides a similar set of functionality to the
.\".B raidtools
.\"packages.
.\"The key differences between
- .\".B mdadm
+ .\".I mdadm
.\"and
.\".B raidtools
.\"are:
.\".IP \(bu 4
- .\".B mdadm
+ .\".I mdadm
.\"is a single program and not a collection of programs.
.\".IP \(bu 4
- .\".B mdadm
+ .\".I mdadm
.\"can perform (almost) all of its functions without having a
.\"configuration file and does not use one by default. Also
- .\".B mdadm
+ .\".I mdadm
.\"helps with management of the configuration
.\"file.
.\".IP \(bu 4
- .\".B mdadm
+ .\".I mdadm
.\"can provide information about your arrays (through Query, Detail, and Examine)
.\"that
.\".B raidtools
Assemble the components of a previously created
array into an active array. Components can be explicitly given
or can be searched for.
- .B mdadm
+ .I mdadm
checks that the components
do form a bona fide array, and can, on request, fiddle superblock
information so as to assemble a faulty array.
.TP
.BR \-q ", " \-\-quiet
Avoid printing purely informative messages. With this,
- .B mdadm
+ .I mdadm
will be silent unless there is something really important to report.
.TP
.B /proc/mdstat
for missing information.
In general, this option gives
- .B mdadm
+ .I mdadm
permission to get any missing information (like component devices,
array devices, array identities, and alert destination) from the
configuration file (see previous option);
to use the minor number of the md device that is being assembled.
e.g. when assembling
.BR /dev/md0 ,
- .M \-\-super\-minor=dev
+ .B \-\-super\-minor=dev
will look for super blocks with a minor number of 0.
.TP
.TP
.BR \-d ", " \-\-delay
Give a delay in seconds.
- .B mdadm
+ .I mdadm
polls the md arrays and then waits this many seconds before polling
again. The default is 60 seconds.
.TP
.BR \-f ", " \-\-daemonise
Tell
- .B mdadm
+ .I mdadm
to run as a background daemon if it decides to monitor anything. This
causes it to fork and run in the child, and to disconnect form the
terminal. The process id of the child is written to stdout.
.TP
.BR \-i ", " \-\-pid\-file
When
- .B mdadm
+ .I mdadm
is running in daemon mode, write the pid of the daemon process to
the specified file, instead of printing it on standard output.
To create a "degraded" array in which some devices are missing, simply
give the word "\fBmissing\fP"
in place of a device name. This will cause
- .B mdadm
+ .I mdadm
to leave the corresponding slot in the array empty.
For a RAID4 or RAID5 array at most one slot can be
"\fBmissing\fP"; for a RAID6 array at most two slots.
"\fBmissing\fP".
When creating a RAID5 array,
- .B mdadm
+ .I mdadm
will automatically create a degraded array with an extra spare drive.
This is because building the spare into a degraded array is in general faster than resyncing
the parity on a non-degraded, but not clean, array. This feature can
.B \-\-force
option.
- When creating an array with version-1 metadata a name for the host is
+ When creating an array with version-1 metadata a name for the array is
required.
If this is not given with the
.B \-\-name
option,
.I mdadm
- will chose a name based on the last component of the name of the
+ will choose a name based on the last component of the name of the
device being created. So if
.B /dev/md3
is being created, then the name
.B home
will be used.
+ When creating a partition based array, using
+ .I mdadm
+ with version-1.x metadata, the partition type should be set to
+ .B 0xDA
+ (non fs-data). This type selection allows for greater precision since
+ using any other [RAID auto-detect (0xFD) or a GNU/Linux partition (0x83)],
+ might create problems in the event of array recovery through a live cdrom.
+
A new array will normally get a randomly assigned 128bit UUID which is
very likely to be unique. If you have a specific need, you can choose
a UUID for the array by giving the
.TP
.B \-\-examine
The device should be a component of an md array.
- .B mdadm
+ .I mdadm
will read the md superblock of the device and display the contents.
If
.B \-\-brief
.PP
This usage causes
- .B mdadm
+ .I mdadm
to periodically poll a number of md arrays and to report on any events
noticed.
- .B mdadm
+ .I mdadm
will never exit once it decides that there are arrays to be checked,
so it should normally be run in the background.
As well as reporting events,
- .B mdadm
+ .I mdadm
may move a spare drive from one array to another if they are in the
same
.B spare-group
and if the destination array has a failed drive but no spares.
If any devices are listed on the command line,
- .B mdadm
+ .I mdadm
will only monitor those devices. Otherwise all arrays listed in the
configuration file will be monitored. Further, if
.B \-\-scan
.B \-\-scan
is given, then a program or an E-mail address must be specified on the
command line or in the config file. If neither are available, then
- .B mdadm
+ .I mdadm
will not monitor anything.
Without
.B \-\-scan,
- .B mdadm
+ .I mdadm
will continue monitoring as long as something was found to monitor. If
no program or email is given, then each event is reported to
.BR stdout .
the second device is the array that the spare was moved from.
For
- .B mdadm
+ .I mdadm
to move spares from one array to another, the different arrays need to
be labeled with the same
.B spare-group
groups use different names.
When
- .B mdadm
+ .I mdadm
detects that an array in a spare group has fewer active
devices than necessary for the complete array, and has no spare
devices, it will look for another array in the same spare group that
change the "size" attribute
for RAID1, RAID5 and RAID6.
.IP \(bu 4
- increase the "raid-disks" attribute of RAID1, RAID5, and RAID6.
+ increase the "raid\-devices" attribute of RAID1, RAID5, and RAID6.
.IP \(bu 4
add a write-intent bitmap to any array which supports these bitmaps, or
remove a write-intent bitmap from such an array.
As an alternative,
.B \-\-run
may be passed to
- .B mdadm
+ .I mdadm
in which case the array will be run as soon as there are enough
devices present for the data to be accessible. For a raid1, that
means one device will start the array. For a clean raid5, the array
happens. Further devices that are found before the first write can
still be added safely.
+
+.SH ENVIRONMENT
+This section describes environment variables that affect how mdadm
+operates.
+
+.TP
+.B MDADM_NO_MDMON
+Setting this value to 1 will prevent mdadm from automatically launching
+mdmon. This variable is intended primarily for debugging mdadm/mdmon.
+
.SH EXAMPLES
.B " mdadm \-\-query /dev/name-of-device"
filesystem,
.B /proc/mdstat
lists all active md devices with information about them.
- .B mdadm
+ .I mdadm
uses this to find arrays when
.B \-\-scan
is given in Misc mode, and to monitor array reconstruction
Partition numbers should be indicated by added "pMM" to these, thus "/dev/md/d1p2".
.SH NOTE
- .B mdadm
+ .I mdadm
was previously known as
- .BR mdctl .
+ .IR mdctl .
.P
- .B mdadm
+ .I mdadm
is completely separate from the
- .B raidtools
+ .I raidtools
package, and does not use the
.I /etc/raidtab
configuration file at all.
break;
}
if (raiddisks == 0) {
- fprintf(stderr, Name ": no raid-disks specified.\n");
+ fprintf(stderr, Name ": no raid-devices specified.\n");
rv = 1;
break;
}
break;
}
if (raiddisks == 0) {
- fprintf(stderr, Name ": no raid-disks specified.\n");
+ fprintf(stderr, Name ": no raid-devices specified.\n");
rv = 1;
break;
}
export, test, homehost);
continue;
case 'K': /* Zero superblock */
- rv |= Kill(dv->devname, force, quiet); continue;
+ rv |= Kill(dv->devname, force, quiet,0);
+ continue;
case 'Q':
rv |= Query(dv->devname); continue;
case 'X':
path0=$dev6
path1=$dev7
+ echo 2000 > /proc/sys/dev/raid/speed_limit_max
+ echo 0 > /sys/module/md_mod/parameters/start_ro
+
if [ " $1" = " setup" ]
then trap 0 ; exit 0
fi
grep -s "active $1 " /proc/mdstat > /dev/null || {
echo >&2 "ERROR active $1 not found" ; cat /proc/mdstat ; exit 1;}
;;
- resync | recovery )
+ resync | recovery | reshape)
sleep 0.5
grep -s $1 /proc/mdstat > /dev/null || {
echo >&2 ERROR no $1 happening; cat /proc/mdstat; exit 1; }
nosync )
sleep 0.5
- if grep -s 're[synccovery]* =' > /dev/null /proc/mdstat ; then
+ if grep -s -E '(resync|recovery|reshape) =' > /dev/null /proc/mdstat ; then
echo >&2 "ERROR resync or recovery is happening!"; cat /proc/mdstat ; exit 1;
fi
;;
wait )
sleep 0.1
- while grep 're[synccovery]* =' > /dev/null /proc/mdstat
+ while grep -E '(resync|recovery|reshape|check|repair) =' > /dev/null /proc/mdstat
do sleep 2;
done
;;
if [ -f "$script" ]
then
rm -f $targetdir/stderr
+ # stop all arrays, just incase some script left an array active.
+ mdadm -Ssq
# source script in a subshell, so it has access to our
# namespace, but cannot change it.
if ( set -ex ; . $script ) 2> $targetdir/log