X-Git-Url: http://git.ipfire.org/?p=thirdparty%2Fmdadm.git;a=blobdiff_plain;f=util.c;h=4adbbff0295e30c984f59b5c5a5c4bfe024e1d60;hp=7e2bbad14ed89a3035d0969eab6ad64d204b8612;hb=1b7eb962db2cf9179d097e06cce74b84ac80e49d;hpb=5a23a06ea472460ae3beddf9140923570268e3dd diff --git a/util.c b/util.c index 7e2bbad1..4adbbff0 100644 --- a/util.c +++ b/util.c @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2012 Neil Brown + * Copyright (C) 2001-2013 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -28,9 +28,16 @@ #include #include #include +#include +#include +#include +#include +#include #include #include #include +#include + /* * following taken from linux/blkpg.h because they aren't @@ -76,6 +83,229 @@ struct blkpg_partition { aren't permitted). */ #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) +static int is_dlm_hooks_ready = 0; + +int dlm_funs_ready(void) +{ + return is_dlm_hooks_ready ? 1 : 0; +} + +static struct dlm_hooks *dlm_hooks = NULL; +struct dlm_lock_resource *dlm_lock_res = NULL; +static int ast_called = 0; + +struct dlm_lock_resource { + dlm_lshandle_t *ls; + struct dlm_lksb lksb; +}; + +/* Using poll(2) to wait for and dispatch ASTs */ +static int poll_for_ast(dlm_lshandle_t ls) +{ + struct pollfd pfd; + + pfd.fd = dlm_hooks->ls_get_fd(ls); + pfd.events = POLLIN; + + while (!ast_called) + { + if (poll(&pfd, 1, 0) < 0) + { + perror("poll"); + return -1; + } + dlm_hooks->dispatch(dlm_hooks->ls_get_fd(ls)); + } + ast_called = 0; + + return 0; +} + +static void dlm_ast(void *arg) +{ + ast_called = 1; +} + +static char *cluster_name = NULL; +/* Create the lockspace, take bitmapXXX locks on all the bitmaps. */ +int cluster_get_dlmlock(void) +{ + int ret = -1; + char str[64]; + int flags = LKF_NOQUEUE; + int retry_count = 0; + + if (!dlm_funs_ready()) { + pr_err("Something wrong with dlm library\n"); + return -1; + } + + ret = get_cluster_name(&cluster_name); + if (ret) { + pr_err("The md can't get cluster name\n"); + return -1; + } + + dlm_lock_res = xmalloc(sizeof(struct dlm_lock_resource)); + dlm_lock_res->ls = dlm_hooks->open_lockspace(cluster_name); + if (!dlm_lock_res->ls) { + dlm_lock_res->ls = dlm_hooks->create_lockspace(cluster_name, O_RDWR); + if (!dlm_lock_res->ls) { + pr_err("%s failed to create lockspace\n", cluster_name); + return -ENOMEM; + } + } else { + pr_err("open existed %s lockspace\n", cluster_name); + } + + snprintf(str, 64, "bitmap%s", cluster_name); +retry: + ret = dlm_hooks->ls_lock(dlm_lock_res->ls, LKM_PWMODE, + &dlm_lock_res->lksb, flags, str, strlen(str), + 0, dlm_ast, dlm_lock_res, NULL, NULL); + if (ret) { + pr_err("error %d when get PW mode on lock %s\n", errno, str); + /* let's try several times if EAGAIN happened */ + if (dlm_lock_res->lksb.sb_status == EAGAIN && retry_count < 10) { + sleep(10); + retry_count++; + goto retry; + } + dlm_hooks->release_lockspace(cluster_name, dlm_lock_res->ls, 1); + return ret; + } + + /* Wait for it to complete */ + poll_for_ast(dlm_lock_res->ls); + + if (dlm_lock_res->lksb.sb_status) { + pr_err("failed to lock cluster\n"); + return -1; + } + return 1; +} + +int cluster_release_dlmlock(void) +{ + int ret = -1; + + if (!cluster_name) + goto out; + + if (!dlm_lock_res->lksb.sb_lkid) + goto out; + + ret = dlm_hooks->ls_unlock_wait(dlm_lock_res->ls, + dlm_lock_res->lksb.sb_lkid, 0, + &dlm_lock_res->lksb); + if (ret) { + pr_err("error %d happened when unlock\n", errno); + /* XXX make sure the lock is unlocked eventually */ + goto out; + } + + /* Wait for it to complete */ + poll_for_ast(dlm_lock_res->ls); + + errno = dlm_lock_res->lksb.sb_status; + if (errno != EUNLOCK) { + pr_err("error %d happened in ast when unlock lockspace\n", + errno); + /* XXX make sure the lockspace is unlocked eventually */ + goto out; + } + + ret = dlm_hooks->release_lockspace(cluster_name, dlm_lock_res->ls, 1); + if (ret) { + pr_err("error %d happened when release lockspace\n", errno); + /* XXX make sure the lockspace is released eventually */ + goto out; + } + free(dlm_lock_res); + +out: + return ret; +} + +int md_array_valid(int fd) +{ + struct mdinfo *sra; + int ret; + + sra = sysfs_read(fd, NULL, GET_ARRAY_STATE); + if (sra) { + if (sra->array_state != ARRAY_UNKNOWN_STATE) + ret = 0; + else + ret = -ENODEV; + + free(sra); + } else { + /* + * GET_ARRAY_INFO doesn't provide access to the proper state + * information, so fallback to a basic check for raid_disks != 0 + */ + ret = ioctl(fd, RAID_VERSION); + } + + return !ret; +} + +int md_array_active(int fd) +{ + struct mdinfo *sra; + struct mdu_array_info_s array; + int ret = 0; + + sra = sysfs_read(fd, NULL, GET_ARRAY_STATE); + if (sra) { + if (!md_array_is_active(sra)) + ret = -ENODEV; + + free(sra); + } else { + /* + * GET_ARRAY_INFO doesn't provide access to the proper state + * information, so fallback to a basic check for raid_disks != 0 + */ + ret = ioctl(fd, GET_ARRAY_INFO, &array); + } + + return !ret; +} + +int md_array_is_active(struct mdinfo *info) +{ + return (info->array_state != ARRAY_CLEAR && + info->array_state != ARRAY_INACTIVE && + info->array_state != ARRAY_UNKNOWN_STATE); +} + +/* + * Get array info from the kernel. Longer term we want to deprecate the + * ioctl and get it from sysfs. + */ +int md_get_array_info(int fd, struct mdu_array_info_s *array) +{ + return ioctl(fd, GET_ARRAY_INFO, array); +} + +/* + * Set array info + */ +int md_set_array_info(int fd, struct mdu_array_info_s *array) +{ + return ioctl(fd, SET_ARRAY_INFO, array); +} + +/* + * Get disk info from the kernel. + */ +int md_get_disk_info(int fd, struct mdu_disk_info_s *disk) +{ + return ioctl(fd, GET_DISK_INFO, disk); +} + /* * Parse a 128 bit uuid in 4 integers * format is 32 hexx nibbles with options :. separator @@ -113,35 +343,6 @@ int parse_uuid(char *str, int uuid[4]) return 0; } -/* - * Get the md version number. - * We use the RAID_VERSION ioctl if it is supported - * If not, but we have a block device with major '9', we assume - * 0.36.0 - * - * Return version number as 24 but number - assume version parts - * always < 255 - */ - -int md_get_version(int fd) -{ - struct stat stb; - mdu_version_t vers; - - if (fstat(fd, &stb)<0) - return -1; - if ((S_IFMT&stb.st_mode) != S_IFBLK) - return -1; - - if (ioctl(fd, RAID_VERSION, &vers) == 0) - return (vers.major*10000) + (vers.minor*100) + vers.patchlevel; - if (errno == EACCES) - return -1; - if (major(stb.st_rdev) == MD_MAJOR) - return (3600); - return -1; -} - int get_linux_version() { struct utsname name; @@ -160,7 +361,6 @@ int get_linux_version() return (a*1000000)+(b*1000)+c; } -#ifndef MDASSEMBLE int mdadm_version(char *version) { int a, b, c; @@ -222,6 +422,17 @@ unsigned long long parse_size(char *size) return s; } +int is_near_layout_10(int layout) +{ + int fc, fo; + + fc = (layout >> 8) & 255; + fo = layout & (1 << 16); + if (fc > 1 || fo > 0) + return 0; + return 1; +} + int parse_layout_10(char *layout) { int copies, rv; @@ -266,7 +477,16 @@ long parse_num(char *num) else return rv; } -#endif + +int parse_cluster_confirm_arg(char *input, char **devname, int *slot) +{ + char *dev; + *slot = strtoul(input, &dev, 10); + if (dev == input || dev[0] != ':') + return -1; + *devname = dev+1; + return 0; +} void remove_partitions(int fd) { @@ -304,7 +524,7 @@ int test_partition(int fd) if (ioctl(fd, BLKPG, &a) == 0) /* Very unlikely, but not a partition */ return 0; - if (errno == ENXIO) + if (errno == ENXIO || errno == ENOTTY) /* not a partition */ return 0; @@ -365,6 +585,13 @@ int enough(int level, int raid_disks, int layout, int clean, char *avail) case 1: return avail_disks >= 1; case 4: + if (avail_disks == raid_disks - 1 && + !avail[raid_disks - 1]) + /* If just the parity device is missing, then we + * have enough, even if not clean + */ + return 1; + /* FALL THROUGH */ case 5: if (clean) return avail_disks >= raid_disks-1; @@ -380,40 +607,6 @@ int enough(int level, int raid_disks, int layout, int clean, char *avail) } } -int enough_fd(int fd) -{ - struct mdu_array_info_s array; - struct mdu_disk_info_s disk; - int avail_disks = 0; - int i, rv; - char *avail; - - if (ioctl(fd, GET_ARRAY_INFO, &array) != 0 || - array.raid_disks <= 0) - return 0; - avail = xcalloc(array.raid_disks, 1); - for (i = 0; i < MAX_DISKS && array.nr_disks > 0; i++) { - disk.number = i; - if (ioctl(fd, GET_DISK_INFO, &disk) != 0) - continue; - if (disk.major == 0 && disk.minor == 0) - continue; - array.nr_disks--; - - if (! (disk.state & (1<= array.raid_disks) - continue; - avail_disks++; - avail[disk.raid_disk] = 1; - } - /* This is used on an active array, so assume it is clean */ - rv = enough(array.level, array.raid_disks, array.layout, - 1, avail); - free(avail); - return rv; -} - const int uuid_zero[4] = { 0, 0, 0, 0 }; int same_uuid(int a[4], int b[4], int swapuuid) @@ -484,17 +677,18 @@ char *__fname_from_uuid(int id[4], int swap, char *buf, char sep) } -char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char sep) +char *fname_from_uuid(struct supertype *st, struct mdinfo *info, + char *buf, char sep) { // dirty hack to work around an issue with super1 superblocks... // super1 superblocks need swapuuid set in order for assembly to // work, but can't have it set if we want this printout to match // all the other uuid printouts in super1.c, so we force swapuuid // to 1 to make our printout match the rest of super1 - return __fname_from_uuid(info->uuid, (st->ss == &super1) ? 1 : st->ss->swapuuid, buf, sep); + return __fname_from_uuid(info->uuid, (st->ss == &super1) ? 1 : + st->ss->swapuuid, buf, sep); } -#ifndef MDASSEMBLE int check_ext2(int fd, char *name) { /* @@ -508,7 +702,8 @@ int check_ext2(int fd, char *name) */ unsigned char sb[1024]; time_t mtime; - int size, bsize; + unsigned long long size; + int bsize; if (lseek(fd, 1024,0)!= 1024) return 0; if (read(fd, sb, 1024)!= 1024) @@ -519,10 +714,10 @@ int check_ext2(int fd, char *name) mtime = sb[44]|(sb[45]|(sb[46]|sb[47]<<8)<<8)<<8; bsize = sb[24]|(sb[25]|(sb[26]|sb[27]<<8)<<8)<<8; size = sb[4]|(sb[5]|(sb[6]|sb[7]<<8)<<8)<<8; + size <<= bsize; pr_err("%s appears to contain an ext2fs file system\n", name); - fprintf(stderr," size=%dK mtime=%s", - size*(1<ss->load_super(st, fd, name); - /* Looks like a raid array .. */ - pr_err("%s appears to be part of a raid array:\n", - name); - st->ss->getinfo_super(st, &info, NULL); - st->ss->free_super(st); - crtime = info.array.ctime; - level = map_num(pers, info.array.level); - if (!level) level = "-unknown-"; - fprintf(stderr, " level=%s devices=%d ctime=%s", - level, info.array.raid_disks, ctime(&crtime)); + if (st->ss->add_to_super != NULL) { + st->ss->load_super(st, fd, name); + /* Looks like a raid array .. */ + pr_err("%s appears to be part of a raid array:\n", name); + st->ss->getinfo_super(st, &info, NULL); + st->ss->free_super(st); + crtime = info.array.ctime; + level = map_num(pers, info.array.level); + if (!level) + level = "-unknown-"; + cont_err("level=%s devices=%d ctime=%s", + level, info.array.raid_disks, ctime(&crtime)); + } else { + /* Looks like GPT or MBR */ + pr_err("partition table exists on %s\n", name); + } + return 1; +} + +int fstat_is_blkdev(int fd, char *devname, dev_t *rdev) +{ + struct stat stb; + + if (fstat(fd, &stb) != 0) { + pr_err("fstat failed for %s: %s\n", devname, strerror(errno)); + return 0; + } + if ((S_IFMT & stb.st_mode) != S_IFBLK) { + pr_err("%s is not a block device.\n", devname); + return 0; + } + if (rdev) + *rdev = stb.st_rdev; + return 1; +} + +int stat_is_blkdev(char *devname, dev_t *rdev) +{ + struct stat stb; + + if (stat(devname, &stb) != 0) { + pr_err("stat failed for %s: %s\n", devname, strerror(errno)); + return 0; + } + if ((S_IFMT & stb.st_mode) != S_IFBLK) { + pr_err("%s is not a block device.\n", devname); + return 0; + } + if (rdev) + *rdev = stb.st_rdev; return 1; } @@ -592,7 +826,6 @@ int ask(char *mesg) pr_err("assuming 'no'\n"); return 0; } -#endif /* MDASSEMBLE */ int is_standard(char *dev, int *nump) { @@ -652,10 +885,9 @@ unsigned long calc_csum(void *super, int bytes) return csum; } -#ifndef MDASSEMBLE char *human_size(long long bytes) { - static char buf[30]; + static char buf[47]; /* We convert bytes to either centi-M{ega,ibi}bytes or * centi-G{igi,ibi}bytes, with appropriate rounding, @@ -669,17 +901,15 @@ char *human_size(long long bytes) if (bytes < 5000*1024) buf[0] = 0; else if (bytes < 2*1024LL*1024LL*1024LL) { - long cMiB = (bytes / ( (1LL<<20) / 200LL ) +1) /2; + long cMiB = (bytes * 200LL / (1LL<<20) + 1) / 2; long cMB = (bytes / ( 1000000LL / 200LL ) +1) /2; snprintf(buf, sizeof(buf), " (%ld.%02ld MiB %ld.%02ld MB)", - cMiB/100 , cMiB % 100, - cMB/100, cMB % 100); + cMiB/100, cMiB % 100, cMB/100, cMB % 100); } else { - long cGiB = (bytes / ( (1LL<<30) / 200LL ) +1) /2; + long cGiB = (bytes * 200LL / (1LL<<30) +1) / 2; long cGB = (bytes / (1000000000LL/200LL ) +1) /2; snprintf(buf, sizeof(buf), " (%ld.%02ld GiB %ld.%02ld GB)", - cGiB/100 , cGiB % 100, - cGB/100, cGB % 100); + cGiB/100, cGiB % 100, cGB/100, cGB % 100); } return buf; } @@ -704,24 +934,24 @@ char *human_size_brief(long long bytes, int prefix) buf[0] = 0; else if (prefix == IEC) { if (bytes < 2*1024LL*1024LL*1024LL) { - long cMiB = (bytes / ( (1LL<<20) / 200LL ) +1) /2; + long cMiB = (bytes * 200LL / (1LL<<20) +1) /2; snprintf(buf, sizeof(buf), "%ld.%02ldMiB", - cMiB/100 , cMiB % 100); + cMiB/100, cMiB % 100); } else { - long cGiB = (bytes / ( (1LL<<30) / 200LL ) +1) /2; + long cGiB = (bytes * 200LL / (1LL<<30) +1) /2; snprintf(buf, sizeof(buf), "%ld.%02ldGiB", - cGiB/100 , cGiB % 100); + cGiB/100, cGiB % 100); } } else if (prefix == JEDEC) { if (bytes < 2*1024LL*1024LL*1024LL) { long cMB = (bytes / ( 1000000LL / 200LL ) +1) /2; snprintf(buf, sizeof(buf), "%ld.%02ldMB", - cMB/100, cMB % 100); + cMB/100, cMB % 100); } else { long cGB = (bytes / (1000000000LL/200LL ) +1) /2; snprintf(buf, sizeof(buf), "%ld.%02ldGB", - cGB/100 , cGB % 100); + cGB/100, cGB % 100); } } else @@ -746,7 +976,6 @@ void print_r10_layout(int layout) if (near*far == 1) printf("NO REDUNDANCY"); } -#endif unsigned long long calc_array_size(int level, int raid_disks, int layout, int chunksize, unsigned long long devsize) @@ -777,8 +1006,7 @@ int get_data_disks(int level, int layout, int raid_disks) return data_disks; } - -int devnm2devid(char *devnm) +dev_t devnm2devid(char *devnm) { /* First look in /sys/block/$DEVNM/dev for %d:%d * If that fails, try parsing out a number @@ -814,7 +1042,6 @@ int devnm2devid(char *devnm) return 0; } -#if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) char *get_md_name(char *devnm) { /* find /dev/md%d or /dev/md/%d or make a device /dev/.tmp.md%d */ @@ -830,21 +1057,18 @@ char *get_md_name(char *devnm) if (strncmp(devnm, "md_", 3) == 0) { snprintf(devname, sizeof(devname), "/dev/md/%s", devnm + 3); - if (stat(devname, &stb) == 0 - && (S_IFMT&stb.st_mode) == S_IFBLK - && (stb.st_rdev == rdev)) + if (stat(devname, &stb) == 0 && + (S_IFMT&stb.st_mode) == S_IFBLK && (stb.st_rdev == rdev)) return devname; } snprintf(devname, sizeof(devname), "/dev/%s", devnm); - if (stat(devname, &stb) == 0 - && (S_IFMT&stb.st_mode) == S_IFBLK - && (stb.st_rdev == rdev)) + if (stat(devname, &stb) == 0 && (S_IFMT&stb.st_mode) == S_IFBLK && + (stb.st_rdev == rdev)) return devname; snprintf(devname, sizeof(devname), "/dev/md/%s", devnm+2); - if (stat(devname, &stb) == 0 - && (S_IFMT&stb.st_mode) == S_IFBLK - && (stb.st_rdev == rdev)) + if (stat(devname, &stb) == 0 && (S_IFMT&stb.st_mode) == S_IFBLK && + (stb.st_rdev == rdev)) return devname; dn = map_dev(major(rdev), minor(rdev), 0); @@ -855,9 +1079,8 @@ char *get_md_name(char *devnm) if (errno != EEXIST) return NULL; - if (stat(devname, &stb) == 0 - && (S_IFMT&stb.st_mode) == S_IFBLK - && (stb.st_rdev == rdev)) + if (stat(devname, &stb) == 0 && (S_IFMT&stb.st_mode) == S_IFBLK && + (stb.st_rdev == rdev)) return devname; unlink(devname); return NULL; @@ -869,58 +1092,30 @@ void put_md_name(char *name) unlink(name); } -char *find_free_devnm(int use_partitions) +int get_maj_min(char *dev, int *major, int *minor) { - static char devnm[32]; - int devnum; - for (devnum = 127; devnum != 128; - devnum = devnum ? devnum-1 : (1<<20)-1) { - - if (use_partitions) - sprintf(devnm, "md_d%d", devnum); - else - sprintf(devnm, "md%d", devnum); - if (mddev_busy(devnm)) - continue; - if (!conf_name_is_free(devnm)) - continue; - if (!use_udev()) { - /* make sure it is new to /dev too, at least as a - * non-standard */ - int devid = devnm2devid(devnm); - if (devid) { - char *dn = map_dev(major(devid), - minor(devid), 0); - if (dn && ! is_standard(dn, NULL)) - continue; - } - } - break; - } - if (devnum == 128) - return NULL; - return devnm; + char *e; + *major = strtoul(dev, &e, 0); + return (e > dev && *e == ':' && e[1] && + (*minor = strtoul(e+1, &e, 0)) >= 0 && + *e == 0); } -#endif /* !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) */ int dev_open(char *dev, int flags) { /* like 'open', but if 'dev' matches %d:%d, create a temp * block device and open that */ - char *e; int fd = -1; char devname[32]; int major; int minor; - if (!dev) return -1; + if (!dev) + return -1; flags |= O_DIRECT; - major = strtoul(dev, &e, 0); - if (e > dev && *e == ':' && e[1] && - (minor = strtoul(e+1, &e, 0)) >= 0 && - *e == 0) { + if (get_maj_min(dev, &major, &minor)) { snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d:%d", (int)getpid(), major, minor); if (mknod(devname, S_IFBLK|0600, makedev(major, minor)) == 0) { @@ -929,9 +1124,11 @@ int dev_open(char *dev, int flags) } if (fd < 0) { /* Try /tmp as /dev appear to be read-only */ - snprintf(devname, sizeof(devname), "/tmp/.tmp.md.%d:%d:%d", + snprintf(devname, sizeof(devname), + "/tmp/.tmp.md.%d:%d:%d", (int)getpid(), major, minor); - if (mknod(devname, S_IFBLK|0600, makedev(major, minor)) == 0) { + if (mknod(devname, S_IFBLK|0600, + makedev(major, minor)) == 0) { fd = open(devname, flags); unlink(devname); } @@ -943,7 +1140,7 @@ int dev_open(char *dev, int flags) int open_dev_flags(char *devnm, int flags) { - int devid; + dev_t devid; char buf[20]; devid = devnm2devid(devnm); @@ -961,10 +1158,11 @@ int open_dev_excl(char *devnm) char buf[20]; int i; int flags = O_RDWR; - int devid = devnm2devid(devnm); + dev_t devid = devnm2devid(devnm); + long delay = 1000; sprintf(buf, "%d:%d", major(devid), minor(devid)); - for (i = 0 ; i < 25 ; i++) { + for (i = 0; i < 25; i++) { int fd = dev_open(buf, flags|O_EXCL); if (fd >= 0) return fd; @@ -974,7 +1172,9 @@ int open_dev_excl(char *devnm) } if (errno != EBUSY) return fd; - usleep(200000); + usleep(delay); + if (delay < 200000) + delay *= 2; } return -1; } @@ -997,21 +1197,24 @@ void wait_for(char *dev, int fd) { int i; struct stat stb_want; + long delay = 1000; if (fstat(fd, &stb_want) != 0 || (stb_want.st_mode & S_IFMT) != S_IFBLK) return; - for (i = 0 ; i < 25 ; i++) { + for (i = 0; i < 25; i++) { struct stat stb; if (stat(dev, &stb) == 0 && (stb.st_mode & S_IFMT) == S_IFBLK && (stb.st_rdev == stb_want.st_rdev)) return; - usleep(200000); + usleep(delay); + if (delay < 200000) + delay *= 2; } if (i == 25) - dprintf("%s: timeout waiting for %s\n", __func__, dev); + pr_err("timeout waiting for %s\n", dev); } struct superswitch *superlist[] = @@ -1019,9 +1222,8 @@ struct superswitch *superlist[] = &super0, &super1, &super_ddf, &super_imsm, &mbr, &gpt, - NULL }; - -#if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) + NULL +}; struct supertype *super_by_fd(int fd, char **subarrayp) { @@ -1043,7 +1245,7 @@ struct supertype *super_by_fd(int fd, char **subarrayp) minor = sra->array.minor_version; verstr = sra->text_version; } else { - if (ioctl(fd, GET_ARRAY_INFO, &array)) + if (md_get_array_info(fd, &array)) array.major_version = array.minor_version = 0; vers = array.major_version; minor = array.minor_version; @@ -1063,8 +1265,7 @@ struct supertype *super_by_fd(int fd, char **subarrayp) subarray = xstrdup(subarray); } strcpy(container, dev); - if (sra) - sysfs_free(sra); + sysfs_free(sra); sra = sysfs_read(-1, container, GET_VERSION); if (sra && sra->text_version[0]) verstr = sra->text_version; @@ -1072,11 +1273,10 @@ struct supertype *super_by_fd(int fd, char **subarrayp) verstr = "-no-metadata-"; } - for (i = 0; st == NULL && superlist[i] ; i++) + for (i = 0; st == NULL && superlist[i]; i++) st = superlist[i]->match_metadata_desc(verstr); - if (sra) - sysfs_free(sra); + sysfs_free(sra); if (st) { st->sb = NULL; if (subarrayp) @@ -1088,7 +1288,6 @@ struct supertype *super_by_fd(int fd, char **subarrayp) return st; } -#endif /* !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) */ int dev_size_from_id(dev_t id, unsigned long long *size) { @@ -1107,6 +1306,23 @@ int dev_size_from_id(dev_t id, unsigned long long *size) return 0; } +int dev_sector_size_from_id(dev_t id, unsigned int *size) +{ + char buf[20]; + int fd; + + sprintf(buf, "%d:%d", major(id), minor(id)); + fd = dev_open(buf, O_RDONLY); + if (fd < 0) + return 0; + if (get_dev_sector_size(fd, NULL, size)) { + close(fd); + return 1; + } + close(fd); + return 0; +} + struct supertype *dup_super(struct supertype *orig) { struct supertype *st; @@ -1118,6 +1334,7 @@ struct supertype *dup_super(struct supertype *orig) st->max_devs = orig->max_devs; st->minor_version = orig->minor_version; st->ignore_hw_compat = orig->ignore_hw_compat; + st->data_offset = orig->data_offset; st->sb = NULL; st->info = NULL; return st; @@ -1130,14 +1347,14 @@ struct supertype *guess_super_type(int fd, enum guess_types guess_type) */ struct superswitch *ss; struct supertype *st; - time_t besttime = 0; + unsigned int besttime = 0; int bestsuper = -1; int i; st = xcalloc(1, sizeof(*st)); st->container_devnm[0] = 0; - for (i = 0 ; superlist[i]; i++) { + for (i = 0; superlist[i]; i++) { int rv; ss = superlist[i]; if (guess_type == guess_array && ss->add_to_super == NULL) @@ -1191,7 +1408,7 @@ int get_dev_size(int fd, char *dname, unsigned long long *sizep) ldsize <<= 9; } else { if (dname) - pr_err("Cannot get size of %s: %s\b", + pr_err("Cannot get size of %s: %s\n", dname, strerror(errno)); return 0; } @@ -1200,14 +1417,35 @@ int get_dev_size(int fd, char *dname, unsigned long long *sizep) return 1; } +/* Return sector size of device in bytes */ +int get_dev_sector_size(int fd, char *dname, unsigned int *sectsizep) +{ + unsigned int sectsize; + + if (ioctl(fd, BLKSSZGET, §size) != 0) { + if (dname) + pr_err("Cannot get sector size of %s: %s\n", + dname, strerror(errno)); + return 0; + } + + *sectsizep = sectsize; + return 1; +} + /* Return true if this can only be a container, not a member device. * i.e. is and md device and size is zero */ int must_be_container(int fd) { + struct mdinfo *mdi; unsigned long long size; - if (md_get_version(fd) < 0) + + mdi = sysfs_read(fd, NULL, GET_VERSION); + if (!mdi) return 0; + sysfs_free(mdi); + if (get_dev_size(fd, NULL, &size) == 0) return 1; if (size == 0) @@ -1229,12 +1467,15 @@ static int get_gpt_last_partition_end(int fd, unsigned long long *endofpart) unsigned long long curr_part_end; unsigned all_partitions, entry_size; unsigned part_nr; + unsigned int sector_size = 0; *endofpart = 0; BUILD_BUG_ON(sizeof(gpt) != 512); /* skip protective MBR */ - lseek(fd, 512, SEEK_SET); + if (!get_dev_sector_size(fd, NULL, §or_size)) + return 0; + lseek(fd, sector_size, SEEK_SET); /* read GPT header */ if (read(fd, &gpt, 512) != 512) return 0; @@ -1254,6 +1495,8 @@ static int get_gpt_last_partition_end(int fd, unsigned long long *endofpart) part = (struct GPT_part_entry *)buf; + /* set offset to third block (GPT entries) */ + lseek(fd, sector_size*2, SEEK_SET); for (part_nr = 0; part_nr < all_partitions; part_nr++) { /* read partition entry */ if (read(fd, buf, entry_size) != (ssize_t)entry_size) @@ -1279,9 +1522,9 @@ static int get_gpt_last_partition_end(int fd, unsigned long long *endofpart) static int get_last_partition_end(int fd, unsigned long long *endofpart) { struct MBR boot_sect; - struct MBR_part_record *part; unsigned long long curr_part_end; unsigned part_nr; + unsigned int sector_size; int retval = 0; *endofpart = 0; @@ -1296,26 +1539,34 @@ static int get_last_partition_end(int fd, unsigned long long *endofpart) if (boot_sect.magic == MBR_SIGNATURE_MAGIC) { retval = 1; /* found the correct signature */ - part = boot_sect.parts; for (part_nr = 0; part_nr < MBR_PARTITIONS; part_nr++) { + /* + * Have to make every access through boot_sect rather + * than using a pointer to the partition table (or an + * entry), since the entries are not properly aligned. + */ + /* check for GPT type */ - if (part->part_type == MBR_GPT_PARTITION_TYPE) { + if (boot_sect.parts[part_nr].part_type == + MBR_GPT_PARTITION_TYPE) { retval = get_gpt_last_partition_end(fd, endofpart); break; } /* check the last used lba for the current partition */ - curr_part_end = __le32_to_cpu(part->first_sect_lba) + - __le32_to_cpu(part->blocks_num); + curr_part_end = + __le32_to_cpu(boot_sect.parts[part_nr].first_sect_lba) + + __le32_to_cpu(boot_sect.parts[part_nr].blocks_num); if (curr_part_end > *endofpart) *endofpart = curr_part_end; - - part++; } } else { /* Unknown partition table */ retval = -1; } + /* calculate number of 512-byte blocks */ + if (get_dev_sector_size(fd, NULL, §or_size)) + *endofpart *= (sector_size / 512); abort: return retval; } @@ -1327,9 +1578,8 @@ int check_partitions(int fd, char *dname, unsigned long long freesize, * Check where the last partition ends */ unsigned long long endofpart; - int ret; - if ((ret = get_last_partition_end(fd, &endofpart)) > 0) { + if (get_last_partition_end(fd, &endofpart) > 0) { /* There appears to be a partition table here */ if (freesize == 0) { /* partitions will not be visible in new device */ @@ -1594,7 +1844,7 @@ int add_disk(int mdfd, struct supertype *st, { /* Add a device to an array, in one of 2 ways. */ int rv; -#ifndef MDASSEMBLE + if (st->ss->external) { if (info->disk.state & (1<recovery_start = MaxSector; @@ -1614,7 +1864,6 @@ int add_disk(int mdfd, struct supertype *st, } } } else -#endif rv = ioctl(mdfd, ADD_NEW_DISK, &info->disk); return rv; } @@ -1623,39 +1872,63 @@ int remove_disk(int mdfd, struct supertype *st, struct mdinfo *sra, struct mdinfo *info) { int rv; + /* Remove the disk given by 'info' from the array */ -#ifndef MDASSEMBLE if (st->ss->external) rv = sysfs_set_str(sra, info, "slot", "none"); else -#endif rv = ioctl(mdfd, HOT_REMOVE_DISK, makedev(info->disk.major, info->disk.minor)); return rv; } +int hot_remove_disk(int mdfd, unsigned long dev, int force) +{ + int cnt = force ? 500 : 5; + int ret; + + /* HOT_REMOVE_DISK can fail with EBUSY if there are + * outstanding IO requests to the device. + * In this case, it can be helpful to wait a little while, + * up to 5 seconds if 'force' is set, or 50 msec if not. + */ + while ((ret = ioctl(mdfd, HOT_REMOVE_DISK, dev)) == -1 && + errno == EBUSY && + cnt-- > 0) + usleep(10000); + + return ret; +} + +int sys_hot_remove_disk(int statefd, int force) +{ + int cnt = force ? 500 : 5; + int ret; + + while ((ret = write(statefd, "remove", 6)) == -1 && + errno == EBUSY && + cnt-- > 0) + usleep(10000); + return ret == 6 ? 0 : -1; +} + int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info) { /* Initialise kernel's knowledge of array. * This varies between externally managed arrays * and older kernels */ - int vers = md_get_version(mdfd); + mdu_array_info_t inf; int rv; -#ifndef MDASSEMBLE if (st->ss->external) - rv = sysfs_set_array(info, vers); - else -#endif - if ((vers % 100) >= 1) { /* can use different versions */ - mdu_array_info_t inf; - memset(&inf, 0, sizeof(inf)); - inf.major_version = info->array.major_version; - inf.minor_version = info->array.minor_version; - rv = ioctl(mdfd, SET_ARRAY_INFO, &inf); - } else - rv = ioctl(mdfd, SET_ARRAY_INFO, NULL); + return sysfs_set_array(info, 9003); + + memset(&inf, 0, sizeof(inf)); + inf.major_version = info->array.major_version; + inf.minor_version = info->array.minor_version; + rv = md_set_array_info(mdfd, &inf); + return rv; } @@ -1713,8 +1986,8 @@ int start_mdmon(char *devnm) char pathbuf[1024]; char *paths[4] = { pathbuf, - "/sbin/mdmon", - "mdmon", + BINDIR "/mdmon", + "./mdmon", NULL }; @@ -1735,31 +2008,38 @@ int start_mdmon(char *devnm) pathbuf[0] = '\0'; /* First try to run systemctl */ - switch(fork()) { - case 0: - /* FIXME yuk. CLOSE_EXEC?? */ - skipped = 0; - for (i = 3; skipped < 20; i++) - if (close(i) < 0) - skipped++; - else - skipped = 0; - - snprintf(pathbuf, sizeof(pathbuf), "mdmon@%s.service", - devnm); - status = execl("/usr/bin/systemctl", "systemctl", "start", - pathbuf, NULL); - status = execl("/bin/systemctl", "systemctl", "start", - pathbuf, NULL); - exit(1); - case -1: pr_err("cannot run mdmon. " - "Array remains readonly\n"); - return -1; - default: /* parent - good */ - pid = wait(&status); - if (pid >= 0 && status == 0) - return 0; - } + if (!check_env("MDADM_NO_SYSTEMCTL")) + switch(fork()) { + case 0: + /* FIXME yuk. CLOSE_EXEC?? */ + skipped = 0; + for (i = 3; skipped < 20; i++) + if (close(i) < 0) + skipped++; + else + skipped = 0; + + /* Don't want to see error messages from + * systemctl. If the service doesn't exist, + * we start mdmon ourselves. + */ + close(2); + open("/dev/null", O_WRONLY); + snprintf(pathbuf, sizeof(pathbuf), "mdmon@%s.service", + devnm); + status = execl("/usr/bin/systemctl", "systemctl", + "start", + pathbuf, NULL); + status = execl("/bin/systemctl", "systemctl", "start", + pathbuf, NULL); + exit(1); + case -1: pr_err("cannot run mdmon. Array remains readonly\n"); + return -1; + default: /* parent - good */ + pid = wait(&status); + if (pid >= 0 && status == 0) + return 0; + } /* That failed, try running mdmon directly */ switch(fork()) { @@ -1774,34 +2054,22 @@ int start_mdmon(char *devnm) for (i = 0; paths[i]; i++) if (paths[i][0]) { - execl(paths[i], "mdmon", + execl(paths[i], paths[i], devnm, NULL); } exit(1); - case -1: pr_err("cannot run mdmon. " - "Array remains readonly\n"); + case -1: pr_err("cannot run mdmon. Array remains readonly\n"); return -1; default: /* parent - good */ pid = wait(&status); if (pid < 0 || status != 0) { - pr_err("failed to launch mdmon. " - "Array remains readonly\n"); + pr_err("failed to launch mdmon. Array remains readonly\n"); return -1; } } return 0; } -int check_env(char *name) -{ - char *val = getenv(name); - - if (val && atoi(val) == 1) - return 1; - - return 0; -} - __u32 random32(void) { __u32 rv; @@ -1813,7 +2081,27 @@ __u32 random32(void) return rv; } -#ifndef MDASSEMBLE +void random_uuid(__u8 *buf) +{ + int fd, i, len; + __u32 r[4]; + + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) + goto use_random; + len = read(fd, buf, 16); + close(fd); + if (len != 16) + goto use_random; + + return; + +use_random: + for (i = 0; i < 4; i++) + r[i] = random(); + memcpy(buf, r, 16); +} + int flush_metadata_updates(struct supertype *st) { int sfd; @@ -1855,7 +2143,6 @@ void append_metadata_update(struct supertype *st, void *buf, int len) *st->update_tail = mu; st->update_tail = &mu->next; } -#endif /* MDASSEMBLE */ #ifdef __TINYC__ /* tinyc doesn't optimize this check in ioctl.h out ... */ @@ -1867,8 +2154,7 @@ int experimental(void) if (check_env("MDADM_EXPERIMENTAL")) return 1; else { - pr_err("To use this feature MDADM_EXPERIMENTAL" - " environment variable has to be defined.\n"); + pr_err("To use this feature MDADM_EXPERIMENTAL environment variable has to be defined.\n"); return 0; } } @@ -1879,7 +2165,7 @@ int experimental(void) * if spare_group given add it to domains of each spare * metadata allows to test domains using metadata of destination array */ struct mdinfo *container_choose_spares(struct supertype *st, - unsigned long long min_size, + struct spare_criteria *criteria, struct domainlist *domlist, char *spare_group, const char *metadata, int get_one) @@ -1901,12 +2187,24 @@ struct mdinfo *container_choose_spares(struct supertype *st, if (d->disk.state == 0) { /* check if size is acceptable */ unsigned long long dev_size; + unsigned int dev_sector_size; + int size_valid = 0; + int sector_size_valid = 0; + dev_t dev = makedev(d->disk.major,d->disk.minor); - if (!min_size || + if (!criteria->min_size || (dev_size_from_id(dev, &dev_size) && - dev_size >= min_size)) - found = 1; + dev_size >= criteria->min_size)) + size_valid = 1; + + if (!criteria->sector_size || + (dev_sector_size_from_id(dev, &dev_sector_size) && + criteria->sector_size == dev_sector_size)) + sector_size_valid = 1; + + found = size_valid && sector_size_valid; + /* check if domain matches */ if (found && domlist) { struct dev_policy *pol = devid_policy(dev); @@ -1954,3 +2252,174 @@ int compare_paths (char* path1, char* path2) return 0; return 1; } + +/* Make sure we can open as many devices as needed */ +void enable_fds(int devices) +{ + unsigned int fds = 20 + devices; + struct rlimit lim; + if (getrlimit(RLIMIT_NOFILE, &lim) != 0 || lim.rlim_cur >= fds) + return; + if (lim.rlim_max < fds) + lim.rlim_max = fds; + lim.rlim_cur = fds; + setrlimit(RLIMIT_NOFILE, &lim); +} + +int in_initrd(void) +{ + /* This is based on similar function in systemd. */ + struct statfs s; + /* statfs.f_type is signed long on s390x and MIPS, causing all + sorts of sign extension problems with RAMFS_MAGIC being + defined as 0x858458f6 */ + return statfs("/", &s) >= 0 && + ((unsigned long)s.f_type == TMPFS_MAGIC || + ((unsigned long)s.f_type & 0xFFFFFFFFUL) == + ((unsigned long)RAMFS_MAGIC & 0xFFFFFFFFUL)); +} + +void reopen_mddev(int mdfd) +{ + /* Re-open without any O_EXCL, but keep + * the same fd + */ + char *devnm; + int fd; + devnm = fd2devnm(mdfd); + close(mdfd); + fd = open_dev(devnm); + if (fd >= 0 && fd != mdfd) + dup2(fd, mdfd); +} + +static struct cmap_hooks *cmap_hooks = NULL; +static int is_cmap_hooks_ready = 0; + +void set_cmap_hooks(void) +{ + cmap_hooks = xmalloc(sizeof(struct cmap_hooks)); + cmap_hooks->cmap_handle = dlopen("libcmap.so.4", RTLD_NOW | RTLD_LOCAL); + if (!cmap_hooks->cmap_handle) + return; + + cmap_hooks->initialize = + dlsym(cmap_hooks->cmap_handle, "cmap_initialize"); + cmap_hooks->get_string = + dlsym(cmap_hooks->cmap_handle, "cmap_get_string"); + cmap_hooks->finalize = dlsym(cmap_hooks->cmap_handle, "cmap_finalize"); + + if (!cmap_hooks->initialize || !cmap_hooks->get_string || + !cmap_hooks->finalize) + dlclose(cmap_hooks->cmap_handle); + else + is_cmap_hooks_ready = 1; +} + +int get_cluster_name(char **cluster_name) +{ + int rv = -1; + cmap_handle_t handle; + + if (!is_cmap_hooks_ready) + return rv; + + rv = cmap_hooks->initialize(&handle); + if (rv != CS_OK) + goto out; + + rv = cmap_hooks->get_string(handle, "totem.cluster_name", cluster_name); + if (rv != CS_OK) { + free(*cluster_name); + rv = -1; + goto name_err; + } + + rv = 0; +name_err: + cmap_hooks->finalize(handle); +out: + return rv; +} + +void set_dlm_hooks(void) +{ + dlm_hooks = xmalloc(sizeof(struct dlm_hooks)); + dlm_hooks->dlm_handle = dlopen("libdlm_lt.so.3", RTLD_NOW | RTLD_LOCAL); + if (!dlm_hooks->dlm_handle) + return; + + dlm_hooks->open_lockspace = + dlsym(dlm_hooks->dlm_handle, "dlm_open_lockspace"); + dlm_hooks->create_lockspace = + dlsym(dlm_hooks->dlm_handle, "dlm_create_lockspace"); + dlm_hooks->release_lockspace = + dlsym(dlm_hooks->dlm_handle, "dlm_release_lockspace"); + dlm_hooks->ls_lock = dlsym(dlm_hooks->dlm_handle, "dlm_ls_lock"); + dlm_hooks->ls_unlock_wait = + dlsym(dlm_hooks->dlm_handle, "dlm_ls_unlock_wait"); + dlm_hooks->ls_get_fd = dlsym(dlm_hooks->dlm_handle, "dlm_ls_get_fd"); + dlm_hooks->dispatch = dlsym(dlm_hooks->dlm_handle, "dlm_dispatch"); + + if (!dlm_hooks->open_lockspace || !dlm_hooks->create_lockspace || + !dlm_hooks->ls_lock || !dlm_hooks->ls_unlock_wait || + !dlm_hooks->release_lockspace || !dlm_hooks->ls_get_fd || + !dlm_hooks->dispatch) + dlclose(dlm_hooks->dlm_handle); + else + is_dlm_hooks_ready = 1; +} + +void set_hooks(void) +{ + set_dlm_hooks(); + set_cmap_hooks(); +} + +int zero_disk_range(int fd, unsigned long long sector, size_t count) +{ + int ret = 0; + int fd_zero; + void *addr = NULL; + size_t written = 0; + size_t len = count * 512; + ssize_t n; + + fd_zero = open("/dev/zero", O_RDONLY); + if (fd_zero < 0) { + pr_err("Cannot open /dev/zero\n"); + return -1; + } + + if (lseek64(fd, sector * 512, SEEK_SET) < 0) { + ret = -errno; + pr_err("Failed to seek offset for zeroing\n"); + goto out; + } + + addr = mmap(NULL, len, PROT_READ, MAP_PRIVATE, fd_zero, 0); + + if (addr == MAP_FAILED) { + ret = -errno; + pr_err("Mapping /dev/zero failed\n"); + goto out; + } + + do { + n = write(fd, addr + written, len - written); + if (n < 0) { + if (errno == EINTR) + continue; + ret = -errno; + pr_err("Zeroing disk range failed\n"); + break; + } + written += n; + } while (written != len); + + munmap(addr, len); + +out: + close(fd_zero); + return ret; +}