X-Git-Url: http://git.ipfire.org/?p=thirdparty%2Fmdadm.git;a=blobdiff_plain;f=sysfs.c;h=8fdb52998409303e3874f84b6fe5a1eea8820aee;hp=ea7d52f6c025be17a47a1774e3bd5bec843fe261;hb=d23534e4646313a67296b295666d165a87bb2c92;hpb=355726fa014fb9d656e6af92f0305e55af58de09 diff --git a/sysfs.c b/sysfs.c index ea7d52f6..8fdb5299 100644 --- a/sysfs.c +++ b/sysfs.c @@ -2,7 +2,7 @@ * sysfs - extract md related information from sysfs. Part of: * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2006 Neil Brown + * Copyright (C) 2006-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -25,6 +25,7 @@ #include "mdadm.h" #include +#include int load_sys(char *path, char *buf) { @@ -37,7 +38,7 @@ int load_sys(char *path, char *buf) if (n <0 || n >= 1024) return -1; buf[n] = 0; - if (buf[n-1] == '\n') + if (n && buf[n-1] == '\n') buf[n-1] = 0; return 0; } @@ -59,26 +60,44 @@ void sysfs_free(struct mdinfo *sra) int sysfs_open(int devnum, char *devname, char *attr) { char fname[50]; - char sys_name[16]; int fd; - if (devnum >= 0) - sprintf(sys_name, "md%d", devnum); - else - sprintf(sys_name, "md_d%d", - -1-devnum); + char *mdname = devnum2devname(devnum); - sprintf(fname, "/sys/block/%s/md/", sys_name); + if (!mdname) + return -1; + + sprintf(fname, "/sys/block/%s/md/", mdname); if (devname) { strcat(fname, devname); strcat(fname, "/"); } strcat(fname, attr); fd = open(fname, O_RDWR); - if (fd < 0 && errno == -EACCES) + if (fd < 0 && errno == EACCES) fd = open(fname, O_RDONLY); + free(mdname); return fd; } +void sysfs_init(struct mdinfo *mdi, int fd, int devnum) +{ + mdi->sys_name[0] = 0; + if (fd >= 0) { + mdu_version_t vers; + if (ioctl(fd, RAID_VERSION, &vers) != 0) + return; + devnum = fd2devnum(fd); + } + if (devnum == NoMdDev) + return; + if (devnum >= 0) + sprintf(mdi->sys_name, "md%d", devnum); + else + sprintf(mdi->sys_name, "md_d%d", + -1-devnum); +} + + struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) { /* Longest possible name in sysfs, mounted at /sys, is @@ -98,26 +117,13 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) sra = malloc(sizeof(*sra)); if (sra == NULL) return sra; - sra->next = NULL; - - if (fd >= 0) { - struct stat stb; - mdu_version_t vers; - if (fstat(fd, &stb)) return NULL; - if (ioctl(fd, RAID_VERSION, &vers) != 0) - return NULL; - if (major(stb.st_rdev)==9) - sprintf(sra->sys_name, "md%d", (int)minor(stb.st_rdev)); - else - sprintf(sra->sys_name, "md_d%d", - (int)minor(stb.st_rdev)>>MdpMinorShift); - } else { - if (devnum >= 0) - sprintf(sra->sys_name, "md%d", devnum); - else - sprintf(sra->sys_name, "md_d%d", - -1-devnum); + memset(sra, 0, sizeof(*sra)); + sysfs_init(sra, fd, devnum); + if (sra->sys_name[0] == 0) { + free(sra); + return NULL; } + sprintf(fname, "/sys/block/%s/md/", sra->sys_name); base = fname + strlen(fname); @@ -134,10 +140,12 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) sra->array.major_version = -1; sra->array.minor_version = -2; strcpy(sra->text_version, buf+9); - } else + } else { sscanf(buf, "%d.%d", &sra->array.major_version, &sra->array.minor_version); + strcpy(sra->text_version, buf); + } } if (options & GET_LEVEL) { strcpy(base, "level"); @@ -157,6 +165,12 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) goto abort; sra->array.raid_disks = strtoul(buf, NULL, 0); } + if (options & GET_DEGRADED) { + strcpy(base, "degraded"); + if (load_sys(fname, buf)) + goto abort; + sra->array.failed_disks = strtoul(buf, NULL, 0); + } if (options & GET_COMPONENT) { strcpy(base, "component_size"); if (load_sys(fname, buf)) @@ -183,6 +197,35 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) goto abort; sra->mismatch_cnt = strtoul(buf, NULL, 0); } + if (options & GET_SAFEMODE) { + int scale = 1; + int dot = 0; + int i; + unsigned long msec; + size_t len; + + strcpy(base, "safe_mode_delay"); + if (load_sys(fname, buf)) + goto abort; + + /* remove a period, and count digits after it */ + len = strlen(buf); + for (i = 0; i < len; i++) { + if (dot) { + if (isdigit(buf[i])) { + buf[i-1] = buf[i]; + scale *= 10; + } + buf[i] = 0; + } else if (buf[i] == '.') { + dot=1; + buf[i] = 0; + } + } + msec = strtoul(buf, NULL, 10); + msec = (msec * 1000) / scale; + sra->safe_mode_delay = msec; + } if (! (options & GET_DEVS)) return sra; @@ -206,22 +249,57 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) dev = malloc(sizeof(*dev)); if (!dev) goto abort; - dev->next = sra->devs; - sra->devs = dev; - strcpy(dev->sys_name, de->d_name); /* Always get slot, major, minor */ strcpy(dbase, "slot"); - if (load_sys(fname, buf)) - goto abort; + if (load_sys(fname, buf)) { + /* hmm... unable to read 'slot' maybe the device + * is going away? + */ + strcpy(dbase, "block"); + if (readlink(fname, buf, sizeof(buf)) < 0 && + errno != ENAMETOOLONG) { + /* ...yup device is gone */ + free(dev); + continue; + } else { + /* slot is unreadable but 'block' link + * still intact... something bad is happening + * so abort + */ + free(dev); + goto abort; + } + + } + strcpy(dev->sys_name, de->d_name); dev->disk.raid_disk = strtoul(buf, &ep, 10); if (*ep) dev->disk.raid_disk = -1; strcpy(dbase, "block/dev"); - if (load_sys(fname, buf)) - goto abort; + if (load_sys(fname, buf)) { + free(dev); + if (options & SKIP_GONE_DEVS) + continue; + else + goto abort; + } sscanf(buf, "%d:%d", &dev->disk.major, &dev->disk.minor); + /* special case check for block devices that can go 'offline' */ + if (options & SKIP_GONE_DEVS) { + strcpy(dbase, "block/device/state"); + if (load_sys(fname, buf) == 0 && + strncmp(buf, "offline", 7) == 0) { + free(dev); + continue; + } + } + + /* finally add this disk to the array */ + dev->next = sra->devs; + sra->devs = dev; + if (options & GET_OFFSET) { strcpy(dbase, "offset"); if (load_sys(fname, buf)) @@ -232,7 +310,7 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) strcpy(dbase, "size"); if (load_sys(fname, buf)) goto abort; - dev->component_size = strtoull(buf, NULL, 0); + dev->component_size = strtoull(buf, NULL, 0) * 2; } if (options & GET_STATE) { dev->disk.state = 0; @@ -263,6 +341,31 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) return NULL; } +int sysfs_attr_match(const char *attr, const char *str) +{ + /* See if attr, read from a sysfs file, matches + * str. They must either be the same, or attr can + * have a trailing newline or comma + */ + while (*attr && *str && *attr == *str) { + attr++; + str++; + } + + if (*str || (*attr && *attr != ',' && *attr != '\n')) + return 0; + return 1; +} + +int sysfs_match_word(const char *word, char **list) +{ + int n; + for (n=0; list[n]; n++) + if (sysfs_attr_match(word, list[n])) + break; + return n; +} + unsigned long long get_component_size(int fd) { /* Find out the component size of the array. @@ -276,7 +379,7 @@ unsigned long long get_component_size(int fd) char fname[50]; int n; if (fstat(fd, &stb)) return 0; - if (major(stb.st_rdev) == 9) + if (major(stb.st_rdev) != get_mdp_major()) sprintf(fname, "/sys/block/md%d/md/component_size", (int)minor(stb.st_rdev)); else @@ -299,6 +402,7 @@ int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev, char fname[50]; int n; int fd; + sprintf(fname, "/sys/block/%s/md/%s/%s", sra->sys_name, dev?dev->sys_name:"", name); fd = open(fname, O_WRONLY); @@ -306,8 +410,11 @@ int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev, return -1; n = write(fd, val, strlen(val)); close(fd); - if (n != strlen(val)) + if (n != strlen(val)) { + dprintf(Name ": failed to write '%s' to '%s' (%s)\n", + val, fname, strerror(errno)); return -1; + } return 0; } @@ -319,21 +426,44 @@ int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev, return sysfs_set_str(sra, dev, name, valstr); } -int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, - char *name, unsigned long long *val) +int sysfs_uevent(struct mdinfo *sra, char *event) { char fname[50]; - char buf[50]; int n; int fd; - char *ep; + + sprintf(fname, "/sys/block/%s/uevent", + sra->sys_name); + fd = open(fname, O_WRONLY); + if (fd < 0) + return -1; + n = write(fd, event, strlen(event)); + close(fd); + return 0; +} + +int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev, + char *name) +{ + char fname[50]; + int fd; + sprintf(fname, "/sys/block/%s/md/%s/%s", sra->sys_name, dev?dev->sys_name:"", name); - fd = open(fname, O_RDONLY); + fd = open(fname, O_RDWR); if (fd < 0) - return -1; + fd = open(fname, O_RDONLY); + return fd; +} + +int sysfs_fd_get_ll(int fd, unsigned long long *val) +{ + char buf[50]; + int n; + char *ep; + + lseek(fd, 0, 0); n = read(fd, buf, sizeof(buf)); - close(fd); if (n <= 0) return -1; buf[n] = 0; @@ -343,22 +473,102 @@ int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, return 0; } -int sysfs_set_array(struct mdinfo *sra, - struct mdinfo *info) +int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long *val) +{ + int n; + int fd; + + fd = sysfs_get_fd(sra, dev, name); + if (fd < 0) + return -1; + n = sysfs_fd_get_ll(fd, val); + close(fd); + return n; +} + +int sysfs_fd_get_str(int fd, char *val, int size) +{ + int n; + + lseek(fd, 0, 0); + n = read(fd, val, size); + if (n <= 0) + return -1; + val[n] = 0; + return n; +} + +int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev, + char *name, char *val, int size) +{ + int n; + int fd; + + fd = sysfs_get_fd(sra, dev, name); + if (fd < 0) + return -1; + n = sysfs_fd_get_str(fd, val, size); + close(fd); + return n; +} + +int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms) +{ + unsigned long sec; + unsigned long msec; + char delay[30]; + + sec = ms / 1000; + msec = ms % 1000; + + sprintf(delay, "%ld.%03ld\n", sec, msec); + /* this '\n' ^ needed for kernels older than 2.6.28 */ + return sysfs_set_str(sra, NULL, "safe_mode_delay", delay); +} + +int sysfs_set_array(struct mdinfo *info, int vers) { int rv = 0; - sra->array = info->array; + char ver[100]; + ver[0] = 0; + if (info->array.major_version == -1 && + info->array.minor_version == -2) { + strcat(strcpy(ver, "external:"), info->text_version); + + if ((vers % 100) < 2 || + sysfs_set_str(info, NULL, "metadata_version", + ver) < 0) { + fprintf(stderr, Name ": This kernel does not " + "support external metadata.\n"); + return 1; + } + } if (info->array.level < 0) return 0; /* FIXME */ - rv |= sysfs_set_str(sra, NULL, "level", + rv |= sysfs_set_str(info, NULL, "level", map_num(pers, info->array.level)); - rv |= sysfs_set_num(sra, NULL, "raid_disks", info->array.raid_disks); - rv |= sysfs_set_num(sra, NULL, "chunk_size", info->array.chunk_size); - rv |= sysfs_set_num(sra, NULL, "layout", info->array.layout); - rv |= sysfs_set_num(sra, NULL, "component_size", info->component_size); - rv |= sysfs_set_num(sra, NULL, "resync_start", info->resync_start); - sra->array = info->array; + rv |= sysfs_set_num(info, NULL, "raid_disks", info->array.raid_disks); + rv |= sysfs_set_num(info, NULL, "chunk_size", info->array.chunk_size); + rv |= sysfs_set_num(info, NULL, "layout", info->array.layout); + rv |= sysfs_set_num(info, NULL, "component_size", info->component_size/2); + if (info->custom_array_size) { + int rc; + + rc = sysfs_set_num(info, NULL, "array_size", + info->custom_array_size/2); + if (rc && errno == ENOENT) { + fprintf(stderr, Name ": This kernel does not " + "have the md/array_size attribute, " + "the array may be larger than expected\n"); + rc = 0; + } + rv |= rc; + } + + if (info->array.level > 0) + rv |= sysfs_set_num(info, NULL, "resync_start", info->resync_start); return rv; } @@ -366,7 +576,6 @@ int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd) { char dv[100]; char nm[100]; - struct mdinfo *sd2; char *dname; int rv; @@ -377,23 +586,309 @@ int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd) memset(nm, 0, sizeof(nm)); sprintf(dv, "/sys/dev/block/%d:%d", sd->disk.major, sd->disk.minor); - if (readlink(dv, nm, sizeof(nm)) < 0) + rv = readlink(dv, nm, sizeof(nm)); + if (rv <= 0) return -1; + nm[rv] = '\0'; dname = strrchr(nm, '/'); if (dname) dname++; strcpy(sd->sys_name, "dev-"); strcpy(sd->sys_name+4, dname); - rv |= sysfs_set_num(sra, sd, "offset", sd->data_offset); + rv = sysfs_set_num(sra, sd, "offset", sd->data_offset); rv |= sysfs_set_num(sra, sd, "size", (sd->component_size+1) / 2); if (sra->array.level != LEVEL_CONTAINER) { + if (sd->recovery_start == MaxSector) + /* This can correctly fail if array isn't started, + * yet, so just ignore status for now. + */ + sysfs_set_str(sra, sd, "state", "insync"); rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk); -// rv |= sysfs_set_str(sra, sd, "state", "in_sync"); } - sd2 = malloc(sizeof(*sd2)); - *sd2 = *sd; - sd2->next = sra->devs; - sra->devs = sd2; + return rv; +} + +#if 0 +int sysfs_disk_to_sg(int fd) +{ + /* from an open block device, try find and open its corresponding + * scsi_generic interface + */ + struct stat st; + char path[256]; + char sg_path[256]; + char sg_major_minor[8]; + char *c; + DIR *dir; + struct dirent *de; + int major, minor, rv; + + if (fstat(fd, &st)) + return -1; + + snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device", + major(st.st_rdev), minor(st.st_rdev)); + + dir = opendir(path); + if (!dir) + return -1; + + de = readdir(dir); + while (de) { + if (strncmp("scsi_generic:", de->d_name, + strlen("scsi_generic:")) == 0) + break; + de = readdir(dir); + } + closedir(dir); + + if (!de) + return -1; + + snprintf(sg_path, sizeof(sg_path), "%s/%s/dev", path, de->d_name); + fd = open(sg_path, O_RDONLY); + if (fd < 0) + return fd; + + rv = read(fd, sg_major_minor, sizeof(sg_major_minor)); + close(fd); + if (rv < 0) + return -1; + else + sg_major_minor[rv - 1] = '\0'; + + c = strchr(sg_major_minor, ':'); + *c = '\0'; + c++; + major = strtol(sg_major_minor, NULL, 10); + minor = strtol(c, NULL, 10); + snprintf(path, sizeof(path), "/dev/.tmp.md.%d:%d:%d", + (int) getpid(), major, minor); + if (mknod(path, S_IFCHR|0600, makedev(major, minor))==0) { + fd = open(path, O_RDONLY); + unlink(path); + return fd; + } + + return -1; +} +#endif + +int sysfs_disk_to_scsi_id(int fd, __u32 *id) +{ + /* from an open block device, try to retrieve it scsi_id */ + struct stat st; + char path[256]; + char *c1, *c2; + DIR *dir; + struct dirent *de; + + if (fstat(fd, &st)) + return 1; + + snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device", + major(st.st_rdev), minor(st.st_rdev)); + + dir = opendir(path); + if (!dir) + return 1; + + de = readdir(dir); + while (de) { + if (strncmp("scsi_disk:", de->d_name, + strlen("scsi_disk:")) == 0) + break; + de = readdir(dir); + } + closedir(dir); + + if (!de) + return 1; + + c1 = strchr(de->d_name, ':'); + c1++; + c2 = strchr(c1, ':'); + *c2 = '\0'; + *id = strtol(c1, NULL, 10) << 24; /* host */ + c1 = c2 + 1; + c2 = strchr(c1, ':'); + *c2 = '\0'; + *id |= strtol(c1, NULL, 10) << 16; /* channel */ + c1 = c2 + 1; + c2 = strchr(c1, ':'); + *c2 = '\0'; + *id |= strtol(c1, NULL, 10) << 8; /* lun */ + c1 = c2 + 1; + *id |= strtol(c1, NULL, 10); /* id */ + + return 0; +} + + +int sysfs_unique_holder(int devnum, long rdev) +{ + /* Check that devnum is a holder of rdev, + * and is the only holder. + * we should be locked against races by + * an O_EXCL on devnum + */ + DIR *dir; + struct dirent *de; + char dirname[100]; + char l; + int found = 0; + sprintf(dirname, "/sys/dev/block/%d:%d/holders", + major(rdev), minor(rdev)); + dir = opendir(dirname); + errno = ENOENT; + if (!dir) + return 0; + l = strlen(dirname); + while ((de = readdir(dir)) != NULL) { + char buf[10]; + int n; + int mj, mn; + char c; + int fd; + + if (de->d_ino == 0) + continue; + if (de->d_name[0] == '.') + continue; + strcpy(dirname+l, "/"); + strcat(dirname+l, de->d_name); + strcat(dirname+l, "/dev"); + fd = open(dirname, O_RDONLY); + if (fd < 0) { + errno = ENOENT; + break; + } + n = read(fd, buf, sizeof(buf)-1); + close(fd); + buf[n] = 0; + if (sscanf(buf, "%d:%d%c", &mj, &mn, &c) != 3 || + c != '\n') { + errno = ENOENT; + break; + } + if (mj != MD_MAJOR) + mn = -1-(mn>>6); + + if (devnum != mn) { + errno = EEXIST; + break; + } + found = 1; + } + closedir(dir); + if (de) + return 0; + else + return found; +} + +#ifndef MDASSEMBLE + +static char *clean_states[] = { + "clear", "inactive", "readonly", "read-auto", "clean", NULL }; + +int WaitClean(char *dev, int sock, int verbose) +{ + int fd; + struct mdinfo *mdi; + int rv = 1; + int devnum; + + fd = open(dev, O_RDONLY); + if (fd < 0) { + if (verbose) + fprintf(stderr, Name ": Couldn't open %s: %s\n", dev, strerror(errno)); + return 1; + } + + devnum = fd2devnum(fd); + mdi = sysfs_read(fd, devnum, GET_VERSION|GET_LEVEL|GET_SAFEMODE); + if (!mdi) { + if (verbose) + fprintf(stderr, Name ": Failed to read sysfs attributes for " + "%s\n", dev); + close(fd); + return 0; + } + + switch(mdi->array.level) { + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + /* safemode delay is irrelevant for these levels */ + rv = 0; + + } + + /* for internal metadata the kernel handles the final clean + * transition, containers can never be dirty + */ + if (!is_subarray(mdi->text_version)) + rv = 0; + + /* safemode disabled ? */ + if (mdi->safe_mode_delay == 0) + rv = 0; + + if (rv) { + int state_fd = sysfs_open(fd2devnum(fd), NULL, "array_state"); + char buf[20]; + fd_set fds; + struct timeval tm; + + /* minimize the safe_mode_delay and prepare to wait up to 5s + * for writes to quiesce + */ + sysfs_set_safemode(mdi, 1); + tm.tv_sec = 5; + tm.tv_usec = 0; + + /* give mdmon a chance to checkpoint resync */ + sysfs_set_str(mdi, NULL, "sync_action", "idle"); + + FD_ZERO(&fds); + + /* wait for array_state to be clean */ + while (1) { + rv = read(state_fd, buf, sizeof(buf)); + if (rv < 0) + break; + if (sysfs_match_word(buf, clean_states) <= 4) + break; + FD_SET(state_fd, &fds); + rv = select(state_fd + 1, NULL, NULL, &fds, &tm); + if (rv < 0 && errno != EINTR) + break; + lseek(state_fd, 0, SEEK_SET); + } + if (rv < 0) + rv = 1; + else if (fping_monitor(sock) == 0 || + ping_monitor(mdi->text_version) == 0) { + /* we need to ping to close the window between array + * state transitioning to clean and the metadata being + * marked clean + */ + rv = 0; + } else + rv = 1; + if (rv && verbose) + fprintf(stderr, Name ": Error waiting for %s to be clean\n", + dev); + + /* restore the original safe_mode_delay */ + sysfs_set_safemode(mdi, mdi->safe_mode_delay); + close(state_fd); + } + + sysfs_free(mdi); + close(fd); return rv; } +#endif /* MDASSEMBLE */