* sysfs - extract md related information from sysfs. Part of:
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2006 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
#include "mdadm.h"
#include <dirent.h>
+#include <ctype.h>
int load_sys(char *path, char *buf)
{
{
char fname[50];
int fd;
+ char *mdname = devnum2devname(devnum);
- sprintf(fname, "/sys/block/%s/md/", devnum2devname(devnum));
+ if (!mdname)
+ return -1;
+
+ sprintf(fname, "/sys/block/%s/md/", mdname);
if (devname) {
strcat(fname, devname);
strcat(fname, "/");
fd = open(fname, O_RDWR);
if (fd < 0 && errno == EACCES)
fd = open(fname, O_RDONLY);
+ free(mdname);
return fd;
}
+void sysfs_init(struct mdinfo *mdi, int fd, int devnum)
+{
+ mdi->sys_name[0] = 0;
+ if (fd >= 0) {
+ mdu_version_t vers;
+ if (ioctl(fd, RAID_VERSION, &vers) != 0)
+ return;
+ devnum = fd2devnum(fd);
+ }
+ if (devnum == NoMdDev)
+ return;
+ if (devnum >= 0)
+ sprintf(mdi->sys_name, "md%d", devnum);
+ else
+ sprintf(mdi->sys_name, "md_d%d",
+ -1-devnum);
+}
+
+
struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
{
- /* Longest possible name in sysfs, mounted at /sys, is
- * /sys/block/md_dXXX/md/dev-XXXXX/block/dev
- * /sys/block/md_dXXX/md/metadata_version
- * which is about 41 characters. 50 should do for now
- */
- char fname[50];
- char buf[1024];
+ char fname[PATH_MAX];
+ char buf[PATH_MAX];
char *base;
char *dbase;
struct mdinfo *sra;
sra = malloc(sizeof(*sra));
if (sra == NULL)
return sra;
- sra->next = NULL;
-
- if (fd >= 0) {
- struct stat stb;
- mdu_version_t vers;
- if (fstat(fd, &stb)) return NULL;
- if (ioctl(fd, RAID_VERSION, &vers) != 0)
- return NULL;
- if (major(stb.st_rdev)==9)
- sprintf(sra->sys_name, "md%d", (int)minor(stb.st_rdev));
- else
- sprintf(sra->sys_name, "md_d%d",
- (int)minor(stb.st_rdev)>>MdpMinorShift);
- } else {
- if (devnum >= 0)
- sprintf(sra->sys_name, "md%d", devnum);
- else
- sprintf(sra->sys_name, "md_d%d",
- -1-devnum);
+ memset(sra, 0, sizeof(*sra));
+ sysfs_init(sra, fd, devnum);
+ if (sra->sys_name[0] == 0) {
+ free(sra);
+ return NULL;
}
+
sprintf(fname, "/sys/block/%s/md/", sra->sys_name);
base = fname + strlen(fname);
goto abort;
sra->array.raid_disks = strtoul(buf, NULL, 0);
}
+ if (options & GET_DEGRADED) {
+ strcpy(base, "degraded");
+ if (load_sys(fname, buf))
+ goto abort;
+ sra->array.failed_disks = strtoul(buf, NULL, 0);
+ }
if (options & GET_COMPONENT) {
strcpy(base, "component_size");
if (load_sys(fname, buf))
goto abort;
sra->mismatch_cnt = strtoul(buf, NULL, 0);
}
+ if (options & GET_SAFEMODE) {
+ int scale = 1;
+ int dot = 0;
+ unsigned i;
+ unsigned long msec;
+ size_t len;
+
+ strcpy(base, "safe_mode_delay");
+ if (load_sys(fname, buf))
+ goto abort;
+
+ /* remove a period, and count digits after it */
+ len = strlen(buf);
+ for (i = 0; i < len; i++) {
+ if (dot) {
+ if (isdigit(buf[i])) {
+ buf[i-1] = buf[i];
+ scale *= 10;
+ }
+ buf[i] = 0;
+ } else if (buf[i] == '.') {
+ dot=1;
+ buf[i] = 0;
+ }
+ }
+ msec = strtoul(buf, NULL, 10);
+ msec = (msec * 1000) / scale;
+ sra->safe_mode_delay = msec;
+ }
if (! (options & GET_DEVS))
return sra;
dev = malloc(sizeof(*dev));
if (!dev)
goto abort;
- dev->next = sra->devs;
- sra->devs = dev;
- strcpy(dev->sys_name, de->d_name);
/* Always get slot, major, minor */
strcpy(dbase, "slot");
- if (load_sys(fname, buf))
- goto abort;
+ if (load_sys(fname, buf)) {
+ /* hmm... unable to read 'slot' maybe the device
+ * is going away?
+ */
+ strcpy(dbase, "block");
+ if (readlink(fname, buf, sizeof(buf)) < 0 &&
+ errno != ENAMETOOLONG) {
+ /* ...yup device is gone */
+ free(dev);
+ continue;
+ } else {
+ /* slot is unreadable but 'block' link
+ * still intact... something bad is happening
+ * so abort
+ */
+ free(dev);
+ goto abort;
+ }
+
+ }
+ strcpy(dev->sys_name, de->d_name);
dev->disk.raid_disk = strtoul(buf, &ep, 10);
if (*ep) dev->disk.raid_disk = -1;
strcpy(dbase, "block/dev");
- if (load_sys(fname, buf))
- goto abort;
+ if (load_sys(fname, buf)) {
+ /* assume this is a stale reference to a hot
+ * removed device
+ */
+ free(dev);
+ continue;
+ }
sscanf(buf, "%d:%d", &dev->disk.major, &dev->disk.minor);
+ /* special case check for block devices that can go 'offline' */
+ strcpy(dbase, "block/device/state");
+ if (load_sys(fname, buf) == 0 &&
+ strncmp(buf, "offline", 7) == 0) {
+ free(dev);
+ continue;
+ }
+
+ /* finally add this disk to the array */
+ dev->next = sra->devs;
+ sra->devs = dev;
+
if (options & GET_OFFSET) {
strcpy(dbase, "offset");
if (load_sys(fname, buf))
return NULL;
}
+int sysfs_attr_match(const char *attr, const char *str)
+{
+ /* See if attr, read from a sysfs file, matches
+ * str. They must either be the same, or attr can
+ * have a trailing newline or comma
+ */
+ while (*attr && *str && *attr == *str) {
+ attr++;
+ str++;
+ }
+
+ if (*str || (*attr && *attr != ',' && *attr != '\n'))
+ return 0;
+ return 1;
+}
+
+int sysfs_match_word(const char *word, char **list)
+{
+ int n;
+ for (n=0; list[n]; n++)
+ if (sysfs_attr_match(word, list[n]))
+ break;
+ return n;
+}
+
unsigned long long get_component_size(int fd)
{
/* Find out the component size of the array.
char fname[50];
int n;
if (fstat(fd, &stb)) return 0;
- if (major(stb.st_rdev) == 9)
+ if (major(stb.st_rdev) != (unsigned)get_mdp_major())
sprintf(fname, "/sys/block/md%d/md/component_size",
(int)minor(stb.st_rdev));
else
char *name, char *val)
{
char fname[50];
- int n;
+ unsigned int n;
int fd;
sprintf(fname, "/sys/block/%s/md/%s/%s",
return -1;
n = write(fd, val, strlen(val));
close(fd);
- if (n != strlen(val))
+ if (n != strlen(val)) {
+ dprintf(Name ": failed to write '%s' to '%s' (%s)\n",
+ val, fname, strerror(errno));
return -1;
+ }
return 0;
}
return sysfs_set_str(sra, dev, name, valstr);
}
-int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
- char *name, unsigned long long *val)
+int sysfs_uevent(struct mdinfo *sra, char *event)
{
char fname[50];
- char buf[50];
int n;
int fd;
- char *ep;
+
+ sprintf(fname, "/sys/block/%s/uevent",
+ sra->sys_name);
+ fd = open(fname, O_WRONLY);
+ if (fd < 0)
+ return -1;
+ n = write(fd, event, strlen(event));
+ close(fd);
+ return 0;
+}
+
+int sysfs_attribute_available(struct mdinfo *sra, struct mdinfo *dev, char *name)
+{
+ char fname[50];
+ struct stat st;
+
sprintf(fname, "/sys/block/%s/md/%s/%s",
sra->sys_name, dev?dev->sys_name:"", name);
- fd = open(fname, O_RDONLY);
+
+ return stat(fname, &st) == 0;
+}
+
+int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev,
+ char *name)
+{
+ char fname[50];
+ int fd;
+
+ sprintf(fname, "/sys/block/%s/md/%s/%s",
+ sra->sys_name, dev?dev->sys_name:"", name);
+ fd = open(fname, O_RDWR);
if (fd < 0)
- return -1;
+ fd = open(fname, O_RDONLY);
+ return fd;
+}
+
+int sysfs_fd_get_ll(int fd, unsigned long long *val)
+{
+ char buf[50];
+ int n;
+ char *ep;
+
+ lseek(fd, 0, 0);
n = read(fd, buf, sizeof(buf));
- close(fd);
if (n <= 0)
return -1;
buf[n] = 0;
return 0;
}
-int sysfs_set_array(struct mdinfo *sra,
- struct mdinfo *info)
+int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, unsigned long long *val)
{
- int rv = 0;
- sra->array = info->array;
+ int n;
+ int fd;
+
+ fd = sysfs_get_fd(sra, dev, name);
+ if (fd < 0)
+ return -1;
+ n = sysfs_fd_get_ll(fd, val);
+ close(fd);
+ return n;
+}
+
+int sysfs_fd_get_str(int fd, char *val, int size)
+{
+ int n;
+
+ lseek(fd, 0, 0);
+ n = read(fd, val, size);
+ if (n <= 0)
+ return -1;
+ val[n] = 0;
+ return n;
+}
+
+int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, char *val, int size)
+{
+ int n;
+ int fd;
+
+ fd = sysfs_get_fd(sra, dev, name);
+ if (fd < 0)
+ return -1;
+ n = sysfs_fd_get_str(fd, val, size);
+ close(fd);
+ return n;
+}
+
+int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms)
+{
+ unsigned long sec;
+ unsigned long msec;
+ char delay[30];
+
+ sec = ms / 1000;
+ msec = ms % 1000;
+
+ sprintf(delay, "%ld.%03ld\n", sec, msec);
+ /* this '\n' ^ needed for kernels older than 2.6.28 */
+ return sysfs_set_str(sra, NULL, "safe_mode_delay", delay);
+}
+int sysfs_set_array(struct mdinfo *info, int vers)
+{
+ int rv = 0;
+ char ver[100];
+
+ ver[0] = 0;
+ if (info->array.major_version == -1 &&
+ info->array.minor_version == -2) {
+ strcat(strcpy(ver, "external:"), info->text_version);
+
+ if ((vers % 100) < 2 ||
+ sysfs_set_str(info, NULL, "metadata_version",
+ ver) < 0) {
+ fprintf(stderr, Name ": This kernel does not "
+ "support external metadata.\n");
+ return 1;
+ }
+ }
if (info->array.level < 0)
return 0; /* FIXME */
- rv |= sysfs_set_str(sra, NULL, "level",
+ rv |= sysfs_set_str(info, NULL, "level",
map_num(pers, info->array.level));
- rv |= sysfs_set_num(sra, NULL, "raid_disks", info->array.raid_disks);
- rv |= sysfs_set_num(sra, NULL, "chunk_size", info->array.chunk_size);
- rv |= sysfs_set_num(sra, NULL, "layout", info->array.layout);
- rv |= sysfs_set_num(sra, NULL, "component_size", info->component_size/2);
- rv |= sysfs_set_num(sra, NULL, "resync_start", info->resync_start);
- sra->array = info->array;
+ rv |= sysfs_set_num(info, NULL, "raid_disks", info->array.raid_disks);
+ rv |= sysfs_set_num(info, NULL, "chunk_size", info->array.chunk_size);
+ rv |= sysfs_set_num(info, NULL, "layout", info->array.layout);
+ rv |= sysfs_set_num(info, NULL, "component_size", info->component_size/2);
+ if (info->custom_array_size) {
+ int rc;
+
+ rc = sysfs_set_num(info, NULL, "array_size",
+ info->custom_array_size/2);
+ if (rc && errno == ENOENT) {
+ fprintf(stderr, Name ": This kernel does not "
+ "have the md/array_size attribute, "
+ "the array may be larger than expected\n");
+ rc = 0;
+ }
+ rv |= rc;
+ }
+
+ if (info->array.level > 0)
+ rv |= sysfs_set_num(info, NULL, "resync_start", info->resync_start);
return rv;
}
-int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd)
+int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume)
{
- char dv[100];
- char nm[100];
- struct mdinfo *sd2;
+ char dv[PATH_MAX];
+ char nm[PATH_MAX];
char *dname;
int rv;
strcpy(sd->sys_name, "dev-");
strcpy(sd->sys_name+4, dname);
+ /* test write to see if 'recovery_start' is available */
+ if (resume && sd->recovery_start < MaxSector &&
+ sysfs_set_num(sra, sd, "recovery_start", 0)) {
+ sysfs_set_str(sra, sd, "state", "remove");
+ return -1;
+ }
+
rv = sysfs_set_num(sra, sd, "offset", sd->data_offset);
rv |= sysfs_set_num(sra, sd, "size", (sd->component_size+1) / 2);
if (sra->array.level != LEVEL_CONTAINER) {
+ if (sd->recovery_start == MaxSector)
+ /* This can correctly fail if array isn't started,
+ * yet, so just ignore status for now.
+ */
+ sysfs_set_str(sra, sd, "state", "insync");
rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk);
-// rv |= sysfs_set_str(sra, sd, "state", "in_sync");
- }
- if (! rv) {
- sd2 = malloc(sizeof(*sd2));
- *sd2 = *sd;
- sd2->next = sra->devs;
- sra->devs = sd2;
+ if (resume)
+ sysfs_set_num(sra, sd, "recovery_start", sd->recovery_start);
}
return rv;
}
else
return found;
}
+
+int sysfs_freeze_array(struct mdinfo *sra)
+{
+ /* Try to freeze resync/rebuild on this array/container.
+ * Return -1 if the array is busy,
+ * return -2 container cannot be frozen,
+ * return 0 if this kernel doesn't support 'frozen'
+ * return 1 if it worked.
+ */
+ char buf[20];
+
+ if (!sysfs_attribute_available(sra, NULL, "sync_action"))
+ return 1; /* no sync_action == frozen */
+ if (sysfs_get_str(sra, NULL, "sync_action", buf, 20) <= 0)
+ return 0;
+ if (strcmp(buf, "idle\n") != 0 &&
+ strcmp(buf, "frozen\n") != 0)
+ return -1;
+ if (sysfs_set_str(sra, NULL, "sync_action", "frozen") < 0)
+ return 0;
+ return 1;
+}
+
+#ifndef MDASSEMBLE
+
+static char *clean_states[] = {
+ "clear", "inactive", "readonly", "read-auto", "clean", NULL };
+
+int WaitClean(char *dev, int sock, int verbose)
+{
+ int fd;
+ struct mdinfo *mdi;
+ int rv = 1;
+ int devnum;
+
+ fd = open(dev, O_RDONLY);
+ if (fd < 0) {
+ if (verbose)
+ fprintf(stderr, Name ": Couldn't open %s: %s\n", dev, strerror(errno));
+ return 1;
+ }
+
+ devnum = fd2devnum(fd);
+ mdi = sysfs_read(fd, devnum, GET_VERSION|GET_LEVEL|GET_SAFEMODE);
+ if (!mdi) {
+ if (verbose)
+ fprintf(stderr, Name ": Failed to read sysfs attributes for "
+ "%s\n", dev);
+ close(fd);
+ return 0;
+ }
+
+ switch(mdi->array.level) {
+ case LEVEL_LINEAR:
+ case LEVEL_MULTIPATH:
+ case 0:
+ /* safemode delay is irrelevant for these levels */
+ rv = 0;
+
+ }
+
+ /* for internal metadata the kernel handles the final clean
+ * transition, containers can never be dirty
+ */
+ if (!is_subarray(mdi->text_version))
+ rv = 0;
+
+ /* safemode disabled ? */
+ if (mdi->safe_mode_delay == 0)
+ rv = 0;
+
+ if (rv) {
+ int state_fd = sysfs_open(fd2devnum(fd), NULL, "array_state");
+ char buf[20];
+ fd_set fds;
+ struct timeval tm;
+
+ /* minimize the safe_mode_delay and prepare to wait up to 5s
+ * for writes to quiesce
+ */
+ sysfs_set_safemode(mdi, 1);
+ tm.tv_sec = 5;
+ tm.tv_usec = 0;
+
+ FD_ZERO(&fds);
+
+ /* wait for array_state to be clean */
+ while (1) {
+ rv = read(state_fd, buf, sizeof(buf));
+ if (rv < 0)
+ break;
+ if (sysfs_match_word(buf, clean_states) <= 4)
+ break;
+ FD_SET(state_fd, &fds);
+ rv = select(state_fd + 1, NULL, NULL, &fds, &tm);
+ if (rv < 0 && errno != EINTR)
+ break;
+ lseek(state_fd, 0, SEEK_SET);
+ }
+ if (rv < 0)
+ rv = 1;
+ else if (fping_monitor(sock) == 0 ||
+ ping_monitor(mdi->text_version) == 0) {
+ /* we need to ping to close the window between array
+ * state transitioning to clean and the metadata being
+ * marked clean
+ */
+ rv = 0;
+ } else
+ rv = 1;
+ if (rv && verbose)
+ fprintf(stderr, Name ": Error waiting for %s to be clean\n",
+ dev);
+
+ /* restore the original safe_mode_delay */
+ sysfs_set_safemode(mdi, mdi->safe_mode_delay);
+ close(state_fd);
+ }
+
+ sysfs_free(mdi);
+ close(fd);
+
+ return rv;
+}
+#endif /* MDASSEMBLE */