From 549e9569c6006433512801ae76b34abc0d3e1ac0 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Thu, 15 May 2008 16:48:37 +1000 Subject: [PATCH] Merge mdmon --- Makefile | 13 +- managemon.c | 309 +++++++++++++++++++++++++++++++++++++++++++ mdadm.h | 35 +++++ mdmon.c | 222 +++++++++++++++++++++++++++++++ mdmon.h | 41 ++++++ mdstat.c | 60 ++++++--- monitor.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++ super-ddf.c | 39 +++++- sysfs.c | 29 ++++ 9 files changed, 1101 insertions(+), 19 deletions(-) create mode 100644 managemon.c create mode 100644 mdmon.c create mode 100644 mdmon.h create mode 100644 monitor.c diff --git a/Makefile b/Makefile index 46d75946..b2087d0c 100644 --- a/Makefile +++ b/Makefile @@ -77,6 +77,11 @@ SRCS = mdadm.c config.c mdstat.c ReadMe.c util.c Manage.c Assemble.c Build.c \ mdopen.c super0.c super1.c super-ddf.c super-intel.c bitmap.c \ restripe.c sysfs.c sha1.c mapfile.c crc32.c sg_io.c msg.c +MON_OBJS = mdmon.o monitor.o managemon.o util.o mdstat.o sysfs.o config.o \ + Kill.o sg_io.o dlink.o ReadMe.o super0.o super1.o super-intel.o \ + super-ddf.o sha1.o crc32.o + + STATICSRC = pwgr.c STATICOBJS = pwgr.o @@ -88,7 +93,7 @@ ASSEMBLE_SRCS += mdopen.c mdstat.c ASSEMBLE_FLAGS += -DMDASSEMBLE_AUTO endif -all : mdadm mdadm.man md.man mdadm.conf.man +all : mdadm mdmon mdadm.man md.man mdadm.conf.man everything: all mdadm.static swap_super test_stripe \ mdassemble mdassemble.static mdassemble.man \ @@ -118,6 +123,9 @@ mdadm.Os : $(SRCS) mdadm.h mdadm.O2 : $(SRCS) mdadm.h gcc -o mdadm.O2 $(CFLAGS) -DHAVE_STDINT_H -O2 $(SRCS) +mdmon : $(MON_OBJS) + $(CC) $(LDFLAGS) -o mdmon $(MON_OBJS) $(LDLIBS) + test_stripe : restripe.c mdadm.h $(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe -DMAIN restripe.c @@ -182,7 +190,8 @@ test: mdadm test_stripe swap_super @echo "Please run 'sh ./test' as root" clean : - rm -f mdadm $(OBJS) $(STATICOBJS) core *.man mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \ + rm -f mdadm mdmon $(OBJS) $(MON_OBJS) $(STATICOBJS) core *.man \ + mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \ mdadm.Os mdadm.O2 \ mdassemble mdassemble.static mdassemble.uclibc mdassemble.klibc swap_super \ init.cpio.gz mdadm.uclibc.static test_stripe diff --git a/managemon.c b/managemon.c new file mode 100644 index 00000000..ee4ee2b9 --- /dev/null +++ b/managemon.c @@ -0,0 +1,309 @@ + +/* + * The management thread for monitoring active md arrays. + * This thread does things which might block such as memory + * allocation. + * In particular: + * + * - Find out about new arrays in this container. + * Allocate the data structures and open the files. + * + * For this we watch /proc/mdstat and find new arrays with + * metadata type that confirms sharing. e.g. "md4" + * When we find a new array we slip it into the list of + * arrays and signal 'monitor' by writing to a pipe. + * + * - Respond to reshape requests by allocating new data structures + * and opening new files. + * + * These come as a change to raid_disks. We allocate a new + * version of the data structures and slip it into the list. + * 'monitor' will notice and release the old version. + * Changes to level, chunksize, layout.. do not need re-allocation. + * Reductions in raid_disks don't really either, but we handle + * them the same way for consistency. + * + * - When a device is added to the container, we add it to the metadata + * as a spare. + * + * - assist with activating spares by opening relevant sysfs file. + * + * - Pass on metadata updates from external programs such as + * mdadm creating a new array. + * + * This is most-messy. + * It might involve adding a new array or changing the status of + * a spare, or any reconfig that the kernel doesn't get involved in. + * + * The required updates are received via a named pipe. There will + * be one named pipe for each container. Each message contains a + * sync marker: 0x5a5aa5a5, A byte count, and the message. This is + * passed to the metadata handler which will interpret and process it. + * For 'DDF' messages are internal data blocks with the leading + * 'magic number' signifying what sort of data it is. + * + */ + +/* + * We select on /proc/mdstat and the named pipe. + * We create new arrays or updated version of arrays and slip + * them into the head of the list, then signal 'monitor' via a pipe write. + * 'monitor' will notice and place the old array on a return list. + * Metadata updates are placed on a queue just like they arrive + * from the named pipe. + * + * When new arrays are found based on correct metadata string, we + * need to identify them with an entry in the metadata. Maybe we require + * the metadata to be mdX/NN when NN is the index into an appropriate table. + * + */ + +/* + * List of tasks: + * - Watch for spares to be added to the container, and write updated + * metadata to them. + * - Watch for new arrays using this container, confirm they match metadata + * and if so, start monitoring them + * - Watch for spares being added to monitored arrays. This shouldn't + * happen, as we should do all the adding. Just remove them. + * - Watch for change in raid-disks, chunk-size, etc. Update metadata and + * start a reshape. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "mdadm.h" +#include "mdmon.h" +#include + + +static void free_aa(struct active_array *aa) +{ + /* Note that this doesn't close fds, as they may be in used + * by a clone. Use close_aa for that. + */ + while (aa->info.devs) { + struct mdinfo *d = aa->info.devs; + aa->info.devs = d->next; + free(d); + } + free(aa); +} + +static void replace_array(struct supertype *container, + struct active_array *old, + struct active_array *new) +{ + /* To replace an array, we add it to the top of the list + * marked with ->replaces to point to the original. + * 'monitor' will take the original out of the list + * and put it on 'discard_this'. We take it from there + * and discard it. + */ + + while (pending_discard) { + while (discard_this == NULL) + sleep(1); + if (discard_this != pending_discard) + abort(); + discard_this->next = NULL; + free_aa(discard_this); + discard_this = NULL; + pending_discard = NULL; + } + pending_discard = old; + new->replaces = old; + new->next = container->arrays; + container->arrays = new; +} + + +static void manage_container(struct mdstat_ent *mdstat, + struct supertype *container) +{ + /* The only thing of interest here is if a new device + * has been added to the container. We add it to the + * array ignoring any metadata on it. + * FIXME should we look for compatible metadata and take hints + * about spare assignment.... probably not. + * + */ + if (mdstat->devcnt != container->devcnt) { + /* read /sys/block/NAME/md/dev-??/block/dev to find out + * what is there, and compare with container->info.devs + * To see what is removed and what is added. + * These need to be remove from, or added to, the array + */ + // FIXME + container->devcnt = mdstat->devcnt; + } +} + +static void manage_member(struct mdstat_ent *mdstat, + struct active_array *a) +{ + /* Compare mdstat info with known state of member array. + * We do not need to look for device state changes here, that + * is dealt with by the monitor. + * + * We just look for changes which suggest that a reshape is + * being requested. + * Unfortunately decreases in raid_disks don't show up in + * mdstat until the reshape completes FIXME. + */ + // FIXME + a->info.array.raid_disks = mdstat->raid_disks; + a->info.array.chunk_size = mdstat->chunk_size; + // MORE + +} + +static void write_wakeup(struct supertype *c) +{ + write(c->pipe[1], "PING", 4); +} + +static void manage_new(struct mdstat_ent *mdstat, + struct supertype *container) +{ + /* A new array has appeared in this container. + * Hopefully it is already recorded in the metadata. + * Check, then create the new array to report it to + * the monitor. + */ + + struct active_array *new; + struct mdinfo *mdi, *di; + char *n; + int inst; + int i; + + new = malloc(sizeof(*new)); + + new->devnum = mdstat->devnum; + + new->prev_state = new->curr_state = new->next_state = inactive; + new->prev_action= new->curr_action= new->next_action= idle; + + new->container = container; + + n = &mdstat->metadata_version[10+strlen(container->devname)+1]; + inst = atoi(n); + if (inst < 0) + abort();//FIXME + + mdi = sysfs_read(-1, new->devnum, + GET_LEVEL|GET_CHUNK|GET_DISKS| + GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE); + if (!mdi) { + /* Eeek. Cannot monitor this array. + * Mark it to be ignored by setting container to NULL + */ + new->container = NULL; + replace_array(container, NULL, new); + return; + } + + new->info.array = mdi->array; + + for (i = 0; i < new->info.array.raid_disks; i++) { + struct mdinfo *newd = malloc(sizeof(*newd)); + + for (di = mdi->devs; di; di = di->next) + if (i == di->disk.raid_disk) + break; + + if (di) { + memcpy(newd, di, sizeof(*newd)); + + sprintf(newd->sys_name, "rd%d", i); + + newd->state_fd = sysfs_open(new->devnum, + newd->sys_name, + "state"); + + newd->prev_state = read_dev_state(newd->state_fd); + newd->curr_state = newd->curr_state; + } else { + newd->state_fd = -1; + } + newd->next = new->info.devs; + new->info.devs = newd; + } + new->action_fd = sysfs_open(new->devnum, NULL, "sync_action"); + new->info.state_fd = sysfs_open(new->devnum, NULL, "array_state"); + new->sync_pos_fd = sysfs_open(new->devnum, NULL, "sync_completed"); + new->sync_pos = 0; + + // finds and compares. + if (container->ss->open_new(container, new, inst) < 0) { + // FIXME close all those files + new->container = NULL; + replace_array(container, NULL, new); + return; + } + replace_array(container, NULL, new); + write_wakeup(container); + return; +} + +void manage(struct mdstat_ent *mdstat, struct active_array *aa, + struct supertype *container) +{ + /* We have just read mdstat and need to compare it with + * the known active arrays. + * Arrays with the wrong metadata are ignored. + */ + + for ( ; mdstat ; mdstat = mdstat->next) { + struct active_array *a; + if (mdstat->devnum == container->devnum) { + manage_container(mdstat, container); + continue; + } + if (mdstat->metadata_version == NULL || + strncmp(mdstat->metadata_version, "external:/", 10) != 0 || + strncmp(mdstat->metadata_version+10, container->devname, + strlen(container->devname)) != 0 || + mdstat->metadata_version[10+strlen(container->devname)] + != '/') + /* Not for this array */ + continue; + /* Looks like a member of this container */ + for (a = aa; a; a = a->next) { + if (mdstat->devnum == a->devnum) { + if (a->container) + manage_member(mdstat, a); + break; + } + } + if (a == NULL) + manage_new(mdstat, container); + } +} + +void read_sock(int pfd) +{ + int fd; + + // FIXME set non-blocking + fd = accept(pfd, NULL, NULL); + if (fd < 0) + return; + // FIXME do something useful + close(fd); +} +void do_manager(struct supertype *container) +{ + struct mdstat_ent *mdstat; + + do { + mdstat = mdstat_read(1, 0); + + manage(mdstat, array_list, container); + + read_sock(container->sock); + + mdstat_wait_fd(container->sock); + } while(1); +} diff --git a/mdadm.h b/mdadm.h index 64f41fd0..3f778f1c 100644 --- a/mdadm.h +++ b/mdadm.h @@ -159,6 +159,11 @@ struct mdinfo { char sys_name[20]; struct mdinfo *devs; struct mdinfo *next; + + /* Device info for mdmon: */ + int state_fd; + int prev_state, curr_state, next_state; + }; struct createinfo { @@ -271,12 +276,17 @@ struct mdstat_ent { char *pattern; /* U or up, _ for down */ int percent; /* -1 if no resync */ int resync; /* 1 if resync, 0 if recovery */ + int devcnt; + int raid_disks; + int chunk_size; + char * metadata_version; struct mdstat_ent *next; }; extern struct mdstat_ent *mdstat_read(int hold, int start); extern void free_mdstat(struct mdstat_ent *ms); extern void mdstat_wait(int seconds); +extern void mdstat_wait_fd(int fd); extern int mddev_busy(int devnum); struct map_ent { @@ -304,6 +314,7 @@ extern void map_add(struct map_ent **melp, #define GET_CACHE 16 #define GET_MISMATCH 32 #define GET_VERSION 64 +#define GET_DISKS 128 #define GET_DEVS 1024 /* gets role, major, minor */ #define GET_OFFSET 2048 @@ -314,6 +325,7 @@ extern void map_add(struct map_ent **melp, /* If fd >= 0, get the array it is open on, * else use devnum. >=0 -> major9. <0..... */ +extern int sysfs_open(int devnum, char *devname, char *attr); extern void sysfs_free(struct mdinfo *sra); extern struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options); extern int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev, @@ -350,6 +362,7 @@ extern mapping_t r5layout[], pers[], modes[], faultylayout[]; extern char *map_dev(int major, int minor, int create); +struct active_array; extern struct superswitch { void (*examine_super)(struct supertype *st, char *homehost); @@ -390,6 +403,14 @@ extern struct superswitch { struct mdinfo *(*container_content)(struct supertype *st); +/* for mdmon */ + int (*open_new)(struct supertype *c, struct active_array *a, int inst); + void (*mark_clean)(struct active_array *a, unsigned long long sync_pos); + void (*mark_dirty)(struct active_array *a); + void (*set_disk)(struct active_array *a, int n); + void (*sync_metadata)(struct active_array *a); + + int major; char *text_version; int swapuuid; /* true if uuid is bigending rather than hostendian */ @@ -406,6 +427,20 @@ struct supertype { int container_member; /* numerical position in container */ void *sb; void *info; + + /* extra stuff used by mdmon */ + struct active_array *arrays; + int devfd; + int sock; /* listen to external programs */ + int pipe[2]; /* communicate between threads */ + int devnum; + char *devname; /* e.g. md0. This appears in metadata_verison: + * external:/md0/12 + */ + int devcnt; + + struct mdinfo *devs; + }; extern struct supertype supertype_container_member; diff --git a/mdmon.c b/mdmon.c new file mode 100644 index 00000000..1284a124 --- /dev/null +++ b/mdmon.c @@ -0,0 +1,222 @@ + +/* + * md array manager. + * When md arrays have user-space managed metadata, this is the program + * that does the managing. + * + * Given one argument: the name of the array (e.g. /dev/md0) that is + * the container. + * We fork off a helper that runs high priority and mlocked. It responds to + * device failures and other events that might stop writeout, or that are + * trivial to deal with. + * The main thread then watches for new arrays being created in the container + * and starts monitoring them too ... along with a few other tasks. + * + * The main thread communicates with the priority thread by writing over + * a pipe. + * Separate programs can communicate with the main thread via Unix-domain + * socket. + * The two threads share address space and open file table. + * + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "mdadm.h" +#include "mdmon.h" + +struct active_array *array_list; +struct active_array *discard_this; +struct active_array *pending_discard; + +int run_child(void *v) +{ + struct supertype *c = v; + do_monitor(c); + return 0; +} + +int clone_monitor(struct supertype *container) +{ + int pfd[2]; + static char stack[4096]; + int rv; + + pipe(container->pipe); + + rv = clone(run_child, stack+4096-64, + CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD, + container); + + if (rv < 0) + return rv; + return pfd[1]; +} + +static struct superswitch *find_metadata_methods(char *vers) +{ + if (strcmp(vers, "ddf") == 0) + return &super_ddf; + return NULL; +} + + +static int make_pidfile(char *devname) +{ + char path[100]; + char pid[10]; + int fd; + sprintf(path, "/var/run/mdadm/%s.pid", devname); + + fd = open(path, O_RDWR|O_CREAT|O_EXCL, 0600); + if (fd < 0) + return -1; + sprintf(pid, "%d\n", getpid()); + write(fd, pid, strlen(pid)); + close(fd); + return 0; +} + +static int make_control_sock(char *devname) +{ + char path[100]; + int sfd; + long fl; + struct sockaddr_un addr; + + sprintf(path, "/var/run/mdadm/%s.sock", devname); + unlink(path); + sfd = socket(PF_LOCAL, SOCK_STREAM, 0); + if (sfd < 0) + return -1; + + addr.sun_family = PF_LOCAL; + strcpy(addr.sun_path, path); + if (bind(sfd, &addr, sizeof(addr)) < 0) { + close(sfd); + return -1; + } + listen(sfd, 10); + fl = fcntl(sfd, F_GETFL, 0); + fl |= O_NONBLOCK; + fcntl(sfd, F_SETFL, fl); + return sfd; +} + +int main(int argc, char *argv[]) +{ + int mdfd; + int pipefd; + struct mdinfo *mdi, *di; + struct supertype *container; + if (argc != 2) { + fprintf(stderr, "Usage: md-manage /device/name/for/container\n"); + exit(2); + } + mdfd = open(argv[1], O_RDWR); + if (mdfd < 0) { + fprintf(stderr, "md-manage: %s: %s\n", argv[1], + strerror(errno)); + exit(1); + } + if (md_get_version(mdfd) < 0) { + fprintf(stderr, "md-manage: %s: Not an md device\n", + argv[1]); + exit(1); + } + + /* hopefully it is a container - we'll check later */ + + container = malloc(sizeof(*container)); + container->devfd = mdfd; + container->devnum = fd2devnum(mdfd); + container->devname = devnum2devname(container->devnum); + + /* If this fails, we hope it already exists */ + mkdir("/var/run/mdadm", 0600); + /* pid file lives in /var/run/mdadm/mdXX.pid */ + if (make_pidfile(container->devname) < 0) { + fprintf(stderr, "md-manage: %s already managed\n", + container->devname); + exit(3); + } + + container->sock = make_control_sock(container->devname); + if (container->sock < 0) { + fprintf(stderr, "mdmon: Cannot create socket in /var/run/mdadm\n"); + exit(3); + } + container->arrays = NULL; + + mdi = sysfs_read(mdfd, container->devnum, + GET_VERSION|GET_LEVEL|GET_DEVS); + + if (!mdi) { + fprintf(stderr, "mdmon: failed to load sysfs info for %s\n", + container->devname); + exit(3); + } + if (mdi->array.level != UnSet) { + fprintf(stderr, "mdmon: %s is not a container - cannot monitor\n", + argv[1]); + exit(3); + } + if (mdi->array.major_version != -1 || + mdi->array.minor_version != -2) { + fprintf(stderr, "mdmon: %s does not use external metadata - cannot monitor\n", + argv[1]); + exit(3); + } + + container->ss = find_metadata_methods(mdi->text_version); + if (container->ss == NULL) { + fprintf(stderr, "mdmon: %s uses unknown metadata: %s\n", + argv[1], mdi->text_version); + exit(3); + } + + container->devs = NULL; + for (di = mdi->devs; di; di = di->next) { + struct mdinfo *cd = malloc(sizeof(*cd)); + cd = di; + cd->next = container->devs; + container->devs = cd; + } + sysfs_free(mdi); + + + if (container->ss->load_super(container, mdfd, argv[1])) { + fprintf(stderr, "mdmon: Cannot load metadata for %s\n", + argv[1]); + exit(3); + } + + + mlockall(MCL_FUTURE); + + pipefd = clone_monitor(container); + if (pipefd < 0) { + fprintf(stderr, "md-manage: failed to start monitor process: %s\n", + strerror(errno)); + exit(2); + } + + do_manager(container); + + exit(0); +} diff --git a/mdmon.h b/mdmon.h new file mode 100644 index 00000000..497bbec2 --- /dev/null +++ b/mdmon.h @@ -0,0 +1,41 @@ + +enum array_state { clear, inactive, suspended, readonly, read_auto, + clean, active, write_pending, active_idle, bad_word}; + +enum sync_action { idle, reshape, resync, recover, check, repair, bad_action }; + + +struct active_array { + struct mdinfo info; + struct supertype *container; + struct active_array *next, *replaces; + + int action_fd; + int sync_pos_fd; + + enum array_state prev_state, curr_state, next_state; + enum sync_action prev_action, curr_action, next_action; + + int devnum; + + unsigned long long sync_pos; +}; + + + +#define MD_MAJOR 9 + +extern struct active_array *container; +extern struct active_array *array_list; +extern struct active_array *discard_this; +extern struct active_array *pending_discard; + + +void do_monitor(struct supertype *container); +void do_manager(struct supertype *container); + +int read_dev_state(int fd); + +struct mdstat_ent *mdstat_read(int hold, int start); + +extern struct superswitch super_ddf, super_ddf_bvd, super_ddf_svd; diff --git a/mdstat.c b/mdstat.c index a8f7ce75..c75260e8 100644 --- a/mdstat.c +++ b/mdstat.c @@ -86,6 +86,7 @@ #include "mdadm.h" #include "dlink.h" #include +#include void free_mdstat(struct mdstat_ent *ms) { @@ -158,6 +159,10 @@ struct mdstat_ent *mdstat_read(int hold, int start) ent->percent = -1; ent->active = -1; ent->resync = 0; + ent->metadata_version = NULL; + ent->raid_disks = 0; + ent->chunk_size = 0; + ent->devcnt = 0; ent->dev = strdup(line); ent->devnum = devnum; @@ -176,22 +181,32 @@ struct mdstat_ent *mdstat_read(int hold, int start) in_devs = 1; } else if (in_devs && strcmp(w, "blocks")==0) in_devs = 0; - else if (in_devs && strncmp(w, "md", 2)==0) { - /* This has an md device as a component. - * If that device is already in the list, - * make sure we insert before there. - */ - struct mdstat_ent **ih; - int dn2; - if (strncmp(w, "md_d", 4)==0) - dn2 = -1-strtoul(w+4, &ep, 10); - else - dn2 = strtoul(w+2, &ep, 10); - ih = &all; - while (ih != insert_here && *ih && - (*ih)->devnum != dn2) - ih = & (*ih)->next; - insert_here = ih; + else if (in_devs) { + ent->devcnt++; + if (strncmp(w, "md", 2)==0) { + /* This has an md device as a component. + * If that device is already in the + * list, make sure we insert before + * there. + */ + struct mdstat_ent **ih; + int dn2; + if (strncmp(w, "md_d", 4)==0) + dn2 = -1-strtoul(w+4, &ep, 10); + else + dn2 = strtoul(w+2, &ep, 10); + ih = &all; + while (ih != insert_here && *ih && + (*ih)->devnum != dn2) + ih = & (*ih)->next; + insert_here = ih; + } + } else if (strcmp(w, "super") == 0 && + dl_next(w) != line) { + w = dl_next(w); + ent->metadata_version = strdup(w); + } else if (w[0] == '[' && isdigit(w[1])) { + ent->raid_disks = atoi(w+1); } else if (!ent->pattern && w[0] == '[' && (w[1] == 'U' || w[1] == '_')) { @@ -256,6 +271,19 @@ void mdstat_wait(int seconds) select(mdstat_fd >2 ? mdstat_fd+1:3, NULL, NULL, &fds, &tm); } +void mdstat_wait_fd(int fd) +{ + fd_set fds, rfds; + + FD_ZERO(&fds); + FD_ZERO(&rfds); + if (mdstat_fd >= 0) + FD_SET(mdstat_fd, &fds); + FD_SET(fd, &rfds); + + select(mdstat_fd >2 ? mdstat_fd+1:3, &rfds, NULL, &fds, NULL); +} + int mddev_busy(int devnum) { struct mdstat_ent *mdstat = mdstat_read(0, 0); diff --git a/monitor.c b/monitor.c new file mode 100644 index 00000000..38725d18 --- /dev/null +++ b/monitor.c @@ -0,0 +1,372 @@ + +#include "mdadm.h" +#include "mdmon.h" + +#include + + +static char *array_states[] = { + "clear", "inactive", "suspended", "readonly", "read-auto", + "clean", "active", "write-pending", "active-idle", NULL }; +static char *sync_actions[] = { + "idle", "reshape", "resync", "recover", "check", "repair", NULL +}; + +static int write_attr(char *attr, int fd) +{ + return write(fd, attr, strlen(attr)); +} + +static void add_fd(fd_set *fds, int *maxfd, int fd) +{ + if (fd < 0) + return; + if (fd > *maxfd) + *maxfd = fd; + FD_SET(fd, fds); +} + +static int read_attr(char *buf, int len, int fd) +{ + int n; + + if (fd < 0) { + buf[0] = 0; + return 0; + } + lseek(fd, 0, 0); + n = read(fd, buf, len - 1); + + if (n <= 0) { + buf[0] = 0; + return 0; + } + buf[n] = 0; + if (buf[n-1] == '\n') + buf[n-1] = 0; + return n; +} + +static int get_sync_pos(struct active_array *a) +{ + char buf[30]; + int n; + + n = read_attr(buf, 30, a->sync_pos_fd); + if (n <= 0) + return n; + + if (strncmp(buf, "max", 3) == 0) { + a->sync_pos = ~(unsigned long long)0; + return 1; + } + a->sync_pos = strtoull(buf, NULL, 10); + return 1; +} + + +static int attr_match(const char *attr, const char *str) +{ + /* See if attr, read from a sysfs file, matches + * str. They must either be the same, or attr can + * have a trailing newline or comma + */ + while (*attr && *str && *attr == *str) { + attr++; + str++; + } + + if (*str || (*attr && *attr != ',' && *attr != '\n')) + return 0; + return 1; +} + +static int match_word(const char *word, char **list) +{ + int n; + for (n=0; list[n]; n++) + if (attr_match(word, list[n])) + break; + return n; +} + +static enum array_state read_state(int fd) +{ + char buf[20]; + int n = read_attr(buf, 20, fd); + + if (n <= 0) + return bad_word; + return (enum array_state) match_word(buf, array_states); +} + +static enum sync_action read_action( int fd) +{ + char buf[20]; + int n = read_attr(buf, 20, fd); + + if (n <= 0) + return bad_action; + return (enum sync_action) match_word(buf, sync_actions); +} + +#define DS_FAULTY 1 +#define DS_INSYNC 2 +#define DS_WRITE_MOSTLY 4 +#define DS_SPARE 8 +#define DS_REMOVE 1024 + +int read_dev_state(int fd) +{ + char buf[60]; + int n = read_attr(buf, 60, fd); + char *cp; + int rv = 0; + + if (n <= 0) + return 0; + + cp = buf; + while (cp) { + if (attr_match("faulty", cp)) + rv |= DS_FAULTY; + if (attr_match("in_sync", cp)) + rv |= DS_INSYNC; + if (attr_match("write_mostly", cp)) + rv |= DS_WRITE_MOSTLY; + if (attr_match("spare", cp)) + rv |= DS_SPARE; + cp = strchr(cp, ','); + if (cp) + cp++; + } + return rv; +} + + +/* Monitor a set of active md arrays - all of which share the + * same metadata - and respond to events that require + * metadata update. + * + * New arrays are detected by another thread which allocates + * required memory and attaches the data structure to our list. + * + * Events: + * Array stops. + * This is detected by array_state going to 'clear' or 'inactive'. + * while we thought it was active. + * Response is to mark metadata as clean and 'clear' the array(??) + * write-pending + * array_state if 'write-pending' + * We mark metadata as 'dirty' then set array to 'active'. + * active_idle + * Either ignore, or mark clean, then mark metadata as clean. + * + * device fails + * detected by rd-N/state reporting "faulty" + * mark device as 'failed' in metadata, the remove device + * by writing 'remove' to rd/state. + * + * sync completes + * sync_action was 'resync' and becomes 'idle' and resync_start becomes + * MaxSector + * Notify metadata that sync is complete. + * "Deal with Degraded" + * + * recovery completes + * sync_action changes from 'recover' to 'idle' + * Check each device state and mark metadata if 'faulty' or 'in_sync'. + * "Deal with Degraded" + * + * deal with degraded array + * We only do this when first noticing the array is degraded. + * This can be when we first see the array, when sync completes or + * when recovery completes. + * + * Check if number of failed devices suggests recovery is needed, and + * skip if not. + * Ask metadata for a spare device + * Add device as not in_sync and give a role + * Update metadata. + * Start recovery. + * + * deal with resync + * This only happens on finding a new array.... + * Maybe this is done by mdadm before passing the array to us? + * + * If array is 'clean' but metadata is 'dirty', start a resync + * and mark array as 'dirty'. + * + * + * + * + * We wait for a change (poll/select) on array_state, sync_action, and + * each rd-X/state file. + * When we get any change, we check everything. So read each state file, + * then decide what to do. + * + * The core action is to write new metadata to all devices in the array. + * This is done at most once on any wakeup. + * After that we might: + * - update the array_state + * - set the role of some devices. + * - request a sync_action + * + */ + +static int read_and_act(struct active_array *a) +{ + int check_degraded; + struct mdinfo *mdi; + + a->next_state = bad_word; + a->next_action = bad_action; + + a->curr_state = read_state(a->info.state_fd); + a->curr_action = read_action(a->action_fd); + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + mdi->next_state = 0; + mdi->curr_state = read_dev_state(mdi->state_fd); + } + + if (a->curr_state <= inactive && + a->prev_state > inactive) { + /* array has been stopped */ + get_sync_pos(a); + a->container->ss->mark_clean(a, a->sync_pos); + a->next_state = clear; + } + if (a->curr_state == write_pending) { + a->container->ss->mark_dirty(a); + a->next_state = active; + } + if (a->curr_state == active_idle) { + /* Set array to 'clean' FIRST, then + * a->ss->mark_clean(a); + * just ignore for now. + */ + } + + if (a->curr_state == readonly) { + /* Well, I'm ready to handle things, so + * read-auto is OK. FIXME what if we really want + * readonly ??? + */ + a->next_state = read_auto; + } + + if (a->curr_action == idle && + a->prev_action == resync) { + /* check resync_start to see if it is 'max'. + * Do I open here, or have it open the whole time? + */ + get_sync_pos(a); + check_degraded = 1; + } + + if (a->curr_action == idle && + a->prev_action == recover) { + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) { + a->container->ss->set_disk(a, mdi->disk.raid_disk); + if (! (mdi->curr_state & DS_INSYNC)) + check_degraded = 1; + } + } + + + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) { + if (mdi->curr_state & DS_FAULTY) { + a->container->ss->set_disk(a, mdi->disk.raid_disk); + check_degraded = 1; + mdi->next_state = DS_REMOVE; + } + } + + if (check_degraded) { + // FIXME; + } + + a->container->ss->sync_metadata(a); + + /* Effect state changes in the array */ + if (a->next_state != bad_word) + write_attr(array_states[a->next_state], a->info.state_fd); + if (a->next_action != bad_action) + write_attr(sync_actions[a->next_action], a->action_fd); + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + if (mdi->next_state == DS_REMOVE) + write_attr("remove", mdi->state_fd); + if (mdi->next_state & DS_INSYNC) + write_attr("+in_sync", mdi->state_fd); + } + + /* move curr_ to prev_ */ + a->prev_state = a->curr_state; + + a->prev_action = a->curr_action; + + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + mdi->prev_state = mdi->curr_state; + mdi->next_state = 0; + } + + return 1; +} + +static int wait_and_act(struct active_array *aa, int pfd, int nowait) +{ + fd_set rfds; + int maxfd = 0; + struct active_array *a; + int rv; + + FD_ZERO(&rfds); + + add_fd(&rfds, &maxfd, pfd); + for (a = aa ; a ; a = a->next) { + struct mdinfo *mdi; + + add_fd(&rfds, &maxfd, a->info.state_fd); + add_fd(&rfds, &maxfd, a->action_fd); + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) + add_fd(&rfds, &maxfd, mdi->state_fd); + } + + if (!nowait) { + rv = select(maxfd+1, &rfds, NULL, NULL, NULL); + + if (rv <= 0) + return rv; + + if (FD_ISSET(pfd, &rfds)) { + char buf[4]; + read(pfd, buf, 4); + ; // FIXME read from the pipe + } + } + + for (a = aa; a ; a = a->next) { + if (a->replaces) { + struct active_array **ap; + for (ap = &a->next; *ap && *ap != a->replaces; + ap = & (*ap)->next) + ; + if (*ap) + *ap = (*ap)->next; + discard_this = a->replaces; + a->replaces = NULL; + } + rv += read_and_act(a); + } + return rv; +} + +void do_monitor(struct supertype *container) +{ + int rv; + int first = 1; + do { + rv = wait_and_act(container->arrays, container->pipe[0], first); + first = 0; + } while (rv >= 0); +} diff --git a/super-ddf.c b/super-ddf.c index 1031e22b..c11fa1c7 100644 --- a/super-ddf.c +++ b/super-ddf.c @@ -27,6 +27,7 @@ #define HAVE_STDINT_H 1 #include "mdadm.h" +#include "mdmon.h" #include "sha1.h" #include @@ -416,7 +417,7 @@ struct ddf_super { #define offsetof(t,f) ((size_t)&(((t*)0)->f)) #endif -extern struct superswitch super_ddf_container, super_ddf_bvd; +extern struct superswitch super_ddf_container, super_ddf_bvd, super_ddf; static int calc_crc(void *buf, int len) { @@ -2442,6 +2443,32 @@ static int compare_super_ddf(struct supertype *st, struct supertype *tst) return 0; } +static int ddf_open_new(struct supertype *c, struct active_array *a, int inst) +{ + fprintf(stderr, "ddf: open_new %d\n", inst); + return 0; +} + +static void ddf_mark_clean(struct active_array *a, unsigned long long sync_pos) +{ + fprintf(stderr, "ddf: mark clean %llu\n", sync_pos); +} + +static void ddf_mark_dirty(struct active_array *a) +{ + fprintf(stderr, "ddf: mark dirty\n"); +} + +static void ddf_set_disk(struct active_array *a, int n) +{ + fprintf(stderr, "ddf: set_disk %d\n", n); +} + +static void ddf_sync_metadata(struct active_array *a) +{ + fprintf(stderr, "ddf: sync_metadata\n"); +} + struct superswitch super_ddf = { #ifndef MDASSEMBLE .examine_super = examine_super_ddf, @@ -2471,6 +2498,16 @@ struct superswitch super_ddf = { .swapuuid = 0, .external = 1, .text_version = "ddf", + +/* for mdmon */ + .open_new = ddf_open_new, + .load_super = load_super_ddf, + .mark_clean = ddf_mark_clean, + .mark_dirty = ddf_mark_dirty, + .set_disk = ddf_set_disk, + .sync_metadata = ddf_sync_metadata, + + }; /* Super_ddf_container is set by validate_geometry_ddf when given a diff --git a/sysfs.c b/sysfs.c index f0e95122..34840f76 100644 --- a/sysfs.c +++ b/sysfs.c @@ -56,6 +56,29 @@ void sysfs_free(struct mdinfo *sra) } } +int sysfs_open(int devnum, char *devname, char *attr) +{ + char fname[50]; + char sys_name[16]; + int fd; + if (devnum >= 0) + sprintf(sys_name, "md%d", devnum); + else + sprintf(sys_name, "md_d%d", + -1-devnum); + + sprintf(fname, "/sys/block/%s/md/", sys_name); + if (devname) { + strcat(fname, devname); + strcat(fname, "/"); + } + strcat(fname, attr); + fd = open(fname, O_RDWR); + if (fd < 0 && errno == -EACCES) + fd = open(fname, O_RDONLY); + return fd; +} + struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) { /* Longest possible name in sysfs, mounted at /sys, is @@ -128,6 +151,12 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) goto abort; sra->array.layout = strtoul(buf, NULL, 0); } + if (options & GET_DISKS) { + strcpy(base, "raid_disks"); + if (load_sys(fname, buf)) + goto abort; + sra->array.raid_disks = strtoul(buf, NULL, 0); + } if (options & GET_COMPONENT) { strcpy(base, "component_size"); if (load_sys(fname, buf)) -- 2.39.2