X-Git-Url: http://git.ipfire.org/?p=thirdparty%2Fmdadm.git;a=blobdiff_plain;f=Monitor.c;h=32181e45ed63100c672e5dc32ddcbd84f6400f29;hp=bdab9427bd87f5226a00a73202688a8fc319de34;hb=e0d1903663dac9307a37646c26abf7991b0a9593;hpb=11a3e71da434939895cc504e20e735eb656b1c74 diff --git a/Monitor.c b/Monitor.c index bdab9427..32181e45 100644 --- a/Monitor.c +++ b/Monitor.c @@ -30,13 +30,22 @@ #include "mdadm.h" #include "md_p.h" #include "md_u.h" +#include #include static void alert(char *event, char *dev, char *disc, char *mailaddr, char *cmd); +static char *percentalerts[] = { + "RebuildStarted", + "Rebuild20", + "Rebuild40", + "Rebuild60", + "Rebuild80", +}; + int Monitor(mddev_dev_t devlist, char *mailaddr, char *alert_cmd, - int period, + int period, int scan, char *config) { /* @@ -48,13 +57,27 @@ int Monitor(mddev_dev_t devlist, * Update time * active/working/failed/spare drives * State of each device. + * %rebuilt if rebuilding * * If the update time changes, check out all the data again * It is possible that we cannot get the state of each device * due to bugs in the md kernel module. + * We also read /proc/mdstat to get rebuild percent, + * and to get state on all active devices incase of kernel bug. * - * if active_drives decreases, generate a "Fail" event - * if active_drives increases, generate a "SpareActive" event + * Events are: + * Fail + * An active device had Faulty set or Active/Sync removed + * FailSpare + * A spare device had Faulty set + * SpareActive + * An active device had a reverse transition + * RebuildStarted + * percent went from -1 to +ve + * Rebuild20 Rebuild40 Rebuild60 Rebuild80 + * percent went from below to not-below that number + * DeviceDisappeared + * Couldn't access a device which was previously visible * * if we detect an array with activenext) { + struct state *st = malloc(sizeof *st); + if (st == NULL) + continue; + st->devname = strdup(mdlist->devname); + st->utime = 0; + st->next = statelist; + st->err = 1; + st->devnum = -1; + st->percent = -2; + if (mdlist->spare_group) + st->spare_group = strdup(mdlist->spare_group); + else + st->spare_group = NULL; + statelist = st; + } + } else { mddev_dev_t dv; - int dnum=0; - if (devlist== NULL) - mdlist = conf_get_ident(config, NULL); - dv = devlist; - while (dv || mdlist) { - mddev_ident_t mdident; - struct state *st; + for (dv=devlist ; dv; dv=dv->next) { + struct state *st = malloc(sizeof *st); + if (st == NULL) + continue; + st->devname = strdup(dv->devname); + st->utime = 0; + st->next = statelist; + st->err = 1; + st->devnum = -1; + st->percent = -2; + st->spare_group = NULL; + statelist = st; + } + } + + + while (! finished) { + struct state *st; + + if (mdstat) + free_mdstat(mdstat); + mdstat = mdstat_read(); + + for (st=statelist; st; st=st->next) { mdu_array_info_t array; - char *dev; + struct mdstat_ent *mse; + char *dev = st->devname; int fd; - char *event = NULL; int i; - char *event_disc = NULL; - if (dv) { - dev = dv->devname; - mdident = conf_get_ident(config, dev); - dv = dv->next; - } else { - mdident = mdlist; - dev = mdident->devname; - mdlist = mdlist->next; - } - for (st=statelist; st ; st=st->next) - if (strcmp(st->devname, dev)==0) - break; - if (!st) { - st =malloc(sizeof *st); - if (st == NULL) - continue; - st->devname = strdup(dev); - st->utime = 0; - st->next = statelist; - st->err = 0; - statelist = st; - } + fd = open(dev, O_RDONLY); if (fd < 0) { if (!st->err) - fprintf(stderr, Name ": cannot open %s: %s\n", + alert("DeviceDisappeared", dev, NULL, + mailaddr, alert_cmd); +/* fprintf(stderr, Name ": cannot open %s: %s\n", dev, strerror(errno)); - st->err=1; +*/ st->err=1; continue; } if (ioctl(fd, GET_ARRAY_INFO, &array)<0) { if (!st->err) - fprintf(stderr, Name ": cannot get array info for %s: %s\n", + alert("DeviceDisappeared", dev, NULL, + mailaddr, alert_cmd); +/* fprintf(stderr, Name ": cannot get array info for %s: %s\n", dev, strerror(errno)); - st->err=1; +*/ st->err=1; close(fd); continue; } - st->err = 0; - + if (st->devnum < 0) { + struct stat stb; + if (fstat(fd, &stb) == 0 && + (S_IFMT&stb.st_mode)==S_IFBLK) + st->devnum = MINOR(stb.st_rdev); + } + + for (mse = mdstat ; mse ; mse=mse->next) + if (mse->devnum == st->devnum) { + mse->devnum = -1; /* flag it as "used" */ + break; + } + if (st->utime == array.utime && - st->failed == array.failed_disks) { + st->failed == array.failed_disks && + st->working == array.working_disks && + st->spare == array.spare_disks && + (mse == NULL || ( + mse->percent == st->percent + ))) { close(fd); + st->err = 0; continue; } - event = NULL; - if (st->utime) { - int i; - if (st->active > array.active_disks) - event = "Fail"; - else if (st->working > array.working_disks) - event = "FailSpare"; - else if (st->active < array.active_disks) - event = "ActiveSpare"; - } - for (i=0; ipercent == -1 && + mse->percent >= 0) + alert("RebuildStarted", dev, NULL, mailaddr, alert_cmd); + if (mse && + st->percent >= 0 && + mse->percent >= 0 && + (mse->percent / 20) > (st->percent / 20)) + alert(percentalerts[mse->percent/20], + dev, NULL, mailaddr, alert_cmd); + + if (mse) + st->percent = mse->percent; + + for (i=0; i= 0) { - if (event && event_disc == NULL && - st->devstate[i] != disc.state) { - char * dv = map_dev(disc.major, disc.minor); - if (dv) - event_disc = strdup(dv); + newstate = disc.state; + dv = map_dev(disc.major, disc.minor); + } else if (mse && i < strlen(mse->pattern)) + switch(mse->pattern[i]) { + case 'U': newstate = 6 /* ACTIVE/SYNC */; break; + case '_': newstate = 0; break; } - st->devstate[i] = disc.state; + change = newstate ^ st->devstate[i]; + if (st->utime && change && !st->err) { + if (i < array.raid_disks && + (((newstate&change)&(1<devstate[i]&change)&(1<devstate[i]&change)&(1<=array.raid_disks && + (disc.major || disc.minor) && + st->devid[i] == MKDEV(disc.major, disc.minor) && + ((newstate&change)&(1<devstate[i]&change)&(1<devstate[i] = disc.state; + st->devid[i] = MKDEV(disc.major, disc.minor); } close(fd); st->active = array.active_disks; @@ -163,9 +264,78 @@ int Monitor(mddev_dev_t devlist, st->spare = array.spare_disks; st->failed = array.failed_disks; st->utime = array.utime; - if (event) - alert(event, dev, event_disc, mailaddr, alert_cmd); + st->raid = array.raid_disks; + st->err = 0; } + /* now check if there are any new devices found in mdstat */ + if (scan) { + struct mdstat_ent *mse; + for (mse=mdstat; mse; mse=mse->next) + if (mse->devnum > 0) { + struct state *st = malloc(sizeof *st); + if (st == NULL) + continue; + st->devname = strdup(get_md_name(mse->devnum)); + st->utime = 0; + st->next = statelist; + st->err = 1; + st->devnum = mse->devnum; + st->percent = -2; + st->spare_group = NULL; + statelist = st; + alert("NewArray", st->devname, NULL, mailaddr, alert_cmd); + } + } + /* If an array has active < raid && spare == 0 && spare_group != NULL + * Look for another array with spare > 0 and active == raid and same spare_group + * if found, choose a device and hotremove/hotadd + */ + for (st = statelist; st; st=st->next) + if (st->active < st->raid && + st->spare == 0 && + st->spare_group != NULL) { + struct state *st2; + for (st2=statelist ; st2 ; st2=st2->next) + if (st2 != st && + st2->spare > 0 && + st2->active == st2->raid && + st2->spare_group != NULL && + strcmp(st->spare_group, st2->spare_group) == 0) { + /* try to remove and add */ + int fd1 = open(st->devname, O_RDONLY); + int fd2 = open(st2->devname, O_RDONLY); + int dev = -1; + int d; + if (fd1 < 0 || fd2 < 0) { + if (fd1>=0) close(fd1); + if (fd2>=0) close(fd2); + continue; + } + for (d=st2->raid; ddevid[d] > 0 && + st2->devstate[d] == 0) { + dev = st2->devid[d]; + break; + } + } + if (dev > 0) { + if (ioctl(fd2, HOT_REMOVE_DISK, + (unsigned long)dev) == 0) { + if (ioctl(fd1, HOT_ADD_DISK, + (unsigned long)dev) == 0) { + alert("MoveSpare", st->devname, st2->devname, mailaddr, alert_cmd); + close(fd1); + close(fd2); + break; + } + else ioctl(fd2, HOT_ADD_DISK, (unsigned long) dev); + } + } + close(fd1); + close(fd2); + } + } + sleep(period); } return 0; @@ -177,7 +347,7 @@ static void alert(char *event, char *dev, char *disc, char *mailaddr, char *cmd) if (!cmd && !mailaddr) { time_t now = time(0); - printf("%0.15s: %s on %s %s\n", ctime(&now)+4, event, dev, disc?disc:"unknown device"); + printf("%1.15s: %s on %s %s\n", ctime(&now)+4, event, dev, disc?disc:"unknown device"); } if (cmd) { int pid = fork();