]> git.ipfire.org Git - thirdparty/mdadm.git/blame - Monitor.c
mdadm-0.7.2
[thirdparty/mdadm.git] / Monitor.c
CommitLineData
52826846 1/*
9a9dab36 2 * mdadm - manage Linux "md" devices aka RAID arrays.
52826846 3 *
cd29a5c8 4 * Copyright (C) 2001-2002 Neil Brown <neilb@cse.unsw.edu.au>
52826846
NB
5 *
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 * Author: Neil Brown
22 * Email: <neilb@cse.unsw.edu.au>
23 * Paper: Neil Brown
24 * School of Computer Science and Engineering
25 * The University of New South Wales
26 * Sydney, 2052
27 * Australia
28 */
29
9a9dab36 30#include "mdadm.h"
52826846
NB
31#include "md_p.h"
32#include "md_u.h"
33#include <sys/signal.h>
34
35static void alert(char *event, char *dev, char *disc, char *mailaddr, char *cmd);
36
cd29a5c8 37int Monitor(mddev_dev_t devlist,
52826846
NB
38 char *mailaddr, char *alert_cmd,
39 int period,
40 char *config)
41{
42 /*
43 * Every few seconds, scan every md device looking for changes
44 * When a change is found, log it, possibly run the alert command,
45 * and possibly send Email
46 *
47 * For each array, we record:
48 * Update time
49 * active/working/failed/spare drives
50 * State of each device.
51 *
52 * If the update time changes, check out all the data again
53 * It is possible that we cannot get the state of each device
54 * due to bugs in the md kernel module.
55 *
56 * if active_drives decreases, generate a "Fail" event
57 * if active_drives increases, generate a "SpareActive" event
58 *
59 * if we detect an array with active<raid and spare==0
60 * we look at other arrays that have same spare-group
61 * If we find one with active==raid and spare>0,
62 * and if we can get_disk_info and find a name
63 * Then we hot-remove and hot-add to the other array
64 *
65 */
66
67 struct state {
68 char *devname;
69 long utime;
70 int err;
71 int active, working, failed, spare;
72 int devstate[MD_SB_DISKS];
73 struct state *next;
74 } *statelist = NULL;
75 int finished = 0;
76 while (! finished) {
77 mddev_ident_t mdlist = NULL;
cd29a5c8 78 mddev_dev_t dv;
52826846 79 int dnum=0;
cd29a5c8 80 if (devlist== NULL)
52826846 81 mdlist = conf_get_ident(config, NULL);
cd29a5c8
NB
82 dv = devlist;
83 while (dv || mdlist) {
52826846
NB
84 mddev_ident_t mdident;
85 struct state *st;
86 mdu_array_info_t array;
87 char *dev;
88 int fd;
89 char *event = NULL;
90 int i;
91 char *event_disc = NULL;
cd29a5c8
NB
92 if (dv) {
93 dev = dv->devname;
52826846 94 mdident = conf_get_ident(config, dev);
cd29a5c8 95 dv = dv->next;
52826846
NB
96 } else {
97 mdident = mdlist;
98 dev = mdident->devname;
99 mdlist = mdlist->next;
100 }
101 for (st=statelist; st ; st=st->next)
102 if (strcmp(st->devname, dev)==0)
103 break;
104 if (!st) {
105 st =malloc(sizeof *st);
106 if (st == NULL)
107 continue;
108 st->devname = strdup(dev);
109 st->utime = 0;
110 st->next = statelist;
111 st->err = 0;
112 statelist = st;
113 }
114 fd = open(dev, O_RDONLY);
115 if (fd < 0) {
116 if (!st->err)
117 fprintf(stderr, Name ": cannot open %s: %s\n",
118 dev, strerror(errno));
119 st->err=1;
120 continue;
121 }
122 if (ioctl(fd, GET_ARRAY_INFO, &array)<0) {
123 if (!st->err)
124 fprintf(stderr, Name ": cannot get array info for %s: %s\n",
125 dev, strerror(errno));
126 st->err=1;
127 close(fd);
128 continue;
129 }
130 st->err = 0;
131
132 if (st->utime == array.utime &&
133 st->failed == array.failed_disks) {
134 close(fd);
135 continue;
136 }
137 event = NULL;
138 if (st->utime) {
139 int i;
140 if (st->active > array.active_disks)
141 event = "Fail";
142 else if (st->working > array.working_disks)
143 event = "FailSpare";
144 else if (st->active < array.active_disks)
145 event = "ActiveSpare";
146 }
147 for (i=0; i<array.raid_disks+array.spare_disks; i++) {
148 mdu_disk_info_t disc;
149 disc.number = i;
150 if (ioctl(fd, GET_DISK_INFO, &disc)>= 0) {
151 if (event && event_disc == NULL &&
152 st->devstate[i] != disc.state) {
153 char * dv = map_dev(disc.major, disc.minor);
154 if (dv)
155 event_disc = strdup(dv);
156 }
157 st->devstate[i] = disc.state;
158 }
159 }
160 close(fd);
161 st->active = array.active_disks;
162 st->working = array.working_disks;
163 st->spare = array.spare_disks;
164 st->failed = array.failed_disks;
165 st->utime = array.utime;
166 if (event)
167 alert(event, dev, event_disc, mailaddr, alert_cmd);
168 }
169 sleep(period);
170 }
171 return 0;
172}
173
174
175static void alert(char *event, char *dev, char *disc, char *mailaddr, char *cmd)
176{
cd29a5c8
NB
177 if (!cmd && !mailaddr) {
178 time_t now = time(0);
179
180 printf("%0.15s: %s on %s %s\n", ctime(&now)+4, event, dev, disc?disc:"unknown device");
181 }
52826846
NB
182 if (cmd) {
183 int pid = fork();
184 switch(pid) {
185 default:
186 waitpid(pid, NULL, 0);
187 break;
188 case -1:
189 break;
190 case 0:
191 execl(cmd, cmd, event, dev, disc, NULL);
192 exit(2);
193 }
194 }
195 if (mailaddr && strncmp(event, "Fail", 4)==0) {
196 FILE *mp = popen(Sendmail, "w");
197 if (mp) {
198 char hname[256];
199 gethostname(hname, sizeof(hname));
200 signal(SIGPIPE, SIG_IGN);
201 fprintf(mp, "From: " Name " monitoring <root>\n");
202 fprintf(mp, "To: %s\n", mailaddr);
203 fprintf(mp, "Subject: %s event on %s:%s\n\n", event, dev, hname);
204
205 fprintf(mp, "This is an automatically generated mail message from " Name "\n");
206 fprintf(mp, "running on %s\n\n", hname);
207
208 fprintf(mp, "A %s event had been detected on md device %s.\n\n", event, dev);
209
210 if (disc)
211 fprintf(mp, "It could be related to sub-device %s.\n\n", disc);
212
213 fprintf(mp, "Faithfully yours, etc.\n");
214 fclose(mp);
215 }
216
217 }
218 /* FIXME log the event to syslog maybe */
219}