]>
git.ipfire.org Git - thirdparty/mdadm.git/blob - Monitor.c
2 * mdadm - manage Linux "md" devices aka RAID arrays.
4 * Copyright (C) 2001-2002 Neil Brown <neilb@cse.unsw.edu.au>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 * Email: <neilb@cse.unsw.edu.au>
24 * School of Computer Science and Engineering
25 * The University of New South Wales
34 #include <sys/signal.h>
36 static void alert(char *event
, char *dev
, char *disc
, char *mailaddr
, char *cmd
);
38 static char *percentalerts
[] = {
46 int Monitor(mddev_dev_t devlist
,
47 char *mailaddr
, char *alert_cmd
,
48 int period
, int daemonise
, int scan
,
52 * Every few seconds, scan every md device looking for changes
53 * When a change is found, log it, possibly run the alert command,
54 * and possibly send Email
56 * For each array, we record:
58 * active/working/failed/spare drives
59 * State of each device.
60 * %rebuilt if rebuilding
62 * If the update time changes, check out all the data again
63 * It is possible that we cannot get the state of each device
64 * due to bugs in the md kernel module.
65 * We also read /proc/mdstat to get rebuild percent,
66 * and to get state on all active devices incase of kernel bug.
70 * An active device had Faulty set or Active/Sync removed
72 * A spare device had Faulty set
74 * An active device had a reverse transition
76 * percent went from -1 to +ve
77 * Rebuild20 Rebuild40 Rebuild60 Rebuild80
78 * percent went from below to not-below that number
80 * Couldn't access a device which was previously visible
82 * if we detect an array with active<raid and spare==0
83 * we look at other arrays that have same spare-group
84 * If we find one with active==raid and spare>0,
85 * and if we can get_disk_info and find a name
86 * Then we hot-remove and hot-add to the other array
88 * If devlist is NULL, then we can monitor everything because --scan
89 * was given. We get an initial list from config file and add anything
90 * that appears in /proc/mdstat
95 int devnum
; /* to sync with mdstat info */
99 int active
, working
, failed
, spare
, raid
;
100 int devstate
[MD_SB_DISKS
];
101 int devid
[MD_SB_DISKS
];
106 struct mdstat_ent
*mdstat
= NULL
;
109 mailaddr
= conf_get_mailaddr(config
);
110 if (mailaddr
&& ! scan
)
111 fprintf(stderr
, Name
": Monitor using email address \"%s\" from config file\n",
115 alert_cmd
= conf_get_program(config
);
116 if (alert_cmd
&& ! scan
)
117 fprintf(stderr
, Name
": Monitor using program \"%s\" from config file\n",
120 if (scan
&& !mailaddr
&& !alert_cmd
) {
121 fprintf(stderr
, Name
": No mail address or alert command - not monitoring.\n");
136 open("/dev/null", 3);
142 if (devlist
== NULL
) {
143 mddev_ident_t mdlist
= conf_get_ident(config
, NULL
);
144 for (; mdlist
; mdlist
=mdlist
->next
) {
145 struct state
*st
= malloc(sizeof *st
);
148 st
->devname
= strdup(mdlist
->devname
);
150 st
->next
= statelist
;
154 if (mdlist
->spare_group
)
155 st
->spare_group
= strdup(mdlist
->spare_group
);
157 st
->spare_group
= NULL
;
162 for (dv
=devlist
; dv
; dv
=dv
->next
) {
163 struct state
*st
= malloc(sizeof *st
);
166 st
->devname
= strdup(dv
->devname
);
168 st
->next
= statelist
;
172 st
->spare_group
= NULL
;
183 mdstat
= mdstat_read();
185 for (st
=statelist
; st
; st
=st
->next
) {
186 mdu_array_info_t array
;
187 struct mdstat_ent
*mse
;
188 char *dev
= st
->devname
;
192 fd
= open(dev
, O_RDONLY
);
195 alert("DeviceDisappeared", dev
, NULL
,
196 mailaddr
, alert_cmd
);
197 /* fprintf(stderr, Name ": cannot open %s: %s\n",
198 dev, strerror(errno));
202 if (ioctl(fd
, GET_ARRAY_INFO
, &array
)<0) {
204 alert("DeviceDisappeared", dev
, NULL
,
205 mailaddr
, alert_cmd
);
206 /* fprintf(stderr, Name ": cannot get array info for %s: %s\n",
207 dev, strerror(errno));
212 if (array
.level
!= 1 && array
.level
!= 5 && array
.level
!= -4) {
214 alert("DeviceDisappeared", dev
, "Wrong-Level",
215 mailaddr
, alert_cmd
);
220 if (st
->devnum
< 0) {
222 if (fstat(fd
, &stb
) == 0 &&
223 (S_IFMT
&stb
.st_mode
)==S_IFBLK
)
224 st
->devnum
= MINOR(stb
.st_rdev
);
227 for (mse
= mdstat
; mse
; mse
=mse
->next
)
228 if (mse
->devnum
== st
->devnum
) {
229 mse
->devnum
= -1; /* flag it as "used" */
233 if (st
->utime
== array
.utime
&&
234 st
->failed
== array
.failed_disks
&&
235 st
->working
== array
.working_disks
&&
236 st
->spare
== array
.spare_disks
&&
238 mse
->percent
== st
->percent
247 alert("RebuildStarted", dev
, NULL
, mailaddr
, alert_cmd
);
251 (mse
->percent
/ 20) > (st
->percent
/ 20))
252 alert(percentalerts
[mse
->percent
/20],
253 dev
, NULL
, mailaddr
, alert_cmd
);
256 st
->percent
= mse
->percent
;
258 for (i
=0; i
<MD_SB_DISKS
; i
++) {
259 mdu_disk_info_t disc
;
264 if (ioctl(fd
, GET_DISK_INFO
, &disc
)>= 0) {
265 newstate
= disc
.state
;
266 dv
= map_dev(disc
.major
, disc
.minor
);
267 } else if (mse
&& mse
->pattern
&& i
< strlen(mse
->pattern
))
268 switch(mse
->pattern
[i
]) {
269 case 'U': newstate
= 6 /* ACTIVE/SYNC */; break;
270 case '_': newstate
= 0; break;
272 change
= newstate
^ st
->devstate
[i
];
273 if (st
->utime
&& change
&& !st
->err
) {
274 if (i
< array
.raid_disks
&&
275 (((newstate
&change
)&(1<<MD_DISK_FAULTY
)) ||
276 ((st
->devstate
[i
]&change
)&(1<<MD_DISK_ACTIVE
)) ||
277 ((st
->devstate
[i
]&change
)&(1<<MD_DISK_SYNC
)))
279 alert("Fail", dev
, dv
, mailaddr
, alert_cmd
);
280 else if (i
>=array
.raid_disks
&&
281 (disc
.major
|| disc
.minor
) &&
282 st
->devid
[i
] == MKDEV(disc
.major
, disc
.minor
) &&
283 ((newstate
&change
)&(1<<MD_DISK_FAULTY
))
285 alert("FailSpare", dev
, dv
, mailaddr
, alert_cmd
);
286 else if (i
< array
.raid_disks
&&
287 (((st
->devstate
[i
]&change
)&(1<<MD_DISK_FAULTY
)) ||
288 ((newstate
&change
)&(1<<MD_DISK_ACTIVE
)) ||
289 ((newstate
&change
)&(1<<MD_DISK_SYNC
)))
291 alert("SpareActive", dev
, dv
, mailaddr
, alert_cmd
);
293 st
->devstate
[i
] = disc
.state
;
294 st
->devid
[i
] = MKDEV(disc
.major
, disc
.minor
);
297 st
->active
= array
.active_disks
;
298 st
->working
= array
.working_disks
;
299 st
->spare
= array
.spare_disks
;
300 st
->failed
= array
.failed_disks
;
301 st
->utime
= array
.utime
;
302 st
->raid
= array
.raid_disks
;
305 /* now check if there are any new devices found in mdstat */
307 struct mdstat_ent
*mse
;
308 for (mse
=mdstat
; mse
; mse
=mse
->next
)
309 if (mse
->devnum
>= 0 &&
310 (strcmp(mse
->level
, "raid1")==0 ||
311 strcmp(mse
->level
, "raid5")==0 ||
312 strcmp(mse
->level
, "multipath")==0)
314 struct state
*st
= malloc(sizeof *st
);
317 st
->devname
= strdup(get_md_name(mse
->devnum
));
319 st
->next
= statelist
;
321 st
->devnum
= mse
->devnum
;
323 st
->spare_group
= NULL
;
325 alert("NewArray", st
->devname
, NULL
, mailaddr
, alert_cmd
);
328 /* If an array has active < raid && spare == 0 && spare_group != NULL
329 * Look for another array with spare > 0 and active == raid and same spare_group
330 * if found, choose a device and hotremove/hotadd
332 for (st
= statelist
; st
; st
=st
->next
)
333 if (st
->active
< st
->raid
&&
335 st
->spare_group
!= NULL
) {
337 for (st2
=statelist
; st2
; st2
=st2
->next
)
340 st2
->active
== st2
->raid
&&
341 st2
->spare_group
!= NULL
&&
342 strcmp(st
->spare_group
, st2
->spare_group
) == 0) {
343 /* try to remove and add */
344 int fd1
= open(st
->devname
, O_RDONLY
);
345 int fd2
= open(st2
->devname
, O_RDONLY
);
348 if (fd1
< 0 || fd2
< 0) {
349 if (fd1
>=0) close(fd1
);
350 if (fd2
>=0) close(fd2
);
353 for (d
=st2
->raid
; d
<MD_SB_DISKS
; d
++) {
354 if (st2
->devid
[d
] > 0 &&
355 st2
->devstate
[d
] == 0) {
361 if (ioctl(fd2
, HOT_REMOVE_DISK
,
362 (unsigned long)dev
) == 0) {
363 if (ioctl(fd1
, HOT_ADD_DISK
,
364 (unsigned long)dev
) == 0) {
365 alert("MoveSpare", st
->devname
, st2
->devname
, mailaddr
, alert_cmd
);
370 else ioctl(fd2
, HOT_ADD_DISK
, (unsigned long) dev
);
384 static void alert(char *event
, char *dev
, char *disc
, char *mailaddr
, char *cmd
)
386 if (!cmd
&& !mailaddr
) {
387 time_t now
= time(0);
389 printf("%1.15s: %s on %s %s\n", ctime(&now
)+4, event
, dev
, disc
?disc
:"unknown device");
395 waitpid(pid
, NULL
, 0);
400 execl(cmd
, cmd
, event
, dev
, disc
, NULL
);
404 if (mailaddr
&& strncmp(event
, "Fail", 4)==0) {
405 FILE *mp
= popen(Sendmail
, "w");
408 gethostname(hname
, sizeof(hname
));
409 signal(SIGPIPE
, SIG_IGN
);
410 fprintf(mp
, "From: " Name
" monitoring <root>\n");
411 fprintf(mp
, "To: %s\n", mailaddr
);
412 fprintf(mp
, "Subject: %s event on %s:%s\n\n", event
, dev
, hname
);
414 fprintf(mp
, "This is an automatically generated mail message from " Name
"\n");
415 fprintf(mp
, "running on %s\n\n", hname
);
417 fprintf(mp
, "A %s event had been detected on md device %s.\n\n", event
, dev
);
420 fprintf(mp
, "It could be related to component device %s.\n\n", disc
);
422 fprintf(mp
, "Faithfully yours, etc.\n");
427 /* FIXME log the event to syslog maybe */