]> git.ipfire.org Git - thirdparty/mdadm.git/blob - Monitor.c
968e4b3e4a4fe844feba4941ccd85520d1461c19
[thirdparty/mdadm.git] / Monitor.c
1 /*
2 * mdctl - manage Linux "md" devices aka RAID arrays.
3 *
4 * Copyright (C) 2001 Neil Brown <neilb@cse.unsw.edu.au>
5 *
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 * Author: Neil Brown
22 * Email: <neilb@cse.unsw.edu.au>
23 * Paper: Neil Brown
24 * School of Computer Science and Engineering
25 * The University of New South Wales
26 * Sydney, 2052
27 * Australia
28 */
29
30 #include "mdctl.h"
31 #include "md_p.h"
32 #include "md_u.h"
33 #include <sys/signal.h>
34
35 static void alert(char *event, char *dev, char *disc, char *mailaddr, char *cmd);
36
37 int Monitor(int num_devs, char *devlist[],
38 char *mailaddr, char *alert_cmd,
39 int period,
40 char *config)
41 {
42 /*
43 * Every few seconds, scan every md device looking for changes
44 * When a change is found, log it, possibly run the alert command,
45 * and possibly send Email
46 *
47 * For each array, we record:
48 * Update time
49 * active/working/failed/spare drives
50 * State of each device.
51 *
52 * If the update time changes, check out all the data again
53 * It is possible that we cannot get the state of each device
54 * due to bugs in the md kernel module.
55 *
56 * if active_drives decreases, generate a "Fail" event
57 * if active_drives increases, generate a "SpareActive" event
58 *
59 * if we detect an array with active<raid and spare==0
60 * we look at other arrays that have same spare-group
61 * If we find one with active==raid and spare>0,
62 * and if we can get_disk_info and find a name
63 * Then we hot-remove and hot-add to the other array
64 *
65 */
66
67 struct state {
68 char *devname;
69 long utime;
70 int err;
71 int active, working, failed, spare;
72 int devstate[MD_SB_DISKS];
73 struct state *next;
74 } *statelist = NULL;
75 int finished = 0;
76 while (! finished) {
77 mddev_ident_t mdlist = NULL;
78 int dnum=0;
79 if (num_devs == 0)
80 mdlist = conf_get_ident(config, NULL);
81 while (dnum < num_devs || mdlist) {
82 mddev_ident_t mdident;
83 struct state *st;
84 mdu_array_info_t array;
85 char *dev;
86 int fd;
87 char *event = NULL;
88 int i;
89 char *event_disc = NULL;
90 if (num_devs) {
91 dev = devlist[dnum++];
92 mdident = conf_get_ident(config, dev);
93 } else {
94 mdident = mdlist;
95 dev = mdident->devname;
96 mdlist = mdlist->next;
97 }
98 for (st=statelist; st ; st=st->next)
99 if (strcmp(st->devname, dev)==0)
100 break;
101 if (!st) {
102 st =malloc(sizeof *st);
103 if (st == NULL)
104 continue;
105 st->devname = strdup(dev);
106 st->utime = 0;
107 st->next = statelist;
108 st->err = 0;
109 statelist = st;
110 }
111 fd = open(dev, O_RDONLY);
112 if (fd < 0) {
113 if (!st->err)
114 fprintf(stderr, Name ": cannot open %s: %s\n",
115 dev, strerror(errno));
116 st->err=1;
117 continue;
118 }
119 if (ioctl(fd, GET_ARRAY_INFO, &array)<0) {
120 if (!st->err)
121 fprintf(stderr, Name ": cannot get array info for %s: %s\n",
122 dev, strerror(errno));
123 st->err=1;
124 close(fd);
125 continue;
126 }
127 st->err = 0;
128
129 if (st->utime == array.utime &&
130 st->failed == array.failed_disks) {
131 close(fd);
132 continue;
133 }
134 event = NULL;
135 if (st->utime) {
136 int i;
137 if (st->active > array.active_disks)
138 event = "Fail";
139 else if (st->working > array.working_disks)
140 event = "FailSpare";
141 else if (st->active < array.active_disks)
142 event = "ActiveSpare";
143 }
144 for (i=0; i<array.raid_disks+array.spare_disks; i++) {
145 mdu_disk_info_t disc;
146 disc.number = i;
147 if (ioctl(fd, GET_DISK_INFO, &disc)>= 0) {
148 if (event && event_disc == NULL &&
149 st->devstate[i] != disc.state) {
150 char * dv = map_dev(disc.major, disc.minor);
151 if (dv)
152 event_disc = strdup(dv);
153 }
154 st->devstate[i] = disc.state;
155 }
156 }
157 close(fd);
158 st->active = array.active_disks;
159 st->working = array.working_disks;
160 st->spare = array.spare_disks;
161 st->failed = array.failed_disks;
162 st->utime = array.utime;
163 if (event)
164 alert(event, dev, event_disc, mailaddr, alert_cmd);
165 }
166 sleep(period);
167 }
168 return 0;
169 }
170
171
172 static void alert(char *event, char *dev, char *disc, char *mailaddr, char *cmd)
173 {
174 if (cmd) {
175 int pid = fork();
176 switch(pid) {
177 default:
178 waitpid(pid, NULL, 0);
179 break;
180 case -1:
181 break;
182 case 0:
183 execl(cmd, cmd, event, dev, disc, NULL);
184 exit(2);
185 }
186 }
187 if (mailaddr && strncmp(event, "Fail", 4)==0) {
188 FILE *mp = popen(Sendmail, "w");
189 if (mp) {
190 char hname[256];
191 gethostname(hname, sizeof(hname));
192 signal(SIGPIPE, SIG_IGN);
193 fprintf(mp, "From: " Name " monitoring <root>\n");
194 fprintf(mp, "To: %s\n", mailaddr);
195 fprintf(mp, "Subject: %s event on %s:%s\n\n", event, dev, hname);
196
197 fprintf(mp, "This is an automatically generated mail message from " Name "\n");
198 fprintf(mp, "running on %s\n\n", hname);
199
200 fprintf(mp, "A %s event had been detected on md device %s.\n\n", event, dev);
201
202 if (disc)
203 fprintf(mp, "It could be related to sub-device %s.\n\n", disc);
204
205 fprintf(mp, "Faithfully yours, etc.\n");
206 fclose(mp);
207 }
208
209 }
210 /* FIXME log the event to syslog maybe */
211 }