]>
Commit | Line | Data |
---|---|---|
52826846 | 1 | /* |
9a9dab36 | 2 | * mdadm - manage Linux "md" devices aka RAID arrays. |
52826846 | 3 | * |
cd29a5c8 | 4 | * Copyright (C) 2001-2002 Neil Brown <neilb@cse.unsw.edu.au> |
52826846 NB |
5 | * |
6 | * | |
7 | * This program is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU General Public License as published by | |
9 | * the Free Software Foundation; either version 2 of the License, or | |
10 | * (at your option) any later version. | |
11 | * | |
12 | * This program is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | * GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with this program; if not, write to the Free Software | |
19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
20 | * | |
21 | * Author: Neil Brown | |
22 | * Email: <neilb@cse.unsw.edu.au> | |
23 | * Paper: Neil Brown | |
24 | * School of Computer Science and Engineering | |
25 | * The University of New South Wales | |
26 | * Sydney, 2052 | |
27 | * Australia | |
28 | */ | |
29 | ||
9a9dab36 | 30 | #include "mdadm.h" |
52826846 NB |
31 | #include "md_p.h" |
32 | #include "md_u.h" | |
33 | #include <sys/signal.h> | |
34 | ||
35 | static void alert(char *event, char *dev, char *disc, char *mailaddr, char *cmd); | |
36 | ||
cd29a5c8 | 37 | int Monitor(mddev_dev_t devlist, |
52826846 NB |
38 | char *mailaddr, char *alert_cmd, |
39 | int period, | |
40 | char *config) | |
41 | { | |
42 | /* | |
43 | * Every few seconds, scan every md device looking for changes | |
44 | * When a change is found, log it, possibly run the alert command, | |
45 | * and possibly send Email | |
46 | * | |
47 | * For each array, we record: | |
48 | * Update time | |
49 | * active/working/failed/spare drives | |
50 | * State of each device. | |
51 | * | |
52 | * If the update time changes, check out all the data again | |
53 | * It is possible that we cannot get the state of each device | |
54 | * due to bugs in the md kernel module. | |
55 | * | |
56 | * if active_drives decreases, generate a "Fail" event | |
57 | * if active_drives increases, generate a "SpareActive" event | |
58 | * | |
59 | * if we detect an array with active<raid and spare==0 | |
60 | * we look at other arrays that have same spare-group | |
61 | * If we find one with active==raid and spare>0, | |
62 | * and if we can get_disk_info and find a name | |
63 | * Then we hot-remove and hot-add to the other array | |
64 | * | |
65 | */ | |
66 | ||
67 | struct state { | |
68 | char *devname; | |
69 | long utime; | |
70 | int err; | |
71 | int active, working, failed, spare; | |
72 | int devstate[MD_SB_DISKS]; | |
73 | struct state *next; | |
74 | } *statelist = NULL; | |
75 | int finished = 0; | |
76 | while (! finished) { | |
77 | mddev_ident_t mdlist = NULL; | |
cd29a5c8 | 78 | mddev_dev_t dv; |
52826846 | 79 | int dnum=0; |
cd29a5c8 | 80 | if (devlist== NULL) |
52826846 | 81 | mdlist = conf_get_ident(config, NULL); |
cd29a5c8 NB |
82 | dv = devlist; |
83 | while (dv || mdlist) { | |
52826846 NB |
84 | mddev_ident_t mdident; |
85 | struct state *st; | |
86 | mdu_array_info_t array; | |
87 | char *dev; | |
88 | int fd; | |
89 | char *event = NULL; | |
90 | int i; | |
91 | char *event_disc = NULL; | |
cd29a5c8 NB |
92 | if (dv) { |
93 | dev = dv->devname; | |
52826846 | 94 | mdident = conf_get_ident(config, dev); |
cd29a5c8 | 95 | dv = dv->next; |
52826846 NB |
96 | } else { |
97 | mdident = mdlist; | |
98 | dev = mdident->devname; | |
99 | mdlist = mdlist->next; | |
100 | } | |
101 | for (st=statelist; st ; st=st->next) | |
102 | if (strcmp(st->devname, dev)==0) | |
103 | break; | |
104 | if (!st) { | |
105 | st =malloc(sizeof *st); | |
106 | if (st == NULL) | |
107 | continue; | |
108 | st->devname = strdup(dev); | |
109 | st->utime = 0; | |
110 | st->next = statelist; | |
111 | st->err = 0; | |
112 | statelist = st; | |
113 | } | |
114 | fd = open(dev, O_RDONLY); | |
115 | if (fd < 0) { | |
116 | if (!st->err) | |
117 | fprintf(stderr, Name ": cannot open %s: %s\n", | |
118 | dev, strerror(errno)); | |
119 | st->err=1; | |
120 | continue; | |
121 | } | |
122 | if (ioctl(fd, GET_ARRAY_INFO, &array)<0) { | |
123 | if (!st->err) | |
124 | fprintf(stderr, Name ": cannot get array info for %s: %s\n", | |
125 | dev, strerror(errno)); | |
126 | st->err=1; | |
127 | close(fd); | |
128 | continue; | |
129 | } | |
130 | st->err = 0; | |
131 | ||
132 | if (st->utime == array.utime && | |
133 | st->failed == array.failed_disks) { | |
134 | close(fd); | |
135 | continue; | |
136 | } | |
137 | event = NULL; | |
138 | if (st->utime) { | |
139 | int i; | |
140 | if (st->active > array.active_disks) | |
141 | event = "Fail"; | |
142 | else if (st->working > array.working_disks) | |
143 | event = "FailSpare"; | |
144 | else if (st->active < array.active_disks) | |
145 | event = "ActiveSpare"; | |
146 | } | |
147 | for (i=0; i<array.raid_disks+array.spare_disks; i++) { | |
148 | mdu_disk_info_t disc; | |
149 | disc.number = i; | |
150 | if (ioctl(fd, GET_DISK_INFO, &disc)>= 0) { | |
151 | if (event && event_disc == NULL && | |
152 | st->devstate[i] != disc.state) { | |
153 | char * dv = map_dev(disc.major, disc.minor); | |
154 | if (dv) | |
155 | event_disc = strdup(dv); | |
156 | } | |
157 | st->devstate[i] = disc.state; | |
158 | } | |
159 | } | |
160 | close(fd); | |
161 | st->active = array.active_disks; | |
162 | st->working = array.working_disks; | |
163 | st->spare = array.spare_disks; | |
164 | st->failed = array.failed_disks; | |
165 | st->utime = array.utime; | |
166 | if (event) | |
167 | alert(event, dev, event_disc, mailaddr, alert_cmd); | |
168 | } | |
169 | sleep(period); | |
170 | } | |
171 | return 0; | |
172 | } | |
173 | ||
174 | ||
175 | static void alert(char *event, char *dev, char *disc, char *mailaddr, char *cmd) | |
176 | { | |
cd29a5c8 NB |
177 | if (!cmd && !mailaddr) { |
178 | time_t now = time(0); | |
179 | ||
180 | printf("%0.15s: %s on %s %s\n", ctime(&now)+4, event, dev, disc?disc:"unknown device"); | |
181 | } | |
52826846 NB |
182 | if (cmd) { |
183 | int pid = fork(); | |
184 | switch(pid) { | |
185 | default: | |
186 | waitpid(pid, NULL, 0); | |
187 | break; | |
188 | case -1: | |
189 | break; | |
190 | case 0: | |
191 | execl(cmd, cmd, event, dev, disc, NULL); | |
192 | exit(2); | |
193 | } | |
194 | } | |
195 | if (mailaddr && strncmp(event, "Fail", 4)==0) { | |
196 | FILE *mp = popen(Sendmail, "w"); | |
197 | if (mp) { | |
198 | char hname[256]; | |
199 | gethostname(hname, sizeof(hname)); | |
200 | signal(SIGPIPE, SIG_IGN); | |
201 | fprintf(mp, "From: " Name " monitoring <root>\n"); | |
202 | fprintf(mp, "To: %s\n", mailaddr); | |
203 | fprintf(mp, "Subject: %s event on %s:%s\n\n", event, dev, hname); | |
204 | ||
205 | fprintf(mp, "This is an automatically generated mail message from " Name "\n"); | |
206 | fprintf(mp, "running on %s\n\n", hname); | |
207 | ||
208 | fprintf(mp, "A %s event had been detected on md device %s.\n\n", event, dev); | |
209 | ||
210 | if (disc) | |
211 | fprintf(mp, "It could be related to sub-device %s.\n\n", disc); | |
212 | ||
213 | fprintf(mp, "Faithfully yours, etc.\n"); | |
214 | fclose(mp); | |
215 | } | |
216 | ||
217 | } | |
218 | /* FIXME log the event to syslog maybe */ | |
219 | } |