]> git.ipfire.org Git - thirdparty/mdadm.git/blame_incremental - Monitor.c
Create.c: fix uclibc build
[thirdparty/mdadm.git] / Monitor.c
... / ...
CommitLineData
1/*
2 * mdadm - manage Linux "md" devices aka RAID arrays.
3 *
4 * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
5 *
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 * Author: Neil Brown
22 * Email: <neilb@suse.de>
23 */
24
25#include "mdadm.h"
26#include "udev.h"
27#include "md_p.h"
28#include "md_u.h"
29#include <sys/wait.h>
30#include <limits.h>
31#include <syslog.h>
32
33#define TASK_COMM_LEN 16
34#define EVENT_NAME_MAX 32
35#define AUTOREBUILD_PID_PATH MDMON_DIR "/autorebuild.pid"
36#define FALLBACK_DELAY 5
37
38/**
39 * struct state - external array or container properties.
40 * @devname: has length of %DEV_MD_DIR + device name + terminating byte
41 * @devnm: to sync with mdstat info
42 * @parent_devnm: or subarray, devnm of parent, for others, ""
43 * @subarray: for a container it is a link to first subarray, for a subarray it is a link to next
44 * subarray in the same container
45 * @parent: for a subarray it is a link to its container
46 */
47struct state {
48 char devname[MD_NAME_MAX + sizeof(DEV_MD_DIR)];
49 char devnm[MD_NAME_MAX];
50 unsigned int utime;
51 int err;
52 char *spare_group;
53 int active, working, failed, spare, raid;
54 int from_config;
55 int from_auto;
56 int expected_spares;
57 int devstate[MAX_DISKS];
58 dev_t devid[MAX_DISKS];
59 int percent;
60 char parent_devnm[MD_NAME_MAX];
61 struct supertype *metadata;
62 struct state *subarray;
63 struct state *parent;
64 struct state *next;
65};
66
67struct alert_info {
68 char hostname[HOST_NAME_MAX];
69 char *mailaddr;
70 char *mailfrom;
71 char *alert_cmd;
72 int dosyslog;
73 int test;
74} info;
75
76enum event {
77 EVENT_SPARE_ACTIVE = 0,
78 EVENT_NEW_ARRAY,
79 EVENT_MOVE_SPARE,
80 EVENT_TEST_MESSAGE,
81 __SYSLOG_PRIORITY_WARNING,
82 EVENT_REBUILD_STARTED,
83 EVENT_REBUILD,
84 EVENT_REBUILD_FINISHED,
85 EVENT_SPARES_MISSING,
86 __SYSLOG_PRIORITY_CRITICAL,
87 EVENT_DEVICE_DISAPPEARED,
88 EVENT_FAIL,
89 EVENT_FAIL_SPARE,
90 EVENT_DEGRADED_ARRAY,
91 EVENT_UNKNOWN
92};
93
94mapping_t events_map[] = {
95 {"SpareActive", EVENT_SPARE_ACTIVE},
96 {"NewArray", EVENT_NEW_ARRAY},
97 {"MoveSpare", EVENT_MOVE_SPARE},
98 {"TestMessage", EVENT_TEST_MESSAGE},
99 {"RebuildStarted", EVENT_REBUILD_STARTED},
100 {"Rebuild", EVENT_REBUILD},
101 {"RebuildFinished", EVENT_REBUILD_FINISHED},
102 {"SparesMissing", EVENT_SPARES_MISSING},
103 {"DeviceDisappeared", EVENT_DEVICE_DISAPPEARED},
104 {"Fail", EVENT_FAIL},
105 {"FailSpare", EVENT_FAIL_SPARE},
106 {"DegradedArray", EVENT_DEGRADED_ARRAY},
107 {NULL, EVENT_UNKNOWN}
108};
109
110struct event_data {
111 enum event event_enum;
112 /*
113 * @event_name: Rebuild event name must be in form "RebuildXX", where XX is rebuild progress.
114 */
115 char event_name[EVENT_NAME_MAX];
116 char message[BUFSIZ];
117 const char *description;
118 const char *dev;
119 const char *disc;
120};
121
122static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist);
123static void try_spare_migration(struct state *statelist);
124static void link_containers_with_subarrays(struct state *list);
125static void free_statelist(struct state *statelist);
126static int check_array(struct state *st, struct mdstat_ent *mdstat, int increments, char *prefer);
127static int check_one_sharer(int scan);
128static void link_containers_with_subarrays(struct state *list);
129static int make_daemon(char *pidfile);
130static void try_spare_migration(struct state *statelist);
131static void wait_for_events(int *delay_for_event, int c_delay);
132static void wait_for_events_mdstat(int *delay_for_event, int c_delay);
133static int write_autorebuild_pid(void);
134
135int Monitor(struct mddev_dev *devlist,
136 char *mailaddr, char *alert_cmd,
137 struct context *c,
138 int daemonise, int oneshot,
139 int dosyslog, char *pidfile, int increments,
140 int share)
141{
142 /*
143 * Every few seconds, scan every md device looking for changes
144 * When a change is found, log it, possibly run the alert command,
145 * and possibly send Email
146 *
147 * For each array, we record:
148 * Update time
149 * active/working/failed/spare drives
150 * State of each device.
151 * %rebuilt if rebuilding
152 *
153 * If the update time changes, check out all the data again
154 * It is possible that we cannot get the state of each device
155 * due to bugs in the md kernel module.
156 * We also read /proc/mdstat to get rebuild percent,
157 * and to get state on all active devices incase of kernel bug.
158 *
159 * Events are:
160 * Fail
161 * An active device had Faulty set or Active/Sync removed
162 * FailSpare
163 * A spare device had Faulty set
164 * SpareActive
165 * An active device had a reverse transition
166 * RebuildStarted
167 * percent went from -1 to +ve
168 * RebuildNN
169 * percent went from below to not-below NN%
170 * DeviceDisappeared
171 * Couldn't access a device which was previously visible
172 *
173 * if we detect an array with active<raid and spare==0
174 * we look at other arrays that have same spare-group
175 * If we find one with active==raid and spare>0,
176 * and if we can get_disk_info and find a name
177 * Then we hot-remove and hot-add to the other array
178 *
179 * If devlist is NULL, then we can monitor everything if --scan
180 * was given. We get an initial list from config file and add anything
181 * that appears in /proc/mdstat
182 */
183
184 struct state *statelist = NULL;
185 int finished = 0;
186 struct mdstat_ent *mdstat = NULL;
187 char *mailfrom;
188 struct mddev_ident *mdlist;
189 int delay_for_event = c->delay;
190
191 if (devlist && c->scan) {
192 pr_err("Devices list and --scan option cannot be combined - not monitoring.\n");
193 return 1;
194 }
195
196 if (!mailaddr)
197 mailaddr = conf_get_mailaddr();
198
199 if (!alert_cmd)
200 alert_cmd = conf_get_program();
201
202 mailfrom = conf_get_mailfrom();
203
204 if (c->scan && !mailaddr && !alert_cmd && !dosyslog) {
205 pr_err("No mail address or alert command - not monitoring.\n");
206 return 1;
207 }
208
209 if (c->verbose) {
210 pr_err("Monitor is started with delay %ds\n", c->delay);
211 if (mailaddr)
212 pr_err("Monitor using email address %s\n", mailaddr);
213 if (alert_cmd)
214 pr_err("Monitor using program %s\n", alert_cmd);
215 }
216
217 info.alert_cmd = alert_cmd;
218 info.mailaddr = mailaddr;
219 info.mailfrom = mailfrom;
220 info.dosyslog = dosyslog;
221 info.test = c->test;
222
223 if (s_gethostname(info.hostname, sizeof(info.hostname)) != 0) {
224 pr_err("Cannot get hostname.\n");
225 return 1;
226 }
227
228 if (share){
229 if (check_one_sharer(c->scan) == 2)
230 return 1;
231 }
232
233 if (daemonise) {
234 int rv = make_daemon(pidfile);
235 if (rv >= 0)
236 return rv;
237 }
238
239 if (share)
240 if (write_autorebuild_pid() != 0)
241 return 1;
242
243 if (devlist == NULL) {
244 mdlist = conf_get_ident(NULL);
245 for (; mdlist; mdlist = mdlist->next) {
246 struct state *st;
247
248 if (mdlist->devname == NULL)
249 continue;
250 if (is_devname_ignore(mdlist->devname) == true)
251 continue;
252 if (!is_mddev(mdlist->devname))
253 continue;
254
255 st = xcalloc(1, sizeof *st);
256 snprintf(st->devname, MD_NAME_MAX + sizeof(DEV_MD_DIR), DEV_MD_DIR "%s",
257 basename(mdlist->devname));
258 st->next = statelist;
259 st->devnm[0] = 0;
260 st->percent = RESYNC_UNKNOWN;
261 st->from_config = 1;
262 st->expected_spares = mdlist->spare_disks;
263 if (mdlist->spare_group)
264 st->spare_group = xstrdup(mdlist->spare_group);
265 statelist = st;
266 }
267 } else {
268 struct mddev_dev *dv;
269
270 for (dv = devlist; dv; dv = dv->next) {
271 struct state *st;
272
273 if (!is_mddev(dv->devname))
274 continue;
275
276 st = xcalloc(1, sizeof *st);
277 mdlist = conf_get_ident(dv->devname);
278 snprintf(st->devname, MD_NAME_MAX + sizeof(DEV_MD_DIR), "%s", dv->devname);
279 st->next = statelist;
280 st->devnm[0] = 0;
281 st->percent = RESYNC_UNKNOWN;
282 st->expected_spares = -1;
283 if (mdlist) {
284 st->expected_spares = mdlist->spare_disks;
285 if (mdlist->spare_group)
286 st->spare_group = xstrdup(mdlist->spare_group);
287 }
288 statelist = st;
289 }
290 }
291
292 while (!finished) {
293 int new_found = 0;
294 struct state *st, **stp;
295 int anydegraded = 0;
296 int anyredundant = 0;
297
298 if (mdstat)
299 free_mdstat(mdstat);
300 mdstat = mdstat_read(oneshot ? 0 : 1, 0);
301
302 for (st = statelist; st; st = st->next) {
303 if (check_array(st, mdstat, increments, c->prefer))
304 anydegraded = 1;
305 /* for external arrays, metadata is filled for
306 * containers only
307 */
308 if (st->metadata && st->metadata->ss->external)
309 continue;
310 if (st->err == 0 && !anyredundant)
311 anyredundant = 1;
312 }
313
314 /* now check if there are any new devices found in mdstat */
315 if (c->scan)
316 new_found = add_new_arrays(mdstat, &statelist);
317
318 /* If an array has active < raid && spare == 0 && spare_group != NULL
319 * Look for another array with spare > 0 and active == raid and same spare_group
320 * if found, choose a device and hotremove/hotadd
321 */
322 if (share && anydegraded)
323 try_spare_migration(statelist);
324 if (!new_found) {
325 if (oneshot)
326 break;
327 if (!anyredundant) {
328 pr_err("No array with redundancy detected, stopping\n");
329 break;
330 }
331
332 wait_for_events(&delay_for_event, c->delay);
333 }
334 info.test = 0;
335
336 for (stp = &statelist; (st = *stp) != NULL; ) {
337 if (st->from_auto && st->err > 5) {
338 *stp = st->next;
339 if (st->spare_group)
340 free(st->spare_group);
341
342 free(st);
343 } else
344 stp = &st->next;
345 }
346 }
347
348 free_statelist(statelist);
349
350 if (pidfile)
351 unlink(pidfile);
352 return 0;
353}
354
355/*
356 * wait_for_events() - Waits for events on md devices.
357 * @delay_for_event: pointer to current event delay
358 * @c_delay: delay from config
359 */
360static void wait_for_events(int *delay_for_event, int c_delay)
361{
362#ifndef NO_LIBUDEV
363 if (udev_is_available()) {
364 if (udev_wait_for_events(*delay_for_event) == UDEV_STATUS_ERROR)
365 pr_err("Error while waiting for udev events.\n");
366 return;
367 }
368#endif
369 wait_for_events_mdstat(delay_for_event, c_delay);
370}
371
372/*
373 * wait_for_events_mdstat() - Waits for events on mdstat.
374 * @delay_for_event: pointer to current event delay
375 * @c_delay: delay from config
376 */
377static void wait_for_events_mdstat(int *delay_for_event, int c_delay)
378{
379 int wait_result = mdstat_wait(*delay_for_event);
380
381 if (wait_result < 0) {
382 pr_err("Error while waiting for events on mdstat.\n");
383 return;
384 }
385
386 /*
387 * Give chance to process new device
388 */
389 if (wait_result != 0) {
390 if (c_delay > FALLBACK_DELAY)
391 *delay_for_event = FALLBACK_DELAY;
392 } else {
393 *delay_for_event = c_delay;
394 }
395 mdstat_close();
396}
397
398static int make_daemon(char *pidfile)
399{
400 /* Return:
401 * -1 in the forked daemon
402 * 0 in the parent
403 * 1 on error
404 * so a none-negative becomes the exit code.
405 */
406 int pid = fork();
407 if (pid > 0) {
408 if (!pidfile)
409 printf("%d\n", pid);
410 else {
411 FILE *pid_file = NULL;
412 int fd = open(pidfile, O_WRONLY | O_CREAT | O_TRUNC,
413 0644);
414 if (fd >= 0)
415 pid_file = fdopen(fd, "w");
416 if (!pid_file)
417 perror("cannot create pid file");
418 else {
419 fprintf(pid_file,"%d\n", pid);
420 fclose(pid_file);
421 }
422 }
423 return 0;
424 }
425 if (pid < 0) {
426 perror("daemonise");
427 return 1;
428 }
429 manage_fork_fds(0);
430 setsid();
431 return -1;
432}
433
434/*
435 * check_one_sharer() - Checks for other mdmon processes running.
436 *
437 * Return:
438 * 0 - no other processes running,
439 * 1 - warning,
440 * 2 - error, or when scan mode is enabled, and one mdmon process already exists
441 */
442static int check_one_sharer(int scan)
443{
444 int pid;
445 FILE *fp, *comm_fp;
446 char comm_path[PATH_MAX];
447 char comm[TASK_COMM_LEN];
448
449 if (!is_directory(MDMON_DIR)) {
450 pr_err("%s is not a regular directory.\n", MDMON_DIR);
451 return 2;
452 }
453
454 fp = fopen(AUTOREBUILD_PID_PATH, "r");
455 if (!fp) {
456 /* PID file does not exist */
457 if (errno == ENOENT)
458 return 0;
459
460 pr_err("Cannot open %s file.\n", AUTOREBUILD_PID_PATH);
461 return 2;
462 }
463
464 if (!is_file(AUTOREBUILD_PID_PATH)) {
465 pr_err("%s is not a regular file.\n", AUTOREBUILD_PID_PATH);
466 fclose(fp);
467 return 2;
468 }
469
470 if (fscanf(fp, "%d", &pid) != 1) {
471 pr_err("Cannot read pid from %s file.\n", AUTOREBUILD_PID_PATH);
472 fclose(fp);
473 return 2;
474 }
475
476 snprintf(comm_path, sizeof(comm_path), "/proc/%d/comm", pid);
477
478 comm_fp = fopen(comm_path, "r");
479 if (!comm_fp) {
480 dprintf("Warning: Cannot open %s, continuing\n", comm_path);
481 fclose(fp);
482 return 1;
483 }
484
485 if (fscanf(comm_fp, "%15s", comm) == 0) {
486 dprintf("Warning: Cannot read comm from %s, continuing\n", comm_path);
487 fclose(comm_fp);
488 fclose(fp);
489 return 1;
490 }
491
492 if (strncmp(basename(comm), Name, strlen(Name)) == 0) {
493 if (scan) {
494 pr_err("Only one autorebuild process allowed in scan mode, aborting\n");
495 fclose(comm_fp);
496 fclose(fp);
497 return 2;
498 }
499 pr_err("Warning: One autorebuild process already running.\n");
500 }
501 fclose(comm_fp);
502 fclose(fp);
503 return 0;
504}
505
506/*
507 * write_autorebuild_pid() - Writes pid to autorebuild.pid file.
508 *
509 * Return: 0 on success, 1 on error
510 */
511static int write_autorebuild_pid(void)
512{
513 FILE *fp;
514 int fd;
515
516 if (mkdir(MDMON_DIR, 0700) < 0 && errno != EEXIST) {
517 pr_err("%s: %s\n", strerror(errno), MDMON_DIR);
518 return 1;
519 }
520
521 if (!is_directory(MDMON_DIR)) {
522 pr_err("%s is not a regular directory.\n", MDMON_DIR);
523 return 1;
524 }
525
526 fd = open(AUTOREBUILD_PID_PATH, O_WRONLY | O_CREAT | O_TRUNC, 0700);
527
528 if (fd < 0) {
529 pr_err("Error opening %s file.\n", AUTOREBUILD_PID_PATH);
530 return 1;
531 }
532
533 fp = fdopen(fd, "w");
534
535 if (!fp) {
536 pr_err("Error opening fd for %s file.\n", AUTOREBUILD_PID_PATH);
537 return 1;
538 }
539
540 fprintf(fp, "%d\n", getpid());
541
542 fclose(fp);
543 return 0;
544}
545
546#define BASE_MESSAGE "%s event detected on md device %s"
547#define COMPONENT_DEVICE_MESSAGE ", component device %s"
548#define DESCRIPTION_MESSAGE ": %s"
549/*
550 * sprint_event_message() - Writes basic message about detected event to destination ptr.
551 * @dest: message destination, should be at least the size of BUFSIZ
552 * @data: event data
553 *
554 * Return: 0 on success, 1 on error
555 */
556static int sprint_event_message(char *dest, const struct event_data *data)
557{
558 if (!dest || !data)
559 return 1;
560
561 if (data->disc && data->description)
562 snprintf(dest, BUFSIZ, BASE_MESSAGE COMPONENT_DEVICE_MESSAGE DESCRIPTION_MESSAGE,
563 data->event_name, data->dev, data->disc, data->description);
564 else if (data->disc)
565 snprintf(dest, BUFSIZ, BASE_MESSAGE COMPONENT_DEVICE_MESSAGE,
566 data->event_name, data->dev, data->disc);
567 else if (data->description)
568 snprintf(dest, BUFSIZ, BASE_MESSAGE DESCRIPTION_MESSAGE,
569 data->event_name, data->dev, data->description);
570 else
571 snprintf(dest, BUFSIZ, BASE_MESSAGE, data->event_name, data->dev);
572
573 return 0;
574}
575
576/*
577 * get_syslog_event_priority() - Determines event priority.
578 * @event_enum: event to be checked
579 *
580 * Return: LOG_CRIT, LOG_WARNING or LOG_INFO
581 */
582static int get_syslog_event_priority(const enum event event_enum)
583{
584 if (event_enum > __SYSLOG_PRIORITY_CRITICAL)
585 return LOG_CRIT;
586 if (event_enum > __SYSLOG_PRIORITY_WARNING)
587 return LOG_WARNING;
588 return LOG_INFO;
589}
590
591/*
592 * is_email_event() - Determines whether email for event should be sent or not.
593 * @event_enum: event to be checked
594 *
595 * Return: true if email should be sent, false otherwise
596 */
597static bool is_email_event(const enum event event_enum)
598{
599 static const enum event email_events[] = {
600 EVENT_FAIL,
601 EVENT_FAIL_SPARE,
602 EVENT_DEGRADED_ARRAY,
603 EVENT_SPARES_MISSING,
604 EVENT_TEST_MESSAGE
605 };
606 unsigned int i;
607
608 for (i = 0; i < ARRAY_SIZE(email_events); ++i) {
609 if (event_enum == email_events[i])
610 return true;
611 }
612 return false;
613}
614
615/*
616 * execute_alert_cmd() - Forks and executes command provided as alert_cmd.
617 * @data: event data
618 */
619static void execute_alert_cmd(const struct event_data *data)
620{
621 int pid = fork();
622
623 switch (pid) {
624 default:
625 waitpid(pid, NULL, 0);
626 break;
627 case -1:
628 pr_err("Cannot fork to execute alert command");
629 break;
630 case 0:
631 execl(info.alert_cmd, info.alert_cmd, data->event_name, data->dev, data->disc, NULL);
632 exit(2);
633 }
634}
635
636/*
637 * send_event_email() - Sends an email about event detected by monitor.
638 * @data: event data
639 */
640static void send_event_email(const struct event_data *data)
641{
642 FILE *mp, *mdstat;
643 char buf[BUFSIZ];
644 int n;
645
646 mp = popen(Sendmail, "w");
647 if (!mp) {
648 pr_err("Cannot open pipe stream for sendmail.\n");
649 return;
650 }
651
652 signal(SIGPIPE, SIG_IGN);
653 if (info.mailfrom)
654 fprintf(mp, "From: %s\n", info.mailfrom);
655 else
656 fprintf(mp, "From: %s monitoring <root>\n", Name);
657 fprintf(mp, "To: %s\n", info.mailaddr);
658 fprintf(mp, "Subject: %s event on %s:%s\n\n", data->event_name, data->dev, info.hostname);
659 fprintf(mp, "This is an automatically generated mail message.\n");
660 fprintf(mp, "%s\n", data->message);
661
662 mdstat = fopen("/proc/mdstat", "r");
663 if (!mdstat) {
664 pr_err("Cannot open /proc/mdstat\n");
665 pclose(mp);
666 return;
667 }
668
669 fprintf(mp, "The /proc/mdstat file currently contains the following:\n\n");
670 while ((n = fread(buf, 1, sizeof(buf), mdstat)) > 0)
671 n = fwrite(buf, 1, n, mp);
672 fclose(mdstat);
673 pclose(mp);
674}
675
676/*
677 * log_event_to_syslog() - Logs an event into syslog.
678 * @data: event data
679 */
680static void log_event_to_syslog(const struct event_data *data)
681{
682 int priority;
683
684 priority = get_syslog_event_priority(data->event_enum);
685
686 syslog(priority, "%s\n", data->message);
687}
688
689/*
690 * alert() - Alerts about the monitor event.
691 * @event_enum: event to be sent
692 * @description: event description
693 * @progress: rebuild progress
694 * @dev: md device name
695 * @disc: component device
696 *
697 * If needed function executes alert command, sends an email or logs event to syslog.
698 */
699static void alert(const enum event event_enum, const char *description, const uint8_t progress,
700 const char *dev, const char *disc)
701{
702 struct event_data data = {.dev = dev, .disc = disc, .description = description};
703
704 if (!dev)
705 return;
706
707 if (event_enum == EVENT_REBUILD) {
708 snprintf(data.event_name, sizeof(data.event_name), "%s%02d",
709 map_num_s(events_map, EVENT_REBUILD), progress);
710 } else {
711 snprintf(data.event_name, sizeof(data.event_name), "%s", map_num_s(events_map, event_enum));
712 }
713
714 data.event_enum = event_enum;
715
716 if (sprint_event_message(data.message, &data) != 0) {
717 pr_err("Cannot create event message.\n");
718 return;
719 }
720 pr_err("%s\n", data.message);
721
722 if (info.alert_cmd)
723 execute_alert_cmd(&data);
724
725 if (info.mailaddr && is_email_event(event_enum))
726 send_event_email(&data);
727
728 if (info.dosyslog)
729 log_event_to_syslog(&data);
730}
731
732static int check_array(struct state *st, struct mdstat_ent *mdstat,
733 int increments, char *prefer)
734{
735 /* Update the state 'st' to reflect any changes shown in mdstat,
736 * or found by directly examining the array, and return
737 * '1' if the array is degraded, or '0' if it is optimal (or dead).
738 */
739 struct { int state, major, minor; } disks_info[MAX_DISKS];
740 struct mdinfo *sra = NULL;
741 mdu_array_info_t array;
742 struct mdstat_ent *mse = NULL, *mse2;
743 char *dev = st->devname;
744 int fd;
745 int i;
746 int remaining_disks;
747 int last_disk;
748 int new_array = 0;
749 int retval;
750 int is_container = 0;
751 unsigned long redundancy_only_flags = 0;
752
753 if (info.test)
754 alert(EVENT_TEST_MESSAGE, NULL, 0, dev, NULL);
755
756 retval = 0;
757
758 fd = open(dev, O_RDONLY);
759 if (fd < 0)
760 goto disappeared;
761
762 if (st->devnm[0] == 0)
763 snprintf(st->devnm, MD_NAME_MAX, "%s", fd2devnm(fd));
764
765 for (mse2 = mdstat; mse2; mse2 = mse2->next)
766 if (strcmp(mse2->devnm, st->devnm) == 0) {
767 mse2->devnm[0] = 0; /* flag it as "used" */
768 mse = mse2;
769 }
770
771 if (!mse) {
772 /* duplicated array in statelist
773 * or re-created after reading mdstat
774 */
775 st->err++;
776 goto out;
777 }
778
779 if (mse->level == NULL)
780 is_container = 1;
781
782 if (!is_container && !md_array_active(fd))
783 goto disappeared;
784
785 fcntl(fd, F_SETFD, FD_CLOEXEC);
786 if (md_get_array_info(fd, &array) < 0)
787 goto disappeared;
788
789 if (!is_container && map_name(pers, mse->level) > 0)
790 redundancy_only_flags |= GET_MISMATCH;
791
792 sra = sysfs_read(-1, st->devnm, GET_LEVEL | GET_DISKS | GET_DEVS |
793 GET_STATE | redundancy_only_flags);
794
795 if (!sra)
796 goto disappeared;
797
798 /* It's much easier to list what array levels can't
799 * have a device disappear than all of them that can
800 */
801 if (sra->array.level == 0 || sra->array.level == -1) {
802 if (!st->err && !st->from_config)
803 alert(EVENT_DEVICE_DISAPPEARED, "Wrong-Level", 0, dev, NULL);
804 st->err++;
805 goto out;
806 }
807
808 /* this array is in /proc/mdstat */
809 if (array.utime == 0)
810 /* external arrays don't update utime, so
811 * just make sure it is always different. */
812 array.utime = st->utime + 1;;
813
814 if (st->err) {
815 /* New array appeared where previously had an error */
816 st->err = 0;
817 st->percent = RESYNC_NONE;
818 new_array = 1;
819 if (!is_container)
820 alert(EVENT_NEW_ARRAY, NULL, 0, st->devname, NULL);
821 }
822
823 if (st->utime == array.utime && st->failed == sra->array.failed_disks &&
824 st->working == sra->array.working_disks &&
825 st->spare == sra->array.spare_disks &&
826 (mse == NULL || (mse->percent == st->percent))) {
827 if ((st->active < st->raid) && st->spare == 0)
828 retval = 1;
829 goto out;
830 }
831 if (st->utime == 0 && /* new array */
832 mse->pattern && strchr(mse->pattern, '_') /* degraded */)
833 alert(EVENT_DEGRADED_ARRAY, NULL, 0, dev, NULL);
834
835 if (st->utime == 0 && /* new array */ st->expected_spares > 0 &&
836 sra->array.spare_disks < st->expected_spares)
837 alert(EVENT_SPARES_MISSING, NULL, 0, dev, NULL);
838 if (st->percent < 0 && st->percent != RESYNC_UNKNOWN &&
839 mse->percent >= 0)
840 alert(EVENT_REBUILD_STARTED, NULL, 0, dev, NULL);
841 if (st->percent >= 0 && mse->percent >= 0 &&
842 (mse->percent / increments) > (st->percent / increments)) {
843 if((mse->percent / increments) == 0)
844 alert(EVENT_REBUILD_STARTED, NULL, 0, dev, NULL);
845 else
846 alert(EVENT_REBUILD, NULL, mse->percent, dev, NULL);
847 }
848
849 if (mse->percent == RESYNC_NONE && st->percent >= 0) {
850 /* Rebuild/sync/whatever just finished.
851 * If there is a number in /mismatch_cnt,
852 * we should report that.
853 */
854 if (sra && sra->mismatch_cnt > 0) {
855 char cnt[80];
856 snprintf(cnt, sizeof(cnt),
857 " mismatches found: %d (on raid level %d)",
858 sra->mismatch_cnt, sra->array.level);
859 alert(EVENT_REBUILD_FINISHED, NULL, 0, dev, cnt);
860 } else
861 alert(EVENT_REBUILD_FINISHED, NULL, 0, dev, NULL);
862 }
863 st->percent = mse->percent;
864
865 remaining_disks = sra->array.nr_disks;
866 for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
867 mdu_disk_info_t disc;
868 disc.number = i;
869 if (md_get_disk_info(fd, &disc) >= 0) {
870 disks_info[i].state = disc.state;
871 disks_info[i].major = disc.major;
872 disks_info[i].minor = disc.minor;
873 if (disc.major || disc.minor)
874 remaining_disks --;
875 } else
876 disks_info[i].major = disks_info[i].minor = 0;
877 }
878 last_disk = i;
879
880 if (mse->metadata_version &&
881 strncmp(mse->metadata_version, "external:", 9) == 0 &&
882 is_subarray(mse->metadata_version+9)) {
883 char *sl;
884 snprintf(st->parent_devnm, MD_NAME_MAX, "%s", mse->metadata_version + 10);
885 sl = strchr(st->parent_devnm, '/');
886 if (sl)
887 *sl = 0;
888 } else
889 st->parent_devnm[0] = 0;
890 if (st->metadata == NULL && st->parent_devnm[0] == 0)
891 st->metadata = super_by_fd(fd, NULL);
892
893 for (i = 0; i < MAX_DISKS; i++) {
894 mdu_disk_info_t disc = {0, 0, 0, 0, 0};
895 int newstate = 0;
896 int change;
897 char *dv = NULL;
898 disc.number = i;
899 if (i < last_disk && (disks_info[i].major || disks_info[i].minor)) {
900 newstate = disks_info[i].state;
901 dv = map_dev_preferred(disks_info[i].major, disks_info[i].minor, 1,
902 prefer);
903 disc.state = newstate;
904 disc.major = disks_info[i].major;
905 disc.minor = disks_info[i].minor;
906 } else
907 newstate = (1 << MD_DISK_REMOVED);
908
909 if (dv == NULL && st->devid[i])
910 dv = map_dev_preferred(major(st->devid[i]),
911 minor(st->devid[i]), 1, prefer);
912 change = newstate ^ st->devstate[i];
913 if (st->utime && change && !st->err && !new_array) {
914 if ((st->devstate[i]&change) & (1 << MD_DISK_SYNC))
915 alert(EVENT_FAIL, NULL, 0, dev, dv);
916 else if ((newstate & (1 << MD_DISK_FAULTY)) &&
917 (disc.major || disc.minor) &&
918 st->devid[i] == makedev(disc.major,
919 disc.minor))
920 alert(EVENT_FAIL_SPARE, NULL, 0, dev, dv);
921 else if ((newstate&change) & (1 << MD_DISK_SYNC))
922 alert(EVENT_SPARE_ACTIVE, NULL, 0, dev, dv);
923 }
924 st->devstate[i] = newstate;
925 st->devid[i] = makedev(disc.major, disc.minor);
926 }
927 st->active = sra->array.active_disks;
928 st->working = sra->array.working_disks;
929 st->spare = sra->array.spare_disks;
930 st->failed = sra->array.failed_disks;
931 st->utime = array.utime;
932 st->raid = sra->array.raid_disks;
933 st->err = 0;
934 if ((st->active < st->raid) && st->spare == 0)
935 retval = 1;
936
937 out:
938 if (sra)
939 sysfs_free(sra);
940 if (fd >= 0)
941 close(fd);
942 return retval;
943
944 disappeared:
945 if (!st->err && !is_container)
946 alert(EVENT_DEVICE_DISAPPEARED, NULL, 0, dev, NULL);
947 st->err++;
948 goto out;
949}
950
951static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist)
952{
953 struct mdstat_ent *mse;
954 int new_found = 0;
955 char *name;
956
957 for (mse = mdstat; mse; mse = mse->next)
958 if (mse->devnm[0] && (!mse->level || /* retrieve containers */
959 (strcmp(mse->level, "raid0") != 0 &&
960 strcmp(mse->level, "linear") != 0))) {
961 struct state *st = xcalloc(1, sizeof *st);
962 mdu_array_info_t array;
963 int fd;
964
965 name = get_md_name(mse->devnm);
966 if (!name) {
967 free(st);
968 continue;
969 }
970
971 snprintf(st->devname, MD_NAME_MAX + sizeof(DEV_MD_DIR), "%s", name);
972 if ((fd = open(st->devname, O_RDONLY)) < 0 ||
973 md_get_array_info(fd, &array) < 0) {
974 /* no such array */
975 if (fd >= 0)
976 close(fd);
977 put_md_name(st->devname);
978 if (st->metadata) {
979 st->metadata->ss->free_super(st->metadata);
980 free(st->metadata);
981 }
982 free(st);
983 continue;
984 }
985 close(fd);
986 st->next = *statelist;
987 st->err = 1;
988 st->from_auto = 1;
989 snprintf(st->devnm, MD_NAME_MAX, "%s", mse->devnm);
990 st->percent = RESYNC_UNKNOWN;
991 st->expected_spares = -1;
992 if (mse->metadata_version &&
993 strncmp(mse->metadata_version,
994 "external:", 9) == 0 &&
995 is_subarray(mse->metadata_version+9)) {
996 char *sl;
997 snprintf(st->parent_devnm, MD_NAME_MAX,
998 "%s", mse->metadata_version + 10);
999 sl = strchr(st->parent_devnm, '/');
1000 *sl = 0;
1001 } else
1002 st->parent_devnm[0] = 0;
1003 *statelist = st;
1004 if (info.test)
1005 alert(EVENT_TEST_MESSAGE, NULL, 0, st->devname, NULL);
1006 new_found = 1;
1007 }
1008 return new_found;
1009}
1010
1011static int check_donor(struct state *from, struct state *to)
1012{
1013 struct state *sub;
1014
1015 if (from == to)
1016 return 0;
1017 if (from->parent)
1018 /* Cannot move from a member */
1019 return 0;
1020 if (from->err)
1021 return 0;
1022 for (sub = from->subarray; sub; sub = sub->subarray)
1023 /* If source array has degraded subarrays, don't
1024 * remove anything
1025 */
1026 if (sub->active < sub->raid)
1027 return 0;
1028 if (from->metadata->ss->external == 0)
1029 if (from->active < from->raid)
1030 return 0;
1031 if (from->spare <= 0)
1032 return 0;
1033 return 1;
1034}
1035
1036static dev_t choose_spare(struct state *from, struct state *to,
1037 struct domainlist *domlist, struct spare_criteria *sc)
1038{
1039 int d;
1040 dev_t dev = 0;
1041
1042 for (d = from->raid; !dev && d < MAX_DISKS; d++) {
1043 if (from->devid[d] > 0 && from->devstate[d] == 0) {
1044 struct dev_policy *pol;
1045
1046 if (to->metadata->ss->external &&
1047 test_partition_from_id(from->devid[d]))
1048 continue;
1049
1050 if (devid_matches_criteria(to->metadata, from->devid[d], sc) == false)
1051 continue;
1052
1053 pol = devid_policy(from->devid[d]);
1054 if (from->spare_group)
1055 pol_add(&pol, pol_domain,
1056 from->spare_group, NULL);
1057 if (domain_test(domlist, pol,
1058 to->metadata->ss->name) == 1)
1059 dev = from->devid[d];
1060 dev_policy_free(pol);
1061 }
1062 }
1063 return dev;
1064}
1065
1066static dev_t container_choose_spare(struct state *from, struct state *to,
1067 struct domainlist *domlist,
1068 struct spare_criteria *sc, int active)
1069{
1070 /* This is similar to choose_spare, but we cannot trust devstate,
1071 * so we need to read the metadata instead
1072 */
1073 struct mdinfo *list;
1074 struct supertype *st = from->metadata;
1075 int fd = open(from->devname, O_RDONLY);
1076 int err;
1077 dev_t dev = 0;
1078
1079 if (fd < 0)
1080 return 0;
1081 if (!st->ss->getinfo_super_disks) {
1082 close(fd);
1083 return 0;
1084 }
1085
1086 err = st->ss->load_container(st, fd, NULL);
1087 close(fd);
1088 if (err)
1089 return 0;
1090
1091 if (from == to) {
1092 /* We must check if number of active disks has not increased
1093 * since ioctl in main loop. mdmon may have added spare
1094 * to subarray. If so we do not need to look for more spares
1095 * so return non zero value */
1096 int active_cnt = 0;
1097 struct mdinfo *dp;
1098 list = st->ss->getinfo_super_disks(st);
1099 if (!list) {
1100 st->ss->free_super(st);
1101 return 1;
1102 }
1103 dp = list->devs;
1104 while (dp) {
1105 if (dp->disk.state & (1 << MD_DISK_SYNC) &&
1106 !(dp->disk.state & (1 << MD_DISK_FAULTY)))
1107 active_cnt++;
1108 dp = dp->next;
1109 }
1110 sysfs_free(list);
1111 if (active < active_cnt) {
1112 /* Spare just activated.*/
1113 st->ss->free_super(st);
1114 return 1;
1115 }
1116 }
1117
1118 /* We only need one spare so full list not needed */
1119 list = container_choose_spares(st, sc, domlist, from->spare_group,
1120 to->metadata->ss->name, 1);
1121 if (list) {
1122 struct mdinfo *disks = list->devs;
1123 if (disks)
1124 dev = makedev(disks->disk.major, disks->disk.minor);
1125 sysfs_free(list);
1126 }
1127 st->ss->free_super(st);
1128 return dev;
1129}
1130
1131static void try_spare_migration(struct state *statelist)
1132{
1133 struct state *from;
1134 struct state *st;
1135
1136 link_containers_with_subarrays(statelist);
1137 for (st = statelist; st; st = st->next)
1138 if (st->active < st->raid && st->spare == 0 && !st->err) {
1139 struct domainlist *domlist = NULL;
1140 struct spare_criteria sc = {0};
1141 int d;
1142 struct state *to = st;
1143
1144 if (to->parent_devnm[0] && !to->parent)
1145 /* subarray monitored without parent container
1146 * we can't move spares here */
1147 continue;
1148
1149 if (to->parent)
1150 /* member of a container */
1151 to = to->parent;
1152
1153 if (to->metadata->ss->get_spare_criteria)
1154 if (to->metadata->ss->get_spare_criteria(to->metadata, to->devname,
1155 &sc))
1156 continue;
1157
1158 if (to->metadata->ss->external) {
1159 /* We must make sure there is
1160 * no suitable spare in container already.
1161 * If there is we don't add more */
1162 dev_t devid = container_choose_spare(
1163 to, to, NULL, &sc, st->active);
1164 if (devid > 0)
1165 continue;
1166 }
1167 for (d = 0; d < MAX_DISKS; d++)
1168 if (to->devid[d])
1169 domainlist_add_dev(&domlist,
1170 to->devid[d],
1171 to->metadata->ss->name);
1172 if (to->spare_group)
1173 domain_add(&domlist, to->spare_group);
1174 /*
1175 * No spare migration if the destination
1176 * has no domain. Skip this array.
1177 */
1178 if (!domlist)
1179 continue;
1180 for (from=statelist ; from ; from=from->next) {
1181 dev_t devid;
1182 if (!check_donor(from, to))
1183 continue;
1184 if (from->metadata->ss->external)
1185 devid = container_choose_spare(
1186 from, to, domlist, &sc, 0);
1187 else
1188 devid = choose_spare(from, to, domlist,
1189 &sc);
1190 if (devid > 0 &&
1191 move_spare(from->devname, to->devname,
1192 devid)) {
1193 alert(EVENT_MOVE_SPARE, NULL, 0, to->devname, from->devname);
1194 break;
1195 }
1196 }
1197 domain_free(domlist);
1198 dev_policy_free(sc.pols);
1199 }
1200}
1201
1202/* search the statelist to connect external
1203 * metadata subarrays with their containers
1204 * We always completely rebuild the tree from scratch as
1205 * that is safest considering the possibility of entries
1206 * disappearing or changing.
1207 */
1208static void link_containers_with_subarrays(struct state *list)
1209{
1210 struct state *st;
1211 struct state *cont;
1212 for (st = list; st; st = st->next) {
1213 st->parent = NULL;
1214 st->subarray = NULL;
1215 }
1216 for (st = list; st; st = st->next)
1217 if (st->parent_devnm[0])
1218 for (cont = list; cont; cont = cont->next)
1219 if (!cont->err && cont->parent_devnm[0] == 0 &&
1220 strcmp(cont->devnm, st->parent_devnm) == 0) {
1221 st->parent = cont;
1222 st->subarray = cont->subarray;
1223 cont->subarray = st;
1224 break;
1225 }
1226}
1227
1228/**
1229 * free_statelist() - Frees statelist.
1230 * @statelist: statelist to free
1231 */
1232static void free_statelist(struct state *statelist)
1233{
1234 struct state *tmp = NULL;
1235
1236 while (statelist) {
1237 if (statelist->spare_group)
1238 free(statelist->spare_group);
1239
1240 tmp = statelist;
1241 statelist = statelist->next;
1242 free(tmp);
1243 }
1244}
1245
1246/* Not really Monitor but ... */
1247int Wait(char *dev)
1248{
1249 char devnm[32];
1250 dev_t rdev;
1251 char *tmp;
1252 int rv = 1;
1253 int frozen_remaining = 3;
1254
1255 if (!stat_is_blkdev(dev, &rdev))
1256 return 2;
1257
1258 tmp = devid2devnm(rdev);
1259 if (!tmp) {
1260 pr_err("Cannot get md device name.\n");
1261 return 2;
1262 }
1263
1264 strcpy(devnm, tmp);
1265
1266 while(1) {
1267 struct mdstat_ent *ms = mdstat_read(1, 0);
1268 struct mdstat_ent *e;
1269
1270 for (e = ms; e; e = e->next)
1271 if (strcmp(e->devnm, devnm) == 0)
1272 break;
1273
1274 if (e && e->percent == RESYNC_NONE) {
1275 /* We could be in the brief pause before something
1276 * starts. /proc/mdstat doesn't show that, but
1277 * sync_action does.
1278 */
1279 struct mdinfo mdi;
1280 char buf[SYSFS_MAX_BUF_SIZE];
1281
1282 if (sysfs_init(&mdi, -1, devnm))
1283 return 2;
1284 if (sysfs_get_str(&mdi, NULL, "sync_action",
1285 buf, sizeof(buf)) > 0 &&
1286 strcmp(buf,"idle\n") != 0) {
1287 e->percent = RESYNC_UNKNOWN;
1288 if (strcmp(buf, "frozen\n") == 0) {
1289 if (frozen_remaining == 0)
1290 e->percent = RESYNC_NONE;
1291 else
1292 frozen_remaining -= 1;
1293 }
1294 }
1295 }
1296 if (!e || e->percent == RESYNC_NONE) {
1297 if (e && e->metadata_version &&
1298 strncmp(e->metadata_version, "external:", 9) == 0) {
1299 if (is_subarray(&e->metadata_version[9]))
1300 ping_monitor(&e->metadata_version[9]);
1301 else
1302 ping_monitor(devnm);
1303 }
1304 free_mdstat(ms);
1305 return rv;
1306 }
1307 free_mdstat(ms);
1308 rv = 0;
1309 mdstat_wait(5);
1310 }
1311}
1312
1313/* The state "broken" is used only for RAID0/LINEAR - it's the same as
1314 * "clean", but used in case the array has one or more members missing.
1315 */
1316static char *clean_states[] = {
1317 "clear", "inactive", "readonly", "read-auto", "clean", "broken", NULL };
1318
1319int WaitClean(char *dev, int verbose)
1320{
1321 int fd;
1322 struct mdinfo *mdi;
1323 int rv = 1;
1324 char devnm[32];
1325
1326 if (!stat_is_blkdev(dev, NULL))
1327 return 2;
1328 fd = open(dev, O_RDONLY);
1329 if (fd < 0) {
1330 if (verbose)
1331 pr_err("Couldn't open %s: %s\n", dev, strerror(errno));
1332 return 1;
1333 }
1334
1335 strcpy(devnm, fd2devnm(fd));
1336 mdi = sysfs_read(fd, devnm, GET_VERSION|GET_LEVEL|GET_SAFEMODE);
1337 if (!mdi) {
1338 if (verbose)
1339 pr_err("Failed to read sysfs attributes for %s\n", dev);
1340 close(fd);
1341 return 0;
1342 }
1343
1344 switch(mdi->array.level) {
1345 case LEVEL_LINEAR:
1346 case LEVEL_MULTIPATH:
1347 case 0:
1348 /* safemode delay is irrelevant for these levels */
1349 rv = 0;
1350 }
1351
1352 /* for internal metadata the kernel handles the final clean
1353 * transition, containers can never be dirty
1354 */
1355 if (!is_subarray(mdi->text_version))
1356 rv = 0;
1357
1358 /* safemode disabled ? */
1359 if (mdi->safe_mode_delay == 0)
1360 rv = 0;
1361
1362 if (rv) {
1363 int state_fd = sysfs_open(fd2devnm(fd), NULL, "array_state");
1364 char buf[SYSFS_MAX_BUF_SIZE];
1365 int delay = 5000;
1366
1367 /* minimize the safe_mode_delay and prepare to wait up to 5s
1368 * for writes to quiesce
1369 */
1370 sysfs_set_safemode(mdi, 1);
1371
1372 /* wait for array_state to be clean */
1373 while (1) {
1374 rv = read(state_fd, buf, sizeof(buf));
1375 if (rv < 0)
1376 break;
1377 if (sysfs_match_word(buf, clean_states) <
1378 (int)ARRAY_SIZE(clean_states) - 1)
1379 break;
1380 rv = sysfs_wait(state_fd, &delay);
1381 if (rv < 0 && errno != EINTR)
1382 break;
1383 lseek(state_fd, 0, SEEK_SET);
1384 }
1385 if (rv < 0)
1386 rv = 1;
1387 else if (ping_monitor(mdi->text_version) == 0) {
1388 /* we need to ping to close the window between array
1389 * state transitioning to clean and the metadata being
1390 * marked clean
1391 */
1392 rv = 0;
1393 } else {
1394 rv = 1;
1395 pr_err("Error connecting monitor with %s\n", dev);
1396 }
1397 if (rv && verbose)
1398 pr_err("Error waiting for %s to be clean\n", dev);
1399
1400 /* restore the original safe_mode_delay */
1401 sysfs_set_safemode(mdi, mdi->safe_mode_delay);
1402 close(state_fd);
1403 }
1404
1405 sysfs_free(mdi);
1406 close(fd);
1407
1408 return rv;
1409}