2 * mdadm - manage Linux "md" devices aka RAID arrays.
4 * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 * Email: <neilb@suse.de>
33 /* The largest number of disks current arrays can manage is 384
34 * This really should be dynamically, but that will have to wait
35 * At least it isn't MD_SB_DISKS.
40 int devnum
; /* to sync with mdstat info */
44 int active
, working
, failed
, spare
, raid
;
46 int devstate
[MaxDisks
];
47 unsigned devid
[MaxDisks
];
49 int parent_dev
; /* For subarray, devnum of parent.
52 struct supertype
*metadata
;
53 struct state
*subarray
;/* for a container it is a link to first subarray
54 * for a subarray it is a link to next subarray
55 * in the same container */
56 struct state
*parent
; /* for a subarray it is a link to its container
67 static int make_daemon(char *pidfile
);
68 static int check_one_sharer(int scan
);
69 static void alert(char *event
, char *dev
, char *disc
, struct alert_info
*info
);
70 static int check_array(struct state
*st
, struct mdstat_ent
*mdstat
,
71 int test
, struct alert_info
*info
,
73 static int add_new_arrays(struct mdstat_ent
*mdstat
, struct state
*statelist
,
74 int test
, struct alert_info
*info
);
75 static void try_spare_migration(struct state
*statelist
, struct alert_info
*info
);
76 static void link_containers_with_subarrays(struct state
*list
);
78 int Monitor(struct mddev_dev
*devlist
,
79 char *mailaddr
, char *alert_cmd
,
80 int period
, int daemonise
, int scan
, int oneshot
,
81 int dosyslog
, int test
, char *pidfile
, int increments
,
85 * Every few seconds, scan every md device looking for changes
86 * When a change is found, log it, possibly run the alert command,
87 * and possibly send Email
89 * For each array, we record:
91 * active/working/failed/spare drives
92 * State of each device.
93 * %rebuilt if rebuilding
95 * If the update time changes, check out all the data again
96 * It is possible that we cannot get the state of each device
97 * due to bugs in the md kernel module.
98 * We also read /proc/mdstat to get rebuild percent,
99 * and to get state on all active devices incase of kernel bug.
103 * An active device had Faulty set or Active/Sync removed
105 * A spare device had Faulty set
107 * An active device had a reverse transition
109 * percent went from -1 to +ve
111 * percent went from below to not-below NN%
113 * Couldn't access a device which was previously visible
115 * if we detect an array with active<raid and spare==0
116 * we look at other arrays that have same spare-group
117 * If we find one with active==raid and spare>0,
118 * and if we can get_disk_info and find a name
119 * Then we hot-remove and hot-add to the other array
121 * If devlist is NULL, then we can monitor everything because --scan
122 * was given. We get an initial list from config file and add anything
123 * that appears in /proc/mdstat
126 struct state
*statelist
= NULL
;
128 struct mdstat_ent
*mdstat
= NULL
;
129 char *mailfrom
= NULL
;
130 struct alert_info info
;
133 mailaddr
= conf_get_mailaddr();
134 if (mailaddr
&& ! scan
)
135 fprintf(stderr
, Name
": Monitor using email address \"%s\" from config file\n",
138 mailfrom
= conf_get_mailfrom();
141 alert_cmd
= conf_get_program();
142 if (alert_cmd
&& ! scan
)
143 fprintf(stderr
, Name
": Monitor using program \"%s\" from config file\n",
146 if (scan
&& !mailaddr
&& !alert_cmd
) {
147 fprintf(stderr
, Name
": No mail address or alert command - not monitoring.\n");
150 info
.alert_cmd
= alert_cmd
;
151 info
.mailaddr
= mailaddr
;
152 info
.mailfrom
= mailfrom
;
153 info
.dosyslog
= dosyslog
;
156 if (make_daemon(pidfile
))
160 if (check_one_sharer(scan
))
163 if (devlist
== NULL
) {
164 struct mddev_ident
*mdlist
= conf_get_ident(NULL
);
165 for (; mdlist
; mdlist
=mdlist
->next
) {
167 if (mdlist
->devname
== NULL
)
169 if (strcasecmp(mdlist
->devname
, "<ignore>") == 0)
171 st
= calloc(1, sizeof *st
);
174 if (mdlist
->devname
[0] == '/')
175 st
->devname
= strdup(mdlist
->devname
);
177 st
->devname
= malloc(8+strlen(mdlist
->devname
)+1);
178 strcpy(strcpy(st
->devname
, "/dev/md/"),
181 st
->next
= statelist
;
182 st
->devnum
= INT_MAX
;
184 st
->expected_spares
= mdlist
->spare_disks
;
185 if (mdlist
->spare_group
)
186 st
->spare_group
= strdup(mdlist
->spare_group
);
190 struct mddev_dev
*dv
;
191 for (dv
=devlist
; dv
; dv
=dv
->next
) {
192 struct mddev_ident
*mdlist
= conf_get_ident(dv
->devname
);
193 struct state
*st
= calloc(1, sizeof *st
);
196 st
->devname
= strdup(dv
->devname
);
197 st
->next
= statelist
;
198 st
->devnum
= INT_MAX
;
200 st
->expected_spares
= -1;
202 st
->expected_spares
= mdlist
->spare_disks
;
203 if (mdlist
->spare_group
)
204 st
->spare_group
= strdup(mdlist
->spare_group
);
218 mdstat
= mdstat_read(oneshot
?0:1, 0);
220 for (st
=statelist
; st
; st
=st
->next
)
221 if (check_array(st
, mdstat
, test
, &info
, increments
))
224 /* now check if there are any new devices found in mdstat */
226 new_found
= add_new_arrays(mdstat
, statelist
, test
,
229 /* If an array has active < raid && spare == 0 && spare_group != NULL
230 * Look for another array with spare > 0 and active == raid and same spare_group
231 * if found, choose a device and hotremove/hotadd
233 if (share
&& anydegraded
)
234 try_spare_migration(statelist
, &info
);
248 static int make_daemon(char *pidfile
)
256 pid_file
=fopen(pidfile
, "w");
258 perror("cannot create pid file");
260 fprintf(pid_file
,"%d\n", pid
);
271 open("/dev/null", O_RDWR
);
278 static int check_one_sharer(int scan
)
284 fp
= fopen("/var/run/mdadm/autorebuild.pid", "r");
286 fscanf(fp
, "%d", &pid
);
287 sprintf(dir
, "/proc/%d", pid
);
288 rv
= stat(dir
, &buf
);
291 fprintf(stderr
, Name
": Only one "
292 "autorebuild process allowed"
293 " in scan mode, aborting\n");
297 fprintf(stderr
, Name
": Warning: One"
298 " autorebuild process already"
305 fp
= fopen("/var/run/mdadm/autorebuild.pid", "w");
307 fprintf(stderr
, Name
": Cannot create"
312 fprintf(fp
, "%d\n", pid
);
319 static void alert(char *event
, char *dev
, char *disc
, struct alert_info
*info
)
323 if (!info
->alert_cmd
&& !info
->mailaddr
) {
324 time_t now
= time(0);
326 printf("%1.15s: %s on %s %s\n", ctime(&now
)+4, event
, dev
, disc
?disc
:"unknown device");
328 if (info
->alert_cmd
) {
332 waitpid(pid
, NULL
, 0);
337 execl(info
->alert_cmd
, info
->alert_cmd
,
338 event
, dev
, disc
, NULL
);
342 if (info
->mailaddr
&&
343 (strncmp(event
, "Fail", 4)==0 ||
344 strncmp(event
, "Test", 4)==0 ||
345 strncmp(event
, "Spares", 6)==0 ||
346 strncmp(event
, "Degrade", 7)==0)) {
347 FILE *mp
= popen(Sendmail
, "w");
351 gethostname(hname
, sizeof(hname
));
352 signal(SIGPIPE
, SIG_IGN
);
354 fprintf(mp
, "From: %s\n", info
->mailfrom
);
356 fprintf(mp
, "From: " Name
" monitoring <root>\n");
357 fprintf(mp
, "To: %s\n", info
->mailaddr
);
358 fprintf(mp
, "Subject: %s event on %s:%s\n\n",
362 "This is an automatically generated"
363 " mail message from " Name
"\n");
364 fprintf(mp
, "running on %s\n\n", hname
);
367 "A %s event had been detected on"
368 " md device %s.\n\n", event
, dev
);
370 if (disc
&& disc
[0] != ' ')
372 "It could be related to"
373 " component device %s.\n\n", disc
);
374 if (disc
&& disc
[0] == ' ')
375 fprintf(mp
, "Extra information:%s.\n\n", disc
);
377 fprintf(mp
, "Faithfully yours, etc.\n");
379 mdstat
= fopen("/proc/mdstat", "r");
384 "\nP.S. The /proc/mdstat file"
385 " currently contains the following:\n\n");
386 while ( (n
=fread(buf
, 1, sizeof(buf
), mdstat
)) > 0)
387 n
=fwrite(buf
, 1, n
, mp
);
394 /* log the event to syslog maybe */
395 if (info
->dosyslog
) {
396 /* Log at a different severity depending on the event.
398 * These are the critical events: */
399 if (strncmp(event
, "Fail", 4)==0 ||
400 strncmp(event
, "Degrade", 7)==0 ||
401 strncmp(event
, "DeviceDisappeared", 17)==0)
403 /* Good to know about, but are not failures: */
404 else if (strncmp(event
, "Rebuild", 7)==0 ||
405 strncmp(event
, "MoveSpare", 9)==0 ||
406 strncmp(event
, "Spares", 6) != 0)
407 priority
= LOG_WARNING
;
408 /* Everything else: */
414 "%s event detected on md device %s,"
415 " component device %s", event
, dev
, disc
);
418 "%s event detected on md device %s",
423 static int check_array(struct state
*st
, struct mdstat_ent
*mdstat
,
424 int test
, struct alert_info
*ainfo
,
427 struct { int state
, major
, minor
; } info
[MaxDisks
];
428 mdu_array_info_t array
;
429 struct mdstat_ent
*mse
= NULL
, *mse2
;
430 char *dev
= st
->devname
;
435 alert("TestMessage", dev
, NULL
, ainfo
);
436 fd
= open(dev
, O_RDONLY
);
439 alert("DeviceDisappeared", dev
, NULL
, ainfo
);
443 fcntl(fd
, F_SETFD
, FD_CLOEXEC
);
444 if (ioctl(fd
, GET_ARRAY_INFO
, &array
)<0) {
446 alert("DeviceDisappeared", dev
, NULL
, ainfo
);
451 /* It's much easier to list what array levels can't
452 * have a device disappear than all of them that can
454 if (array
.level
== 0 || array
.level
== -1) {
456 alert("DeviceDisappeared", dev
, "Wrong-Level", ainfo
);
461 if (st
->devnum
== INT_MAX
) {
463 if (fstat(fd
, &stb
) == 0 &&
464 (S_IFMT
&stb
.st_mode
)==S_IFBLK
) {
465 if (major(stb
.st_rdev
) == MD_MAJOR
)
466 st
->devnum
= minor(stb
.st_rdev
);
468 st
->devnum
= -1- (minor(stb
.st_rdev
)>>6);
472 for (mse2
= mdstat
; mse2
; mse2
=mse2
->next
)
473 if (mse2
->devnum
== st
->devnum
) {
474 mse2
->devnum
= INT_MAX
; /* flag it as "used" */
479 /* duplicated array in statelist
480 * or re-created after reading mdstat*/
485 /* this array is in /proc/mdstat */
486 if (array
.utime
== 0)
487 /* external arrays don't update utime, so
488 * just make sure it is always different. */
489 array
.utime
= st
->utime
+ 1;;
491 if (st
->utime
== array
.utime
&&
492 st
->failed
== array
.failed_disks
&&
493 st
->working
== array
.working_disks
&&
494 st
->spare
== array
.spare_disks
&&
496 mse
->percent
== st
->percent
502 if (st
->utime
== 0 && /* new array */
503 mse
->pattern
&& strchr(mse
->pattern
, '_') /* degraded */
505 alert("DegradedArray", dev
, NULL
, ainfo
);
507 if (st
->utime
== 0 && /* new array */
508 st
->expected_spares
> 0 &&
509 array
.spare_disks
< st
->expected_spares
)
510 alert("SparesMissing", dev
, NULL
, ainfo
);
511 if (st
->percent
== -1 &&
513 alert("RebuildStarted", dev
, NULL
, ainfo
);
514 if (st
->percent
>= 0 &&
516 (mse
->percent
/ increments
) > (st
->percent
/ increments
)) {
517 char percentalert
[15]; // "RebuildNN" (10 chars) or "RebuildStarted" (15 chars)
519 if((mse
->percent
/ increments
) == 0)
520 snprintf(percentalert
, sizeof(percentalert
), "RebuildStarted");
522 snprintf(percentalert
, sizeof(percentalert
), "Rebuild%02d", mse
->percent
);
524 alert(percentalert
, dev
, NULL
, ainfo
);
527 if (mse
->percent
== -1 &&
529 /* Rebuild/sync/whatever just finished.
530 * If there is a number in /mismatch_cnt,
531 * we should report that.
534 sysfs_read(-1, st
->devnum
, GET_MISMATCH
);
535 if (sra
&& sra
->mismatch_cnt
> 0) {
537 sprintf(cnt
, " mismatches found: %d", sra
->mismatch_cnt
);
538 alert("RebuildFinished", dev
, cnt
, ainfo
);
540 alert("RebuildFinished", dev
, NULL
, ainfo
);
544 st
->percent
= mse
->percent
;
546 for (i
=0; i
<MaxDisks
&& i
<= array
.raid_disks
+ array
.nr_disks
;
548 mdu_disk_info_t disc
;
550 if (ioctl(fd
, GET_DISK_INFO
, &disc
) >= 0) {
551 info
[i
].state
= disc
.state
;
552 info
[i
].major
= disc
.major
;
553 info
[i
].minor
= disc
.minor
;
555 info
[i
].major
= info
[i
].minor
= 0;
558 if (strncmp(mse
->metadata_version
, "external:", 9) == 0 &&
559 is_subarray(mse
->metadata_version
+9))
561 devname2devnum(mse
->metadata_version
+10);
563 st
->parent_dev
= NoMdDev
;
564 if (st
->metadata
== NULL
&&
565 st
->parent_dev
== NoMdDev
)
566 st
->metadata
= super_by_fd(fd
, NULL
);
570 for (i
=0; i
<MaxDisks
; i
++) {
571 mdu_disk_info_t disc
= {0,0,0,0,0};
576 if (i
> array
.raid_disks
+ array
.nr_disks
) {
578 disc
.major
= disc
.minor
= 0;
579 } else if (info
[i
].major
|| info
[i
].minor
) {
580 newstate
= info
[i
].state
;
581 dv
= map_dev(info
[i
].major
, info
[i
].minor
, 1);
582 disc
.state
= newstate
;
583 disc
.major
= info
[i
].major
;
584 disc
.minor
= info
[i
].minor
;
585 } else if (mse
&& mse
->pattern
&& i
< (int)strlen(mse
->pattern
)) {
586 switch(mse
->pattern
[i
]) {
587 case 'U': newstate
= 6 /* ACTIVE/SYNC */; break;
588 case '_': newstate
= 0; break;
590 disc
.major
= disc
.minor
= 0;
592 if (dv
== NULL
&& st
->devid
[i
])
593 dv
= map_dev(major(st
->devid
[i
]),
594 minor(st
->devid
[i
]), 1);
595 change
= newstate
^ st
->devstate
[i
];
596 if (st
->utime
&& change
&& !st
->err
) {
597 if (i
< array
.raid_disks
&&
598 (((newstate
&change
)&(1<<MD_DISK_FAULTY
)) ||
599 ((st
->devstate
[i
]&change
)&(1<<MD_DISK_ACTIVE
)) ||
600 ((st
->devstate
[i
]&change
)&(1<<MD_DISK_SYNC
)))
602 alert("Fail", dev
, dv
, ainfo
);
603 else if (i
>= array
.raid_disks
&&
604 (disc
.major
|| disc
.minor
) &&
605 st
->devid
[i
] == makedev(disc
.major
, disc
.minor
) &&
606 ((newstate
&change
)&(1<<MD_DISK_FAULTY
))
608 alert("FailSpare", dev
, dv
, ainfo
);
609 else if (i
< array
.raid_disks
&&
610 ! (newstate
& (1<<MD_DISK_REMOVED
)) &&
611 (((st
->devstate
[i
]&change
)&(1<<MD_DISK_FAULTY
)) ||
612 ((newstate
&change
)&(1<<MD_DISK_ACTIVE
)) ||
613 ((newstate
&change
)&(1<<MD_DISK_SYNC
)))
615 alert("SpareActive", dev
, dv
, ainfo
);
617 st
->devstate
[i
] = newstate
;
618 st
->devid
[i
] = makedev(disc
.major
, disc
.minor
);
620 st
->active
= array
.active_disks
;
621 st
->working
= array
.working_disks
;
622 st
->spare
= array
.spare_disks
;
623 st
->failed
= array
.failed_disks
;
624 st
->utime
= array
.utime
;
625 st
->raid
= array
.raid_disks
;
627 if ((st
->active
< st
->raid
) && st
->spare
== 0)
632 static int add_new_arrays(struct mdstat_ent
*mdstat
, struct state
*statelist
,
633 int test
, struct alert_info
*info
)
635 struct mdstat_ent
*mse
;
638 for (mse
=mdstat
; mse
; mse
=mse
->next
)
639 if (mse
->devnum
!= INT_MAX
&&
640 (!mse
->level
|| /* retrieve containers */
641 (strcmp(mse
->level
, "raid0") != 0 &&
642 strcmp(mse
->level
, "linear") != 0))
644 struct state
*st
= calloc(1, sizeof *st
);
645 mdu_array_info_t array
;
649 st
->devname
= strdup(get_md_name(mse
->devnum
));
650 if ((fd
= open(st
->devname
, O_RDONLY
)) < 0 ||
651 ioctl(fd
, GET_ARRAY_INFO
, &array
)< 0) {
653 if (fd
>=0) close(fd
);
654 put_md_name(st
->devname
);
657 st
->metadata
->ss
->free_super(st
->metadata
);
664 st
->next
= statelist
;
666 st
->devnum
= mse
->devnum
;
668 st
->expected_spares
= -1;
669 if (strncmp(mse
->metadata_version
, "external:", 9) == 0 &&
670 is_subarray(mse
->metadata_version
+9))
672 devname2devnum(mse
->metadata_version
+10);
674 st
->parent_dev
= NoMdDev
;
677 alert("TestMessage", st
->devname
, NULL
, info
);
678 alert("NewArray", st
->devname
, NULL
, info
);
684 unsigned long long min_spare_size_required(struct state
*st
)
687 unsigned long long rv
= 0;
690 !st
->metadata
->ss
->min_acceptable_spare_size
)
693 fd
= open(st
->devname
, O_RDONLY
);
696 st
->metadata
->ss
->load_super(st
->metadata
, fd
, st
->devname
);
698 rv
= st
->metadata
->ss
->min_acceptable_spare_size(st
->metadata
);
699 st
->metadata
->ss
->free_super(st
->metadata
);
704 static int move_spare(struct state
*from
, struct state
*to
,
705 struct domainlist
*domlist
,
706 struct alert_info
*info
)
708 struct mddev_dev devlist
;
710 unsigned long long min_size
;
712 /* try to remove and add */
713 int fd1
= open(to
->devname
, O_RDONLY
);
714 int fd2
= open(from
->devname
, O_RDONLY
);
717 if (fd1
< 0 || fd2
< 0) {
718 if (fd1
>=0) close(fd1
);
719 if (fd2
>=0) close(fd2
);
722 min_size
= min_spare_size_required(to
);
723 for (d
= from
->raid
; dev
< 0 && d
< MaxDisks
; d
++) {
724 if (from
->devid
[d
] > 0 &&
725 from
->devstate
[d
] == 0) {
726 struct dev_policy
*pol
;
727 unsigned long long dev_size
;
730 dev_size_from_id(from
->devid
[d
], &dev_size
) &&
734 pol
= devnum_policy(from
->devid
[d
]);
735 if (from
->spare_group
)
736 pol_add(&pol
, pol_domain
,
737 from
->spare_group
, NULL
);
738 if (domain_test(domlist
, pol
, to
->metadata
->ss
->name
))
739 dev
= from
->devid
[d
];
740 dev_policy_free(pol
);
752 devlist
.writemostly
= 0;
753 devlist
.devname
= devname
;
754 sprintf(devname
, "%d:%d", major(dev
), minor(dev
));
756 devlist
.disposition
= 'r';
757 if (Manage_subdevs(from
->devname
, fd2
, &devlist
, -1, 0) == 0) {
758 devlist
.disposition
= 'a';
759 if (Manage_subdevs(to
->devname
, fd1
, &devlist
, -1, 0) == 0) {
760 alert("MoveSpare", to
->devname
, from
->devname
, info
);
765 else Manage_subdevs(from
->devname
, fd2
, &devlist
, -1, 0);
772 static int check_donor(struct state
*from
, struct state
*to
,
773 struct domainlist
*domlist
)
780 /* Cannot move from a member */
782 for (sub
= from
->subarray
; sub
; sub
= sub
->subarray
)
783 /* If source array has degraded subarrays, don't
786 if (sub
->active
< sub
->raid
)
788 if (from
->metadata
->ss
->external
== 0)
789 if (from
->active
< from
->raid
)
791 if (from
->spare
<= 0)
798 static void try_spare_migration(struct state
*statelist
, struct alert_info
*info
)
803 link_containers_with_subarrays(statelist
);
804 for (st
= statelist
; st
; st
= st
->next
)
805 if (st
->active
< st
->raid
&&
807 struct domainlist
*domlist
= NULL
;
809 struct state
*to
= st
;
812 /* member of a container */
815 for (d
= 0; d
< MaxDisks
; d
++)
817 domainlist_add_dev(&domlist
,
819 to
->metadata
->ss
->name
);
821 domain_add(&domlist
, to
->spare_group
);
823 for (from
=statelist
; from
; from
=from
->next
)
824 if (check_donor(from
, to
, domlist
)
825 && move_spare(from
, to
, domlist
, info
))
827 domain_free(domlist
);
831 /* search the statelist to connect external
832 * metadata subarrays with their containers
833 * We always completely rebuild the tree from scratch as
834 * that is safest considering the possibility of entries
835 * disappearing or changing.
837 static void link_containers_with_subarrays(struct state
*list
)
841 for (st
= list
; st
; st
= st
->next
) {
845 for (st
= list
; st
; st
= st
->next
)
846 if (st
->parent_dev
!= NoMdDev
)
847 for (cont
= list
; cont
; cont
= cont
->next
)
849 cont
->parent_dev
== NoMdDev
&&
850 cont
->devnum
== st
->parent_dev
) {
852 st
->subarray
= cont
->subarray
;
858 /* Not really Monitor but ... */
865 if (stat(dev
, &stb
) != 0) {
866 fprintf(stderr
, Name
": Cannot find %s: %s\n", dev
,
870 devnum
= stat2devnum(&stb
);
873 struct mdstat_ent
*ms
= mdstat_read(1, 0);
874 struct mdstat_ent
*e
;
876 for (e
=ms
; e
; e
=e
->next
)
877 if (e
->devnum
== devnum
)
880 if (!e
|| e
->percent
< 0) {
881 if (e
&& e
->metadata_version
&&
882 strncmp(e
->metadata_version
, "external:", 9) == 0) {
883 if (is_subarray(&e
->metadata_version
[9]))
884 ping_monitor(&e
->metadata_version
[9]);
886 ping_monitor(devnum2devname(devnum
));