2 * mdadm - manage Linux "md" devices aka RAID arrays.
4 * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 * Email: <neilb@suse.de>
35 #define EVENT_NAME_MAX 32
38 char devname
[MD_NAME_MAX
+ sizeof("/dev/md/")]; /* length of "/dev/md/" + device name + terminating byte*/
39 char devnm
[MD_NAME_MAX
]; /* to sync with mdstat info */
43 int active
, working
, failed
, spare
, raid
;
47 int devstate
[MAX_DISKS
];
48 dev_t devid
[MAX_DISKS
];
50 char parent_devnm
[MD_NAME_MAX
]; /* For subarray, devnm of parent.
53 struct supertype
*metadata
;
54 struct state
*subarray
;/* for a container it is a link to first subarray
55 * for a subarray it is a link to next subarray
56 * in the same container */
57 struct state
*parent
; /* for a subarray it is a link to its container
63 char hostname
[HOST_NAME_MAX
];
72 EVENT_SPARE_ACTIVE
= 0,
76 __SYSLOG_PRIORITY_WARNING
,
77 EVENT_REBUILD_STARTED
,
79 EVENT_REBUILD_FINISHED
,
81 __SYSLOG_PRIORITY_CRITICAL
,
82 EVENT_DEVICE_DISAPPEARED
,
89 mapping_t events_map
[] = {
90 {"SpareActive", EVENT_SPARE_ACTIVE
},
91 {"NewArray", EVENT_NEW_ARRAY
},
92 {"MoveSpare", EVENT_MOVE_SPARE
},
93 {"TestMessage", EVENT_TEST_MESSAGE
},
94 {"RebuildStarted", EVENT_REBUILD_STARTED
},
95 {"Rebuild", EVENT_REBUILD
},
96 {"RebuildFinished", EVENT_REBUILD_FINISHED
},
97 {"SparesMissing", EVENT_SPARES_MISSING
},
98 {"DeviceDisappeared", EVENT_DEVICE_DISAPPEARED
},
100 {"FailSpare", EVENT_FAIL_SPARE
},
101 {"DegradedArray", EVENT_DEGRADED_ARRAY
},
102 {NULL
, EVENT_UNKNOWN
}
106 enum event event_enum
;
108 * @event_name: Rebuild event name must be in form "RebuildXX", where XX is rebuild progress.
110 char event_name
[EVENT_NAME_MAX
];
111 char message
[BUFSIZ
];
112 const char *description
;
117 static int add_new_arrays(struct mdstat_ent
*mdstat
, struct state
**statelist
);
118 static void try_spare_migration(struct state
*statelist
);
119 static void link_containers_with_subarrays(struct state
*list
);
120 static void free_statelist(struct state
*statelist
);
121 static int check_array(struct state
*st
, struct mdstat_ent
*mdstat
, int increments
, char *prefer
);
122 static int check_one_sharer(int scan
);
124 static int check_udev_activity(void);
126 static void link_containers_with_subarrays(struct state
*list
);
127 static int make_daemon(char *pidfile
);
128 static void try_spare_migration(struct state
*statelist
);
129 static void write_autorebuild_pid(void);
131 int Monitor(struct mddev_dev
*devlist
,
132 char *mailaddr
, char *alert_cmd
,
134 int daemonise
, int oneshot
,
135 int dosyslog
, char *pidfile
, int increments
,
139 * Every few seconds, scan every md device looking for changes
140 * When a change is found, log it, possibly run the alert command,
141 * and possibly send Email
143 * For each array, we record:
145 * active/working/failed/spare drives
146 * State of each device.
147 * %rebuilt if rebuilding
149 * If the update time changes, check out all the data again
150 * It is possible that we cannot get the state of each device
151 * due to bugs in the md kernel module.
152 * We also read /proc/mdstat to get rebuild percent,
153 * and to get state on all active devices incase of kernel bug.
157 * An active device had Faulty set or Active/Sync removed
159 * A spare device had Faulty set
161 * An active device had a reverse transition
163 * percent went from -1 to +ve
165 * percent went from below to not-below NN%
167 * Couldn't access a device which was previously visible
169 * if we detect an array with active<raid and spare==0
170 * we look at other arrays that have same spare-group
171 * If we find one with active==raid and spare>0,
172 * and if we can get_disk_info and find a name
173 * Then we hot-remove and hot-add to the other array
175 * If devlist is NULL, then we can monitor everything if --scan
176 * was given. We get an initial list from config file and add anything
177 * that appears in /proc/mdstat
180 struct state
*statelist
= NULL
;
182 struct mdstat_ent
*mdstat
= NULL
;
184 struct mddev_ident
*mdlist
;
185 int delay_for_event
= c
->delay
;
187 if (devlist
&& c
->scan
) {
188 pr_err("Devices list and --scan option cannot be combined - not monitoring.\n");
193 mailaddr
= conf_get_mailaddr();
196 alert_cmd
= conf_get_program();
198 mailfrom
= conf_get_mailfrom();
200 if (c
->scan
&& !mailaddr
&& !alert_cmd
&& !dosyslog
) {
201 pr_err("No mail address or alert command - not monitoring.\n");
206 pr_err("Monitor is started with delay %ds\n", c
->delay
);
208 pr_err("Monitor using email address %s\n", mailaddr
);
210 pr_err("Monitor using program %s\n", alert_cmd
);
213 info
.alert_cmd
= alert_cmd
;
214 info
.mailaddr
= mailaddr
;
215 info
.mailfrom
= mailfrom
;
216 info
.dosyslog
= dosyslog
;
219 if (gethostname(info
.hostname
, sizeof(info
.hostname
)) != 0) {
220 pr_err("Cannot get hostname.\n");
223 info
.hostname
[sizeof(info
.hostname
) - 1] = '\0';
226 if (check_one_sharer(c
->scan
))
231 int rv
= make_daemon(pidfile
);
237 write_autorebuild_pid();
239 if (devlist
== NULL
) {
240 mdlist
= conf_get_ident(NULL
);
241 for (; mdlist
; mdlist
= mdlist
->next
) {
244 if (mdlist
->devname
== NULL
)
246 if (strcasecmp(mdlist
->devname
, "<ignore>") == 0)
248 if (!is_mddev(mdlist
->devname
))
251 st
= xcalloc(1, sizeof *st
);
252 snprintf(st
->devname
, MD_NAME_MAX
+ sizeof("/dev/md/"),
253 "/dev/md/%s", basename(mdlist
->devname
));
254 st
->next
= statelist
;
256 st
->percent
= RESYNC_UNKNOWN
;
258 st
->expected_spares
= mdlist
->spare_disks
;
259 if (mdlist
->spare_group
)
260 st
->spare_group
= xstrdup(mdlist
->spare_group
);
264 struct mddev_dev
*dv
;
266 for (dv
= devlist
; dv
; dv
= dv
->next
) {
269 if (!is_mddev(dv
->devname
))
272 st
= xcalloc(1, sizeof *st
);
273 mdlist
= conf_get_ident(dv
->devname
);
274 snprintf(st
->devname
, MD_NAME_MAX
+ sizeof("/dev/md/"), "%s", dv
->devname
);
275 st
->next
= statelist
;
277 st
->percent
= RESYNC_UNKNOWN
;
278 st
->expected_spares
= -1;
280 st
->expected_spares
= mdlist
->spare_disks
;
281 if (mdlist
->spare_group
)
282 st
->spare_group
= xstrdup(mdlist
->spare_group
);
290 struct state
*st
, **stp
;
292 int anyredundant
= 0;
296 mdstat
= mdstat_read(oneshot
? 0 : 1, 0);
298 for (st
= statelist
; st
; st
= st
->next
) {
299 if (check_array(st
, mdstat
, increments
, c
->prefer
))
301 /* for external arrays, metadata is filled for
304 if (st
->metadata
&& st
->metadata
->ss
->external
)
306 if (st
->err
== 0 && !anyredundant
)
310 /* now check if there are any new devices found in mdstat */
312 new_found
= add_new_arrays(mdstat
, &statelist
);
314 /* If an array has active < raid && spare == 0 && spare_group != NULL
315 * Look for another array with spare > 0 and active == raid and same spare_group
316 * if found, choose a device and hotremove/hotadd
318 if (share
&& anydegraded
)
319 try_spare_migration(statelist
);
323 else if (!anyredundant
) {
324 pr_err("No array with redundancy detected, stopping\n");
330 * Wait for udevd to finish new devices
333 if (mdstat_wait(delay_for_event
) &&
334 check_udev_activity())
335 pr_err("Error while waiting for UDEV to complete new devices processing\n");
337 int wait_result
= mdstat_wait(delay_for_event
);
339 * Give chance to process new device
341 if (wait_result
!= 0) {
345 delay_for_event
= c
->delay
;
352 for (stp
= &statelist
; (st
= *stp
) != NULL
; ) {
353 if (st
->from_auto
&& st
->err
> 5) {
356 free(st
->spare_group
);
364 free_statelist(statelist
);
371 static int make_daemon(char *pidfile
)
374 * -1 in the forked daemon
377 * so a none-negative becomes the exit code.
384 FILE *pid_file
= NULL
;
385 int fd
= open(pidfile
, O_WRONLY
| O_CREAT
| O_TRUNC
,
388 pid_file
= fdopen(fd
, "w");
390 perror("cannot create pid file");
392 fprintf(pid_file
,"%d\n", pid
);
407 static int check_one_sharer(int scan
)
412 char comm_path
[PATH_MAX
];
416 sprintf(path
, "%s/autorebuild.pid", MDMON_DIR
);
417 fp
= fopen(path
, "r");
419 if (fscanf(fp
, "%d", &pid
) != 1)
421 snprintf(comm_path
, sizeof(comm_path
),
422 "/proc/%d/comm", pid
);
423 comm_fp
= fopen(comm_path
, "r");
425 if (fscanf(comm_fp
, "%19s", comm
) &&
426 strncmp(basename(comm
), Name
, strlen(Name
)) == 0) {
428 pr_err("Only one autorebuild process allowed in scan mode, aborting\n");
433 pr_err("Warning: One autorebuild process already running.\n");
443 static void write_autorebuild_pid()
448 sprintf(path
, "%s/autorebuild.pid", MDMON_DIR
);
450 if (mkdir(MDMON_DIR
, 0700) < 0 && errno
!= EEXIST
) {
451 pr_err("Can't create autorebuild.pid file\n");
453 int fd
= open(path
, O_WRONLY
| O_CREAT
| O_TRUNC
, 0700);
456 fp
= fdopen(fd
, "w");
459 pr_err("Can't create autorebuild.pid file\n");
462 fprintf(fp
, "%d\n", pid
);
468 #define BASE_MESSAGE "%s event detected on md device %s"
469 #define COMPONENT_DEVICE_MESSAGE ", component device %s"
470 #define DESCRIPTION_MESSAGE ": %s"
472 * sprint_event_message() - Writes basic message about detected event to destination ptr.
473 * @dest: message destination, should be at least the size of BUFSIZ
476 * Return: 0 on success, 1 on error
478 static int sprint_event_message(char *dest
, const struct event_data
*data
)
483 if (data
->disc
&& data
->description
)
484 snprintf(dest
, BUFSIZ
, BASE_MESSAGE COMPONENT_DEVICE_MESSAGE DESCRIPTION_MESSAGE
,
485 data
->event_name
, data
->dev
, data
->disc
, data
->description
);
487 snprintf(dest
, BUFSIZ
, BASE_MESSAGE COMPONENT_DEVICE_MESSAGE
,
488 data
->event_name
, data
->dev
, data
->disc
);
489 else if (data
->description
)
490 snprintf(dest
, BUFSIZ
, BASE_MESSAGE DESCRIPTION_MESSAGE
,
491 data
->event_name
, data
->dev
, data
->description
);
493 snprintf(dest
, BUFSIZ
, BASE_MESSAGE
, data
->event_name
, data
->dev
);
499 * get_syslog_event_priority() - Determines event priority.
500 * @event_enum: event to be checked
502 * Return: LOG_CRIT, LOG_WARNING or LOG_INFO
504 static int get_syslog_event_priority(const enum event event_enum
)
506 if (event_enum
> __SYSLOG_PRIORITY_CRITICAL
)
508 if (event_enum
> __SYSLOG_PRIORITY_WARNING
)
514 * is_email_event() - Determines whether email for event should be sent or not.
515 * @event_enum: event to be checked
517 * Return: true if email should be sent, false otherwise
519 static bool is_email_event(const enum event event_enum
)
521 static const enum event email_events
[] = {
524 EVENT_DEGRADED_ARRAY
,
525 EVENT_SPARES_MISSING
,
530 for (i
= 0; i
< ARRAY_SIZE(email_events
); ++i
) {
531 if (event_enum
== email_events
[i
])
538 * execute_alert_cmd() - Forks and executes command provided as alert_cmd.
541 static void execute_alert_cmd(const struct event_data
*data
)
547 waitpid(pid
, NULL
, 0);
550 pr_err("Cannot fork to execute alert command");
553 execl(info
.alert_cmd
, info
.alert_cmd
, data
->event_name
, data
->dev
, data
->disc
, NULL
);
559 * send_event_email() - Sends an email about event detected by monitor.
562 static void send_event_email(const struct event_data
*data
)
568 mp
= popen(Sendmail
, "w");
570 pr_err("Cannot open pipe stream for sendmail.\n");
574 signal(SIGPIPE
, SIG_IGN
);
576 fprintf(mp
, "From: %s\n", info
.mailfrom
);
578 fprintf(mp
, "From: %s monitoring <root>\n", Name
);
579 fprintf(mp
, "To: %s\n", info
.mailaddr
);
580 fprintf(mp
, "Subject: %s event on %s:%s\n\n", data
->event_name
, data
->dev
, info
.hostname
);
581 fprintf(mp
, "This is an automatically generated mail message.\n");
582 fprintf(mp
, "%s\n", data
->message
);
584 mdstat
= fopen("/proc/mdstat", "r");
586 pr_err("Cannot open /proc/mdstat\n");
591 fprintf(mp
, "The /proc/mdstat file currently contains the following:\n\n");
592 while ((n
= fread(buf
, 1, sizeof(buf
), mdstat
)) > 0)
593 n
= fwrite(buf
, 1, n
, mp
);
599 * log_event_to_syslog() - Logs an event into syslog.
602 static void log_event_to_syslog(const struct event_data
*data
)
606 priority
= get_syslog_event_priority(data
->event_enum
);
608 syslog(priority
, "%s\n", data
->message
);
612 * alert() - Alerts about the monitor event.
613 * @event_enum: event to be sent
614 * @description: event description
615 * @progress: rebuild progress
616 * @dev: md device name
617 * @disc: component device
619 * If needed function executes alert command, sends an email or logs event to syslog.
621 static void alert(const enum event event_enum
, const char *description
, const uint8_t progress
,
622 const char *dev
, const char *disc
)
624 struct event_data data
= {.dev
= dev
, .disc
= disc
, .description
= description
};
629 if (event_enum
== EVENT_REBUILD
) {
630 snprintf(data
.event_name
, sizeof(data
.event_name
), "%s%02d",
631 map_num_s(events_map
, EVENT_REBUILD
), progress
);
633 snprintf(data
.event_name
, sizeof(data
.event_name
), "%s", map_num_s(events_map
, event_enum
));
636 data
.event_enum
= event_enum
;
638 if (sprint_event_message(data
.message
, &data
) != 0) {
639 pr_err("Cannot create event message.\n");
642 pr_err("%s\n", data
.message
);
645 execute_alert_cmd(&data
);
647 if (info
.mailaddr
&& is_email_event(event_enum
))
648 send_event_email(&data
);
651 log_event_to_syslog(&data
);
654 static int check_array(struct state
*st
, struct mdstat_ent
*mdstat
,
655 int increments
, char *prefer
)
657 /* Update the state 'st' to reflect any changes shown in mdstat,
658 * or found by directly examining the array, and return
659 * '1' if the array is degraded, or '0' if it is optimal (or dead).
661 struct { int state
, major
, minor
; } disks_info
[MAX_DISKS
];
662 struct mdinfo
*sra
= NULL
;
663 mdu_array_info_t array
;
664 struct mdstat_ent
*mse
= NULL
, *mse2
;
665 char *dev
= st
->devname
;
672 int is_container
= 0;
673 unsigned long redundancy_only_flags
= 0;
676 alert(EVENT_TEST_MESSAGE
, NULL
, 0, dev
, NULL
);
680 fd
= open(dev
, O_RDONLY
);
684 if (st
->devnm
[0] == 0)
685 snprintf(st
->devnm
, MD_NAME_MAX
, "%s", fd2devnm(fd
));
687 for (mse2
= mdstat
; mse2
; mse2
= mse2
->next
)
688 if (strcmp(mse2
->devnm
, st
->devnm
) == 0) {
689 mse2
->devnm
[0] = 0; /* flag it as "used" */
694 /* duplicated array in statelist
695 * or re-created after reading mdstat
701 if (mse
->level
== NULL
)
704 if (!is_container
&& !md_array_active(fd
))
707 fcntl(fd
, F_SETFD
, FD_CLOEXEC
);
708 if (md_get_array_info(fd
, &array
) < 0)
711 if (!is_container
&& map_name(pers
, mse
->level
) > 0)
712 redundancy_only_flags
|= GET_MISMATCH
;
714 sra
= sysfs_read(-1, st
->devnm
, GET_LEVEL
| GET_DISKS
| GET_DEVS
|
715 GET_STATE
| redundancy_only_flags
);
720 /* It's much easier to list what array levels can't
721 * have a device disappear than all of them that can
723 if (sra
->array
.level
== 0 || sra
->array
.level
== -1) {
724 if (!st
->err
&& !st
->from_config
)
725 alert(EVENT_DEVICE_DISAPPEARED
, "Wrong-Level", 0, dev
, NULL
);
730 /* this array is in /proc/mdstat */
731 if (array
.utime
== 0)
732 /* external arrays don't update utime, so
733 * just make sure it is always different. */
734 array
.utime
= st
->utime
+ 1;;
737 /* New array appeared where previously had an error */
739 st
->percent
= RESYNC_NONE
;
742 alert(EVENT_NEW_ARRAY
, NULL
, 0, st
->devname
, NULL
);
745 if (st
->utime
== array
.utime
&& st
->failed
== sra
->array
.failed_disks
&&
746 st
->working
== sra
->array
.working_disks
&&
747 st
->spare
== sra
->array
.spare_disks
&&
748 (mse
== NULL
|| (mse
->percent
== st
->percent
))) {
749 if ((st
->active
< st
->raid
) && st
->spare
== 0)
753 if (st
->utime
== 0 && /* new array */
754 mse
->pattern
&& strchr(mse
->pattern
, '_') /* degraded */)
755 alert(EVENT_DEGRADED_ARRAY
, NULL
, 0, dev
, NULL
);
757 if (st
->utime
== 0 && /* new array */ st
->expected_spares
> 0 &&
758 sra
->array
.spare_disks
< st
->expected_spares
)
759 alert(EVENT_SPARES_MISSING
, NULL
, 0, dev
, NULL
);
760 if (st
->percent
< 0 && st
->percent
!= RESYNC_UNKNOWN
&&
762 alert(EVENT_REBUILD_STARTED
, NULL
, 0, dev
, NULL
);
763 if (st
->percent
>= 0 && mse
->percent
>= 0 &&
764 (mse
->percent
/ increments
) > (st
->percent
/ increments
)) {
765 if((mse
->percent
/ increments
) == 0)
766 alert(EVENT_REBUILD_STARTED
, NULL
, 0, dev
, NULL
);
768 alert(EVENT_REBUILD
, NULL
, mse
->percent
, dev
, NULL
);
771 if (mse
->percent
== RESYNC_NONE
&& st
->percent
>= 0) {
772 /* Rebuild/sync/whatever just finished.
773 * If there is a number in /mismatch_cnt,
774 * we should report that.
776 if (sra
&& sra
->mismatch_cnt
> 0) {
778 snprintf(cnt
, sizeof(cnt
),
779 " mismatches found: %d (on raid level %d)",
780 sra
->mismatch_cnt
, sra
->array
.level
);
781 alert(EVENT_REBUILD_FINISHED
, NULL
, 0, dev
, cnt
);
783 alert(EVENT_REBUILD_FINISHED
, NULL
, 0, dev
, NULL
);
785 st
->percent
= mse
->percent
;
787 remaining_disks
= sra
->array
.nr_disks
;
788 for (i
= 0; i
< MAX_DISKS
&& remaining_disks
> 0; i
++) {
789 mdu_disk_info_t disc
;
791 if (md_get_disk_info(fd
, &disc
) >= 0) {
792 disks_info
[i
].state
= disc
.state
;
793 disks_info
[i
].major
= disc
.major
;
794 disks_info
[i
].minor
= disc
.minor
;
795 if (disc
.major
|| disc
.minor
)
798 disks_info
[i
].major
= disks_info
[i
].minor
= 0;
802 if (mse
->metadata_version
&&
803 strncmp(mse
->metadata_version
, "external:", 9) == 0 &&
804 is_subarray(mse
->metadata_version
+9)) {
806 snprintf(st
->parent_devnm
, MD_NAME_MAX
, "%s", mse
->metadata_version
+ 10);
807 sl
= strchr(st
->parent_devnm
, '/');
811 st
->parent_devnm
[0] = 0;
812 if (st
->metadata
== NULL
&& st
->parent_devnm
[0] == 0)
813 st
->metadata
= super_by_fd(fd
, NULL
);
815 for (i
= 0; i
< MAX_DISKS
; i
++) {
816 mdu_disk_info_t disc
= {0, 0, 0, 0, 0};
821 if (i
< last_disk
&& (disks_info
[i
].major
|| disks_info
[i
].minor
)) {
822 newstate
= disks_info
[i
].state
;
823 dv
= map_dev_preferred(disks_info
[i
].major
, disks_info
[i
].minor
, 1,
825 disc
.state
= newstate
;
826 disc
.major
= disks_info
[i
].major
;
827 disc
.minor
= disks_info
[i
].minor
;
829 newstate
= (1 << MD_DISK_REMOVED
);
831 if (dv
== NULL
&& st
->devid
[i
])
832 dv
= map_dev_preferred(major(st
->devid
[i
]),
833 minor(st
->devid
[i
]), 1, prefer
);
834 change
= newstate
^ st
->devstate
[i
];
835 if (st
->utime
&& change
&& !st
->err
&& !new_array
) {
836 if ((st
->devstate
[i
]&change
) & (1 << MD_DISK_SYNC
))
837 alert(EVENT_FAIL
, NULL
, 0, dev
, dv
);
838 else if ((newstate
& (1 << MD_DISK_FAULTY
)) &&
839 (disc
.major
|| disc
.minor
) &&
840 st
->devid
[i
] == makedev(disc
.major
,
842 alert(EVENT_FAIL_SPARE
, NULL
, 0, dev
, dv
);
843 else if ((newstate
&change
) & (1 << MD_DISK_SYNC
))
844 alert(EVENT_SPARE_ACTIVE
, NULL
, 0, dev
, dv
);
846 st
->devstate
[i
] = newstate
;
847 st
->devid
[i
] = makedev(disc
.major
, disc
.minor
);
849 st
->active
= sra
->array
.active_disks
;
850 st
->working
= sra
->array
.working_disks
;
851 st
->spare
= sra
->array
.spare_disks
;
852 st
->failed
= sra
->array
.failed_disks
;
853 st
->utime
= array
.utime
;
854 st
->raid
= sra
->array
.raid_disks
;
856 if ((st
->active
< st
->raid
) && st
->spare
== 0)
867 if (!st
->err
&& !is_container
)
868 alert(EVENT_DEVICE_DISAPPEARED
, NULL
, 0, dev
, NULL
);
873 static int add_new_arrays(struct mdstat_ent
*mdstat
, struct state
**statelist
)
875 struct mdstat_ent
*mse
;
879 for (mse
= mdstat
; mse
; mse
= mse
->next
)
880 if (mse
->devnm
[0] && (!mse
->level
|| /* retrieve containers */
881 (strcmp(mse
->level
, "raid0") != 0 &&
882 strcmp(mse
->level
, "linear") != 0))) {
883 struct state
*st
= xcalloc(1, sizeof *st
);
884 mdu_array_info_t array
;
887 name
= get_md_name(mse
->devnm
);
893 snprintf(st
->devname
, MD_NAME_MAX
+ sizeof("/dev/md/"), "%s", name
);
894 if ((fd
= open(st
->devname
, O_RDONLY
)) < 0 ||
895 md_get_array_info(fd
, &array
) < 0) {
899 put_md_name(st
->devname
);
901 st
->metadata
->ss
->free_super(st
->metadata
);
908 st
->next
= *statelist
;
911 snprintf(st
->devnm
, MD_NAME_MAX
, "%s", mse
->devnm
);
912 st
->percent
= RESYNC_UNKNOWN
;
913 st
->expected_spares
= -1;
914 if (mse
->metadata_version
&&
915 strncmp(mse
->metadata_version
,
916 "external:", 9) == 0 &&
917 is_subarray(mse
->metadata_version
+9)) {
919 snprintf(st
->parent_devnm
, MD_NAME_MAX
,
920 "%s", mse
->metadata_version
+ 10);
921 sl
= strchr(st
->parent_devnm
, '/');
924 st
->parent_devnm
[0] = 0;
927 alert(EVENT_TEST_MESSAGE
, NULL
, 0, st
->devname
, NULL
);
933 static int get_required_spare_criteria(struct state
*st
,
934 struct spare_criteria
*sc
)
938 if (!st
->metadata
|| !st
->metadata
->ss
->get_spare_criteria
) {
944 fd
= open(st
->devname
, O_RDONLY
);
947 if (st
->metadata
->ss
->external
)
948 st
->metadata
->ss
->load_container(st
->metadata
, fd
, st
->devname
);
950 st
->metadata
->ss
->load_super(st
->metadata
, fd
, st
->devname
);
952 if (!st
->metadata
->sb
)
955 st
->metadata
->ss
->get_spare_criteria(st
->metadata
, sc
);
956 st
->metadata
->ss
->free_super(st
->metadata
);
961 static int check_donor(struct state
*from
, struct state
*to
)
968 /* Cannot move from a member */
972 for (sub
= from
->subarray
; sub
; sub
= sub
->subarray
)
973 /* If source array has degraded subarrays, don't
976 if (sub
->active
< sub
->raid
)
978 if (from
->metadata
->ss
->external
== 0)
979 if (from
->active
< from
->raid
)
981 if (from
->spare
<= 0)
986 static dev_t
choose_spare(struct state
*from
, struct state
*to
,
987 struct domainlist
*domlist
, struct spare_criteria
*sc
)
992 for (d
= from
->raid
; !dev
&& d
< MAX_DISKS
; d
++) {
993 if (from
->devid
[d
] > 0 && from
->devstate
[d
] == 0) {
994 struct dev_policy
*pol
;
995 unsigned long long dev_size
;
996 unsigned int dev_sector_size
;
998 if (to
->metadata
->ss
->external
&&
999 test_partition_from_id(from
->devid
[d
]))
1003 dev_size_from_id(from
->devid
[d
], &dev_size
) &&
1004 dev_size
< sc
->min_size
)
1007 if (sc
->sector_size
&&
1008 dev_sector_size_from_id(from
->devid
[d
],
1009 &dev_sector_size
) &&
1010 sc
->sector_size
!= dev_sector_size
)
1013 pol
= devid_policy(from
->devid
[d
]);
1014 if (from
->spare_group
)
1015 pol_add(&pol
, pol_domain
,
1016 from
->spare_group
, NULL
);
1017 if (domain_test(domlist
, pol
,
1018 to
->metadata
->ss
->name
) == 1)
1019 dev
= from
->devid
[d
];
1020 dev_policy_free(pol
);
1026 static dev_t
container_choose_spare(struct state
*from
, struct state
*to
,
1027 struct domainlist
*domlist
,
1028 struct spare_criteria
*sc
, int active
)
1030 /* This is similar to choose_spare, but we cannot trust devstate,
1031 * so we need to read the metadata instead
1033 struct mdinfo
*list
;
1034 struct supertype
*st
= from
->metadata
;
1035 int fd
= open(from
->devname
, O_RDONLY
);
1041 if (!st
->ss
->getinfo_super_disks
) {
1046 err
= st
->ss
->load_container(st
, fd
, NULL
);
1052 /* We must check if number of active disks has not increased
1053 * since ioctl in main loop. mdmon may have added spare
1054 * to subarray. If so we do not need to look for more spares
1055 * so return non zero value */
1058 list
= st
->ss
->getinfo_super_disks(st
);
1060 st
->ss
->free_super(st
);
1065 if (dp
->disk
.state
& (1 << MD_DISK_SYNC
) &&
1066 !(dp
->disk
.state
& (1 << MD_DISK_FAULTY
)))
1071 if (active
< active_cnt
) {
1072 /* Spare just activated.*/
1073 st
->ss
->free_super(st
);
1078 /* We only need one spare so full list not needed */
1079 list
= container_choose_spares(st
, sc
, domlist
, from
->spare_group
,
1080 to
->metadata
->ss
->name
, 1);
1082 struct mdinfo
*disks
= list
->devs
;
1084 dev
= makedev(disks
->disk
.major
, disks
->disk
.minor
);
1087 st
->ss
->free_super(st
);
1091 static void try_spare_migration(struct state
*statelist
)
1095 struct spare_criteria sc
;
1097 link_containers_with_subarrays(statelist
);
1098 for (st
= statelist
; st
; st
= st
->next
)
1099 if (st
->active
< st
->raid
&& st
->spare
== 0 && !st
->err
) {
1100 struct domainlist
*domlist
= NULL
;
1102 struct state
*to
= st
;
1104 if (to
->parent_devnm
[0] && !to
->parent
)
1105 /* subarray monitored without parent container
1106 * we can't move spares here */
1110 /* member of a container */
1113 if (get_required_spare_criteria(to
, &sc
))
1115 if (to
->metadata
->ss
->external
) {
1116 /* We must make sure there is
1117 * no suitable spare in container already.
1118 * If there is we don't add more */
1119 dev_t devid
= container_choose_spare(
1120 to
, to
, NULL
, &sc
, st
->active
);
1124 for (d
= 0; d
< MAX_DISKS
; d
++)
1126 domainlist_add_dev(&domlist
,
1128 to
->metadata
->ss
->name
);
1129 if (to
->spare_group
)
1130 domain_add(&domlist
, to
->spare_group
);
1132 * No spare migration if the destination
1133 * has no domain. Skip this array.
1137 for (from
=statelist
; from
; from
=from
->next
) {
1139 if (!check_donor(from
, to
))
1141 if (from
->metadata
->ss
->external
)
1142 devid
= container_choose_spare(
1143 from
, to
, domlist
, &sc
, 0);
1145 devid
= choose_spare(from
, to
, domlist
,
1148 move_spare(from
->devname
, to
->devname
,
1150 alert(EVENT_MOVE_SPARE
, NULL
, 0, to
->devname
, from
->devname
);
1154 domain_free(domlist
);
1158 /* search the statelist to connect external
1159 * metadata subarrays with their containers
1160 * We always completely rebuild the tree from scratch as
1161 * that is safest considering the possibility of entries
1162 * disappearing or changing.
1164 static void link_containers_with_subarrays(struct state
*list
)
1168 for (st
= list
; st
; st
= st
->next
) {
1170 st
->subarray
= NULL
;
1172 for (st
= list
; st
; st
= st
->next
)
1173 if (st
->parent_devnm
[0])
1174 for (cont
= list
; cont
; cont
= cont
->next
)
1175 if (!cont
->err
&& cont
->parent_devnm
[0] == 0 &&
1176 strcmp(cont
->devnm
, st
->parent_devnm
) == 0) {
1178 st
->subarray
= cont
->subarray
;
1179 cont
->subarray
= st
;
1185 * free_statelist() - Frees statelist.
1186 * @statelist: statelist to free
1188 static void free_statelist(struct state
*statelist
)
1190 struct state
*tmp
= NULL
;
1193 if (statelist
->spare_group
)
1194 free(statelist
->spare_group
);
1197 statelist
= statelist
->next
;
1203 /* function: check_udev_activity
1204 * Description: Function waits for udev to finish
1205 * events processing.
1207 * 1 - detected error while opening udev
1209 * 0 - successfull completion
1211 static int check_udev_activity(void)
1213 struct udev
*udev
= NULL
;
1214 struct udev_queue
*udev_queue
= NULL
;
1215 int timeout_cnt
= 30;
1219 * In rare cases systemd may not have udevm,
1220 * in such cases just exit with rc 0
1231 udev_queue
= udev_queue_new(udev
);
1237 if (udev_queue_get_queue_is_empty(udev_queue
))
1240 while (!udev_queue_get_queue_is_empty(udev_queue
)) {
1253 udev_queue_unref(udev_queue
);
1260 /* Not really Monitor but ... */
1267 int frozen_remaining
= 3;
1269 if (!stat_is_blkdev(dev
, &rdev
))
1272 tmp
= devid2devnm(rdev
);
1274 pr_err("Cannot get md device name.\n");
1281 struct mdstat_ent
*ms
= mdstat_read(1, 0);
1282 struct mdstat_ent
*e
;
1284 for (e
= ms
; e
; e
= e
->next
)
1285 if (strcmp(e
->devnm
, devnm
) == 0)
1288 if (e
&& e
->percent
== RESYNC_NONE
) {
1289 /* We could be in the brief pause before something
1290 * starts. /proc/mdstat doesn't show that, but
1296 if (sysfs_init(&mdi
, -1, devnm
))
1298 if (sysfs_get_str(&mdi
, NULL
, "sync_action",
1300 strcmp(buf
,"idle\n") != 0) {
1301 e
->percent
= RESYNC_UNKNOWN
;
1302 if (strcmp(buf
, "frozen\n") == 0) {
1303 if (frozen_remaining
== 0)
1304 e
->percent
= RESYNC_NONE
;
1306 frozen_remaining
-= 1;
1310 if (!e
|| e
->percent
== RESYNC_NONE
) {
1311 if (e
&& e
->metadata_version
&&
1312 strncmp(e
->metadata_version
, "external:", 9) == 0) {
1313 if (is_subarray(&e
->metadata_version
[9]))
1314 ping_monitor(&e
->metadata_version
[9]);
1316 ping_monitor(devnm
);
1327 /* The state "broken" is used only for RAID0/LINEAR - it's the same as
1328 * "clean", but used in case the array has one or more members missing.
1330 static char *clean_states
[] = {
1331 "clear", "inactive", "readonly", "read-auto", "clean", "broken", NULL
};
1333 int WaitClean(char *dev
, int verbose
)
1340 if (!stat_is_blkdev(dev
, NULL
))
1342 fd
= open(dev
, O_RDONLY
);
1345 pr_err("Couldn't open %s: %s\n", dev
, strerror(errno
));
1349 strcpy(devnm
, fd2devnm(fd
));
1350 mdi
= sysfs_read(fd
, devnm
, GET_VERSION
|GET_LEVEL
|GET_SAFEMODE
);
1353 pr_err("Failed to read sysfs attributes for %s\n", dev
);
1358 switch(mdi
->array
.level
) {
1360 case LEVEL_MULTIPATH
:
1362 /* safemode delay is irrelevant for these levels */
1366 /* for internal metadata the kernel handles the final clean
1367 * transition, containers can never be dirty
1369 if (!is_subarray(mdi
->text_version
))
1372 /* safemode disabled ? */
1373 if (mdi
->safe_mode_delay
== 0)
1377 int state_fd
= sysfs_open(fd2devnm(fd
), NULL
, "array_state");
1381 /* minimize the safe_mode_delay and prepare to wait up to 5s
1382 * for writes to quiesce
1384 sysfs_set_safemode(mdi
, 1);
1386 /* wait for array_state to be clean */
1388 rv
= read(state_fd
, buf
, sizeof(buf
));
1391 if (sysfs_match_word(buf
, clean_states
) <
1392 (int)ARRAY_SIZE(clean_states
) - 1)
1394 rv
= sysfs_wait(state_fd
, &delay
);
1395 if (rv
< 0 && errno
!= EINTR
)
1397 lseek(state_fd
, 0, SEEK_SET
);
1401 else if (ping_monitor(mdi
->text_version
) == 0) {
1402 /* we need to ping to close the window between array
1403 * state transitioning to clean and the metadata being
1409 pr_err("Error connecting monitor with %s\n", dev
);
1412 pr_err("Error waiting for %s to be clean\n", dev
);
1414 /* restore the original safe_mode_delay */
1415 sysfs_set_safemode(mdi
, mdi
->safe_mode_delay
);