]> git.ipfire.org Git - thirdparty/mdadm.git/blobdiff - monitor.c
monitor: treat unreadable array_state as clean
[thirdparty/mdadm.git] / monitor.c
index cfe41787f0f2e596502c43b181e9924c9c511eb6..47432b25f8a2774f5a80760e0a9c6e46fae5c5ee 100644 (file)
--- a/monitor.c
+++ b/monitor.c
@@ -38,8 +38,17 @@ static int write_attr(char *attr, int fd)
 
 static void add_fd(fd_set *fds, int *maxfd, int fd)
 {
+       struct stat st;
        if (fd < 0)
                return;
+       if (fstat(fd, &st) == -1) {
+               dprintf("%s: Invalid fd %d\n", __func__, fd);
+               return;
+       }
+       if (st.st_nlink == 0) {
+               dprintf("%s: fd %d was deleted\n", __func__, fd);
+               return;
+       }
        if (fd > *maxfd)
                *maxfd = fd;
        FD_SET(fd, fds);
@@ -66,18 +75,21 @@ static int read_attr(char *buf, int len, int fd)
        return n;
 }
 
-static unsigned long long read_resync_start(int fd)
+static void read_resync_start(int fd, unsigned long long *v)
 {
        char buf[30];
        int n;
 
        n = read_attr(buf, 30, fd);
-       if (n <= 0)
-               return 0;
+       if (n <= 0) {
+               dprintf("%s: Failed to read resync_start (%d)\n",
+                       __func__, fd);
+               return;
+       }
        if (strncmp(buf, "none", 4) == 0)
-               return MaxSector;
+               *v = MaxSector;
        else
-               return strtoull(buf, NULL, 10);
+               *v = strtoull(buf, NULL, 10);
 }
 
 static unsigned long long read_sync_completed(int fd)
@@ -212,6 +224,7 @@ static void signal_manager(void)
  */
 
 #define ARRAY_DIRTY 1
+#define ARRAY_BUSY 2
 static int read_and_act(struct active_array *a)
 {
        unsigned long long sync_completed;
@@ -227,13 +240,20 @@ static int read_and_act(struct active_array *a)
 
        a->curr_state = read_state(a->info.state_fd);
        a->curr_action = read_action(a->action_fd);
-       a->info.resync_start = read_resync_start(a->resync_start_fd);
+       if (a->curr_state != clear)
+               /*
+                * In "clear" state, resync_start may wrongly be set to "0"
+                * when the kernel called md_clean but didn't remove the
+                * sysfs attributes yet
+                */
+               read_resync_start(a->resync_start_fd, &a->info.resync_start);
        sync_completed = read_sync_completed(a->sync_completed_fd);
        for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
                mdi->next_state = 0;
                mdi->curr_state = 0;
                if (mdi->state_fd >= 0) {
-                       mdi->recovery_start = read_resync_start(mdi->recovery_fd);
+                       read_resync_start(mdi->recovery_fd,
+                                         &mdi->recovery_start);
                        mdi->curr_state = read_dev_state(mdi->state_fd);
                }
        }
@@ -245,7 +265,7 @@ static int read_and_act(struct active_array *a)
                 */
                a->container->ss->set_array_state(a, 0);
        }
-       if (a->curr_state <= inactive &&
+       if ((a->curr_state == bad_word || a->curr_state <= inactive) &&
            a->prev_state > inactive) {
                /* array has been stopped */
                a->container->ss->set_array_state(a, 1);
@@ -268,8 +288,7 @@ static int read_and_act(struct active_array *a)
                a->container->ss->set_array_state(a, 1);
        }
        if (a->curr_state == active ||
-           a->curr_state == suspended ||
-           a->curr_state == bad_word)
+           a->curr_state == suspended)
                ret |= ARRAY_DIRTY;
        if (a->curr_state == readonly) {
                /* Well, I'm ready to handle things.  If readonly
@@ -419,9 +438,9 @@ static int read_and_act(struct active_array *a)
                if ((mdi->next_state & DS_REMOVE) && mdi->state_fd >= 0) {
                        int remove_result;
 
-                       /* the kernel may not be able to immediately remove the
-                        * disk, we can simply wait until the next event to try
-                        * again.
+                       /* The kernel may not be able to immediately remove the
+                        * disk.  In that case we wait a little while and
+                        * try again.
                         */
                        remove_result = write_attr("remove", mdi->state_fd);
                        if (remove_result > 0) {
@@ -429,7 +448,8 @@ static int read_and_act(struct active_array *a)
                                close(mdi->state_fd);
                                close(mdi->recovery_fd);
                                mdi->state_fd = -1;
-                       }
+                       } else
+                               ret |= ARRAY_BUSY;
                }
                if (mdi->next_state & DS_INSYNC) {
                        write_attr("+in_sync", mdi->state_fd);
@@ -571,9 +591,9 @@ static int wait_and_act(struct supertype *container, int nowait)
                 */
                int fd;
                if (sigterm)
-                       fd = open_dev_excl(container->devnum);
+                       fd = open_dev_excl(container->devnm);
                else
-                       fd = open_dev_flags(container->devnum, O_RDONLY|O_EXCL);
+                       fd = open_dev_flags(container->devnm, O_RDONLY|O_EXCL);
                if (fd >= 0 || errno != EBUSY) {
                        /* OK, we are safe to leave */
                        if (sigterm && !dirty_arrays)
@@ -584,7 +604,7 @@ static int wait_and_act(struct supertype *container, int nowait)
                                /* On SIGTERM, someone (the take-over mdmon) will
                                 * clean up
                                 */
-                               remove_pidfile(container->devname);
+                               remove_pidfile(container->devnm);
                        exit_now = 1;
                        signal_manager();
                        close(fd);
@@ -597,7 +617,7 @@ static int wait_and_act(struct supertype *container, int nowait)
                struct timespec ts;
                ts.tv_sec = 24*3600;
                ts.tv_nsec = 0;
-               if (*aap == NULL) {
+               if (*aap == NULL || container->retry_soon) {
                        /* just waiting to get O_EXCL access */
                        ts.tv_sec = 0;
                        ts.tv_nsec = 20000000ULL;
@@ -612,7 +632,7 @@ static int wait_and_act(struct supertype *container, int nowait)
                #ifdef DEBUG
                dprint_wake_reasons(&rfds);
                #endif
-
+               container->retry_soon = 0;
        }
 
        if (update_queue) {
@@ -653,6 +673,8 @@ static int wait_and_act(struct supertype *container, int nowait)
                         */
                        if (sigterm && !(ret & ARRAY_DIRTY))
                                a->container = NULL; /* stop touching this array */
+                       if (ret & ARRAY_BUSY)
+                               container->retry_soon = 1;
                }
        }