]> git.ipfire.org Git - thirdparty/mdadm.git/commitdiff
Grow: handle abort/restart of grow while being monitored.
authorNeilBrown <neilb@suse.de>
Tue, 10 May 2011 02:53:51 +0000 (12:53 +1000)
committerNeilBrown <neilb@suse.de>
Tue, 10 May 2011 02:53:51 +0000 (12:53 +1000)
If a device fails while the grow is being monitored but the array is
still functional, the Grow will appear to abort and then almost
instantly restart from where it was up to.

So if it appears to abort, wait up to 10 seconds for a restart (it
should be much much less than this.

Signed-off-by: NeilBrown <neilb@suse.de>
Grow.c

diff --git a/Grow.c b/Grow.c
index 0a084482dcb53ac0eeb87545895567d5c7ed2b85..5840a2c160b0e7a52a7e4edc1e33835e732d8781 100644 (file)
--- a/Grow.c
+++ b/Grow.c
@@ -2625,10 +2625,9 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape,
        if (fd < 0)
                goto check_progress;
 
-       if (sysfs_fd_get_ll(fd, &completed) < 0) {
-               close(fd);
+       if (sysfs_fd_get_ll(fd, &completed) < 0)
                goto check_progress;
-       }
+
        while (completed < max_progress && completed < wait_point) {
                /* Check that sync_action is still 'reshape' to avoid
                 * waiting forever on a dead array
@@ -2653,10 +2652,8 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape,
                FD_ZERO(&rfds);
                FD_SET(fd, &rfds);
                select(fd+1, NULL, NULL, &rfds, NULL);
-               if (sysfs_fd_get_ll(fd, &completed) < 0) {
-                       close(fd);
+               if (sysfs_fd_get_ll(fd, &completed) < 0)
                        goto check_progress;
-               }
        }
        /* Some kernels reset 'sync_completed' to zero,
         * we need to have real point we are in md
@@ -2689,13 +2686,37 @@ check_progress:
        /* if we couldn't read a number from sync_completed, then
         * either the reshape did complete, or it aborted.
         * We can tell which by checking for 'none' in reshape_position.
+        * If it did abort, then it might immediately restart if it
+        * it was just a device failure that leaves us degraded but
+        * functioning.
         */
        strcpy(buf, "hi");
        if (sysfs_get_str(info, NULL, "reshape_position", buf, sizeof(buf)) < 0
-           || strncmp(buf, "none", 4) != 0)
-               return -2; /* abort */
-       else {
+           || strncmp(buf, "none", 4) != 0) {
+               /* The abort might only be temporary.  Wait up to 10
+                * seconds for fd to contain a valid number again.
+                */
+               struct timeval tv;
+               int rv = -2;
+               tv.tv_sec = 10;
+               tv.tv_usec = 0;
+               while (fd >= 0 && rv < 0) {
+                       fd_set rfds;
+                       FD_ZERO(&rfds);
+                       FD_SET(fd, &rfds);
+                       if (select(fd+1, NULL, NULL, &rfds, &tv) != 1)
+                               break;
+                       if (sysfs_fd_get_ll(fd, &completed) >= 0)
+                               /* all good again */
+                               rv = 1;
+               }
+               if (fd >= 0)
+                       close(fd);
+               return rv; /* abort */
+       } else {
                /* Maybe racing with array shutdown - check state */
+               if (fd >= 0)
+                       close(fd);
                if (sysfs_get_str(info, NULL, "array_state", buf, sizeof(buf)) < 0
                    || strncmp(buf, "inactive", 8) == 0
                    || strncmp(buf, "clear",5) == 0)