]> git.ipfire.org Git - thirdparty/mdadm.git/blobdiff - Manage.c
Grow: fix resize of array component size to > 32bits
[thirdparty/mdadm.git] / Manage.c
index be3c652f85bcf23fd10414f73895fbb5c15ed4ce..206f34efcc07a548e0d95c00341ca48c7ae6dcc3 100644 (file)
--- a/Manage.c
+++ b/Manage.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2012 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
@@ -27,9 +27,9 @@
 #include "md_p.h"
 #include <ctype.h>
 
-#define REGISTER_DEV           _IO (MD_MAJOR, 1)
-#define START_MD               _IO (MD_MAJOR, 2)
-#define STOP_MD                _IO (MD_MAJOR, 3)
+#define REGISTER_DEV           _IO (MD_MAJOR, 1)
+#define START_MD               _IO (MD_MAJOR, 2)
+#define STOP_MD                        _IO (MD_MAJOR, 3)
 
 int Manage_ro(char *devname, int fd, int readonly)
 {
@@ -170,28 +170,24 @@ static void remove_devices(char *devnm, char *path)
        free(path2);
 }
 
-int Manage_run(char *devname, int fd, int verbose)
+int Manage_run(char *devname, int fd, struct context *c)
 {
        /* Run the array.  Array must already be configured
         *  Requires >= 0.90.0
         */
-       mdu_param_t param; /* unused */
-       int rv = 0;
+       char nm[32], *nmp;
 
        if (md_get_version(fd) < 9000) {
                pr_err("need md driver version 0.90.0 or later\n");
                return 1;
        }
-
-       if (ioctl(fd, RUN_ARRAY, &param)) {
-               if (verbose >= 0)
-                       pr_err("failed to run array %s: %s\n",
-                              devname, strerror(errno));
+       nmp = fd2devnm(fd);
+       if (!nmp) {
+               pr_err("Cannot find %s in sysfs!!\n", devname);
                return 1;
        }
-       if (verbose >= 0)
-               pr_err("started %s\n", devname);
-       return rv;
+       strcpy(nm, nmp);
+       return IncrementalScan(c, nm);
 }
 
 int Manage_stop(char *devname, int fd, int verbose, int will_retry)
@@ -206,6 +202,8 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry)
        char container[32];
        int err;
        int count;
+       char buf[32];
+       unsigned long long rd1, rd2;
 
        if (will_retry && verbose == 0)
                verbose = -1;
@@ -226,7 +224,7 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry)
        /* Get EXCL access first.  If this fails, then attempting
         * to stop is probably a bad idea.
         */
-       mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION);
+       mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_COMPONENT|GET_VERSION);
        if (mdi && is_subarray(mdi->text_version)) {
                char *sl;
                strncpy(container, mdi->text_version+1, sizeof(container));
@@ -331,6 +329,121 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry)
                        }
        }
 
+       /* If the array is undergoing a reshape which changes the number
+        * of devices, then it would be nice to stop it at a point where
+        * it has completed a full number of stripes in both old and
+        * new layouts as this will allow the reshape to be reverted.
+        * So if 'sync_action' is "reshape" and 'raid_disks' shows two
+        * different numbers, then
+        *  - freeze reshape
+        *  - set sync_max to next multiple of both data_disks and
+        *    chunk sizes (or next but one)
+        *  - unfreeze reshape
+        *  - wait on 'sync_completed' for that point to be reached.
+        */
+       if (mdi && (mdi->array.level >= 4 && mdi->array.level <= 6) &&
+           sysfs_attribute_available(mdi, NULL, "sync_action") &&
+           sysfs_attribute_available(mdi, NULL, "reshape_direction") &&
+           sysfs_get_str(mdi, NULL, "sync_action", buf, 20) > 0 &&
+           strcmp(buf, "reshape\n") == 0 &&
+           sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2 &&
+           sysfs_set_str(mdi, NULL, "sync_action", "frozen") == 0) {
+               /* Array is frozen */
+               unsigned long long position, curr;
+               unsigned long long chunk1, chunk2;
+               unsigned long long rddiv, chunkdiv;
+               unsigned long long sectors;
+               unsigned long long sync_max, old_sync_max;
+               unsigned long long completed;
+               int backwards = 0;
+               int delay;
+               int scfd;
+
+               rd1 -= mdi->array.level == 6 ? 2 : 1;
+               rd2 -= mdi->array.level == 6 ? 2 : 1;
+               sysfs_get_str(mdi, NULL, "reshape_direction", buf, sizeof(buf));
+               if (strncmp(buf, "back", 4) == 0)
+                       backwards = 1;
+               sysfs_get_ll(mdi, NULL, "reshape_position", &position);
+               sysfs_get_two(mdi, NULL, "chunk_size", &chunk1, &chunk2);
+               chunk1 /= 512;
+               chunk2 /= 512;
+               rddiv = GCD(rd1, rd2);
+               chunkdiv = GCD(chunk1, chunk2);
+               sectors = (chunk1/chunkdiv) * chunk2 * (rd1/rddiv) * rd2;
+
+               if (backwards) {
+                       /* Need to subtract 'reshape_position' from
+                        * array size to get equivalent of sync_max.
+                        * Size calculation based on raid5_size in kernel.
+                        */
+                       unsigned long long size = mdi->component_size;
+                       size &= ~(chunk1-1);
+                       size &= ~(chunk2-1);
+                       /* rd1 must be smaller */
+                       position = (position / sectors - 1) * sectors;
+                       sync_max = size - position/rd1;
+               } else {
+                       position = (position / sectors + 2) * sectors;
+                       sync_max = position/rd1;
+               }
+               if (sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) < 0)
+                       old_sync_max = mdi->component_size;
+               /* Must not advance sync_max as that could confuse
+                * the reshape monitor */
+               if (sync_max < old_sync_max)
+                       sysfs_set_num(mdi, NULL, "sync_max", sync_max);
+               sysfs_set_str(mdi, NULL, "sync_action", "idle");
+
+               /* That should have set things going again.  Now we
+                * wait a little while (3 second max) for sync_completed
+                * to reach the target.
+                * The reshape process can block for 500msec if
+                * the sync speed limit is hit, so we need to wait
+                * a lot longer than that. 1 second is usually
+                * enough.  3 is safe.
+                */
+               delay = 3000;
+               scfd = sysfs_open(mdi->sys_name, NULL, "sync_completed");
+               while (scfd >= 0 && delay > 0 && old_sync_max > 0) {
+                       sysfs_get_ll(mdi, NULL, "reshape_position", &curr);
+                       sysfs_fd_get_str(scfd, buf, sizeof(buf));
+                       if (strncmp(buf, "none", 4) == 0) {
+                               /* Either reshape has aborted, or hasn't
+                                * quite started yet.  Wait a bit and
+                                * check  'sync_action' to see.
+                                */
+                               usleep(10000);
+                               sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf));
+                               if (strncmp(buf, "reshape", 7) != 0)
+                                       break;
+                       }
+
+                       if (sysfs_fd_get_ll(scfd, &completed) == 0 &&
+                           (completed > sync_max ||
+                            (completed == sync_max && curr != position))) {
+                               while (completed > sync_max) {
+                                       sync_max += sectors / rd1;
+                                       if (backwards)
+                                               position -= sectors;
+                                       else
+                                               position += sectors;
+                               }
+                               if (sync_max < old_sync_max)
+                                       sysfs_set_num(mdi, NULL, "sync_max", sync_max);
+                       }
+
+                       if (!backwards && curr >= position)
+                               break;
+                       if (backwards && curr <= position)
+                               break;
+                       sysfs_wait(scfd, &delay);
+               }
+               if (scfd >= 0)
+                       close(scfd);
+
+       }
+
        /* As we have an O_EXCL open, any use of the device
         * which blocks STOP_ARRAY is probably a transient use,
         * so it is reasonable to retry for a while - 5 seconds.
@@ -601,10 +714,8 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
                        return -1;
        }
 
-       if (tst->ss->validate_geometry(
-                   tst, array->level, array->layout,
-                   array->raid_disks, NULL,
-                   ldsize >> 9, INVALID_SECTORS, NULL, NULL, 0) == 0) {
+       if (tst->ss == &super0 && ldsize > 4ULL*1024*1024*1024*1024) {
+               /* More than 4TB is wasted on v0.90 */
                if (!force) {
                        pr_err("%s is larger than %s can "
                               "effectively use.\n"
@@ -672,7 +783,8 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
                                break;
                        }
                /* FIXME this is a bad test to be using */
-               if (!tst->sb && dv->disposition != 'a') {
+               if (!tst->sb && (dv->disposition != 'a'
+                                && dv->disposition != 'S')) {
                        /* we are re-adding a device to a
                         * completely dead array - have to depend
                         * on kernel to check
@@ -702,7 +814,7 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
                        dev_st = dup_super(tst);
                        dev_st->ss->load_super(dev_st, tfd, NULL);
                }
-               if (dev_st && dev_st->sb) {
+               if (dev_st && dev_st->sb && dv->disposition != 'S') {
                        int rv = attempt_re_add(fd, tfd, dv,
                                                dev_st, tst,
                                                rdev,
@@ -735,13 +847,14 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
                                        continue;
                                if (disc.major == 0 && disc.minor == 0)
                                        continue;
+                               found++;
                                if (!(disc.state & (1<<MD_DISK_SYNC)))
                                        continue;
                                avail[disc.raid_disk] = 1;
-                               found++;
                        }
                        array_failed = !enough(array->level, array->raid_disks,
                                               array->layout, 1, avail);
+                       free(avail);
                } else
                        array_failed = 0;
                if (array_failed) {
@@ -1101,7 +1214,7 @@ int Manage_with(struct supertype *tst, int fd, struct mddev_dev *dv,
                                   "slot", dv->used);
                if (rv) {
                        sysfs_free(mdi);
-                       pr_err("Failed to %s as preferred replacement.\n",
+                       pr_err("Failed to set %s as preferred replacement.\n",
                               dv->devname);
                        return -1;
                }
@@ -1125,6 +1238,7 @@ int Manage_subdevs(char *devname, int fd,
         *  'a' - add the device
         *         try HOT_ADD_DISK
         *         If that fails EINVAL, try ADD_NEW_DISK
+        *  'S' - add the device as a spare - don't try re-add
         *  'A' - re-add the device
         *  'r' - remove the device: HOT_REMOVE_DISK
         *        device can be 'faulty' or 'detached' in which case all
@@ -1150,7 +1264,6 @@ int Manage_subdevs(char *devname, int fd,
        mdu_array_info_t array;
        unsigned long long array_size;
        struct mddev_dev *dv;
-       struct stat stb;
        int tfd = -1;
        struct supertype *tst;
        char *subarray = NULL;
@@ -1182,9 +1295,10 @@ int Manage_subdevs(char *devname, int fd,
                goto abort;
        }
 
-       stb.st_rdev = 0;
        for (dv = devlist; dv; dv = dv->next) {
+               unsigned long rdev = 0; /* device to add/remove etc */
                int rv;
+               int mj,mn;
 
                if (strcmp(dv->devname, "failed") == 0 ||
                    strcmp(dv->devname, "faulty") == 0) {
@@ -1277,10 +1391,9 @@ int Manage_subdevs(char *devname, int fd,
                        sysfd = sysfs_open(fd2devnm(fd), dname, "block/dev");
                        if (sysfd >= 0) {
                                char dn[20];
-                               int mj,mn;
                                if (sysfs_fd_get_str(sysfd, dn, 20) > 0 &&
                                    sscanf(dn, "%d:%d", &mj,&mn) == 2) {
-                                       stb.st_rdev = makedev(mj,mn);
+                                       rdev = makedev(mj,mn);
                                        found = 1;
                                }
                                close(sysfd);
@@ -1295,7 +1408,14 @@ int Manage_subdevs(char *devname, int fd,
                                        goto abort;
                                }
                        }
+               } else if ((dv->disposition == 'r' || dv->disposition == 'f')
+                          && get_maj_min(dv->devname, &mj, &mn)) {
+                       /* for 'fail' and 'remove', the device might
+                        * not exist.
+                        */
+                       rdev = makedev(mj, mn);
                } else {
+                       struct stat stb;
                        tfd = dev_open(dv->devname, O_RDONLY);
                        if (tfd >= 0)
                                fstat(tfd, &stb);
@@ -1328,6 +1448,7 @@ int Manage_subdevs(char *devname, int fd,
                                        goto abort;
                                }
                        }
+                       rdev = stb.st_rdev;
                }
                switch(dv->disposition){
                default:
@@ -1335,6 +1456,7 @@ int Manage_subdevs(char *devname, int fd,
                                dv->devname, dv->disposition);
                        goto abort;
                case 'a':
+               case 'S': /* --add-spare */
                case 'A':
                case 'M': /* --re-add missing */
                case 'F': /* --re-add faulty  */
@@ -1347,8 +1469,7 @@ int Manage_subdevs(char *devname, int fd,
                        }
                        if (dv->disposition == 'F')
                                /* Need to remove first */
-                               ioctl(fd, HOT_REMOVE_DISK,
-                                     (unsigned long)stb.st_rdev);
+                               ioctl(fd, HOT_REMOVE_DISK, rdev);
                        /* Make sure it isn't in use (in 2.6 or later) */
                        tfd = dev_open(dv->devname, O_RDONLY|O_EXCL);
                        if (tfd >= 0) {
@@ -1358,7 +1479,7 @@ int Manage_subdevs(char *devname, int fd,
                                 */
                                close(tfd);
                                tfd = dev_open(dv->devname, O_RDONLY);
-                       }                               
+                       }
                        if (tfd < 0) {
                                if (dv->disposition == 'M')
                                        continue;
@@ -1374,7 +1495,7 @@ int Manage_subdevs(char *devname, int fd,
                        }
                        rv = Manage_add(fd, tfd, dv, tst, &array,
                                        force, verbose, devname, update,
-                                       stb.st_rdev, array_size);
+                                       rdev, array_size);
                        close(tfd);
                        tfd = -1;
                        if (rv < 0)
@@ -1392,7 +1513,7 @@ int Manage_subdevs(char *devname, int fd,
                                rv = -1;
                        } else
                                rv = Manage_remove(tst, fd, dv, sysfd,
-                                                  stb.st_rdev, verbose,
+                                                  rdev, verbose,
                                                   devname);
                        if (sysfd >= 0)
                                close(sysfd);
@@ -1407,7 +1528,7 @@ int Manage_subdevs(char *devname, int fd,
                        /* FIXME check current member */
                        if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) ||
                            (sysfd < 0 && ioctl(fd, SET_DISK_FAULTY,
-                                               (unsigned long) stb.st_rdev))) {
+                                               rdev))) {
                                if (errno == EBUSY)
                                        busy = 1;
                                pr_err("set device faulty failed for %s:  %s\n",
@@ -1438,7 +1559,7 @@ int Manage_subdevs(char *devname, int fd,
                                                frozen = -1;
                                }
                                rv = Manage_replace(tst, fd, dv,
-                                                   stb.st_rdev, verbose,
+                                                   rdev, verbose,
                                                    devname);
                        }
                        if (rv < 0)
@@ -1452,7 +1573,7 @@ int Manage_subdevs(char *devname, int fd,
                        goto abort;
                case 'w': /* --with device which was matched */
                        rv = Manage_with(tst, fd, dv,
-                                        stb.st_rdev, verbose, devname);
+                                        rdev, verbose, devname);
                        if (rv < 0)
                                goto abort;
                        break;