]> git.ipfire.org Git - thirdparty/mdadm.git/commitdiff
mdadm: Add --write-zeros option for Create
authorLogan Gunthorpe <logang@deltatee.com>
Wed, 1 Mar 2023 20:41:33 +0000 (13:41 -0700)
committerJes Sorensen <jes@trained-monkey.org>
Mon, 13 Mar 2023 14:08:10 +0000 (10:08 -0400)
Add the --write-zeros option for Create which will send a write zeros
request to all the disks before assembling the array. After zeroing
the array, the disks will be in a known clean state and the initial
sync may be skipped.

Writing zeroes is best used when there is a hardware offload method
to zero the data. But even still, zeroing can take several minutes on
a large device. Because of this, all disks are zeroed in parallel using
their own forked process and a message is printed to the user. The main
process will proceed only after all the zeroing processes have completed
successfully.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Acked-by: Kinga Tanska <kinga.tanska@linux.intel.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Acked-by: Coly Li <colyli@suse.de>
Signed-off-by: Jes Sorensen <jes@trained-monkey.org>
Create.c
ReadMe.c
mdadm.c
mdadm.h

index 4acda30c5256f7e0df4fea0ffd1392596cc87168..bbe9e13dc76dc0ab2255823816525be0ae7e64ef 100644 (file)
--- a/Create.c
+++ b/Create.c
 #include       "md_u.h"
 #include       "md_p.h"
 #include       <ctype.h>
+#include       <fcntl.h>
+#include       <signal.h>
+#include       <sys/signalfd.h>
+#include       <sys/wait.h>
 
 static int round_size_and_verify(unsigned long long *size, int chunk)
 {
@@ -91,9 +95,149 @@ int default_layout(struct supertype *st, int level, int verbose)
        return layout;
 }
 
+static pid_t write_zeroes_fork(int fd, struct shape *s, struct supertype *st,
+                              struct mddev_dev *dv)
+
+{
+       const unsigned long long req_size = 1 << 30;
+       unsigned long long offset_bytes, size_bytes, sz;
+       sigset_t sigset;
+       int ret = 0;
+       pid_t pid;
+
+       size_bytes = KIB_TO_BYTES(s->size);
+
+       /*
+        * If size_bytes is zero, this is a zoned raid array where
+        * each disk is of a different size and uses its full
+        * disk. Thus zero the entire disk.
+        */
+       if (!size_bytes && !get_dev_size(fd, dv->devname, &size_bytes))
+               return -1;
+
+       if (dv->data_offset != INVALID_SECTORS)
+               offset_bytes = SEC_TO_BYTES(dv->data_offset);
+       else
+               offset_bytes = SEC_TO_BYTES(st->data_offset);
+
+       pr_info("zeroing data from %lld to %lld on: %s\n",
+               offset_bytes, size_bytes, dv->devname);
+
+       pid = fork();
+       if (pid < 0) {
+               pr_err("Could not fork to zero disks: %s\n", strerror(errno));
+               return pid;
+       } else if (pid != 0) {
+               return pid;
+       }
+
+       sigemptyset(&sigset);
+       sigaddset(&sigset, SIGINT);
+       sigprocmask(SIG_UNBLOCK, &sigset, NULL);
+
+       while (size_bytes) {
+               /*
+                * Split requests to the kernel into 1GB chunks seeing the
+                * fallocate() call is not interruptible and blocking a
+                * ctrl-c for several minutes is not desirable.
+                *
+                * 1GB is chosen as a compromise: the user may still have
+                * to wait several seconds if they ctrl-c on devices that
+                * zero slowly, but will reduce the number of requests
+                * required and thus the overhead on devices that perform
+                * better.
+                */
+               sz = size_bytes;
+               if (sz >= req_size)
+                       sz = req_size;
+
+               if (fallocate(fd, FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE,
+                             offset_bytes, sz)) {
+                       pr_err("zeroing %s failed: %s\n", dv->devname,
+                              strerror(errno));
+                       ret = 1;
+                       break;
+               }
+
+               offset_bytes += sz;
+               size_bytes -= sz;
+       }
+
+       exit(ret);
+}
+
+static int wait_for_zero_forks(int *zero_pids, int count)
+{
+       int wstatus, ret = 0, i, sfd, wait_count = 0;
+       struct signalfd_siginfo fdsi;
+       bool interrupted = false;
+       sigset_t sigset;
+       ssize_t s;
+
+       for (i = 0; i < count; i++)
+               if (zero_pids[i])
+                       wait_count++;
+       if (!wait_count)
+               return 0;
+
+       sigemptyset(&sigset);
+       sigaddset(&sigset, SIGINT);
+       sigaddset(&sigset, SIGCHLD);
+       sigprocmask(SIG_BLOCK, &sigset, NULL);
+
+       sfd = signalfd(-1, &sigset, 0);
+       if (sfd < 0) {
+               pr_err("Unable to create signalfd: %s\n", strerror(errno));
+               return 1;
+       }
+
+       while (1) {
+               s = read(sfd, &fdsi, sizeof(fdsi));
+               if (s != sizeof(fdsi)) {
+                       pr_err("Invalid signalfd read: %s\n", strerror(errno));
+                       close(sfd);
+                       return 1;
+               }
+
+               if (fdsi.ssi_signo == SIGINT) {
+                       printf("\n");
+                       pr_info("Interrupting zeroing processes, please wait...\n");
+                       interrupted = true;
+               } else if (fdsi.ssi_signo == SIGCHLD) {
+                       if (!--wait_count)
+                               break;
+               }
+       }
+
+       close(sfd);
+
+       for (i = 0; i < count; i++) {
+               if (!zero_pids[i])
+                       continue;
+
+               waitpid(zero_pids[i], &wstatus, 0);
+               zero_pids[i] = 0;
+               if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus))
+                       ret = 1;
+       }
+
+       if (interrupted) {
+               pr_err("zeroing interrupted!\n");
+               return 1;
+       }
+
+       if (ret)
+               pr_err("zeroing failed!\n");
+       else
+               pr_info("zeroing finished\n");
+
+       return ret;
+}
+
 static int add_disk_to_super(int mdfd, struct shape *s, struct context *c,
                struct supertype *st, struct mddev_dev *dv,
-               struct mdinfo *info, int have_container, int major_num)
+               struct mdinfo *info, int have_container, int major_num,
+               int *zero_pid)
 {
        dev_t rdev;
        int fd;
@@ -148,6 +292,14 @@ static int add_disk_to_super(int mdfd, struct shape *s, struct context *c,
        }
        st->ss->getinfo_super(st, info, NULL);
 
+       if (fd >= 0 && s->write_zeroes) {
+               *zero_pid = write_zeroes_fork(fd, s, st, dv);
+               if (*zero_pid <= 0) {
+                       ioctl(mdfd, STOP_ARRAY, NULL);
+                       return 1;
+               }
+       }
+
        if (have_container && c->verbose > 0)
                pr_err("Using %s for device %d\n",
                       map_dev(info->disk.major, info->disk.minor, 0),
@@ -224,10 +376,23 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
 {
        struct mddev_dev *moved_disk = NULL;
        int pass, raid_disk_num, dnum;
+       int zero_pids[total_slots];
        struct mddev_dev *dv;
        struct mdinfo *infos;
+       sigset_t sigset, orig_sigset;
        int ret = 0;
 
+       /*
+        * Block SIGINT so the main thread will always wait for the
+        * zeroing processes when being interrupted. Otherwise the
+        * zeroing processes will finish their work in the background
+        * keeping the disk busy.
+        */
+       sigemptyset(&sigset);
+       sigaddset(&sigset, SIGINT);
+       sigprocmask(SIG_BLOCK, &sigset, &orig_sigset);
+       memset(zero_pids, 0, sizeof(zero_pids));
+
        infos = xmalloc(sizeof(*infos) * total_slots);
        enable_fds(total_slots);
        for (pass = 1; pass <= 2; pass++) {
@@ -261,7 +426,7 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
 
                                ret = add_disk_to_super(mdfd, s, c, st, dv,
                                                &infos[dnum], have_container,
-                                               major_num);
+                                               major_num, &zero_pids[dnum]);
                                if (ret)
                                        goto out;
 
@@ -287,6 +452,10 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
                }
 
                if (pass == 1) {
+                       ret = wait_for_zero_forks(zero_pids, total_slots);
+                       if (ret)
+                               goto out;
+
                        ret = update_metadata(mdfd, s, st, map, info,
                                              chosen_name);
                        if (ret)
@@ -295,7 +464,10 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
        }
 
 out:
+       if (ret)
+               wait_for_zero_forks(zero_pids, total_slots);
        free(infos);
+       sigprocmask(SIG_SETMASK, &orig_sigset, NULL);
        return ret;
 }
 
index bd8d50d286618de0001700c88e5fc24cdcbf580e..db251ed2f3d4f656ea24cd4e51295153c8c8a620 100644 (file)
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -138,6 +138,7 @@ struct option long_options[] = {
     {"size",     1, 0, 'z'},
     {"auto",     1, 0, Auto}, /* also for --assemble */
     {"assume-clean",0,0, AssumeClean },
+    {"write-zeroes",0,0, WriteZeroes },
     {"metadata",  1, 0, 'e'}, /* superblock format */
     {"bitmap",   1, 0, Bitmap},
     {"bitmap-chunk", 1, 0, BitmapChunk},
@@ -390,6 +391,7 @@ char Help_create[] =
 "  --write-journal=      : Specify journal device for RAID-4/5/6 array\n"
 "  --consistency-policy= : Specify the policy that determines how the array\n"
 "                     -k : maintains consistency in case of unexpected shutdown.\n"
+"  --write-zeroes        : Write zeroes to the disks before creating. This will bypass initial sync.\n"
 "\n"
 ;
 
diff --git a/mdadm.c b/mdadm.c
index 57e8e6fa64b9881f7e5b298a8a8ecb18455633cb..4685ad6b06c2becd8d7110247abe34af7bf08411 100644 (file)
--- a/mdadm.c
+++ b/mdadm.c
@@ -590,6 +590,10 @@ int main(int argc, char *argv[])
                        s.assume_clean = 1;
                        continue;
 
+               case O(CREATE, WriteZeroes):
+                       s.write_zeroes = 1;
+                       continue;
+
                case O(GROW,'n'):
                case O(CREATE,'n'):
                case O(BUILD,'n'): /* number of raid disks */
@@ -1251,6 +1255,11 @@ int main(int argc, char *argv[])
                }
        }
 
+       if (s.write_zeroes && !s.assume_clean) {
+               pr_info("Disk zeroing requested, setting --assume-clean to skip resync\n");
+               s.assume_clean = 1;
+       }
+
        if (!mode && devs_found) {
                mode = MISC;
                devmode = 'Q';
diff --git a/mdadm.h b/mdadm.h
index 4336be4dad82a8a09eea755664e8108c552a172e..b9127f9a8ef64cd5fd6630c73bf731a2868b5167 100644 (file)
--- a/mdadm.h
+++ b/mdadm.h
@@ -275,6 +275,9 @@ static inline void __put_unaligned32(__u32 val, void *p)
 
 #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
 
+#define KIB_TO_BYTES(x)        ((x) << 10)
+#define SEC_TO_BYTES(x)        ((x) << 9)
+
 extern const char Name[];
 
 struct md_bb_entry {
@@ -435,6 +438,7 @@ extern char Version[], Usage[], Help[], OptionHelp[],
  */
 enum special_options {
        AssumeClean = 300,
+       WriteZeroes,
        BitmapChunk,
        WriteBehind,
        ReAdd,
@@ -640,6 +644,7 @@ struct shape {
        int     bitmap_chunk;
        char    *bitmap_file;
        int     assume_clean;
+       bool    write_zeroes;
        int     write_behind;
        unsigned long long size;
        unsigned long long data_offset;