X-Git-Url: http://git.ipfire.org/?a=blobdiff_plain;f=Grow.c;h=a5a9421b348a57151a349a370cfd3fef94a8f1b3;hb=a34c8836f02c18814f146d3d8b7e415b152ee1ca;hp=20fe9078bf55cd78ce082bd48adcf697f5324ba6;hpb=887a7a9e982139bb6ab3c6b5e6fec1fff9bb7d94;p=thirdparty%2Fmdadm.git diff --git a/Grow.c b/Grow.c index 20fe9078..a5a9421b 100644 --- a/Grow.c +++ b/Grow.c @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2012 Neil Brown + * Copyright (C) 2001-2013 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -24,6 +24,9 @@ #include "mdadm.h" #include "dlink.h" #include +#include +#include +#include #if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN) #error no endian defined @@ -39,7 +42,7 @@ int restore_backup(struct supertype *st, struct mdinfo *content, int working_disks, int next_spare, - char *backup_file, + char **backup_filep, int verbose) { int i; @@ -47,10 +50,12 @@ int restore_backup(struct supertype *st, struct mdinfo *dev; int err; int disk_count = next_spare + working_disks; + char *backup_file = *backup_filep; dprintf("Called restore_backup()\n"); fdlist = xmalloc(sizeof(int) * disk_count); + enable_fds(next_spare); for (i = 0; i < next_spare; i++) fdlist[i] = -1; for (dev = content->devs; dev; dev = dev->next) { @@ -67,6 +72,11 @@ int restore_backup(struct supertype *st, fdlist[next_spare++] = fd; } + if (!backup_file) { + backup_file = locate_backup(content->sys_name); + *backup_filep = backup_file; + } + if (st->ss->external && st->ss->recover_backup) err = st->ss->recover_backup(st, content); else @@ -382,7 +392,7 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) "with %s metadata\n", st->ss->name); return 1; } - mdi = sysfs_read(fd, -1, GET_BITMAP_LOCATION); + mdi = sysfs_read(fd, NULL, GET_BITMAP_LOCATION); if (mdi) offset_setable = 1; for (d=0; d< st->max_devs; d++) { @@ -421,7 +431,7 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) } if (offset_setable) { st->ss->getinfo_super(st, mdi, NULL); - sysfs_init(mdi, fd, -1); + sysfs_init(mdi, fd, NULL); rv = sysfs_set_num_signed(mdi, NULL, "bitmap/location", mdi->bitmap_offset); } else { @@ -533,13 +543,11 @@ static int check_idle(struct supertype *st) /* Check that all member arrays for this container, or the * container of this array, are idle */ - int container_dev = (st->container_dev != NoMdDev - ? st->container_dev : st->devnum); - char container[40]; + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); struct mdstat_ent *ent, *e; int is_idle = 1; - fmt_devname(container, container_dev); ent = mdstat_read(0, 0); for (e = ent ; e; e = e->next) { if (!is_container_member(e, container)) @@ -555,15 +563,12 @@ static int check_idle(struct supertype *st) static int freeze_container(struct supertype *st) { - int container_dev = (st->container_dev != NoMdDev - ? st->container_dev : st->devnum); - char container[40]; + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); if (!check_idle(st)) return -1; - fmt_devname(container, container_dev); - if (block_monitor(container, 1)) { pr_err("failed to freeze container\n"); return -2; @@ -574,11 +579,8 @@ static int freeze_container(struct supertype *st) static void unfreeze_container(struct supertype *st) { - int container_dev = (st->container_dev != NoMdDev - ? st->container_dev : st->devnum); - char container[40]; - - fmt_devname(container, container_dev); + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); unblock_monitor(container, 1); } @@ -594,7 +596,7 @@ static int freeze(struct supertype *st) if (st->ss->external) return freeze_container(st); else { - struct mdinfo *sra = sysfs_read(-1, st->devnum, GET_VERSION); + struct mdinfo *sra = sysfs_read(-1, st->devnm, GET_VERSION); int err; char buf[20]; @@ -616,10 +618,15 @@ static void unfreeze(struct supertype *st) if (st->ss->external) return unfreeze_container(st); else { - struct mdinfo *sra = sysfs_read(-1, st->devnum, GET_VERSION); + struct mdinfo *sra = sysfs_read(-1, st->devnm, GET_VERSION); + char buf[20]; - if (sra) + if (sra && + sysfs_get_str(sra, NULL, "sync_action", buf, 20) > 0 + && strcmp(buf, "frozen\n") == 0) { + printf("unfreeze\n"); sysfs_set_str(sra, NULL, "sync_action", "idle"); + } sysfs_free(sra); } } @@ -632,13 +639,9 @@ static void wait_reshape(struct mdinfo *sra) if (fd < 0) return; - while (sysfs_fd_get_str(fd, action, 20) > 0 && - strncmp(action, "reshape", 7) == 0) { - fd_set rfds; - FD_ZERO(&rfds); - FD_SET(fd, &rfds); - select(fd+1, NULL, NULL, &rfds, NULL); - } + while (sysfs_fd_get_str(fd, action, 20) > 0 && + strncmp(action, "reshape", 7) == 0) + sysfs_wait(fd, NULL); close(fd); } @@ -739,7 +742,8 @@ void abort_reshape(struct mdinfo *sra) sysfs_set_num(sra, NULL, "suspend_hi", 0); sysfs_set_num(sra, NULL, "suspend_lo", 0); sysfs_set_num(sra, NULL, "sync_min", 0); - sysfs_set_str(sra, NULL, "sync_max", "max"); + // It isn't safe to reset sync_max as we aren't monitoring. + // Array really should be stopped at this point. } int remove_disks_for_takeover(struct supertype *st, @@ -845,6 +849,7 @@ int reshape_prepare_fdlist(char *devname, int d = 0; struct mdinfo *sd; + enable_fds(nrdisks); for (d = 0; d <= nrdisks; d++) fdlist[d] = -1; d = raid_disks; @@ -888,6 +893,7 @@ int reshape_open_backup_file(char *backup_file, long blocks, int *fdlist, unsigned long long *offsets, + char *sys_name, int restart) { /* Return 1 on success, 0 on any form of failure */ @@ -935,6 +941,12 @@ int reshape_open_backup_file(char *backup_file, return 0; } + if (!restart && strncmp(backup_file, MAP_DIR, strlen(MAP_DIR)) != 0) { + char *bu = make_backup(sys_name); + symlink(backup_file, bu); + free(bu); + } + return 1; } @@ -950,19 +962,14 @@ unsigned long compute_backup_blocks(int nchunk, int ochunk, a = (ochunk/512) * odata; b = (nchunk/512) * ndata; /* Find GCD */ - while (a != b) { - if (a < b) - b -= a; - if (b < a) - a -= b; - } + a = GCD(a, b); /* LCM == product / GCD */ blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a; return blocks; } -char *analyse_change(struct mdinfo *info, struct reshape *re) +char *analyse_change(char *devname, struct mdinfo *info, struct reshape *re) { /* Based on the current array state in info->array and * the changes in info->new_* etc, determine: @@ -979,12 +986,16 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) * This can be called as part of starting a reshape, or * when assembling an array that is undergoing reshape. */ + int near, far, offset, copies; int new_disks; + int old_chunk, new_chunk; /* delta_parity records change in number of devices * caused by level change */ int delta_parity = 0; + memset(re, 0, sizeof(*re)); + /* If a new level not explicitly given, we assume no-change */ if (info->new_level == UnSet) info->new_level = info->array.level; @@ -999,9 +1010,16 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) /* chunk size is meaningful, must divide component_size * evenly */ - if (info->component_size % (info->new_chunk/512)) - return "New chunk size does not" - " divide component size"; + if (info->component_size % (info->new_chunk/512)) { + unsigned long long shrink = info->component_size; + shrink &= ~(unsigned long long)(info->new_chunk/512-1); + pr_err("New chunk size (%dK) does not evenly divide device size (%lluk)\n", + info->new_chunk/1024, info->component_size/2); + pr_err("After shrinking any filesystem, \"mdadm --grow %s --size %llu\"\n", + devname, shrink/2); + pr_err("will shrink the array so the given chunk size would work.\n"); + return ""; + } break; default: return "chunk size not meaningful for this level"; @@ -1029,9 +1047,6 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) re->level = 0; re->before.data_disks = 1; re->after.data_disks = 1; - re->before.layout = 0; - re->backup_blocks = 0; - re->parity = 0; return NULL; } if (info->new_level == 1) { @@ -1039,8 +1054,6 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) /* Don't know what to do */ return "no change requested for Growing RAID1"; re->level = 1; - re->backup_blocks = 0; - re->parity = 0; return NULL; } if (info->array.raid_disks == 2 && @@ -1066,38 +1079,94 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) return "Impossibly level change request for RAID1"; case 10: - /* RAID10 can only be converted from near mode to - * RAID0 by removing some devices + /* RAID10 can be converted from near mode to + * RAID0 by removing some devices. + * It can also be reshaped if the kernel supports + * new_data_offset. */ - if ((info->array.layout & ~0xff) != 0x100) - return "Cannot Grow RAID10 with far/offset layout"; - /* number of devices must be multiple of number of copies */ - if (info->array.raid_disks % (info->array.layout & 0xff)) - return "RAID10 layout too complex for Grow operation"; + switch (info->new_level) { + case 0: + if ((info->array.layout & ~0xff) != 0x100) + return "Cannot Grow RAID10 with far/offset layout"; + /* number of devices must be multiple of number of copies */ + if (info->array.raid_disks % (info->array.layout & 0xff)) + return "RAID10 layout too complex for Grow operation"; + + new_disks = (info->array.raid_disks + / (info->array.layout & 0xff)); + if (info->delta_disks == UnSet) + info->delta_disks = (new_disks + - info->array.raid_disks); - if (info->new_level != 0) - return "RAID10 can only be changed to RAID0"; - new_disks = (info->array.raid_disks - / (info->array.layout & 0xff)); - if (info->delta_disks == UnSet) - info->delta_disks = (new_disks - - info->array.raid_disks); - - if (info->delta_disks != new_disks - info->array.raid_disks) - return "New number of raid-devices impossible for RAID10"; - if (info->new_chunk && - info->new_chunk != info->array.chunk_size) - return "Cannot change chunk-size with RAID10 Grow"; - - /* looks good */ - re->level = 0; - re->parity = 0; - re->before.data_disks = new_disks; - re->after.data_disks = re->before.data_disks; - re->before.layout = 0; - re->backup_blocks = 0; - return NULL; + if (info->delta_disks != new_disks - info->array.raid_disks) + return "New number of raid-devices impossible for RAID10"; + if (info->new_chunk && + info->new_chunk != info->array.chunk_size) + return "Cannot change chunk-size with RAID10 Grow"; + + /* looks good */ + re->level = 0; + re->before.data_disks = new_disks; + re->after.data_disks = re->before.data_disks; + return NULL; + + case 10: + near = info->array.layout & 0xff; + far = (info->array.layout >> 8) & 0xff; + offset = info->array.layout & 0x10000; + if (far > 1 && !offset) + return "Cannot reshape RAID10 in far-mode"; + copies = near * far; + + old_chunk = info->array.chunk_size * far; + + if (info->new_layout == UnSet) + info->new_layout = info->array.layout; + else { + near = info->new_layout & 0xff; + far = (info->new_layout >> 8) & 0xff; + offset = info->new_layout & 0x10000; + if (far > 1 && !offset) + return "Cannot reshape RAID10 to far-mode"; + if (near * far != copies) + return "Cannot change number of copies" + " when reshaping RAID10"; + } + if (info->delta_disks == UnSet) + info->delta_disks = 0; + new_disks = (info->array.raid_disks + + info->delta_disks); + + new_chunk = info->new_chunk * far; + + re->level = 10; + re->before.layout = info->array.layout; + re->before.data_disks = info->array.raid_disks; + re->after.layout = info->new_layout; + re->after.data_disks = new_disks; + /* For RAID10 we don't do backup but do allow reshape, + * so set backup_blocks to INVALID_SECTORS rather than + * zero. + * And there is no need to synchronise stripes on both + * 'old' and 'new'. So the important + * number is the minimum data_offset difference + * which is the larger of (offset copies * chunk). + */ + re->backup_blocks = INVALID_SECTORS; + re->min_offset_change = max(old_chunk, new_chunk) / 512; + if (new_disks < re->before.data_disks && + info->space_after < re->min_offset_change) + /* Reduce component size by one chunk */ + re->new_size = (info->component_size - + re->min_offset_change); + else + re->new_size = info->component_size; + re->new_size = re->new_size * new_disks / copies; + return NULL; + default: + return "RAID10 can only be changed to RAID0"; + } case 0: /* RAID0 can be converted to RAID10, or to RAID456 */ if (info->new_level == 10) { @@ -1128,12 +1197,10 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) return "Cannot change chunk-size with RAID0->RAID10"; /* looks good */ re->level = 10; - re->parity = 0; re->before.data_disks = (info->array.raid_disks + info->delta_disks); re->after.data_disks = re->before.data_disks; re->before.layout = info->new_layout; - re->backup_blocks = 0; return NULL; } @@ -1151,11 +1218,15 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) delta_parity = 1; re->level = 5; re->before.layout = ALGORITHM_PARITY_N; + if (info->new_layout == UnSet) + info->new_layout = map_name(r5layout, "default"); break; case 6: delta_parity = 2; re->level = 6; re->before.layout = ALGORITHM_PARITY_N; + if (info->new_layout == UnSet) + info->new_layout = map_name(r6layout, "default"); break; default: return "Impossible level change requested"; @@ -1215,7 +1286,8 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) return "Cannot set raid_disk when " "converting RAID5->RAID1"; re->level = 1; - break; + info->new_chunk = 0; + return NULL; default: return "Impossible level change requested"; } @@ -1253,6 +1325,7 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) switch (re->level) { case 4: + re->before.layout = 0; re->after.layout = 0; break; case 5: @@ -1268,6 +1341,7 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) switch (re->level) { case 4: + re->before.layout = 0; re->after.layout = 0; break; case 5: @@ -1355,17 +1429,20 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) /* So we have a restripe operation, we need to calculate the number * of blocks per reshape operation. */ + re->new_size = info->component_size * re->before.data_disks; if (info->new_chunk == 0) info->new_chunk = info->array.chunk_size; if (re->after.data_disks == re->before.data_disks && re->after.layout == re->before.layout && info->new_chunk == info->array.chunk_size) { - /* Nothing to change */ + /* Nothing to change, can change level immediately. */ + re->level = info->new_level; re->backup_blocks = 0; return NULL; } if (re->after.data_disks == 1 && re->before.data_disks == 1) { /* chunk and layout changes make no difference */ + re->level = info->new_level; re->backup_blocks = 0; return NULL; } @@ -1382,6 +1459,7 @@ char *analyse_change(struct mdinfo *info, struct reshape *re) info->new_chunk, info->array.chunk_size, re->after.data_disks, re->before.data_disks); + re->min_offset_change = re->backup_blocks / re->before.data_disks; re->new_size = info->component_size * re->after.data_disks; return NULL; @@ -1428,6 +1506,7 @@ static int set_array_size(struct supertype *st, struct mdinfo *sra, static int reshape_array(char *container, int fd, char *devname, struct supertype *st, struct mdinfo *info, int force, struct mddev_dev *devlist, + unsigned long long data_offset, char *backup_file, int verbose, int forked, int restart, int freeze_reshape); static int reshape_container(char *container, char *devname, @@ -1435,11 +1514,12 @@ static int reshape_container(char *container, char *devname, struct supertype *st, struct mdinfo *info, int force, - char *backup_file, - int verbose, int restart, int freeze_reshape); + char *backup_file, int verbose, + int forked, int restart, int freeze_reshape); int Grow_reshape(char *devname, int fd, struct mddev_dev *devlist, + unsigned long long data_offset, struct context *c, struct shape *s) { /* Make some changes in the shape of an array. @@ -1467,7 +1547,6 @@ int Grow_reshape(char *devname, int fd, int frozen; int changed = 0; char *container = NULL; - char container_buf[20]; int cfd = -1; struct mddev_dev *dv; @@ -1481,6 +1560,11 @@ int Grow_reshape(char *devname, int fd, devname); return 1; } + if (data_offset != INVALID_SECTORS && array.level != 10 + && (array.level < 4 || array.level > 6)) { + pr_err("--grow --data-offset not yet supported\n"); + return 1; + } if (s->size > 0 && (s->chunk || s->level!= UnSet || s->layout_str || s->raiddisks)) { @@ -1515,16 +1599,15 @@ int Grow_reshape(char *devname, int fd, * pre-requisite spare devices (mdmon owns final validation) */ if (st->ss->external) { - int container_dev; int rv; if (subarray) { - container_dev = st->container_dev; - cfd = open_dev_excl(st->container_dev); + container = st->container_devnm; + cfd = open_dev_excl(st->container_devnm); } else { - container_dev = st->devnum; + container = st->devnm; close(fd); - cfd = open_dev_excl(st->devnum); + cfd = open_dev_excl(st->devnm); fd = cfd; } if (cfd < 0) { @@ -1534,9 +1617,6 @@ int Grow_reshape(char *devname, int fd, return 1; } - fmt_devname(container_buf, container_dev); - container = container_buf; - rv = st->ss->load_container(st, cfd, NULL); if (rv) { @@ -1567,7 +1647,7 @@ int Grow_reshape(char *devname, int fd, pr_err("cannot reshape arrays in" " container with unsupported" " metadata: %s(%s)\n", - devname, container_buf); + devname, container); sysfs_free(cc); free(subarray); return 1; @@ -1575,7 +1655,7 @@ int Grow_reshape(char *devname, int fd, } sysfs_free(cc); } - if (mdmon_running(container_dev)) + if (mdmon_running(container)) st->update_tail = &st->updates; } @@ -1594,7 +1674,7 @@ int Grow_reshape(char *devname, int fd, return 1; } - sra = sysfs_read(fd, 0, GET_LEVEL | GET_DISKS | GET_DEVS + sra = sysfs_read(fd, NULL, GET_LEVEL | GET_DISKS | GET_DEVS | GET_STATE | GET_VERSION); if (sra) { if (st->ss->external && subarray == NULL) { @@ -1628,6 +1708,12 @@ int Grow_reshape(char *devname, int fd, if (orig_size == 0) orig_size = (unsigned) array.size; + if (orig_size == 0) { + pr_err("Cannot set device size in this type of array.\n"); + rv = 1; + goto release; + } + if (reshape_super(st, s->size, UnSet, UnSet, 0, 0, UnSet, NULL, devname, APPLY_METADATA_CHANGES, c->verbose > 0)) { rv = 1; @@ -1718,10 +1804,10 @@ int Grow_reshape(char *devname, int fd, } /* make sure mdmon is * aware of the new level */ - if (!mdmon_running(st->container_dev)) - start_mdmon(st->container_dev); + if (!mdmon_running(st->container_devnm)) + start_mdmon(st->container_devnm); ping_monitor(container); - if (mdmon_running(st->container_dev) && + if (mdmon_running(st->container_devnm) && st->update_tail == NULL) st->update_tail = &st->updates; } @@ -1777,12 +1863,12 @@ size_change_error: goto release; } if (s->assume_clean) { - /* This will fail on kernels newer than 3.0 unless + /* This will fail on kernels older than 3.0 unless * a backport has been arranged. */ if (sra == NULL || sysfs_set_str(sra, NULL, "resync_start", "none") < 0) - pr_err("--assume-clean not support with --grow on this kernel\n"); + pr_err("--assume-clean not supported with --grow on this kernel\n"); } ioctl(fd, GET_ARRAY_INFO, &array); s->size = get_component_size(fd)/2; @@ -1809,6 +1895,7 @@ size_change_error: if ((s->level == UnSet || s->level == array.level) && (s->layout_str == NULL) && (s->chunk == 0 || s->chunk == array.chunk_size) && + data_offset == INVALID_SECTORS && (s->raiddisks == 0 || s->raiddisks == array.raid_disks)) { /* Nothing more to do */ if (!changed && c->verbose >= 0) @@ -1845,7 +1932,7 @@ size_change_error: memset(&info, 0, sizeof(info)); info.array = array; - sysfs_init(&info, fd, NoMdDev); + sysfs_init(&info, fd, NULL); strcpy(info.text_version, sra->text_version); info.component_size = s->size*2; info.new_level = s->level; @@ -1980,7 +2067,7 @@ size_change_error: * performed at the level of the container */ rv = reshape_container(container, devname, -1, st, &info, - c->force, c->backup_file, c->verbose, 0, 0); + c->force, c->backup_file, c->verbose, 0, 0, 0); frozen = 0; } else { /* get spare devices from external metadata @@ -2009,7 +2096,8 @@ size_change_error: } sync_metadata(st); rv = reshape_array(container, fd, devname, st, &info, c->force, - devlist, c->backup_file, c->verbose, 0, 0, 0); + devlist, data_offset, c->backup_file, c->verbose, + 0, 0, 0); frozen = 0; } release: @@ -2076,9 +2164,641 @@ static int verify_reshape_position(struct mdinfo *info, int level) return ret_val; } +static unsigned long long choose_offset(unsigned long long lo, + unsigned long long hi, + unsigned long long min, + unsigned long long max) +{ + /* Choose a new offset between hi and lo. + * It must be between min and max, but + * we would prefer something near the middle of hi/lo, and also + * prefer to be aligned to a big power of 2. + * + * So we start with the middle, then for each bit, + * starting at '1' and increasing, if it is set, we either + * add it or subtract it if possible, preferring the option + * which is furthest from the boundary. + * + * We stop once we get a 1MB alignment. As units are in sectors, + * 1MB = 2*1024 sectors. + */ + unsigned long long choice = (lo + hi) / 2; + unsigned long long bit = 1; + + for (bit = 1; bit < 2*1024; bit = bit << 1) { + unsigned long long bigger, smaller; + if (! (bit & choice)) + continue; + bigger = choice + bit; + smaller = choice - bit; + if (bigger > max && smaller < min) + break; + if (bigger > max) + choice = smaller; + else if (smaller < min) + choice = bigger; + else if (hi - bigger > smaller - lo) + choice = bigger; + else + choice = smaller; + } + return choice; +} + +static int set_new_data_offset(struct mdinfo *sra, struct supertype *st, + char *devname, int delta_disks, + unsigned long long data_offset, + unsigned long long min, + int can_fallback) +{ + struct mdinfo *sd; + int dir = 0; + int err = 0; + unsigned long long before, after; + + /* Need to find min space before and after so same is used + * on all devices + */ + before = UINT64_MAX; + after = UINT64_MAX; + for (sd = sra->devs; sd; sd = sd->next) { + char *dn; + int dfd; + int rv; + struct supertype *st2; + struct mdinfo info2; + + if (sd->disk.state & (1<disk.major, sd->disk.minor, 0); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) { + pr_err("%s: cannot open component %s\n", + devname, dn ? dn : "-unknown-"); + goto release; + } + st2 = dup_super(st); + rv = st2->ss->load_super(st2,dfd, NULL); + close(dfd); + if (rv) { + free(st2); + pr_err("%s: cannot get superblock from %s\n", + devname, dn); + goto release; + } + st2->ss->getinfo_super(st2, &info2, NULL); + st2->ss->free_super(st2); + free(st2); + if (info2.space_before == 0 && + info2.space_after == 0) { + /* Metadata doesn't support data_offset changes */ + return 1; + } + if (before > info2.space_before) + before = info2.space_before; + if (after > info2.space_after) + after = info2.space_after; + + if (data_offset != INVALID_SECTORS) { + if (dir == 0) { + if (info2.data_offset == data_offset) { + pr_err("%s: already has that data_offset\n", + dn); + goto release; + } + if (data_offset < info2.data_offset) + dir = -1; + else + dir = 1; + } else if ((data_offset <= info2.data_offset && dir == 1) || + (data_offset >= info2.data_offset && dir == -1)) { + pr_err("%s: differing data offsets on devices make this --data-offset setting impossible\n", + dn); + goto release; + } + } + } + if (before == UINT64_MAX) + /* impossible really, there must be no devices */ + return 1; + + for (sd = sra->devs; sd; sd = sd->next) { + char *dn = map_dev(sd->disk.major, sd->disk.minor, 0); + unsigned long long new_data_offset; + + if (sd->disk.state & (1<data_offset + min; + else { + if (data_offset < sd->data_offset + min) { + pr_err("--data-offset too small for %s\n", + dn); + goto release; + } + new_data_offset = data_offset; + } + } else if (delta_disks > 0) { + /* need space before */ + if (before < min) { + if (can_fallback) + goto fallback; + pr_err("Insufficient head-space for reshape on %s\n", + dn); + goto release; + } + if (data_offset == INVALID_SECTORS) + new_data_offset = sd->data_offset - min; + else { + if (data_offset > sd->data_offset - min) { + pr_err("--data-offset too large for %s\n", + dn); + goto release; + } + new_data_offset = data_offset; + } + } else { + if (dir == 0) { + /* can move up or down. If 'data_offset' + * was set we would have already decided, + * so just choose direction with most space. + */ + if (before > after) + dir = -1; + else + dir = 1; + } + sysfs_set_str(sra, NULL, "reshape_direction", + dir == 1 ? "backwards" : "forwards"); + if (dir > 0) { + /* Increase data offset */ + if (after < min) { + if (can_fallback) + goto fallback; + pr_err("Insufficient tail-space for reshape on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS && + data_offset < sd->data_offset + min) { + pr_err("--data-offset too small on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS) + new_data_offset = data_offset; + else + new_data_offset = choose_offset(sd->data_offset, + sd->data_offset + after, + sd->data_offset + min, + sd->data_offset + after); + } else { + /* Decrease data offset */ + if (before < min) { + if (can_fallback) + goto fallback; + pr_err("insufficient head-room on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS && + data_offset < sd->data_offset - min) { + pr_err("--data-offset too small on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS) + new_data_offset = data_offset; + else + new_data_offset = choose_offset(sd->data_offset - before, + sd->data_offset, + sd->data_offset - before, + sd->data_offset - min); + } + } + err = sysfs_set_num(sra, sd, "new_offset", new_data_offset); + if (err < 0 && errno == E2BIG) { + /* try again after increasing data size to max */ + err = sysfs_set_num(sra, sd, "size", 0); + if (err < 0 && errno == EINVAL && + !(sd->disk.state & (1<component_size + after)/2); + } + err = sysfs_set_num(sra, sd, "new_offset", + new_data_offset); + } + if (err < 0) { + if (errno == E2BIG && data_offset != INVALID_SECTORS) { + pr_err("data-offset is too big for %s\n", + dn); + goto release; + } + if (sd == sra->devs && + (errno == ENOENT || errno == E2BIG)) + /* Early kernel, no 'new_offset' file, + * or kernel doesn't like us. + * For RAID5/6 this is not fatal + */ + return 1; + pr_err("Cannot set new_offset for %s\n", + dn); + break; + } + } + return err; +release: + return -1; +fallback: + /* Just use a backup file */ + return 1; +} + +static int raid10_reshape(char *container, int fd, char *devname, + struct supertype *st, struct mdinfo *info, + struct reshape *reshape, + unsigned long long data_offset, + int force, int verbose) +{ + /* Changing raid_disks, layout, chunksize or possibly + * just data_offset for a RAID10. + * We must always change data_offset. We change by at least + * ->min_offset_change which is the largest of the old and new + * chunk sizes. + * If raid_disks is increasing, then data_offset must decrease + * by at least this copy size. + * If raid_disks is unchanged, data_offset must increase or + * decrease by at least min_offset_change but preferably by much more. + * We choose half of the available space. + * If raid_disks is decreasing, data_offset must increase by + * at least min_offset_change. To allow of this, component_size + * must be decreased by the same amount. + * + * So we calculate the required minimum and direction, possibly + * reduce the component_size, then iterate through the devices + * and set the new_data_offset. + * If that all works, we set chunk_size, layout, raid_disks, and start + * 'reshape' + */ + struct mdinfo *sra; + unsigned long long min; + int err = 0; + + sra = sysfs_read(fd, NULL, + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK + ); + if (!sra) { + pr_err("%s: Cannot get array details from sysfs\n", + devname); + goto release; + } + min = reshape->min_offset_change; + + if (info->delta_disks) + sysfs_set_str(sra, NULL, "reshape_direction", + info->delta_disks < 0 ? "backwards" : "forwards"); + if (info->delta_disks < 0 && + info->space_after < min) { + int rv = sysfs_set_num(sra, NULL, "component_size", + (sra->component_size - + min)/2); + if (rv) { + pr_err("cannot reduce component size\n"); + goto release; + } + } + err = set_new_data_offset(sra, st, devname, info->delta_disks, data_offset, + min, 0); + if (err == 1) { + pr_err("Cannot set new_data_offset: RAID10 reshape not\n"); + cont_err("supported on this kernel\n"); + err = -1; + } + if (err < 0) + goto release; + + if (!err && sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0) + err = errno; + if (!err && sysfs_set_num(sra, NULL, "layout", reshape->after.layout) < 0) + err = errno; + if (!err && sysfs_set_num(sra, NULL, "raid_disks", + info->array.raid_disks + info->delta_disks) < 0) + err = errno; + if (!err && sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) + err = errno; + if (err) { + pr_err("Cannot set array shape for %s\n", + devname); + if (err == EBUSY && + (info->array.state & (1<devs; sd; sd = sd->next) { + char *dn; + int dfd; + struct supertype *st2; + struct mdinfo info2; + + if (sd->disk.state & (1<disk.major, sd->disk.minor, 0); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + break; + st2 = dup_super(st); + if (st2->ss->load_super(st2,dfd, NULL)) { + close(dfd); + free(st2); + break; + } + close(dfd); + st2->ss->getinfo_super(st2, &info2, NULL); + st2->ss->free_super(st2); + free(st2); + if (first || + min_space_before > info2.space_before) + min_space_before = info2.space_before; + if (first || + min_space_after > info2.space_after) + min_space_after = info2.space_after; + first = 0; + } + if (sd == NULL && !first) { + info->space_after = min_space_after; + info->space_before = min_space_before; + } + sysfs_free(sra); +} + +static void update_cache_size(char *container, struct mdinfo *sra, + struct mdinfo *info, + int disks, unsigned long long blocks) +{ + /* Check that the internal stripe cache is + * large enough, or it won't work. + * It must hold at least 4 stripes of the larger + * chunk size + */ + unsigned long cache; + cache = max(info->array.chunk_size, info->new_chunk); + cache *= 4; /* 4 stripes minimum */ + cache /= 512; /* convert to sectors */ + /* make sure there is room for 'blocks' with a bit to spare */ + if (cache < 16 + blocks / disks) + cache = 16 + blocks / disks; + cache /= (4096/512); /* Covert from sectors to pages */ + + if (sra->cache_size < cache) + subarray_set_num(container, sra, "stripe_cache_size", + cache+1); +} + +static int impose_reshape(struct mdinfo *sra, + struct mdinfo *info, + struct supertype *st, + int fd, + int restart, + char *devname, char *container, + struct reshape *reshape) +{ + struct mdu_array_info_s array; + + sra->new_chunk = info->new_chunk; + + if (restart) { + /* for external metadata checkpoint saved by mdmon can be lost + * or missed /due to e.g. crash/. Check if md is not during + * restart farther than metadata points to. + * If so, this means metadata information is obsolete. + */ + if (st->ss->external) + verify_reshape_position(info, reshape->level); + sra->reshape_progress = info->reshape_progress; + } else { + sra->reshape_progress = 0; + if (reshape->after.data_disks < reshape->before.data_disks) + /* start from the end of the new array */ + sra->reshape_progress = (sra->component_size + * reshape->after.data_disks); + } + + ioctl(fd, GET_ARRAY_INFO, &array); + if (info->array.chunk_size == info->new_chunk && + reshape->before.layout == reshape->after.layout && + st->ss->external == 0) { + /* use SET_ARRAY_INFO but only if reshape hasn't started */ + array.raid_disks = reshape->after.data_disks + reshape->parity; + if (!restart && + ioctl(fd, SET_ARRAY_INFO, &array) != 0) { + int err = errno; + + pr_err("Cannot set device shape for %s: %s\n", + devname, strerror(errno)); + + if (err == EBUSY && + (array.state & (1<new_chunk) < 0) + err = errno; + if (!err && sysfs_set_num(sra, NULL, "layout", + reshape->after.layout) < 0) + err = errno; + if (!err && subarray_set_num(container, sra, "raid_disks", + reshape->after.data_disks + + reshape->parity) < 0) + err = errno; + if (err) { + pr_err("Cannot set device shape for %s\n", + devname); + + if (err == EBUSY && + (array.state & (1<= 4 && array.level <= 6)) { + /* To convert to RAID0 we need to fail and + * remove any non-data devices. */ + int found = 0; + int d; + int data_disks = array.raid_disks - 1; + if (array.level == 6) + data_disks -= 1; + if (array.level == 5 && + array.layout != ALGORITHM_PARITY_N) + return -1; + if (array.level == 6 && + array.layout != ALGORITHM_PARITY_N_6) + return -1; + sysfs_set_str(&info, NULL,"sync_action", "idle"); + /* First remove any spares so no recovery starts */ + for (d = 0, found = 0; + d < MAX_DISKS && found < array.nr_disks; + d++) { + mdu_disk_info_t disk; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + found++; + if ((disk.state & (1 << MD_DISK_ACTIVE)) + && disk.raid_disk < data_disks) + /* keep this */ + continue; + ioctl(fd, HOT_REMOVE_DISK, + makedev(disk.major, disk.minor)); + } + /* Now fail anything left */ + ioctl(fd, GET_ARRAY_INFO, &array); + for (d = 0, found = 0; + d < MAX_DISKS && found < array.nr_disks; + d++) { + int cnt; + mdu_disk_info_t disk; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + found++; + if ((disk.state & (1 << MD_DISK_ACTIVE)) + && disk.raid_disk < data_disks) + /* keep this */ + continue; + ioctl(fd, SET_DISK_FAULTY, + makedev(disk.major, disk.minor)); + cnt = 5; + while (ioctl(fd, HOT_REMOVE_DISK, + makedev(disk.major, disk.minor)) < 0 + && errno == EBUSY + && cnt--) { + usleep(10000); + } + } + } + c = map_num(pers, level); + if (c) { + int err = sysfs_set_str(&info, NULL, "level", c); + if (err) { + err = errno; + pr_err("%s: could not set level to %s\n", + devname, c); + if (err == EBUSY && + (array.state & (1<= 0) + pr_err("level of %s changed to %s\n", + devname, c); + } + return 0; +} + +int sigterm = 0; +static void catch_term(int sig) +{ + sigterm = 1; +} + +static int continue_via_systemd(char *devnm) +{ + int skipped, i, pid, status; + char pathbuf[1024]; + /* In a systemd/udev world, it is best to get systemd to + * run "mdadm --grow --continue" rather than running in the + * background. + */ + switch(fork()) { + case 0: + /* FIXME yuk. CLOSE_EXEC?? */ + skipped = 0; + for (i = 3; skipped < 20; i++) + if (close(i) < 0) + skipped++; + else + skipped = 0; + + /* Don't want to see error messages from + * systemctl. If the service doesn't exist, + * we fork ourselves. + */ + close(2); + open("/dev/null", O_WRONLY); + snprintf(pathbuf, sizeof(pathbuf), "mdadm-grow-continue@%s.service", + devnm); + status = execl("/usr/bin/systemctl", "systemctl", + "start", + pathbuf, NULL); + status = execl("/bin/systemctl", "systemctl", "start", + pathbuf, NULL); + exit(1); + case -1: /* Just do it ourselves. */ + break; + default: /* parent - good */ + pid = wait(&status); + if (pid >= 0 && status == 0) + return 1; + } + return 0; +} + static int reshape_array(char *container, int fd, char *devname, struct supertype *st, struct mdinfo *info, int force, struct mddev_dev *devlist, + unsigned long long data_offset, char *backup_file, int verbose, int forked, int restart, int freeze_reshape) { @@ -2086,7 +2806,8 @@ static int reshape_array(char *container, int fd, char *devname, int spares_needed; char *msg; int orig_level = UnSet; - int disks, odisks; + int odisks; + int delayed; struct mdu_array_info_s array; char *c; @@ -2100,7 +2821,6 @@ static int reshape_array(char *container, int fd, char *devname, int nrdisks; int err; unsigned long blocks; - unsigned long cache; unsigned long long array_size; int done; struct mdinfo *sra = NULL; @@ -2117,12 +2837,16 @@ static int reshape_array(char *container, int fd, char *devname, info->component_size = array_size / array.raid_disks; } + if (array.level == 10) + /* Need space_after info */ + get_space_after(fd, st, info); + if (info->reshape_active) { int new_level = info->new_level; info->new_level = UnSet; if (info->delta_disks > 0) info->array.raid_disks -= info->delta_disks; - msg = analyse_change(info, &reshape); + msg = analyse_change(devname, info, &reshape); info->new_level = new_level; if (info->delta_disks > 0) info->array.raid_disks += info->delta_disks; @@ -2130,9 +2854,11 @@ static int reshape_array(char *container, int fd, char *devname, /* Make sure the array isn't read-only */ ioctl(fd, RESTART_ARRAY_RW, 0); } else - msg = analyse_change(info, &reshape); + msg = analyse_change(devname, info, &reshape); if (msg) { - pr_err("%s\n", msg); + /* if msg == "", error has already been printed */ + if (msg[0]) + pr_err("%s\n", msg); goto release; } if (restart && @@ -2161,6 +2887,22 @@ static int reshape_array(char *container, int fd, char *devname, /* reshape already started. just skip to monitoring the reshape */ if (reshape.backup_blocks == 0) return 0; + if (restart & RESHAPE_NO_BACKUP) + return 0; + + /* Need 'sra' down at 'started:' */ + sra = sysfs_read(fd, NULL, + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK| + GET_CACHE); + if (!sra) { + pr_err("%s: Cannot get array details from sysfs\n", + devname); + goto release; + } + + if (!backup_file) + backup_file = locate_backup(sra->sys_name); + goto started; } /* The container is frozen but the array may not be. @@ -2203,37 +2945,23 @@ static int reshape_array(char *container, int fd, char *devname, } if (reshape.level != array.level) { - char *c = map_num(pers, reshape.level); - int err; - if (c == NULL) - goto release; - - err = sysfs_set_str(info, NULL, "level", c); - if (err) { - err = errno; - pr_err("%s: could not set level to %s\n", - devname, c); - if (err == EBUSY && - (info->array.state & (1<= 0) - pr_err("level of %s changed to %s\n", - devname, c); + info->new_layout = UnSet; /* after level change, + * layout is meaningless */ orig_level = array.level; sysfs_freeze_array(info); if (reshape.level > 0 && st->ss->external) { /* make sure mdmon is aware of the new level */ - if (mdmon_running(st->container_dev)) + if (mdmon_running(container)) flush_mdmon(container); - if (!mdmon_running(st->container_dev)) - start_mdmon(st->container_dev); + if (!mdmon_running(container)) + start_mdmon(container); ping_monitor(container); - if (mdmon_running(st->container_dev) && + if (mdmon_running(container) && st->update_tail == NULL) st->update_tail = &st->updates; } @@ -2250,7 +2978,7 @@ static int reshape_array(char *container, int fd, char *devname, struct mdinfo *d; if (info2) { - sysfs_init(info2, fd, st->devnum); + sysfs_init(info2, fd, st->devnm); /* When increasing number of devices, we need to set * new raid_disks before adding these, or they might * be rejected. @@ -2280,6 +3008,8 @@ static int reshape_array(char *container, int fd, char *devname, Manage_subdevs(devname, fd, devlist, verbose, 0,NULL, 0); + if (reshape.backup_blocks == 0 && data_offset != INVALID_SECTORS) + reshape.backup_blocks = reshape.before.data_disks * info->array.chunk_size/512; if (reshape.backup_blocks == 0) { /* No restriping needed, but we might need to impose * some more changes: layout, raid_disks, chunk_size @@ -2360,7 +3090,6 @@ static int reshape_array(char *container, int fd, char *devname, * - request the shape change. * - fork to handle backup etc. */ -started: /* Check that we can hold all the data */ get_dev_size(fd, NULL, &array_size); if (reshape.new_size < (array_size/512)) { @@ -2371,7 +3100,20 @@ started: goto release; } - sra = sysfs_read(fd, 0, + if (array.level == 10) { + /* Reshaping RAID10 does not require any data backup by + * user-space. Instead it requires that the data_offset + * is changed to avoid the need for backup. + * So this is handled very separately + */ + if (restart) + /* Nothing to do. */ + return 0; + return raid10_reshape(container, fd, devname, st, info, + &reshape, data_offset, + force, verbose); + } + sra = sysfs_read(fd, NULL, GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK| GET_CACHE); if (!sra) { @@ -2380,6 +3122,58 @@ started: goto release; } + if (!backup_file) + switch(set_new_data_offset(sra, st, devname, + reshape.after.data_disks - reshape.before.data_disks, + data_offset, + reshape.min_offset_change, 1)) { + case -1: + goto release; + case 0: + /* Updated data_offset, so it's easy now */ + update_cache_size(container, sra, info, + min(reshape.before.data_disks, + reshape.after.data_disks), + reshape.backup_blocks); + + /* Right, everything seems fine. Let's kick things off. + */ + sync_metadata(st); + + if (impose_reshape(sra, info, st, fd, restart, + devname, container, &reshape) < 0) + goto release; + if (sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) { + pr_err("Failed to initiate reshape!\n"); + goto release; + } + if (info->new_level == reshape.level) + return 0; + /* need to adjust level when reshape completes */ + switch(fork()) { + case -1: /* ignore error, but don't wait */ + return 0; + default: /* parent */ + return 0; + case 0: + map_fork(); + break; + } + close(fd); + wait_reshape(sra); + fd = open_dev(sra->sys_name); + if (fd >= 0) + impose_level(fd, info->new_level, devname, verbose); + return 0; + case 1: /* Couldn't set data_offset, try the old way */ + if (data_offset != INVALID_SECTORS) { + pr_err("Cannot update data_offset on this array\n"); + goto release; + } + break; + } + +started: /* Decide how many blocks (sectors) for a reshape * unit. The number we have so far is just a minimum */ @@ -2424,8 +3218,9 @@ started: if (backup_file == NULL) { if (reshape.after.data_disks <= reshape.before.data_disks) { - pr_err("%s: Cannot grow - " - "need backup-file\n", devname); + pr_err("%s: Cannot grow - need backup-file\n", + devname); + pr_err(" Please provide one with \"--backup=...\"\n"); goto release; } else if (sra->array.spare_disks == 0) { pr_err("%s: Cannot grow - " @@ -2437,6 +3232,7 @@ started: if (!reshape_open_backup_file(backup_file, fd, devname, (signed)blocks, fdlist+d, offsets+d, + sra->sys_name, restart)) { goto release; } @@ -2444,23 +3240,9 @@ started: } } - /* lastly, check that the internal stripe cache is - * large enough, or it won't work. - * It must hold at least 4 stripes of the larger - * chunk size - */ - cache = max(info->array.chunk_size, info->new_chunk); - cache *= 4; /* 4 stripes minimum */ - cache /= 512; /* convert to sectors */ - disks = min(reshape.before.data_disks, reshape.after.data_disks); - /* make sure there is room for 'blocks' with a bit to spare */ - if (cache < 16 + blocks / disks) - cache = 16 + blocks / disks; - cache /= (4096/512); /* Covert from sectors to pages */ - - if (sra->cache_size < cache) - subarray_set_num(container, sra, "stripe_cache_size", - cache+1); + update_cache_size(container, sra, info, + min(reshape.before.data_disks, reshape.after.data_disks), + blocks); /* Right, everything seems fine. Let's kick things off. * If only changing raid_disks, use ioctl, else use @@ -2468,70 +3250,9 @@ started: */ sync_metadata(st); - sra->new_chunk = info->new_chunk; - - if (restart) { - /* for external metadata checkpoint saved by mdmon can be lost - * or missed /due to e.g. crash/. Check if md is not during - * restart farther than metadata points to. - * If so, this means metadata information is obsolete. - */ - if (st->ss->external) - verify_reshape_position(info, reshape.level); - sra->reshape_progress = info->reshape_progress; - } else { - sra->reshape_progress = 0; - if (reshape.after.data_disks < reshape.before.data_disks) - /* start from the end of the new array */ - sra->reshape_progress = (sra->component_size - * reshape.after.data_disks); - } - - if (info->array.chunk_size == info->new_chunk && - reshape.before.layout == reshape.after.layout && - st->ss->external == 0) { - /* use SET_ARRAY_INFO but only if reshape hasn't started */ - ioctl(fd, GET_ARRAY_INFO, &array); - array.raid_disks = reshape.after.data_disks + reshape.parity; - if (!restart && - ioctl(fd, SET_ARRAY_INFO, &array) != 0) { - int err = errno; - - pr_err("Cannot set device shape for %s: %s\n", - devname, strerror(errno)); - - if (err == EBUSY && - (array.state & (1<new_chunk) < 0) - err = errno; - if (!err && sysfs_set_num(sra, NULL, "layout", - reshape.after.layout) < 0) - err = errno; - if (!err && subarray_set_num(container, sra, "raid_disks", - reshape.after.data_disks + - reshape.parity) < 0) - err = errno; - if (err) { - pr_err("Cannot set device shape for %s\n", - devname); - - if (err == EBUSY && - (array.state & (1<sys_name)) { + free(fdlist); + free(offsets); + sysfs_free(sra); + return 0; + } + /* Now we just need to kick off the reshape and watch, while * handling backups of the data... * This is all done by a forked background process. @@ -2573,6 +3302,39 @@ started: break; } + /* If another array on the same devices is busy, the + * reshape will wait for them. This would mean that + * the first section that we suspend will stay suspended + * for a long time. So check on that possibility + * by looking for "DELAYED" in /proc/mdstat, and if found, + * wait a while + */ + do { + struct mdstat_ent *mds, *m; + delayed = 0; + mds = mdstat_read(1, 0); + for (m = mds; m; m = m->next) + if (strcmp(m->devnm, sra->sys_name) == 0) { + if (m->resync && + m->percent == RESYNC_DELAYED) + delayed = 1; + if (m->resync == 0) + /* Haven't started the reshape thread + * yet, wait a bit + */ + delayed = 2; + break; + } + free_mdstat(mds); + if (delayed == 1 && get_linux_version() < 3007000) { + pr_err("Reshape is delayed, but cannot wait carefully with this kernel.\n" + " You might experience problems until other reshapes complete.\n"); + delayed = 0; + } + if (delayed) + mdstat_wait(30 - (delayed-1) * 25); + } while (delayed); + mdstat_close(); close(fd); if (check_env("MDADM_GROW_VERIFY")) fd = open(devname, O_RDONLY | O_DIRECT); @@ -2580,6 +3342,8 @@ started: fd = -1; mlockall(MCL_FUTURE); + signal(SIGTERM, catch_term); + if (st->ss->external) { /* metadata handler takes it from here */ done = st->ss->manage_reshape( @@ -2597,8 +3361,21 @@ started: free(fdlist); free(offsets); - if (backup_file && done) + if (backup_file && done) { + char *bul; + bul = make_backup(sra->sys_name); + if (bul) { + char buf[1024]; + int l = readlink(bul, buf, sizeof(buf)); + if (l > 0) { + buf[l]=0; + unlink(buf); + } + unlink(bul); + free(bul); + } unlink(backup_file); + } if (!done) { abort_reshape(sra); goto out; @@ -2619,7 +3396,7 @@ started: if (st->ss->external) { /* Re-load the metadata as much could have changed */ - int cfd = open_dev(st->container_dev); + int cfd = open_dev(st->container_devnm); if (cfd >= 0) { flush_mdmon(container); st->ss->free_super(st); @@ -2637,14 +3414,10 @@ started: set_array_size(st, info, info->text_version); if (info->new_level != reshape.level) { - - c = map_num(pers, info->new_level); - if (c) { - err = sysfs_set_str(sra, NULL, "level", c); - if (err) - pr_err("%s: could not set level " - "to %s\n", devname, c); - } + if (fd < 0) + fd = open(devname, O_RDONLY); + impose_level(fd, info->new_level, devname, verbose); + close(fd); if (info->new_level == 0) st->update_tail = NULL; } @@ -2676,12 +3449,12 @@ int reshape_container(char *container, char *devname, struct supertype *st, struct mdinfo *info, int force, - char *backup_file, - int verbose, int restart, int freeze_reshape) + char *backup_file, int verbose, + int forked, int restart, int freeze_reshape) { struct mdinfo *cc = NULL; int rv = restart; - int last_devnum = -1; + char last_devnm[32] = ""; /* component_size is not meaningful for a container, * so pass '0' meaning 'no change' @@ -2702,7 +3475,11 @@ int reshape_container(char *container, char *devname, */ ping_monitor(container); - switch (fork()) { + if (!forked && !check_env("MDADM_NO_SYSTEMCTL")) + if (continue_via_systemd(container)) + return 0; + + switch (forked ? 0 : fork()) { case -1: /* error */ perror("Cannot fork to complete reshape\n"); unfreeze(st); @@ -2738,6 +3515,7 @@ int reshape_container(char *container, char *devname, int fd; struct mdstat_ent *mdstat; char *adev; + int devid; sysfs_free(cc); @@ -2749,13 +3527,12 @@ int reshape_container(char *container, char *devname, continue; subarray = strchr(content->text_version+1, '/')+1; - mdstat = mdstat_by_subdev(subarray, - devname2devnum(container)); + mdstat = mdstat_by_subdev(subarray, container); if (!mdstat) continue; if (mdstat->active == 0) { - pr_err("Skipping inactive " - "array md%i.\n", mdstat->devnum); + pr_err("Skipping inactive array %s.\n", + mdstat->devnm); free_mdstat(mdstat); mdstat = NULL; continue; @@ -2765,20 +3542,19 @@ int reshape_container(char *container, char *devname, if (!content) break; - adev = map_dev(dev2major(mdstat->devnum), - dev2minor(mdstat->devnum), - 0); + devid = devnm2devid(mdstat->devnm); + adev = map_dev(major(devid), minor(devid), 0); if (!adev) adev = content->text_version; - fd = open_dev(mdstat->devnum); + fd = open_dev(mdstat->devnm); if (fd < 0) { printf(Name ": Device %s cannot be opened for reshape.", adev); break; } - if (last_devnum == mdstat->devnum) { + if (strcmp(last_devnm, mdstat->devnm) == 0) { /* Do not allow for multiple reshape_array() calls for * the same array. * It can happen when reshape_array() returns without @@ -2794,15 +3570,15 @@ int reshape_container(char *container, char *devname, close(fd); break; } - last_devnum = mdstat->devnum; + strcpy(last_devnm, mdstat->devnm); - sysfs_init(content, fd, mdstat->devnum); + sysfs_init(content, fd, mdstat->devnm); - if (mdmon_running(devname2devnum(container))) + if (mdmon_running(container)) flush_mdmon(container); rv = reshape_array(container, fd, adev, st, - content, force, NULL, + content, force, NULL, INVALID_SECTORS, backup_file, verbose, 1, restart, freeze_reshape); close(fd); @@ -2816,7 +3592,7 @@ int reshape_container(char *container, char *devname, if (rv) break; - if (mdmon_running(devname2devnum(container))) + if (mdmon_running(container)) flush_mdmon(container); } if (!rv) @@ -2853,7 +3629,7 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape, unsigned long long backup_point, unsigned long long wait_point, unsigned long long *suspend_point, - unsigned long long *reshape_completed) + unsigned long long *reshape_completed, int *frozen) { /* This function is called repeatedly by the reshape manager. * It determines how much progress can safely be made and allows @@ -3070,7 +3846,8 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape, wait_point = info->component_size - wait_point; } - sysfs_set_num(info, NULL, "sync_max", max_progress); + if (!*frozen) + sysfs_set_num(info, NULL, "sync_max", max_progress); /* Now wait. If we have already reached the point that we were * asked to wait to, don't wait at all, else wait for any change. @@ -3090,7 +3867,6 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape, * waiting forever on a dead array */ char action[20]; - fd_set rfds; if (sysfs_get_str(info, NULL, "sync_action", action, 20) <= 0 || strncmp(action, "reshape", 7) != 0) @@ -3106,9 +3882,7 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape, && info->reshape_progress < (info->component_size * reshape->after.data_disks)) break; - FD_ZERO(&rfds); - FD_SET(fd, &rfds); - select(fd+1, NULL, NULL, &rfds, NULL); + sysfs_wait(fd, NULL); if (sysfs_fd_get_ll(fd, &completed) < 0) goto check_progress; } @@ -3153,23 +3927,24 @@ check_progress: /* The abort might only be temporary. Wait up to 10 * seconds for fd to contain a valid number again. */ - struct timeval tv; + int wait = 10000; int rv = -2; - tv.tv_sec = 10; - tv.tv_usec = 0; - while (fd >= 0 && rv < 0 && tv.tv_sec > 0) { - fd_set rfds; - FD_ZERO(&rfds); - FD_SET(fd, &rfds); - if (select(fd+1, NULL, NULL, &rfds, &tv) != 1) + unsigned long long new_sync_max; + while (fd >= 0 && rv < 0 && wait > 0) { + if (sysfs_wait(fd, &wait) != 1) break; switch (sysfs_fd_get_ll(fd, &completed)) { case 0: /* all good again */ rv = 1; + /* If "sync_max" is no longer max_progress + * we need to freeze things + */ + sysfs_get_ll(info, NULL, "sync_max", &new_sync_max); + *frozen = (new_sync_max != max_progress); break; case -2: /* read error - abort */ - tv.tv_sec = 0; + wait = 0; break; } } @@ -3461,6 +4236,7 @@ int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, struct mdinfo *sd; unsigned long stripes; int uuid[4]; + int frozen = 0; /* set up the backup-super-block. This requires the * uuid from the array. @@ -3538,9 +4314,11 @@ int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, wait_point = __le64_to_cpu(bsb.arraystart2); } + reshape_completed = sra->reshape_progress; rv = progress_reshape(sra, reshape, backup_point, wait_point, - &suspend_point, &reshape_completed); + &suspend_point, &reshape_completed, + &frozen); /* external metadata would need to ping_monitor here */ sra->reshape_progress = reshape_completed; @@ -3566,7 +4344,8 @@ int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, forget_backup(dests, destfd, destoffsets, 1); } - + if (sigterm) + rv = -2; if (rv < 0) { if (rv == -1) done = 1; @@ -3574,6 +4353,7 @@ int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, } if (rv == 0 && increasing && !st->ss->external) { /* No longer need to monitor this reshape */ + sysfs_set_str(sra, NULL, "sync_max", "max"); done = 1; break; } @@ -3627,7 +4407,12 @@ int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, } /* FIXME maybe call progress_reshape one more time instead */ - abort_reshape(sra); /* remove any remaining suspension */ + /* remove any remaining suspension */ + sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + sysfs_set_num(sra, NULL, "suspend_hi", 0); + sysfs_set_num(sra, NULL, "suspend_lo", 0); + sysfs_set_num(sra, NULL, "sync_min", 0); + if (reshape->before.data_disks == reshape->after.data_disks) sysfs_set_num(sra, NULL, "sync_speed_min", speed); free(buf); @@ -3741,9 +4526,8 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt (unsigned long)__le64_to_cpu(bsb.mtime), (unsigned long)info->array.utime); } else { - if (verbose) - pr_err("too-old timestamp on " - "backup-metadata on %s\n", devname); + pr_err("too-old timestamp on backup-metadata on %s\n", devname); + pr_err("If you think it is should be safe, try 'export MDADM_GROW_ALLOW_OLD=1'\n"); continue; /* time stamp is too bad */ } } @@ -3963,7 +4747,6 @@ int Grow_continue_command(char *devname, int fd, char *subarray = NULL; struct mdinfo *cc = NULL; struct mdstat_ent *mdstat = NULL; - char buf[40]; int cfd = -1; int fd2 = -1; @@ -3978,27 +4761,63 @@ int Grow_continue_command(char *devname, int fd, } dprintf("Grow continue is run for "); if (st->ss->external == 0) { + int d; dprintf("native array (%s)\n", devname); - if (ioctl(fd, GET_ARRAY_INFO, &array) < 0) { + if (ioctl(fd, GET_ARRAY_INFO, &array.array) < 0) { pr_err("%s is not an active md array -" " aborting\n", devname); ret_val = 1; goto Grow_continue_command_exit; } content = &array; - sysfs_init(content, fd, st->devnum); + /* Need to load a superblock. + * FIXME we should really get what we need from + * sysfs + */ + for (d = 0; d < MAX_DISKS; d++) { + mdu_disk_info_t disk; + char *dv; + int err; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + if ((disk.state & (1 << MD_DISK_ACTIVE)) == 0) + continue; + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) + continue; + fd2 = dev_open(dv, O_RDONLY); + if (fd2 < 0) + continue; + err = st->ss->load_super(st, fd2, NULL); + close(fd2); + /* invalidate fd2 to avoid possible double close() */ + fd2 = -1; + if (err) + continue; + break; + } + if (d == MAX_DISKS) { + pr_err("Unable to load metadata for %s\n", + devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + st->ss->getinfo_super(st, content, NULL); } else { - int container_dev; + char *container; if (subarray) { dprintf("subarray (%s)\n", subarray); - container_dev = st->container_dev; - cfd = open_dev_excl(st->container_dev); + container = st->container_devnm; + cfd = open_dev_excl(st->container_devnm); } else { - container_dev = st->devnum; + container = st->devnm; close(fd); - cfd = open_dev_excl(st->devnum); - dprintf("container (%i)\n", container_dev); + cfd = open_dev_excl(st->devnm); + dprintf("container (%s)\n", container); fd = cfd; } if (cfd < 0) { @@ -4007,7 +4826,6 @@ int Grow_continue_command(char *devname, int fd, ret_val = 1; goto Grow_continue_command_exit; } - fmt_devname(buf, container_dev); /* find in container array under reshape */ @@ -4043,18 +4861,18 @@ int Grow_continue_command(char *devname, int fd, pr_err("cannot continue reshape of an array" " in container with unsupported" " metadata: %s(%s)\n", - devname, buf); + devname, container); ret_val = 1; goto Grow_continue_command_exit; } array = strchr(content->text_version+1, '/')+1; - mdstat = mdstat_by_subdev(array, container_dev); + mdstat = mdstat_by_subdev(array, container); if (!mdstat) continue; if (mdstat->active == 0) { - pr_err("Skipping inactive " - "array md%i.\n", mdstat->devnum); + pr_err("Skipping inactive array %s.\n", + mdstat->devnm); free_mdstat(mdstat); mdstat = NULL; continue; @@ -4067,23 +4885,22 @@ int Grow_continue_command(char *devname, int fd, ret_val = 1; goto Grow_continue_command_exit; } - fd2 = open_dev(mdstat->devnum); + fd2 = open_dev(mdstat->devnm); if (fd2 < 0) { - pr_err("cannot open (md%i)\n", - mdstat->devnum); + pr_err("cannot open (%s)\n", mdstat->devnm); ret_val = 1; goto Grow_continue_command_exit; } - sysfs_init(content, fd2, mdstat->devnum); + sysfs_init(content, fd2, mdstat->devnm); /* start mdmon in case it is not running */ - if (!mdmon_running(container_dev)) - start_mdmon(container_dev); - ping_monitor(buf); + if (!mdmon_running(container)) + start_mdmon(container); + ping_monitor(container); - if (mdmon_running(container_dev)) + if (mdmon_running(container)) st->update_tail = &st->updates; else { pr_err("No mdmon found. " @@ -4096,8 +4913,7 @@ int Grow_continue_command(char *devname, int fd, /* verify that array under reshape is started from * correct position */ - if (verify_reshape_position(content, - map_name(pers, mdstat->level)) < 0) { + if (verify_reshape_position(content, content->array.level) < 0) { ret_val = 1; goto Grow_continue_command_exit; } @@ -4128,22 +4944,49 @@ int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info, return ret_val; if (st->ss->external) { - char container[40]; - int cfd = open_dev(st->container_dev); + int cfd = open_dev(st->container_devnm); if (cfd < 0) return 1; - fmt_devname(container, st->container_dev); - st->ss->load_container(st, cfd, container); + st->ss->load_container(st, cfd, st->container_devnm); close(cfd); - ret_val = reshape_container(container, NULL, mdfd, + ret_val = reshape_container(st->container_devnm, NULL, mdfd, st, info, 0, backup_file, - 0, 1, freeze_reshape); + 0, 1, + 1 | info->reshape_active, + freeze_reshape); } else ret_val = reshape_array(NULL, mdfd, "array", st, info, 1, - NULL, backup_file, 0, 0, 1, + NULL, INVALID_SECTORS, + backup_file, 0, 1, + 1 | info->reshape_active, freeze_reshape); return ret_val; } + +char *make_backup(char *name) +{ + char *base = "backup_file-"; + int len; + char *fname; + + len = strlen(MAP_DIR) + 1 + strlen(base) + strlen(name)+1; + fname = xmalloc(len); + sprintf(fname, "%s/%s%s", MAP_DIR, base, name); + return fname; +} + +char *locate_backup(char *name) +{ + char *fl = make_backup(name); + struct stat stb; + + if (stat(fl, &stb) == 0 && + S_ISREG(stb.st_mode)) + return fl; + + free(fl); + return NULL; +}