]> git.ipfire.org Git - thirdparty/mdadm.git/blame_incremental - Manage.c
Create.c: fix uclibc build
[thirdparty/mdadm.git] / Manage.c
... / ...
CommitLineData
1/*
2 * mdadm - manage Linux "md" devices aka RAID arrays.
3 *
4 * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
5 *
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 * Author: Neil Brown
22 * Email: <neilb@suse.de>
23 */
24
25#include "mdadm.h"
26#include "md_u.h"
27#include "md_p.h"
28#include "udev.h"
29#include <ctype.h>
30
31int Manage_ro(char *devname, int fd, int readonly)
32{
33 /* switch to readonly or rw
34 *
35 * requires >= 0.90.0
36 * first check that array is runing
37 * use RESTART_ARRAY_RW or STOP_ARRAY_RO
38 *
39 */
40 struct mdinfo *mdi;
41 int rv = 0;
42
43 /* If this is an externally-managed array, we need to modify the
44 * metadata_version so that mdmon doesn't undo our change.
45 */
46 mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION);
47 if (mdi &&
48 mdi->array.major_version == -1 &&
49 is_subarray(mdi->text_version)) {
50 char vers[64];
51 strcpy(vers, "external:");
52 strcat(vers, mdi->text_version);
53 if (readonly > 0) {
54 int rv;
55 /* We set readonly ourselves. */
56 vers[9] = '-';
57 sysfs_set_str(mdi, NULL, "metadata_version", vers);
58
59 close(fd);
60 rv = sysfs_set_str(mdi, NULL, "array_state", "readonly");
61
62 if (rv < 0) {
63 pr_err("failed to set readonly for %s: %s\n",
64 devname, strerror(errno));
65
66 vers[9] = mdi->text_version[0];
67 sysfs_set_str(mdi, NULL, "metadata_version", vers);
68 rv = 1;
69 goto out;
70 }
71 } else {
72 char *cp;
73 /* We cannot set read/write - must signal mdmon */
74 vers[9] = '/';
75 sysfs_set_str(mdi, NULL, "metadata_version", vers);
76
77 cp = strchr(vers+10, '/');
78 if (cp)
79 *cp = 0;
80 ping_monitor(vers+10);
81 if (mdi->array.level <= 0)
82 sysfs_set_str(mdi, NULL, "array_state", "active");
83 }
84 goto out;
85 }
86
87 if (!md_array_active(fd)) {
88 pr_err("%s does not appear to be active.\n", devname);
89 rv = 1;
90 goto out;
91 }
92
93 if (readonly > 0) {
94 if (ioctl(fd, STOP_ARRAY_RO, NULL)) {
95 pr_err("failed to set readonly for %s: %s\n",
96 devname, strerror(errno));
97 rv = 1;
98 goto out;
99 }
100 } else if (readonly < 0) {
101 if (ioctl(fd, RESTART_ARRAY_RW, NULL)) {
102 pr_err("failed to set writable for %s: %s\n",
103 devname, strerror(errno));
104 rv = 1;
105 goto out;
106 }
107 }
108out:
109 sysfs_free(mdi);
110 return rv;
111}
112
113static void remove_devices(char *devnm, char *path)
114{
115 /*
116 * Remove names at 'path' - possibly with
117 * partition suffixes - which link to the 'standard'
118 * name for devnm. These were probably created
119 * by mdadm when the array was assembled.
120 */
121 char base[40];
122 char *path2;
123 char link[1024];
124 int n;
125 int part;
126 char *be;
127 char *pe;
128
129 if (!path)
130 return;
131
132 sprintf(base, "/dev/%s", devnm);
133 be = base + strlen(base);
134
135 path2 = xmalloc(strlen(path)+20);
136 strcpy(path2, path);
137 pe = path2 + strlen(path2);
138
139 for (part = 0; part < 16; part++) {
140 if (part) {
141 sprintf(be, "p%d", part);
142
143 if (isdigit(pe[-1]))
144 sprintf(pe, "p%d", part);
145 else
146 sprintf(pe, "%d", part);
147 }
148 n = readlink(path2, link, sizeof(link));
149 if (n > 0 && (int)strlen(base) == n &&
150 strncmp(link, base, n) == 0)
151 unlink(path2);
152 }
153 free(path2);
154}
155
156int Manage_run(char *devname, int fd, struct context *c)
157{
158 /* Run the array. Array must already be configured
159 * Requires >= 0.90.0
160 */
161 char nm[32], *nmp;
162
163 nmp = fd2devnm(fd);
164 if (!nmp) {
165 pr_err("Cannot find %s in sysfs!!\n", devname);
166 return 1;
167 }
168 strcpy(nm, nmp);
169 return IncrementalScan(c, nm);
170}
171
172int Manage_stop(char *devname, int fd, int verbose, int will_retry)
173{
174 /* Stop the array. Array must already be configured
175 * 'will_retry' means that error messages are not wanted.
176 */
177 int rv = 0;
178 struct map_ent *map = NULL;
179 struct mdinfo *mdi;
180 char devnm[32];
181 char container[MD_NAME_MAX] = {0};
182 int err;
183 int count;
184 char buf[SYSFS_MAX_BUF_SIZE];
185 unsigned long long rd1, rd2;
186
187 if (will_retry && verbose == 0)
188 verbose = -1;
189
190 strcpy(devnm, fd2devnm(fd));
191 /* Get EXCL access first. If this fails, then attempting
192 * to stop is probably a bad idea.
193 */
194 mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_COMPONENT|GET_VERSION);
195 if (mdi && is_subarray(mdi->text_version))
196 sysfs_get_container_devnm(mdi, container);
197
198 close(fd);
199 count = 5;
200 while (((fd = ((devname[0] == '/')
201 ?open(devname, O_RDONLY|O_EXCL)
202 :open_dev_flags(devnm, O_RDONLY|O_EXCL))) < 0 ||
203 strcmp(fd2devnm(fd), devnm) != 0) && container[0] &&
204 mdmon_running(container) && count) {
205 /* Can't open, so something might be wrong. However it
206 * is a container, so we might be racing with mdmon, so
207 * retry for a bit.
208 */
209 if (fd >= 0)
210 close(fd);
211 flush_mdmon(container);
212 count--;
213 }
214 if (fd < 0 || strcmp(fd2devnm(fd), devnm) != 0) {
215 if (fd >= 0)
216 close(fd);
217 if (verbose >= 0)
218 pr_err("Cannot get exclusive access to %s:Perhaps a running process, mounted filesystem or active volume group?\n",
219 devname);
220 sysfs_free(mdi);
221 return 1;
222 }
223 /* If this is an mdmon managed array, just write 'inactive'
224 * to the array state and let mdmon clear up.
225 */
226 if (mdi &&
227 mdi->array.level > 0 &&
228 is_subarray(mdi->text_version)) {
229 int err;
230 /* This is mdmon managed. */
231 close(fd);
232
233 /* As we had an O_EXCL open, any use of the device
234 * which blocks STOP_ARRAY is probably a transient use,
235 * so it is reasonable to retry for a while - 5 seconds.
236 */
237 count = 25;
238 while (count &&
239 (err = sysfs_set_str(mdi, NULL,
240 "array_state",
241 "inactive")) < 0 &&
242 errno == EBUSY) {
243 sleep_for(0, MSEC_TO_NSEC(200), true);
244 count--;
245 }
246 if (err) {
247 if (verbose >= 0)
248 pr_err("failed to stop array %s: %s\n",
249 devname, strerror(errno));
250 rv = 1;
251 goto out;
252 }
253
254 /* Give monitor a chance to act */
255 ping_monitor(mdi->text_version);
256
257 fd = open_dev_excl(devnm);
258 if (fd < 0) {
259 if (verbose >= 0)
260 pr_err("failed to completely stop %s: Device is busy\n",
261 devname);
262 rv = 1;
263 goto out;
264 }
265 } else if (mdi &&
266 mdi->array.major_version == -1 &&
267 mdi->array.minor_version == -2 &&
268 !is_subarray(mdi->text_version)) {
269 struct mdstat_ent *mds, *m;
270 /* container, possibly mdmon-managed.
271 * Make sure mdmon isn't opening it, which
272 * would interfere with the 'stop'
273 */
274 ping_monitor(mdi->sys_name);
275
276 /* now check that there are no existing arrays
277 * which are members of this array
278 */
279 mds = mdstat_read(0, 0);
280 for (m = mds; m; m = m->next)
281 if (m->metadata_version &&
282 strncmp(m->metadata_version, "external:", 9)==0 &&
283 metadata_container_matches(m->metadata_version+9,
284 devnm)) {
285 if (verbose >= 0)
286 pr_err("Cannot stop container %s: member %s still active\n",
287 devname, m->devnm);
288 free_mdstat(mds);
289 rv = 1;
290 goto out;
291 }
292 }
293
294 /* If the array is undergoing a reshape which changes the number
295 * of devices, then it would be nice to stop it at a point where
296 * it has completed a full number of stripes in both old and
297 * new layouts as this will allow the reshape to be reverted.
298 * So if 'sync_action' is "reshape" and 'raid_disks' shows two
299 * different numbers, then
300 * - freeze reshape
301 * - set sync_max to next multiple of both data_disks and
302 * chunk sizes (or next but one)
303 * - unfreeze reshape
304 * - wait on 'sync_completed' for that point to be reached.
305 */
306 if (mdi && is_level456(mdi->array.level) &&
307 sysfs_attribute_available(mdi, NULL, "sync_action") &&
308 sysfs_attribute_available(mdi, NULL, "reshape_direction") &&
309 sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf)) > 0 &&
310 strcmp(buf, "reshape\n") == 0 &&
311 sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2) {
312 unsigned long long position, curr;
313 unsigned long long chunk1, chunk2;
314 unsigned long long rddiv, chunkdiv;
315 unsigned long long sectors;
316 unsigned long long sync_max, old_sync_max;
317 unsigned long long completed;
318 int backwards = 0;
319 int delay;
320 int scfd;
321
322 delay = 40;
323 while (rd1 > rd2 && delay > 0 &&
324 sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) == 0) {
325 /* must be in the critical section - wait a bit */
326 delay -= 1;
327 sleep_for(0, MSEC_TO_NSEC(100), true);
328 }
329
330 if (sysfs_set_str(mdi, NULL, "sync_action", "frozen") != 0)
331 goto done;
332 /* Array is frozen */
333
334 rd1 -= mdi->array.level == 6 ? 2 : 1;
335 rd2 -= mdi->array.level == 6 ? 2 : 1;
336 sysfs_get_str(mdi, NULL, "reshape_direction", buf, sizeof(buf));
337 if (strncmp(buf, "back", 4) == 0)
338 backwards = 1;
339 if (sysfs_get_ll(mdi, NULL, "reshape_position", &position) != 0) {
340 /* reshape must have finished now */
341 sysfs_set_str(mdi, NULL, "sync_action", "idle");
342 goto done;
343 }
344 sysfs_get_two(mdi, NULL, "chunk_size", &chunk1, &chunk2);
345 chunk1 /= 512;
346 chunk2 /= 512;
347 rddiv = GCD(rd1, rd2);
348 chunkdiv = GCD(chunk1, chunk2);
349 sectors = (chunk1/chunkdiv) * chunk2 * (rd1/rddiv) * rd2;
350
351 if (backwards) {
352 /* Need to subtract 'reshape_position' from
353 * array size to get equivalent of sync_max.
354 * Size calculation based on raid5_size in kernel.
355 */
356 unsigned long long size = mdi->component_size;
357 size &= ~(chunk1-1);
358 size &= ~(chunk2-1);
359 /* rd1 must be smaller */
360 /* Reshape may have progressed further backwards than
361 * recorded, so target even further back (hence "-1")
362 */
363 position = (position / sectors - 1) * sectors;
364 /* rd1 is always the conversion factor between 'sync'
365 * position and 'reshape' position.
366 * We read 1 "new" stripe worth of data from where-ever,
367 * and when write out that full stripe.
368 */
369 sync_max = size - position/rd1;
370 } else {
371 /* Reshape will very likely be beyond position, and it may
372 * be too late to stop at '+1', so aim for '+2'
373 */
374 position = (position / sectors + 2) * sectors;
375 sync_max = position/rd1;
376 }
377 if (sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) < 0)
378 old_sync_max = mdi->component_size;
379 /* Must not advance sync_max as that could confuse
380 * the reshape monitor */
381 if (sync_max < old_sync_max)
382 sysfs_set_num(mdi, NULL, "sync_max", sync_max);
383 sysfs_set_str(mdi, NULL, "sync_action", "idle");
384
385 /* That should have set things going again. Now we
386 * wait a little while (3 second max) for sync_completed
387 * to reach the target.
388 * The reshape process can block for 500msec if
389 * the sync speed limit is hit, so we need to wait
390 * a lot longer than that. 1 second is usually
391 * enough. 3 is safe.
392 */
393 delay = 3000;
394 scfd = sysfs_open(mdi->sys_name, NULL, "sync_completed");
395 while (scfd >= 0 && delay > 0 && old_sync_max > 0) {
396 unsigned long long max_completed;
397 sysfs_get_ll(mdi, NULL, "reshape_position", &curr);
398 sysfs_fd_get_str(scfd, buf, sizeof(buf));
399 if (str_is_none(buf) == true) {
400 /* Either reshape has aborted, or hasn't
401 * quite started yet. Wait a bit and
402 * check 'sync_action' to see.
403 */
404 sleep_for(0, MSEC_TO_NSEC(10), true);
405 sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf));
406 if (strncmp(buf, "reshape", 7) != 0)
407 break;
408 }
409
410 if (sysfs_fd_get_two(scfd, &completed,
411 &max_completed) == 2 &&
412 /* 'completed' sometimes reads as max-uulong */
413 completed < max_completed &&
414 (completed > sync_max ||
415 (completed == sync_max && curr != position))) {
416 while (completed > sync_max) {
417 sync_max += sectors / rd1;
418 if (backwards)
419 position -= sectors;
420 else
421 position += sectors;
422 }
423 if (sync_max < old_sync_max)
424 sysfs_set_num(mdi, NULL, "sync_max", sync_max);
425 }
426
427 if (!backwards && curr >= position)
428 break;
429 if (backwards && curr <= position)
430 break;
431 sysfs_wait(scfd, &delay);
432 }
433 if (scfd >= 0)
434 close(scfd);
435
436 }
437done:
438
439 /* As we have an O_EXCL open, any use of the device
440 * which blocks STOP_ARRAY is probably a transient use,
441 * so it is reasonable to retry for a while - 5 seconds.
442 */
443 count = 25; err = 0;
444 while (count && fd >= 0 &&
445 (err = ioctl(fd, STOP_ARRAY, NULL)) < 0 && errno == EBUSY) {
446 sleep_for(0, MSEC_TO_NSEC(200), true);
447 count --;
448 }
449 if (fd >= 0 && err) {
450 if (verbose >= 0) {
451 pr_err("failed to stop array %s: %s\n",
452 devname, strerror(errno));
453 if (errno == EBUSY)
454 cont_err("Perhaps a running process, mounted filesystem or active volume group?\n");
455 }
456 rv = 1;
457 goto out;
458 }
459
460 if (devnm[0] && udev_is_available()) {
461 struct map_ent *mp = map_by_devnm(&map, devnm);
462 remove_devices(devnm, mp ? mp->path : NULL);
463 }
464
465 if (verbose >= 0)
466 pr_err("stopped %s\n", devname);
467 map_lock(&map);
468 map_remove(&map, devnm);
469 map_unlock(&map);
470out:
471 sysfs_free(mdi);
472
473 return rv;
474}
475
476static struct mddev_dev *add_one(struct mddev_dev *dv, char *name, char disp)
477{
478 struct mddev_dev *new;
479 new = xmalloc(sizeof(*new));
480 memset(new, 0, sizeof(*new));
481 new->devname = xstrdup(name);
482 new->disposition = disp;
483 new->next = dv->next;
484 dv->next = new;
485 return new;
486}
487
488static void add_faulty(struct mddev_dev *dv, int fd, char disp)
489{
490 mdu_array_info_t array;
491 mdu_disk_info_t disk;
492 int remaining_disks;
493 int i;
494
495 if (md_get_array_info(fd, &array) != 0)
496 return;
497
498 remaining_disks = array.nr_disks;
499 for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
500 char buf[40];
501 disk.number = i;
502 if (md_get_disk_info(fd, &disk) != 0)
503 continue;
504 if (disk.major == 0 && disk.minor == 0)
505 continue;
506 remaining_disks--;
507 if ((disk.state & 1) == 0) /* not faulty */
508 continue;
509 sprintf(buf, "%d:%d", disk.major, disk.minor);
510 dv = add_one(dv, buf, disp);
511 }
512}
513
514static void add_detached(struct mddev_dev *dv, int fd, char disp)
515{
516 mdu_array_info_t array;
517 mdu_disk_info_t disk;
518 int remaining_disks;
519 int i;
520
521 if (md_get_array_info(fd, &array) != 0)
522 return;
523
524 remaining_disks = array.nr_disks;
525 for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
526 char buf[40];
527 int sfd;
528 disk.number = i;
529 if (md_get_disk_info(fd, &disk) != 0)
530 continue;
531 if (disk.major == 0 && disk.minor == 0)
532 continue;
533 remaining_disks--;
534 if (disp == 'f' && (disk.state & 1) != 0) /* already faulty */
535 continue;
536 sprintf(buf, "%d:%d", disk.major, disk.minor);
537 sfd = dev_open(buf, O_RDONLY);
538 if (sfd >= 0) {
539 /* Not detached */
540 close(sfd);
541 continue;
542 }
543 if (errno != ENXIO)
544 /* Probably not detached */
545 continue;
546 dv = add_one(dv, buf, disp);
547 }
548}
549
550static void add_set(struct mddev_dev *dv, int fd, char set_char)
551{
552 mdu_array_info_t array;
553 mdu_disk_info_t disk;
554 int remaining_disks;
555 int copies, set;
556 int i;
557
558 if (md_get_array_info(fd, &array) != 0)
559 return;
560 if (array.level != 10)
561 return;
562 copies = ((array.layout & 0xff) *
563 ((array.layout >> 8) & 0xff));
564 if (array.raid_disks % copies)
565 return;
566
567 remaining_disks = array.nr_disks;
568 for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
569 char buf[40];
570 disk.number = i;
571 if (md_get_disk_info(fd, &disk) != 0)
572 continue;
573 if (disk.major == 0 && disk.minor == 0)
574 continue;
575 remaining_disks--;
576 set = disk.raid_disk % copies;
577 if (set_char != set + 'A')
578 continue;
579 sprintf(buf, "%d:%d", disk.major, disk.minor);
580 dv = add_one(dv, buf, dv->disposition);
581 }
582}
583
584int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
585 struct supertype *dev_st, struct supertype *tst,
586 unsigned long rdev, enum update_opt update,
587 char *devname, int verbose, mdu_array_info_t *array)
588{
589 struct mdinfo mdi;
590 int duuid[4];
591 int ouuid[4];
592
593 dev_st->ss->getinfo_super(dev_st, &mdi, NULL);
594 dev_st->ss->uuid_from_super(dev_st, ouuid);
595 if (tst->sb)
596 tst->ss->uuid_from_super(tst, duuid);
597 else
598 /* Assume uuid matches: kernel will check */
599 memcpy(duuid, ouuid, sizeof(ouuid));
600 if ((mdi.disk.state & (1<<MD_DISK_ACTIVE)) &&
601 !(mdi.disk.state & (1<<MD_DISK_FAULTY)) &&
602 memcmp(duuid, ouuid, sizeof(ouuid))==0) {
603 /* Looks like it is worth a
604 * try. Need to make sure
605 * kernel will accept it
606 * though.
607 */
608 mdu_disk_info_t disc;
609 disc.number = mdi.disk.number;
610 if (md_get_disk_info(fd, &disc) != 0 ||
611 disc.major != 0 || disc.minor != 0)
612 goto skip_re_add;
613 disc.major = major(rdev);
614 disc.minor = minor(rdev);
615 disc.number = mdi.disk.number;
616 disc.raid_disk = mdi.disk.raid_disk;
617 disc.state = mdi.disk.state;
618 if (array->state & (1 << MD_SB_CLUSTERED)) {
619 /* extra flags are needed when adding to a cluster as
620 * there are two cases to distinguish
621 */
622 if (dv->disposition == 'c')
623 disc.state |= (1 << MD_DISK_CANDIDATE);
624 else
625 disc.state |= (1 << MD_DISK_CLUSTER_ADD);
626 }
627 if (dv->writemostly == FlagSet)
628 disc.state |= 1 << MD_DISK_WRITEMOSTLY;
629 if (dv->writemostly == FlagClear)
630 disc.state &= ~(1 << MD_DISK_WRITEMOSTLY);
631 if (dv->failfast == FlagSet)
632 disc.state |= 1 << MD_DISK_FAILFAST;
633 if (dv->failfast == FlagClear)
634 disc.state &= ~(1 << MD_DISK_FAILFAST);
635 remove_partitions(tfd);
636 if (update || dv->writemostly != FlagDefault ||
637 dv->failfast != FlagDefault) {
638 int rv = -1;
639 tfd = dev_open(dv->devname, O_RDWR);
640 if (tfd < 0) {
641 pr_err("failed to open %s for superblock update during re-add\n", dv->devname);
642 return -1;
643 }
644
645 if (dv->writemostly == FlagSet)
646 rv = dev_st->ss->update_super(
647 dev_st, NULL, UOPT_SPEC_WRITEMOSTLY,
648 devname, verbose, 0, NULL);
649 if (dv->writemostly == FlagClear)
650 rv = dev_st->ss->update_super(
651 dev_st, NULL, UOPT_SPEC_READWRITE,
652 devname, verbose, 0, NULL);
653 if (dv->failfast == FlagSet)
654 rv = dev_st->ss->update_super(
655 dev_st, NULL, UOPT_SPEC_FAILFAST,
656 devname, verbose, 0, NULL);
657 if (dv->failfast == FlagClear)
658 rv = dev_st->ss->update_super(
659 dev_st, NULL, UOPT_SPEC_NOFAILFAST,
660 devname, verbose, 0, NULL);
661 if (update)
662 rv = dev_st->ss->update_super(
663 dev_st, NULL, update,
664 devname, verbose, 0, NULL);
665 if (rv == 0)
666 rv = dev_st->ss->store_super(dev_st, tfd);
667 close(tfd);
668 if (rv != 0) {
669 pr_err("failed to update superblock during re-add\n");
670 return -1;
671 }
672 }
673 /* don't even try if disk is marked as faulty */
674 errno = 0;
675 if (ioctl(fd, ADD_NEW_DISK, &disc) == 0) {
676 if (verbose >= 0)
677 pr_err("re-added %s\n", dv->devname);
678 return 1;
679 }
680 if (errno == ENOMEM || errno == EROFS) {
681 pr_err("add new device failed for %s: %s\n",
682 dv->devname, strerror(errno));
683 if (dv->disposition == 'M')
684 return 0;
685 return -1;
686 }
687 }
688skip_re_add:
689 return 0;
690}
691
692/**
693 * manage_add_external() - Add disk to external container.
694 * @st: external supertype pointer, must not be NULL, superblock is released here.
695 * @fd: container file descriptor, must not have O_EXCL mode.
696 * @disk_fd: device to add file descriptor.
697 * @disk_name: name of the device to add.
698 * @disc: disk info.
699 *
700 * Superblock is released here because any open fd with O_EXCL will block sysfs_add_disk().
701 */
702mdadm_status_t manage_add_external(struct supertype *st, int fd, char *disk_name,
703 mdu_disk_info_t *disc)
704{
705 mdadm_status_t rv = MDADM_STATUS_ERROR;
706 char container_devpath[MD_NAME_MAX];
707 struct dev_policy *pols = NULL;
708 struct mdinfo new_mdi;
709 struct mdinfo *sra = NULL;
710 int container_fd;
711 int disk_fd = -1;
712
713 snprintf(container_devpath, MD_NAME_MAX, "%s", fd2devnm(fd));
714
715 container_fd = open_dev_excl(container_devpath);
716 if (!is_fd_valid(container_fd)) {
717 pr_err("Failed to get exclusive access to container %s\n", container_devpath);
718 return MDADM_STATUS_ERROR;
719 }
720
721 /* Check if metadata handler is able to accept the drive */
722 if (!st->ss->validate_geometry(st, LEVEL_CONTAINER, 0, 1, NULL, 0, 0, disk_name, NULL,
723 0, 1))
724 goto out;
725
726 if (mddev_test_and_add_drive_policies(st, &pols, container_fd, 1))
727 goto out;
728
729 Kill(disk_name, NULL, 0, -1, 0);
730
731 disk_fd = dev_open(disk_name, O_RDWR | O_EXCL | O_DIRECT);
732 if (!is_fd_valid(disk_fd)) {
733 pr_err("Failed to exclusively open %s\n", disk_name);
734 goto out;
735 }
736
737 if (drive_test_and_add_policies(st, &pols, disk_fd, 1))
738 goto out;
739
740 if (st->ss->add_to_super(st, disc, disk_fd, disk_name, INVALID_SECTORS))
741 goto out;
742
743 if (!mdmon_running(st->container_devnm))
744 st->ss->sync_metadata(st);
745
746 sra = sysfs_read(container_fd, NULL, 0);
747 if (!sra) {
748 pr_err("Failed to read sysfs for %s\n", disk_name);
749 goto out;
750 }
751
752 sra->array.level = LEVEL_CONTAINER;
753 /* Need to set data_offset and component_size */
754 st->ss->getinfo_super(st, &new_mdi, NULL);
755 new_mdi.disk.major = disc->major;
756 new_mdi.disk.minor = disc->minor;
757 new_mdi.recovery_start = 0;
758
759 st->ss->free_super(st);
760
761 if (sysfs_add_disk(sra, &new_mdi, 0) != 0) {
762 pr_err("Failed to add %s to container %s\n", disk_name, container_devpath);
763 goto out;
764 }
765 ping_monitor(container_devpath);
766 rv = MDADM_STATUS_SUCCESS;
767
768out:
769 close(container_fd);
770 dev_policy_free(pols);
771
772 if (sra)
773 sysfs_free(sra);
774
775 if (rv != MDADM_STATUS_SUCCESS && is_fd_valid(disk_fd))
776 /* Metadata handler records this descriptor, so release it only on failure. */
777 close(disk_fd);
778
779 if (st->sb)
780 st->ss->free_super(st);
781
782 return rv;
783}
784
785int Manage_add(int fd, int tfd, struct mddev_dev *dv,
786 struct supertype *tst, mdu_array_info_t *array,
787 int force, int verbose, char *devname,
788 enum update_opt update, unsigned long rdev,
789 unsigned long long array_size, int raid_slot)
790{
791 unsigned long long ldsize;
792 struct supertype *dev_st;
793 int j;
794 mdu_disk_info_t disc;
795 struct map_ent *map = NULL;
796
797 if (!get_dev_size(tfd, dv->devname, &ldsize)) {
798 if (dv->disposition == 'M')
799 return 0;
800 else
801 return -1;
802 }
803
804 if (tst->ss == &super0 && ldsize > 4ULL*1024*1024*1024*1024) {
805 /* More than 4TB is wasted on v0.90 */
806 if (!force) {
807 pr_err("%s is larger than %s can effectively use.\n"
808 " Add --force is you really want to add this device.\n",
809 dv->devname, devname);
810 return -1;
811 }
812 pr_err("%s is larger than %s can effectively use.\n"
813 " Adding anyway as --force was given.\n",
814 dv->devname, devname);
815 }
816
817 if (array->not_persistent == 0 || tst->ss->external) {
818
819 /* need to find a sample superblock to copy, and
820 * a spare slot to use.
821 * For 'external' array (well, container based),
822 * We can just load the metadata for the array->
823 */
824 int array_failed;
825 if (tst->sb)
826 /* already loaded */;
827 else if (tst->ss->external) {
828 tst->ss->load_container(tst, fd, NULL);
829 } else for (j = 0; j < tst->max_devs; j++) {
830 char *dev;
831 int dfd;
832 disc.number = j;
833 if (md_get_disk_info(fd, &disc))
834 continue;
835 if (disc.major==0 && disc.minor==0)
836 continue;
837 if ((disc.state & 4)==0) /* sync */
838 continue;
839 /* Looks like a good device to try */
840 dev = map_dev(disc.major, disc.minor, 1);
841 if (!dev)
842 continue;
843 dfd = dev_open(dev, O_RDONLY);
844 if (dfd < 0)
845 continue;
846 if (tst->ss->load_super(tst, dfd,
847 NULL)) {
848 close(dfd);
849 continue;
850 }
851 close(dfd);
852 break;
853 }
854 /* FIXME this is a bad test to be using */
855 if (!tst->sb && (dv->disposition != 'a' &&
856 dv->disposition != 'S')) {
857 /* we are re-adding a device to a
858 * completely dead array - have to depend
859 * on kernel to check
860 */
861 } else if (!tst->sb) {
862 pr_err("cannot load array metadata from %s\n", devname);
863 return -1;
864 }
865
866 /* Make sure device is large enough */
867 if (dv->disposition != 'j' && /* skip size check for Journal */
868 tst->sb &&
869 tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) <
870 array_size) {
871 if (dv->disposition == 'M')
872 return 0;
873 pr_err("%s not large enough to join array\n",
874 dv->devname);
875 return -1;
876 }
877
878 /* Possibly this device was recently part of
879 * the array and was temporarily removed, and
880 * is now being re-added. If so, we can
881 * simply re-add it.
882 */
883
884 if (array->not_persistent == 0 && dv->disposition != 'S') {
885 int rv = 0;
886
887 dev_st = dup_super(tst);
888 dev_st->ss->load_super(dev_st, tfd, NULL);
889
890 if (dev_st->sb) {
891 rv = attempt_re_add(fd, tfd, dv, dev_st, tst, rdev, update,
892 devname, verbose, array);
893
894 dev_st->ss->free_super(dev_st);
895 }
896
897 free(dev_st);
898
899 if (rv)
900 return rv;
901 }
902 if (dv->disposition == 'M') {
903 if (verbose > 0)
904 pr_err("--re-add for %s to %s is not possible\n",
905 dv->devname, devname);
906 return 0;
907 }
908 if (dv->disposition == 'A') {
909 pr_err("--re-add for %s to %s is not possible\n",
910 dv->devname, devname);
911 return -1;
912 }
913 if (array->active_disks < array->raid_disks) {
914 char *avail = xcalloc(array->raid_disks, 1);
915 int d;
916 int found = 0;
917
918 for (d = 0; d < MAX_DISKS && found < array->nr_disks; d++) {
919 disc.number = d;
920 if (md_get_disk_info(fd, &disc))
921 continue;
922 if (disc.major == 0 && disc.minor == 0)
923 continue;
924 if (!(disc.state & (1<<MD_DISK_SYNC)))
925 continue;
926 avail[disc.raid_disk] = 1;
927 found++;
928 }
929 array_failed = !enough(array->level, array->raid_disks,
930 array->layout, 1, avail);
931 free(avail);
932 } else
933 array_failed = 0;
934 if (array_failed) {
935 pr_err("%s has failed so using --add cannot work and might destroy\n",
936 devname);
937 pr_err("data on %s. You should stop the array and re-assemble it.\n",
938 dv->devname);
939 return -1;
940 }
941 } else {
942 /* non-persistent. Must ensure that new drive
943 * is at least array->size big.
944 */
945 if (ldsize/512 < array_size) {
946 pr_err("%s not large enough to join array\n",
947 dv->devname);
948 return -1;
949 }
950 }
951 /* committed to really trying this device now*/
952 remove_partitions(tfd);
953
954 /* in 2.6.17 and earlier, version-1 superblocks won't
955 * use the number we write, but will choose a free number.
956 * we must choose the same free number, which requires
957 * starting at 'raid_disks' and counting up
958 */
959 for (j = array->raid_disks; j < tst->max_devs; j++) {
960 disc.number = j;
961 if (md_get_disk_info(fd, &disc))
962 break;
963 if (disc.major==0 && disc.minor==0)
964 break;
965 if (disc.state & 8) /* removed */
966 break;
967 }
968 disc.major = major(rdev);
969 disc.minor = minor(rdev);
970 if (raid_slot < 0)
971 disc.number = j;
972 else
973 disc.number = raid_slot;
974 disc.state = 0;
975
976 /* only add journal to array that supports journaling */
977 if (dv->disposition == 'j') {
978 struct mdinfo *mdp;
979
980 mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE);
981 if (!mdp) {
982 pr_err("%s unable to read array state.\n", devname);
983 return -1;
984 }
985
986 if (mdp->array_state != ARRAY_READONLY) {
987 sysfs_free(mdp);
988 pr_err("%s is not readonly, cannot add journal.\n", devname);
989 return -1;
990 }
991
992 sysfs_free(mdp);
993
994 disc.raid_disk = 0;
995 }
996
997 if (map_lock(&map))
998 pr_err("failed to get exclusive lock on mapfile when add disk\n");
999
1000 if (array->not_persistent==0) {
1001 int dfd;
1002 if (dv->disposition == 'j')
1003 disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC);
1004 if (dv->writemostly == FlagSet)
1005 disc.state |= 1 << MD_DISK_WRITEMOSTLY;
1006 if (dv->failfast == FlagSet)
1007 disc.state |= 1 << MD_DISK_FAILFAST;
1008 dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
1009 if (tst->ss->add_to_super(tst, &disc, dfd,
1010 dv->devname, INVALID_SECTORS))
1011 goto unlock;
1012 if (tst->ss->write_init_super(tst))
1013 goto unlock;
1014 } else if (dv->disposition == 'A') {
1015 /* this had better be raid1.
1016 * As we are "--re-add"ing we must find a spare slot
1017 * to fill.
1018 */
1019 char *used = xcalloc(array->raid_disks, 1);
1020 for (j = 0; j < tst->max_devs; j++) {
1021 mdu_disk_info_t disc2;
1022 disc2.number = j;
1023 if (md_get_disk_info(fd, &disc2))
1024 continue;
1025 if (disc2.major==0 && disc2.minor==0)
1026 continue;
1027 if (disc2.state & 8) /* removed */
1028 continue;
1029 if (disc2.raid_disk < 0)
1030 continue;
1031 if (disc2.raid_disk > array->raid_disks)
1032 continue;
1033 used[disc2.raid_disk] = 1;
1034 }
1035 for (j = 0 ; j < array->raid_disks; j++)
1036 if (!used[j]) {
1037 disc.raid_disk = j;
1038 disc.state |= (1<<MD_DISK_SYNC);
1039 break;
1040 }
1041 free(used);
1042 }
1043
1044 if (array->state & (1 << MD_SB_CLUSTERED)) {
1045 if (dv->disposition == 'c')
1046 disc.state |= (1 << MD_DISK_CANDIDATE);
1047 else
1048 disc.state |= (1 << MD_DISK_CLUSTER_ADD);
1049 }
1050
1051 if (dv->writemostly == FlagSet)
1052 disc.state |= (1 << MD_DISK_WRITEMOSTLY);
1053 if (dv->failfast == FlagSet)
1054 disc.state |= (1 << MD_DISK_FAILFAST);
1055 if (tst->ss->external) {
1056 if (manage_add_external(tst, fd, dv->devname, &disc) != MDADM_STATUS_SUCCESS)
1057 goto unlock;
1058 } else {
1059 tst->ss->free_super(tst);
1060 if (ioctl(fd, ADD_NEW_DISK, &disc)) {
1061 if (dv->disposition == 'j')
1062 pr_err("Failed to hot add %s as journal, "
1063 "please try restart %s.\n", dv->devname, devname);
1064 else
1065 pr_err("add new device failed for %s as %d: %s\n",
1066 dv->devname, j, strerror(errno));
1067 goto unlock;
1068 }
1069 if (dv->disposition == 'j') {
1070 pr_err("Journal added successfully, making %s read-write\n", devname);
1071 if (Manage_ro(devname, fd, -1))
1072 pr_err("Failed to make %s read-write\n", devname);
1073 }
1074
1075 }
1076 if (verbose >= 0)
1077 pr_err("added %s\n", dv->devname);
1078 map_unlock(&map);
1079 return 1;
1080unlock:
1081 map_unlock(&map);
1082 return -1;
1083}
1084
1085int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv,
1086 int sysfd, unsigned long rdev, int force, int verbose, char *devname)
1087{
1088 int lfd = -1;
1089 int err;
1090
1091 if (tst->ss->external) {
1092 /* To remove a device from a container, we must
1093 * check that it isn't in use in an array.
1094 * This involves looking in the 'holders'
1095 * directory - there must be just one entry,
1096 * the container.
1097 * To ensure that it doesn't get used as a
1098 * hot spare while we are checking, we
1099 * get an O_EXCL open on the container
1100 */
1101 int ret;
1102 char devnm[32];
1103 strcpy(devnm, fd2devnm(fd));
1104 lfd = open_dev_excl(devnm);
1105 if (lfd < 0) {
1106 pr_err("Cannot get exclusive access to container - odd\n");
1107 return -1;
1108 }
1109 /* We may not be able to check on holders in
1110 * sysfs, either because we don't have the dev num
1111 * (rdev == 0) or because the device has been detached
1112 * and the 'holders' directory no longer exists
1113 * (ret == -1). In that case, assume it is OK to
1114 * remove.
1115 */
1116 if (rdev == 0)
1117 ret = -1;
1118 else {
1119 /*
1120 * The drive has already been set to 'faulty', however
1121 * monitor might not have had time to process it and the
1122 * drive might still have an entry in the 'holders'
1123 * directory. Try a few times to avoid a false error
1124 */
1125 int count = 20;
1126
1127 do {
1128 ret = sysfs_unique_holder(devnm, rdev);
1129 if (ret < 2)
1130 break;
1131 sleep_for(0, MSEC_TO_NSEC(100), true);
1132 } while (--count > 0);
1133
1134 if (ret == 0) {
1135 pr_err("%s is not a member, cannot remove.\n",
1136 dv->devname);
1137 close(lfd);
1138 return -1;
1139 }
1140 if (ret >= 2) {
1141 pr_err("%s is still in use, cannot remove.\n",
1142 dv->devname);
1143 close(lfd);
1144 return -1;
1145 }
1146 }
1147 }
1148 /* FIXME check that it is a current member */
1149 if (sysfd >= 0) {
1150 /* device has been removed and we don't know
1151 * the major:minor number
1152 */
1153 err = sys_hot_remove_disk(sysfd, force);
1154 } else {
1155 err = hot_remove_disk(fd, rdev, force);
1156 if (err && errno == ENODEV) {
1157 /* Old kernels rejected this if no personality
1158 * is registered */
1159 struct mdinfo *sra = sysfs_read(fd, NULL, GET_DEVS);
1160 struct mdinfo *dv = NULL;
1161 if (sra)
1162 dv = sra->devs;
1163 for ( ; dv ; dv=dv->next)
1164 if (dv->disk.major == (int)major(rdev) &&
1165 dv->disk.minor == (int)minor(rdev))
1166 break;
1167 if (dv)
1168 err = sysfs_set_str(sra, dv,
1169 "state", "remove");
1170 else
1171 err = -1;
1172 sysfs_free(sra);
1173 }
1174 }
1175 if (err) {
1176 pr_err("hot remove failed for %s: %s\n", dv->devname,
1177 strerror(errno));
1178 if (lfd >= 0)
1179 close(lfd);
1180 return -1;
1181 }
1182 if (tst->ss->external) {
1183 /*
1184 * Before dropping our exclusive open we make an
1185 * attempt at preventing mdmon from seeing an
1186 * 'add' event before reconciling this 'remove'
1187 * event.
1188 */
1189 char *devnm = fd2devnm(fd);
1190
1191 if (!devnm) {
1192 pr_err("unable to get container name\n");
1193 return -1;
1194 }
1195
1196 ping_manager(devnm);
1197 }
1198 if (lfd >= 0)
1199 close(lfd);
1200 if (verbose >= 0)
1201 pr_err("hot removed %s from %s\n",
1202 dv->devname, devname);
1203 return 1;
1204}
1205
1206int Manage_replace(struct supertype *tst, int fd, struct mddev_dev *dv,
1207 unsigned long rdev, int verbose, char *devname)
1208{
1209 struct mdinfo *mdi, *di;
1210 if (tst->ss->external) {
1211 pr_err("--replace only supported for native metadata (0.90 or 1.x)\n");
1212 return -1;
1213 }
1214 /* Need to find the device in sysfs and add 'want_replacement' to the
1215 * status.
1216 */
1217 mdi = sysfs_read(fd, NULL, GET_DEVS);
1218 if (!mdi || !mdi->devs) {
1219 pr_err("Cannot find status of %s to enable replacement - strange\n",
1220 devname);
1221 return -1;
1222 }
1223 for (di = mdi->devs; di; di = di->next)
1224 if (di->disk.major == (int)major(rdev) &&
1225 di->disk.minor == (int)minor(rdev))
1226 break;
1227 if (di) {
1228 int rv;
1229 if (di->disk.raid_disk < 0) {
1230 pr_err("%s is not active and so cannot be replaced.\n",
1231 dv->devname);
1232 sysfs_free(mdi);
1233 return -1;
1234 }
1235 rv = sysfs_set_str(mdi, di,
1236 "state", "want_replacement");
1237 if (rv) {
1238 sysfs_free(mdi);
1239 pr_err("Failed to request replacement for %s\n",
1240 dv->devname);
1241 return -1;
1242 }
1243 if (verbose >= 0)
1244 pr_err("Marked %s (device %d in %s) for replacement\n",
1245 dv->devname, di->disk.raid_disk, devname);
1246 /* If there is a matching 'with', we need to tell it which
1247 * raid disk
1248 */
1249 while (dv && dv->disposition != 'W')
1250 dv = dv->next;
1251 if (dv) {
1252 dv->disposition = 'w';
1253 dv->used = di->disk.raid_disk;
1254 }
1255 return 1;
1256 }
1257 sysfs_free(mdi);
1258 pr_err("%s not found in %s so cannot --replace it\n",
1259 dv->devname, devname);
1260 return -1;
1261}
1262
1263int Manage_with(struct supertype *tst, int fd, struct mddev_dev *dv,
1264 unsigned long rdev, int verbose, char *devname)
1265{
1266 struct mdinfo *mdi, *di;
1267 /* try to set 'slot' for 'rdev' in 'fd' to 'dv->used' */
1268 mdi = sysfs_read(fd, NULL, GET_DEVS|GET_STATE);
1269 if (!mdi || !mdi->devs) {
1270 pr_err("Cannot find status of %s to enable replacement - strange\n",
1271 devname);
1272 return -1;
1273 }
1274 for (di = mdi->devs; di; di = di->next)
1275 if (di->disk.major == (int)major(rdev) &&
1276 di->disk.minor == (int)minor(rdev))
1277 break;
1278 if (di) {
1279 int rv;
1280 if (di->disk.state & (1<<MD_DISK_FAULTY)) {
1281 pr_err("%s is faulty and cannot be a replacement\n",
1282 dv->devname);
1283 sysfs_free(mdi);
1284 return -1;
1285 }
1286 if (di->disk.raid_disk >= 0) {
1287 pr_err("%s is active and cannot be a replacement\n",
1288 dv->devname);
1289 sysfs_free(mdi);
1290 return -1;
1291 }
1292 rv = sysfs_set_num(mdi, di,
1293 "slot", dv->used);
1294 if (rv) {
1295 sysfs_free(mdi);
1296 pr_err("Failed to set %s as preferred replacement.\n",
1297 dv->devname);
1298 return -1;
1299 }
1300 if (verbose >= 0)
1301 pr_err("Marked %s in %s as replacement for device %d\n",
1302 dv->devname, devname, dv->used);
1303 return 1;
1304 }
1305 sysfs_free(mdi);
1306 pr_err("%s not found in %s so cannot make it preferred replacement\n",
1307 dv->devname, devname);
1308 return -1;
1309}
1310
1311/**
1312 * is_remove_safe() - Check if remove is safe.
1313 * @array: Array info.
1314 * @fd: Array file descriptor.
1315 * @devname: Name of device to remove.
1316 * @verbose: Verbose.
1317 *
1318 * The function determines if array will be operational
1319 * after removing &devname.
1320 *
1321 * Return: True if array will be operational, false otherwise.
1322 */
1323bool is_remove_safe(mdu_array_info_t *array, const int fd, char *devname, const int verbose)
1324{
1325 dev_t devid = devnm2devid(devname + 5);
1326 struct mdinfo *mdi = sysfs_read(fd, NULL, GET_DEVS | GET_DISKS | GET_STATE);
1327
1328 if (!mdi) {
1329 if (verbose)
1330 pr_err("Failed to read sysfs attributes for %s\n", devname);
1331 return false;
1332 }
1333
1334 char *avail = xcalloc(array->raid_disks, sizeof(char));
1335
1336 for (mdi = mdi->devs; mdi; mdi = mdi->next) {
1337 if (mdi->disk.raid_disk < 0)
1338 continue;
1339 if (!(mdi->disk.state & (1 << MD_DISK_SYNC)))
1340 continue;
1341 if (makedev(mdi->disk.major, mdi->disk.minor) == devid)
1342 continue;
1343 avail[mdi->disk.raid_disk] = 1;
1344 }
1345 sysfs_free(mdi);
1346
1347 bool is_enough = enough(array->level, array->raid_disks,
1348 array->layout, 1, avail);
1349
1350 free(avail);
1351 return is_enough;
1352}
1353
1354/**
1355 * Manage_subdevs() - Execute operation depending on devmode.
1356 *
1357 * @devname: name of the device.
1358 * @fd: file descriptor.
1359 * @devlist: list of sub-devices to manage.
1360 * @verbose: verbose level.
1361 * @test: test flag.
1362 * @update: type of update.
1363 * @force: force flag.
1364 *
1365 * This function executes operation defined by devmode
1366 * for each dev from devlist.
1367 * Devmode can be:
1368 * 'a' - add the device
1369 * 'S' - add the device as a spare - don't try re-add
1370 * 'j' - add the device as a journal device
1371 * 'A' - re-add the device
1372 * 'r' - remove the device: HOT_REMOVE_DISK
1373 * device can be 'faulty' or 'detached' in which case all
1374 * matching devices are removed.
1375 * 'f' - set the device faulty SET_DISK_FAULTY
1376 * device can be 'detached' in which case any device that
1377 * is inaccessible will be marked faulty.
1378 * 'I' - remove device by using incremental fail
1379 * which is executed when device is removed surprisingly.
1380 * 'R' - mark this device as wanting replacement.
1381 * 'W' - this device is added if necessary and activated as
1382 * a replacement for a previous 'R' device.
1383 * -----
1384 * 'w' - 'W' will be changed to 'w' when it is paired with
1385 * a 'R' device. If a 'W' is found while walking the list
1386 * it must be unpaired, and is an error.
1387 * 'M' - this is created by a 'missing' target. It is a slight
1388 * variant on 'A'
1389 * 'F' - Another variant of 'A', where the device was faulty
1390 * so must be removed from the array first.
1391 * 'c' - confirm the device as found (for clustered environments)
1392 *
1393 * For 'f' and 'r', the device can also be a kernel-internal
1394 * name such as 'sdb'.
1395 *
1396 * Return: 0 on success, otherwise 1 or 2.
1397 */
1398int Manage_subdevs(char *devname, int fd,
1399 struct mddev_dev *devlist, int verbose, int test,
1400 enum update_opt update, int force)
1401{
1402 mdu_array_info_t array;
1403 unsigned long long array_size;
1404 struct mddev_dev *dv;
1405 int tfd = -1;
1406 struct supertype *tst = NULL;
1407 char *subarray = NULL;
1408 int sysfd = -1;
1409 int count = 0; /* number of actions taken */
1410 struct mdinfo info;
1411 struct mdinfo devinfo;
1412 int frozen = 0;
1413 int busy = 0;
1414 int raid_slot = -1;
1415
1416 if (sysfs_init(&info, fd, NULL)) {
1417 pr_err("sysfs not availabile for %s\n", devname);
1418 goto abort;
1419 }
1420
1421 if (md_get_array_info(fd, &array)) {
1422 pr_err("Cannot get array info for %s\n", devname);
1423 goto abort;
1424 }
1425 /* array.size is only 32 bits and may be truncated.
1426 * So read from sysfs if possible, and record number of sectors
1427 */
1428
1429 array_size = get_component_size(fd);
1430 if (array_size <= 0)
1431 array_size = array.size * 2;
1432
1433 tst = super_by_fd(fd, &subarray);
1434 if (!tst) {
1435 pr_err("unsupport array - version %d.%d\n",
1436 array.major_version, array.minor_version);
1437 goto abort;
1438 }
1439
1440 for (dv = devlist; dv; dv = dv->next) {
1441 dev_t rdev = 0; /* device to add/remove etc */
1442 int rv;
1443 int mj,mn;
1444
1445 raid_slot = -1;
1446 if (dv->disposition == 'c') {
1447 rv = parse_cluster_confirm_arg(dv->devname,
1448 &dv->devname,
1449 &raid_slot);
1450 if (rv) {
1451 pr_err("Could not get the devname of cluster\n");
1452 goto abort;
1453 }
1454 }
1455
1456 if (strcmp(dv->devname, "failed") == 0 ||
1457 strcmp(dv->devname, "faulty") == 0) {
1458 if (dv->disposition != 'A' && dv->disposition != 'r') {
1459 pr_err("%s only meaningful with -r or --re-add, not -%c\n",
1460 dv->devname, dv->disposition);
1461 goto abort;
1462 }
1463 add_faulty(dv, fd, (dv->disposition == 'A'
1464 ? 'F' : 'r'));
1465 continue;
1466 }
1467 if (strcmp(dv->devname, "detached") == 0) {
1468 if (dv->disposition != 'r' && dv->disposition != 'f') {
1469 pr_err("%s only meaningful with -r of -f, not -%c\n",
1470 dv->devname, dv->disposition);
1471 goto abort;
1472 }
1473 add_detached(dv, fd, dv->disposition);
1474 continue;
1475 }
1476
1477 if (strcmp(dv->devname, "missing") == 0) {
1478 struct mddev_dev *add_devlist;
1479 struct mddev_dev **dp;
1480 if (dv->disposition == 'c') {
1481 rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL);
1482 break;
1483 }
1484
1485 if (dv->disposition != 'A') {
1486 pr_err("'missing' only meaningful with --re-add\n");
1487 goto abort;
1488 }
1489 add_devlist = conf_get_devs();
1490 if (add_devlist == NULL) {
1491 pr_err("no devices to scan for missing members.\n");
1492 continue;
1493 }
1494 for (dp = &add_devlist; *dp; dp = & (*dp)->next)
1495 /* 'M' (for 'missing') is like 'A' without errors */
1496 (*dp)->disposition = 'M';
1497 *dp = dv->next;
1498 dv->next = add_devlist;
1499 continue;
1500 }
1501
1502 if (strncmp(dv->devname, "set-", 4) == 0 &&
1503 strlen(dv->devname) == 5) {
1504 int copies;
1505
1506 if (dv->disposition != 'r' &&
1507 dv->disposition != 'f') {
1508 pr_err("'%s' only meaningful with -r or -f\n",
1509 dv->devname);
1510 goto abort;
1511 }
1512 if (array.level != 10) {
1513 pr_err("'%s' only meaningful with RAID10 arrays\n",
1514 dv->devname);
1515 goto abort;
1516 }
1517 copies = ((array.layout & 0xff) *
1518 ((array.layout >> 8) & 0xff));
1519 if (array.raid_disks % copies != 0 ||
1520 dv->devname[4] < 'A' ||
1521 dv->devname[4] >= 'A' + copies ||
1522 copies > 26) {
1523 pr_err("'%s' not meaningful with this array\n",
1524 dv->devname);
1525 goto abort;
1526 }
1527 add_set(dv, fd, dv->devname[4]);
1528 continue;
1529 }
1530
1531 if (strchr(dv->devname, '/') == NULL &&
1532 strchr(dv->devname, ':') == NULL &&
1533 strlen(dv->devname) < 50) {
1534 /* Assume this is a kernel-internal name like 'sda1' */
1535 int found = 0;
1536 char dname[55];
1537 if (dv->disposition != 'r' && dv->disposition != 'f' &&
1538 dv->disposition != 'I') {
1539 pr_err("%s only meaningful with -r, -f or -I, not -%c\n",
1540 dv->devname, dv->disposition);
1541 goto abort;
1542 }
1543
1544 sprintf(dname, "dev-%s", dv->devname);
1545 sysfd = sysfs_open(fd2devnm(fd), dname, "block/dev");
1546 if (sysfd >= 0) {
1547 char dn[SYSFS_MAX_BUF_SIZE];
1548 if (sysfs_fd_get_str(sysfd, dn, sizeof(dn)) > 0 &&
1549 sscanf(dn, "%d:%d", &mj,&mn) == 2) {
1550 rdev = makedev(mj,mn);
1551 found = 1;
1552 }
1553 close(sysfd);
1554 sysfd = -1;
1555 }
1556 if (!found) {
1557 sysfd = sysfs_open(fd2devnm(fd), dname, "state");
1558 if (sysfd < 0) {
1559 pr_err("%s does not appear to be a component of %s\n",
1560 dv->devname, devname);
1561 goto abort;
1562 }
1563 }
1564 } else if ((dv->disposition == 'r' ||
1565 dv->disposition == 'f') &&
1566 get_maj_min(dv->devname, &mj, &mn)) {
1567 /* for 'fail' and 'remove', the device might
1568 * not exist.
1569 */
1570 rdev = makedev(mj, mn);
1571 } else {
1572 tfd = dev_open(dv->devname, O_RDONLY);
1573 if (tfd >= 0) {
1574 fstat_is_blkdev(tfd, dv->devname, &rdev);
1575 close(tfd);
1576 } else {
1577 int open_err = errno;
1578 if (!stat_is_blkdev(dv->devname, &rdev)) {
1579 if (dv->disposition == 'M')
1580 /* non-fatal. Also improbable */
1581 continue;
1582 goto abort;
1583 }
1584 if (dv->disposition == 'r')
1585 /* Be happy, the stat worked, that is
1586 * enough for --remove
1587 */
1588 ;
1589 else {
1590 if (dv->disposition == 'M')
1591 /* non-fatal */
1592 continue;
1593 pr_err("Cannot open %s: %s\n",
1594 dv->devname, strerror(open_err));
1595 goto abort;
1596 }
1597 }
1598 }
1599 switch(dv->disposition){
1600 default:
1601 pr_err("internal error - devmode[%s]=%d\n",
1602 dv->devname, dv->disposition);
1603 goto abort;
1604 case 'a':
1605 case 'S': /* --add-spare */
1606 case 'j': /* --add-journal */
1607 case 'A':
1608 case 'M': /* --re-add missing */
1609 case 'F': /* --re-add faulty */
1610 case 'c': /* --cluster-confirm */
1611 /* add the device */
1612 if (subarray) {
1613 pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n");
1614 goto abort;
1615 }
1616
1617 /* Let's first try to write re-add to sysfs */
1618 if (rdev != 0 &&
1619 (dv->disposition == 'A' || dv->disposition == 'F')) {
1620 sysfs_init_dev(&devinfo, rdev);
1621 if (sysfs_set_str(&info, &devinfo, "state", "re-add") == 0) {
1622 pr_err("re-add %s to %s succeed\n",
1623 dv->devname, info.sys_name);
1624 break;
1625 }
1626 }
1627
1628 if (dv->disposition == 'F')
1629 /* Need to remove first */
1630 hot_remove_disk(fd, rdev, force);
1631 /* Make sure it isn't in use (in 2.6 or later) */
1632 tfd = dev_open(dv->devname, O_RDONLY|O_EXCL);
1633 if (tfd >= 0) {
1634 /* We know no-one else is using it. We'll
1635 * need non-exclusive access to add it, so
1636 * do that now.
1637 */
1638 close(tfd);
1639 tfd = dev_open(dv->devname, O_RDONLY);
1640 }
1641 if (tfd < 0) {
1642 if (dv->disposition == 'M')
1643 continue;
1644 pr_err("Cannot open %s: %s\n",
1645 dv->devname, strerror(errno));
1646 goto abort;
1647 }
1648 if (!frozen) {
1649 if (sysfs_freeze_array(&info) == 1)
1650 frozen = 1;
1651 else
1652 frozen = -1;
1653 }
1654 rv = Manage_add(fd, tfd, dv, tst, &array,
1655 force, verbose, devname, update,
1656 rdev, array_size, raid_slot);
1657 close(tfd);
1658 tfd = -1;
1659 if (rv < 0)
1660 goto abort;
1661 if (rv > 0)
1662 count++;
1663 break;
1664
1665 case 'r':
1666 /* hot remove */
1667 if (subarray) {
1668 pr_err("Cannot remove disks from a \'member\' array, perform this operation on the parent container\n");
1669 rv = -1;
1670 } else
1671 rv = Manage_remove(tst, fd, dv, sysfd,
1672 rdev, verbose, force,
1673 devname);
1674 if (sysfd >= 0)
1675 close(sysfd);
1676 sysfd = -1;
1677 if (rv < 0)
1678 goto abort;
1679 if (rv > 0)
1680 count++;
1681 break;
1682
1683 case 'f': /* set faulty */
1684 if (!is_remove_safe(&array, fd, dv->devname, verbose)) {
1685 pr_err("Cannot remove %s from %s, array will be failed.\n",
1686 dv->devname, devname);
1687 if (sysfd >= 0)
1688 close(sysfd);
1689 goto abort;
1690 }
1691 case 'I': /* incremental fail */
1692 if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) ||
1693 (sysfd < 0 && ioctl(fd, SET_DISK_FAULTY,
1694 rdev))) {
1695 if (errno == EBUSY)
1696 busy = 1;
1697 pr_err("set device faulty failed for %s: %s\n",
1698 dv->devname, strerror(errno));
1699 if (sysfd >= 0)
1700 close(sysfd);
1701 goto abort;
1702 }
1703 if (sysfd >= 0)
1704 close(sysfd);
1705 sysfd = -1;
1706 count++;
1707 if (verbose >= 0)
1708 pr_err("set %s faulty in %s\n",
1709 dv->devname, devname);
1710 break;
1711 case 'R': /* Mark as replaceable */
1712 if (subarray) {
1713 pr_err("Cannot replace disks in a \'member\' array, perform this operation on the parent container\n");
1714 rv = -1;
1715 } else {
1716 if (!frozen) {
1717 if (sysfs_freeze_array(&info) == 1)
1718 frozen = 1;
1719 else
1720 frozen = -1;
1721 }
1722 rv = Manage_replace(tst, fd, dv,
1723 rdev, verbose,
1724 devname);
1725 }
1726 if (rv < 0)
1727 goto abort;
1728 if (rv > 0)
1729 count++;
1730 break;
1731 case 'W': /* --with device that doesn't match */
1732 pr_err("No matching --replace device for --with %s\n",
1733 dv->devname);
1734 goto abort;
1735 case 'w': /* --with device which was matched */
1736 rv = Manage_with(tst, fd, dv,
1737 rdev, verbose, devname);
1738 if (rv < 0)
1739 goto abort;
1740 break;
1741 }
1742 }
1743 free(tst);
1744 if (frozen > 0)
1745 sysfs_set_str(&info, NULL, "sync_action","idle");
1746 if (test && count == 0)
1747 return 2;
1748 return 0;
1749
1750abort:
1751 free(tst);
1752 if (frozen > 0)
1753 sysfs_set_str(&info, NULL, "sync_action","idle");
1754 return !test && busy ? 2 : 1;
1755}
1756
1757int autodetect(void)
1758{
1759 /* Open any md device, and issue the RAID_AUTORUN ioctl */
1760 int rv = 1;
1761 int fd = dev_open("9:0", O_RDONLY);
1762 if (fd >= 0) {
1763 if (ioctl(fd, RAID_AUTORUN, 0) == 0)
1764 rv = 0;
1765 close(fd);
1766 }
1767 return rv;
1768}
1769
1770int Update_subarray(char *dev, char *subarray, enum update_opt update,
1771 struct mddev_ident *ident, int verbose)
1772{
1773 struct supertype supertype, *st = &supertype;
1774 int fd, rv = 2;
1775 struct mdinfo *info = NULL;
1776 char *update_verb = map_num(update_options, update);
1777 bool allow_active = update == UOPT_PPL || update == UOPT_NO_PPL;
1778
1779 memset(st, 0, sizeof(*st));
1780
1781 fd = open_subarray(dev, subarray, st, verbose < 0);
1782 if (fd < 0)
1783 return 2;
1784
1785 if (!st->ss->update_subarray) {
1786 if (verbose >= 0)
1787 pr_err("Operation not supported for %s metadata\n",
1788 st->ss->name);
1789 goto free_super;
1790 }
1791
1792 if (!allow_active && is_subarray_active(subarray, st->devnm)) {
1793 if (verbose >= 0)
1794 pr_err("Subarray %s in %s is active, cannot update %s\n",
1795 subarray, dev, update_verb);
1796 goto free_super;
1797 }
1798
1799 if (mdmon_running(st->devnm))
1800 st->update_tail = &st->updates;
1801
1802 info = st->ss->container_content(st, subarray);
1803
1804 if (update == UOPT_PPL && !is_level456(info->array.level)) {
1805 pr_err("RWH policy ppl is supported only for raid4, raid5 and raid6.\n");
1806 goto free_super;
1807 }
1808
1809 rv = st->ss->update_subarray(st, subarray, update, ident);
1810
1811 if (rv) {
1812 if (verbose >= 0)
1813 pr_err("Failed to update %s of subarray-%s in %s\n",
1814 update_verb, subarray, dev);
1815 } else if (st->update_tail)
1816 flush_metadata_updates(st);
1817 else
1818 st->ss->sync_metadata(st);
1819
1820 if (rv == 0 && update == UOPT_NAME && verbose >= 0)
1821 pr_err("Updated subarray-%s name from %s, UUIDs may have changed\n",
1822 subarray, dev);
1823
1824free_super:
1825 if (info)
1826 free(info);
1827 st->ss->free_super(st);
1828 close(fd);
1829
1830 return rv;
1831}
1832
1833/* Move spare from one array to another If adding to destination array fails
1834 * add back to original array.
1835 * Returns 1 on success, 0 on failure */
1836int move_spare(char *from_devname, char *to_devname, dev_t devid)
1837{
1838 struct mddev_dev devlist;
1839 char devname[20];
1840
1841 /* try to remove and add */
1842 int fd1 = open(to_devname, O_RDONLY);
1843 int fd2 = open(from_devname, O_RDONLY);
1844
1845 if (fd1 < 0 || fd2 < 0) {
1846 if (fd1 >= 0)
1847 close(fd1);
1848 if (fd2 >= 0)
1849 close(fd2);
1850 return 0;
1851 }
1852
1853 devlist.next = NULL;
1854 devlist.used = 0;
1855 devlist.writemostly = FlagDefault;
1856 devlist.failfast = FlagDefault;
1857 devlist.devname = devname;
1858 sprintf(devname, "%d:%d", major(devid), minor(devid));
1859
1860 devlist.disposition = 'r';
1861 if (Manage_subdevs(from_devname, fd2, &devlist, -1, 0, UOPT_UNDEFINED, 0) == 0) {
1862 devlist.disposition = 'a';
1863 if (Manage_subdevs(to_devname, fd1, &devlist, -1, 0,
1864 UOPT_UNDEFINED, 0) == 0) {
1865 /* make sure manager is aware of changes */
1866 ping_manager(to_devname);
1867 ping_manager(from_devname);
1868 close(fd1);
1869 close(fd2);
1870 return 1;
1871 }
1872 else
1873 Manage_subdevs(from_devname, fd2, &devlist,
1874 -1, 0, UOPT_UNDEFINED, 0);
1875 }
1876 close(fd1);
1877 close(fd2);
1878 return 0;
1879}