monitor.c

   1 /*
   2  * mdmon - monitor external metadata arrays
   3  *
   4  * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de>
   5  * Copyright (C) 2007-2009 Intel Corporation
   6  *
   7  * This program is free software; you can redistribute it and/or modify it
   8  * under the terms and conditions of the GNU General Public License,
   9  * version 2, as published by the Free Software Foundation.
  10  *
  11  * This program is distributed in the hope it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  14  * more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with
  17  * this program; if not, write to the Free Software Foundation, Inc.,
  18  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  19  */
  20
  21 #include "mdadm.h"
  22 #include "mdmon.h"
  23 #include <sys/syscall.h>
  24 #include <sys/select.h>
  25
  26 static char *array_states[] = {
  27         "clear", "inactive", "suspended", "readonly", "read-auto",
  28         "clean", "active", "write-pending", "active-idle", "broken", NULL };
  29 static char *sync_actions[] = {
  30         "idle", "reshape", "resync", "recover", "check", "repair", NULL
  31 };
  32
  33 enum bb_action {
  34         RECORD_BB = 1,
  35         COMPARE_BB,
  36 };
  37
  38 static void add_fd(fd_set *fds, int *maxfd, int fd)
  39 {
  40         struct stat st;
  41         if (fd < 0)
  42                 return;
  43         if (fstat(fd, &st) == -1) {
  44                 dprintf("Invalid fd %d\n", fd);
  45                 return;
  46         }
  47         if (st.st_nlink == 0) {
  48                 dprintf("fd %d was deleted\n", fd);
  49                 return;
  50         }
  51         if (fd > *maxfd)
  52                 *maxfd = fd;
  53         FD_SET(fd, fds);
  54 }
  55
  56 static int read_attr(char *buf, int len, int fd)
  57 {
  58         int n;
  59
  60         if (fd < 0) {
  61                 buf[0] = 0;
  62                 return 0;
  63         }
  64         lseek(fd, 0, 0);
  65         n = read(fd, buf, len - 1);
  66
  67         if (n <= 0) {
  68                 buf[0] = 0;
  69                 return 0;
  70         }
  71         buf[n] = 0;
  72         if (buf[n-1] == '\n')
  73                 buf[n-1] = 0;
  74         return n;
  75 }
  76
  77 static void read_resync_start(int fd, unsigned long long *v)
  78 {
  79         char buf[SYSFS_MAX_BUF_SIZE];
  80         int n;
  81
  82         n = read_attr(buf, sizeof(buf), fd);
  83         if (n <= 0) {
  84                 dprintf("Failed to read resync_start (%d)\n", fd);
  85                 return;
  86         }
  87         if (str_is_none(buf) == true)
  88                 *v = MaxSector;
  89         else
  90                 *v = strtoull(buf, NULL, 10);
  91 }
  92
  93 static unsigned long long read_sync_completed(int fd)
  94 {
  95         unsigned long long val;
  96         char buf[SYSFS_MAX_BUF_SIZE];
  97         int n;
  98         char *ep;
  99
 100         n = read_attr(buf, sizeof(buf), fd);
 101
 102         if (n <= 0)
 103                 return 0;
 104         buf[n] = 0;
 105         val = strtoull(buf, &ep, 0);
 106         if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' '))
 107                 return 0;
 108         return val;
 109 }
 110
 111 static enum array_state read_state(int fd)
 112 {
 113         char buf[SYSFS_MAX_BUF_SIZE];
 114         int n = read_attr(buf, sizeof(buf), fd);
 115
 116         if (n <= 0)
 117                 return bad_word;
 118         return (enum array_state) sysfs_match_word(buf, array_states);
 119 }
 120
 121 static enum sync_action read_action( int fd)
 122 {
 123         char buf[SYSFS_MAX_BUF_SIZE];
 124         int n = read_attr(buf, sizeof(buf), fd);
 125
 126         if (n <= 0)
 127                 return bad_action;
 128         return (enum sync_action) sysfs_match_word(buf, sync_actions);
 129 }
 130
 131 int read_dev_state(int fd)
 132 {
 133         char buf[SYSFS_MAX_BUF_SIZE];
 134         int n = read_attr(buf, sizeof(buf), fd);
 135         char *cp;
 136         int rv = 0;
 137
 138         if (n <= 0)
 139                 return 0;
 140
 141         cp = buf;
 142         while (cp) {
 143                 if (sysfs_attr_match(cp, map_memb_state(MEMB_STATE_FAULTY)))
 144                         rv |= DS_FAULTY;
 145                 if (sysfs_attr_match(cp, map_memb_state(MEMB_STATE_IN_SYNC)))
 146                         rv |= DS_INSYNC;
 147                 if (sysfs_attr_match(cp, map_memb_state(MEMB_STATE_WRITE_MOSTLY)))
 148                         rv |= DS_WRITE_MOSTLY;
 149                 if (sysfs_attr_match(cp, map_memb_state(MEMB_STATE_SPARE)))
 150                         rv |= DS_SPARE;
 151                 if (sysfs_attr_match(cp, map_memb_state(MEMB_STATE_BLOCKED)))
 152                         rv |= DS_BLOCKED;
 153                 if (sysfs_attr_match(cp, map_memb_state(MEMB_STATE_EXTERNAL_BBL)))
 154                         rv |= DS_EXTERNAL_BB;
 155                 cp = strchr(cp, ',');
 156                 if (cp)
 157                         cp++;
 158         }
 159         return rv;
 160 }
 161
 162 int process_ubb(struct active_array *a, struct mdinfo *mdi, const unsigned long
 163                 long sector, const int length, const char *buf,
 164                 const int buf_len)
 165 {
 166         struct superswitch *ss = a->container->ss;
 167
 168         /*
 169          * record bad block in metadata first, then acknowledge it to the driver
 170          * via sysfs file
 171          */
 172         if ((ss->record_bad_block(a, mdi->disk.raid_disk, sector, length)) &&
 173             (sysfs_write_descriptor(mdi->bb_fd, buf, buf_len, NULL) == MDADM_STATUS_SUCCESS))
 174                 return 1;
 175
 176         /*
 177          * failed to store or acknowledge bad block, switch of bad block support
 178          * to get it out of blocked state
 179          */
 180         sysfs_set_str(&a->info, mdi, "state", "-external_bbl");
 181         return -1;
 182 }
 183
 184 int compare_bb(struct active_array *a, struct mdinfo *mdi, const unsigned long
 185                long sector, const unsigned int length, void *arg)
 186 {
 187         struct superswitch *ss = a->container->ss;
 188         struct md_bb *bb = (struct md_bb *) arg;
 189         int record = 1;
 190         int i;
 191
 192         for (i = 0; i < bb->count; i++) {
 193                 unsigned long long start = bb->entries[i].sector;
 194                 unsigned long long len = bb->entries[i].length;
 195
 196                 /*
 197                  * bad block in metadata exactly matches bad block in kernel
 198                  * list, just remove it from a list
 199                  */
 200                 if ((start == sector) && (len == length)) {
 201                         if (i < bb->count - 1)
 202                                 bb->entries[i] = bb->entries[bb->count - 1];
 203                         bb->count -= 1;
 204                         record = 0;
 205                         break;
 206                 }
 207                 /*
 208                  * bad block in metadata spans bad block in kernel list,
 209                  * clear it and record new bad block
 210                  */
 211                 if ((sector >= start) && (sector + length <= start + len)) {
 212                         ss->clear_bad_block(a, mdi->disk.raid_disk, start, len);
 213                         break;
 214                 }
 215         }
 216
 217         /* record all bad blocks not in metadata list */
 218         if (record && (ss->record_bad_block(a, mdi->disk.raid_disk, sector,
 219                                              length) <= 0)) {
 220                 sysfs_set_str(&a->info, mdi, "state", "-external_bbl");
 221                 return -1;
 222         }
 223
 224         return 1;
 225 }
 226
 227 static int read_bb_file(int fd, struct active_array *a, struct mdinfo *mdi,
 228                         enum bb_action action, void *arg)
 229 {
 230         char buf[30];
 231         int n = 0;
 232         int ret = 0;
 233         int read_again = 0;
 234         int off = 0;
 235         int pos = 0;
 236         int preserve_pos = (action == RECORD_BB ? 0 : 1);
 237
 238         if (lseek(fd, 0, SEEK_SET) == (off_t) -1)
 239                 return -1;
 240
 241         do {
 242                 read_again = 0;
 243                 n = read(fd, buf + pos, sizeof(buf) - 1 - pos);
 244                 if (n < 0)
 245                         return -1;
 246                 n += pos;
 247
 248                 buf[n] = '\0';
 249                 off = 0;
 250
 251                 while (off < n) {
 252                         unsigned long long sector;
 253                         int length;
 254                         char newline;
 255                         int consumed;
 256                         int matched;
 257                         int rc;
 258
 259                         /* kernel sysfs file format: "sector length\n" */
 260                         matched = sscanf(buf + off, "%llu %d%c%n", &sector,
 261                                          &length, &newline, &consumed);
 262                         if ((matched != 3) && (off > 0)) {
 263                                 /* truncated entry, read again */
 264                                 if (preserve_pos) {
 265                                         pos = sizeof(buf) - off - 1;
 266                                         memmove(buf, buf + off, pos);
 267                                 } else {
 268                                         if (lseek(fd, 0, SEEK_SET) ==
 269                                             (off_t) -1)
 270                                                 return -1;
 271                                 }
 272                                 read_again = 1;
 273                                 break;
 274                         }
 275                         if (matched != 3)
 276                                 return -1;
 277                         if (newline != '\n')
 278                                 return -1;
 279                         if (length <= 0)
 280                                 return -1;
 281
 282                         if (action == RECORD_BB)
 283                                 rc = process_ubb(a, mdi, sector, length,
 284                                                   buf + off, consumed);
 285                         else if (action == COMPARE_BB)
 286                                 rc = compare_bb(a, mdi, sector, length, arg);
 287                         else
 288                                 rc = -1;
 289
 290                         if (rc < 0)
 291                                 return rc;
 292                         ret += rc;
 293                         off += consumed;
 294                 }
 295         } while (read_again);
 296
 297         return ret;
 298 }
 299
 300 static int process_dev_ubb(struct active_array *a, struct mdinfo *mdi)
 301 {
 302         return read_bb_file(mdi->ubb_fd, a, mdi, RECORD_BB, NULL);
 303 }
 304
 305 static int check_for_cleared_bb(struct active_array *a, struct mdinfo *mdi)
 306 {
 307         struct superswitch *ss = a->container->ss;
 308         struct md_bb *bb;
 309         int i;
 310
 311         /*
 312          * Get a list of bad blocks for an array, then read list of
 313          * acknowledged bad blocks from kernel and compare it against metadata
 314          * list, clear all bad blocks remaining in metadata list
 315          */
 316         bb = ss->get_bad_blocks(a, mdi->disk.raid_disk);
 317         if (!bb)
 318                 return -1;
 319
 320         if (read_bb_file(mdi->bb_fd, a, mdi, COMPARE_BB, bb) < 0)
 321                 return -1;
 322
 323         for (i = 0; i < bb->count; i++) {
 324                 unsigned long long sector = bb->entries[i].sector;
 325                 int length = bb->entries[i].length;
 326
 327                 ss->clear_bad_block(a, mdi->disk.raid_disk, sector, length);
 328         }
 329
 330         return 0;
 331 }
 332
 333 static void signal_manager(void)
 334 {
 335         /* tgkill(getpid(), mon_tid, SIGUSR1); */
 336         int pid = getpid();
 337         syscall(SYS_tgkill, pid, mgr_tid, SIGUSR1);
 338 }
 339
 340 /* Monitor a set of active md arrays - all of which share the
 341  * same metadata - and respond to events that require
 342  * metadata update.
 343  *
 344  * New arrays are detected by another thread which allocates
 345  * required memory and attaches the data structure to our list.
 346  *
 347  * Events:
 348  *  Array stops.
 349  *    This is detected by array_state going to 'clear' or 'inactive'.
 350  *    while we thought it was active.
 351  *    Response is to mark metadata as clean and 'clear' the array(??)
 352  *  write-pending
 353  *    array_state if 'write-pending'
 354  *    We mark metadata as 'dirty' then set array to 'active'.
 355  *  active_idle
 356  *    Either ignore, or mark clean, then mark metadata as clean.
 357  *
 358  *  device fails
 359  *    detected by rd-N/state reporting "faulty"
 360  *    mark device as 'failed' in metadata, let the kernel release the
 361  *    device by writing '-blocked' to rd/state, and finally write 'remove' to
 362  *    rd/state.  Before a disk can be replaced it must be failed and removed
 363  *    from all container members, this will be preemptive for the other
 364  *    arrays... safe?
 365  *
 366  *  sync completes
 367  *    sync_action was 'resync' and becomes 'idle' and resync_start becomes
 368  *    MaxSector
 369  *    Notify metadata that sync is complete.
 370  *
 371  *  recovery completes
 372  *    sync_action changes from 'recover' to 'idle'
 373  *    Check each device state and mark metadata if 'faulty' or 'in_sync'.
 374  *
 375  *  deal with resync
 376  *    This only happens on finding a new array... mdadm will have set
 377  *    'resync_start' to the correct value.  If 'resync_start' indicates that an
 378  *    resync needs to occur set the array to the 'active' state rather than the
 379  *    initial read-auto state.
 380  *
 381  *
 382  *
 383  * We wait for a change (poll/select) on array_state, sync_action, and
 384  * each rd-X/state file.
 385  * When we get any change, we check everything.  So read each state file,
 386  * then decide what to do.
 387  *
 388  * The core action is to write new metadata to all devices in the array.
 389  * This is done at most once on any wakeup.
 390  * After that we might:
 391  *   - update the array_state
 392  *   - set the role of some devices.
 393  *   - request a sync_action
 394  *
 395  */
 396
 397 #define ARRAY_DIRTY 1
 398 #define ARRAY_BUSY 2
 399 static int read_and_act(struct active_array *a)
 400 {
 401         unsigned long long sync_completed;
 402         bool disks_to_remove = false;
 403         bool check_degraded = false;
 404         bool check_reshape = false;
 405         int deactivate = 0;
 406         struct mdinfo *mdi;
 407         int ret = 0;
 408         int count = 0;
 409         bool write_checkpoint = false;
 410
 411         a->next_state = bad_word;
 412         a->next_action = bad_action;
 413
 414         a->curr_state = read_state(a->info.state_fd);
 415         a->curr_action = read_action(a->action_fd);
 416         if (a->curr_state != clear)
 417                 /*
 418                  * In "clear" state, resync_start may wrongly be set to "0"
 419                  * when the kernel called md_clean but didn't remove the
 420                  * sysfs attributes yet
 421                  */
 422                 read_resync_start(a->resync_start_fd, &a->info.resync_start);
 423         sync_completed = read_sync_completed(a->sync_completed_fd);
 424         for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
 425                 mdi->next_state = 0;
 426                 mdi->curr_state = 0;
 427
 428                 if (mdi->man_disk_to_remove)
 429                         /* We are removing this device, skip it then */
 430                         continue;
 431
 432                 read_resync_start(mdi->recovery_fd, &mdi->recovery_start);
 433                 mdi->curr_state = read_dev_state(mdi->state_fd);
 434
 435                 if (!(mdi->curr_state & DS_EXTERNAL_BB))
 436                         /*
 437                          * It assumes that superswitch badblock functions are set if disk
 438                          * has external badblocks support configured.
 439                          */
 440                         continue;
 441
 442                 if ((mdi->curr_state & DS_BLOCKED) && process_dev_ubb(a, mdi) > 0)
 443                         /*
 444                          * Blocked has two meanings: we need to acknowledge failure or badblocks
 445                          * (if supported). Here, badblocks are handled.
 446                          *
 447                          * If successful, unblock the array. This is not perfect but
 448                          * process_dev_ubb() may disable badblock support in case of failure.
 449                          */
 450                         mdi->next_state |= DS_UNBLOCK;
 451
 452                 check_for_cleared_bb(a, mdi);
 453         }
 454
 455         dprintf("(%d): state:%s prev:%s action:%s prev: %s start:%llu\n",
 456                 a->info.container_member,
 457                 array_states[a->curr_state],
 458                 array_states[a->prev_state],
 459                 sync_actions[a->curr_action],
 460                 sync_actions[a->prev_action],
 461                 a->info.resync_start
 462                 );
 463
 464         if ((a->curr_state == bad_word || a->curr_state <= inactive) &&
 465             a->prev_state > inactive) {
 466                 /* array has been stopped */
 467                 a->container->ss->set_array_state(a, 1);
 468                 a->next_state = clear;
 469                 deactivate = 1;
 470         }
 471         if (a->curr_state == write_pending) {
 472                 a->container->ss->set_array_state(a, 0);
 473                 a->next_state = active;
 474                 ret |= ARRAY_DIRTY;
 475         }
 476         if (a->curr_state == active_idle) {
 477                 /* Set array to 'clean' FIRST, then mark clean
 478                  * in the metadata
 479                  */
 480                 a->next_state = clean;
 481                 ret |= ARRAY_DIRTY;
 482         }
 483         if ((a->curr_state == clean) || (a->curr_state == broken)) {
 484                 a->container->ss->set_array_state(a, 1);
 485         }
 486         if (a->curr_state == active ||
 487             a->curr_state == suspended)
 488                 ret |= ARRAY_DIRTY;
 489         if (a->curr_state == readonly) {
 490                 /* Well, I'm ready to handle things.  If readonly
 491                  * wasn't requested, transition to read-auto.
 492                  */
 493                 char buf[64];
 494                 read_attr(buf, sizeof(buf), a->metadata_fd);
 495                 if (strncmp(buf, "external:-", 10) == 0) {
 496                         /* explicit request for readonly array.  Leave it alone */
 497                         ;
 498                 } else {
 499                         if (a->container->ss->set_array_state(a, 2))
 500                                 a->next_state = read_auto; /* array is clean */
 501                         else {
 502                                 a->next_state = active; /* Now active for recovery etc */
 503                                 ret |= ARRAY_DIRTY;
 504                         }
 505                 }
 506         }
 507
 508         if (!deactivate &&
 509             a->curr_action == idle &&
 510             a->prev_action == resync) {
 511                 /* A resync has finished.  The endpoint is recorded in
 512                  * 'sync_start'.  We don't update the metadata
 513                  * until the array goes inactive or readonly though.
 514                  * Just check if we need to fiddle spares.
 515                  */
 516                 a->container->ss->set_array_state(a, a->curr_state <= clean);
 517                 check_degraded = 1;
 518         }
 519
 520         if (!deactivate &&
 521             a->curr_action == idle &&
 522             a->prev_action == recover) {
 523                 /* A recovery has finished.  Some disks may be in sync now,
 524                  * and the array may no longer be degraded
 525                  */
 526                 for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
 527                         a->container->ss->set_disk(a, mdi->disk.raid_disk,
 528                                                    mdi->curr_state);
 529                         if (! (mdi->curr_state & DS_INSYNC))
 530                                 check_degraded = 1;
 531                         count++;
 532                 }
 533                 if (count != a->info.array.raid_disks)
 534                         check_degraded = 1;
 535         }
 536
 537         if (!deactivate &&
 538             a->curr_action == reshape &&
 539             a->prev_action != reshape)
 540                 /* reshape was requested by mdadm.  Need to see if
 541                  * new devices have been added.  Manager does that
 542                  * when it sees check_reshape
 543                  */
 544                 check_reshape = 1;
 545
 546         /* Check for failures and if found:
 547          * 1/ Record the failure in the metadata and unblock the device.
 548          *    FIXME update the kernel to stop notifying on failed drives when
 549          *    the array is readonly and we have cleared 'blocked'
 550          * 2/ Try to remove the device if the array is writable, or can be
 551          *    made writable.
 552          */
 553         for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
 554                 if (mdi->curr_state & DS_FAULTY) {
 555                         a->container->ss->set_disk(a, mdi->disk.raid_disk,
 556                                                    mdi->curr_state);
 557                         check_degraded = 1;
 558                         if (mdi->curr_state & DS_BLOCKED)
 559                                 mdi->next_state |= DS_UNBLOCK;
 560                         if (a->curr_state == read_auto) {
 561                                 a->container->ss->set_array_state(a, 0);
 562                                 a->next_state = active;
 563                         }
 564                         if (a->curr_state > readonly)
 565                                 mdi->next_state |= DS_REMOVE;
 566                 }
 567         }
 568
 569         /* Update reshape checkpoint, depending if it finished or progressed */
 570         if (a->curr_action == idle && a->prev_action == reshape) {
 571                 char buf[SYSFS_MAX_BUF_SIZE];
 572
 573                 if (sync_completed != 0)
 574                         a->last_checkpoint = sync_completed;
 575
 576                 /*
 577                  * If reshape really finished, set checkpoint to the end to finalize it.
 578                  * Do not set checkpoint if reshape is broken.
 579                  * Reshape will restart from last checkpoint.
 580                  */
 581                 if (sysfs_get_str(&a->info, NULL, "reshape_position", buf, sizeof(buf)) >= 0)
 582                         if (str_is_none(buf) == true)
 583                                 a->last_checkpoint = a->info.component_size;
 584
 585                 write_checkpoint = true;
 586         }
 587
 588         if (a->curr_action >= reshape && sync_completed > a->last_checkpoint) {
 589                 /* Update checkpoint if neither reshape nor idle action */
 590                 a->last_checkpoint = sync_completed;
 591
 592                 write_checkpoint = true;
 593         }
 594
 595         /* Save checkpoint */
 596         if (write_checkpoint) {
 597                 a->container->ss->set_array_state(a, a->curr_state <= clean);
 598
 599                 if (a->curr_action <= reshape)
 600                         a->last_checkpoint = sync_completed;
 601         }
 602
 603         if (sync_completed >= a->info.component_size)
 604                 a->last_checkpoint = 0;
 605
 606         a->container->ss->sync_metadata(a->container);
 607         dprintf("(%d): state:%s action:%s next(", a->info.container_member,
 608                 array_states[a->curr_state], sync_actions[a->curr_action]);
 609
 610         /* Effect state changes in the array */
 611         if (a->next_state != bad_word) {
 612                 dprintf_cont(" state:%s", array_states[a->next_state]);
 613                 write_attr(array_states[a->next_state], a->info.state_fd);
 614         }
 615         if (a->next_action != bad_action) {
 616                 write_attr(sync_actions[a->next_action], a->action_fd);
 617                 dprintf_cont(" action:%s", sync_actions[a->next_action]);
 618         }
 619         for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
 620                 if (mdi->next_state & DS_UNBLOCK) {
 621                         dprintf_cont(" %d:-blocked", mdi->disk.raid_disk);
 622                         write_attr("-blocked", mdi->state_fd);
 623                 }
 624
 625                 if ((mdi->next_state & DS_REMOVE) && !mdi->man_disk_to_remove) {
 626                         dprintf_cont(" %d:disk_to_remove", mdi->disk.raid_disk);
 627                         mdi->man_disk_to_remove = true;
 628                         disks_to_remove = true;
 629                 }
 630
 631                 if (mdi->next_state & DS_INSYNC) {
 632                         write_attr("+in_sync", mdi->state_fd);
 633                         dprintf_cont(" %d:+in_sync", mdi->disk.raid_disk);
 634                 }
 635         }
 636         dprintf_cont(" )\n");
 637
 638         /* move curr_ to prev_ */
 639         a->prev_state = a->curr_state;
 640
 641         a->prev_action = a->curr_action;
 642
 643         for (mdi = a->info.devs; mdi ; mdi = mdi->next)
 644                 mdi->prev_state = mdi->curr_state;
 645
 646         if (check_degraded || check_reshape || disks_to_remove) {
 647
 648                 a->check_member_remove |= disks_to_remove;
 649                 a->check_degraded |= check_degraded;
 650                 a->check_reshape |= check_reshape;
 651                 signal_manager();
 652         }
 653
 654         if (deactivate)
 655                 a->container = NULL;
 656
 657         return ret;
 658 }
 659
 660 static struct mdinfo *
 661 find_device(struct active_array *a, int major, int minor)
 662 {
 663         struct mdinfo *mdi;
 664
 665         for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
 666                 if (mdi->disk.major == major && mdi->disk.minor == minor)
 667                         return mdi;
 668
 669         return NULL;
 670 }
 671
 672 static void reconcile_failed(struct active_array *aa, struct mdinfo *failed)
 673 {
 674         struct active_array *a;
 675         struct mdinfo *victim;
 676
 677         for (a = aa; a; a = a->next) {
 678                 if (!a->container || a->to_remove)
 679                         continue;
 680                 victim = find_device(a, failed->disk.major, failed->disk.minor);
 681                 if (!victim)
 682                         continue;
 683
 684                 if (!(victim->curr_state & DS_FAULTY))
 685                         write_attr("faulty", victim->state_fd);
 686         }
 687 }
 688
 689 #ifdef DEBUG
 690 static void dprint_wake_reasons(fd_set *fds)
 691 {
 692         int i;
 693         char proc_path[256];
 694         char link[256];
 695         char *basename;
 696         int rv;
 697
 698         fprintf(stderr, "monitor: wake ( ");
 699         for (i = 0; i < FD_SETSIZE; i++) {
 700                 if (FD_ISSET(i, fds)) {
 701                         sprintf(proc_path, "/proc/%d/fd/%d",
 702                                 (int) getpid(), i);
 703
 704                         rv = readlink(proc_path, link, sizeof(link) - 1);
 705                         if (rv < 0) {
 706                                 fprintf(stderr, "%d:unknown ", i);
 707                                 continue;
 708                         }
 709                         link[rv] = '\0';
 710                         basename = strrchr(link, '/');
 711                         fprintf(stderr, "%d:%s ",
 712                                 i, basename ? ++basename : link);
 713                 }
 714         }
 715         fprintf(stderr, ")\n");
 716 }
 717 #endif
 718
 719 int monitor_loop_cnt;
 720
 721 static int wait_and_act(struct supertype *container, int nowait)
 722 {
 723         struct active_array *a, **ap, **aap = &container->arrays;
 724         static unsigned int dirty_arrays = ~0; /* start at some non-zero value */
 725         struct mdinfo *mdi;
 726         int rv, maxfd = 0;
 727         fd_set rfds;
 728
 729         FD_ZERO(&rfds);
 730
 731         for (ap = aap ; *ap ;) {
 732                 a = *ap;
 733                 /* once an array has been deactivated we want to
 734                  * ask the manager to discard it.
 735                  */
 736                 if (!a->container || a->to_remove) {
 737                         if (discard_this) {
 738                                 ap = &(*ap)->next;
 739                                 continue;
 740                         }
 741                         *ap = a->next;
 742                         a->next = NULL;
 743                         discard_this = a;
 744                         signal_manager();
 745                         continue;
 746                 }
 747
 748                 add_fd(&rfds, &maxfd, a->info.state_fd);
 749                 add_fd(&rfds, &maxfd, a->action_fd);
 750                 add_fd(&rfds, &maxfd, a->sync_completed_fd);
 751
 752                 for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
 753                         if (mdi->man_disk_to_remove) {
 754                                 mdi->mon_descriptors_not_used = true;
 755
 756                                 /* Managemon could be blocked on suspend in kernel.
 757                                  * Monitor must respond if any badblock is recorded in this time.
 758                                  */
 759                                 container->retry_soon = 1;
 760                                 continue;
 761                         }
 762
 763                         add_fd(&rfds, &maxfd, mdi->state_fd);
 764                         add_fd(&rfds, &maxfd, mdi->bb_fd);
 765                         add_fd(&rfds, &maxfd, mdi->ubb_fd);
 766                 }
 767
 768                 ap = &(*ap)->next;
 769         }
 770
 771         if (manager_ready && (*aap == NULL || (sigterm && !dirty_arrays))) {
 772                 /* No interesting arrays, or we have been told to
 773                  * terminate and everything is clean.  Lets see about
 774                  * exiting.  Note that blocking at this point is not a
 775                  * problem as there are no active arrays, there is
 776                  * nothing that we need to be ready to do.
 777                  */
 778                 int fd;
 779                 if (sigterm)
 780                         fd = open_dev_excl(container->devnm);
 781                 else
 782                         fd = open_dev_flags(container->devnm, O_RDONLY|O_EXCL);
 783                 if (fd >= 0 || errno != EBUSY) {
 784                         /* OK, we are safe to leave */
 785                         if (sigterm && !dirty_arrays)
 786                                 dprintf("caught sigterm, all clean... exiting\n");
 787                         else
 788                                 dprintf("no arrays to monitor... exiting\n");
 789                         if (!sigterm)
 790                                 /* On SIGTERM, someone (the take-over mdmon) will
 791                                  * clean up
 792                                  */
 793                                 remove_pidfile(container->devnm);
 794                         exit_now = 1;
 795                         signal_manager();
 796                         close(fd);
 797                         exit(0);
 798                 }
 799         }
 800
 801         if (!nowait) {
 802                 sigset_t set;
 803                 struct timespec ts;
 804                 ts.tv_sec = 24*3600;
 805                 ts.tv_nsec = 0;
 806                 if (*aap == NULL || container->retry_soon) {
 807                         /* just waiting to get O_EXCL access */
 808                         ts.tv_sec = 0;
 809                         ts.tv_nsec = 20000000ULL;
 810                 }
 811                 sigprocmask(SIG_UNBLOCK, NULL, &set);
 812                 sigdelset(&set, SIGUSR1);
 813                 monitor_loop_cnt |= 1;
 814                 rv = pselect(maxfd+1, NULL, NULL, &rfds, &ts, &set);
 815                 monitor_loop_cnt += 1;
 816                 if (rv == -1) {
 817                         if (errno == EINTR) {
 818                                 rv = 0;
 819                                 FD_ZERO(&rfds);
 820                                 dprintf("monitor: caught signal\n");
 821                         } else
 822                                 dprintf("monitor: error %d in pselect\n",
 823                                         errno);
 824                 }
 825                 #ifdef DEBUG
 826                 else
 827                         dprint_wake_reasons(&rfds);
 828                 #endif
 829                 container->retry_soon = 0;
 830         }
 831
 832         if (update_queue) {
 833                 struct metadata_update *this;
 834
 835                 for (this = update_queue; this ; this = this->next)
 836                         container->ss->process_update(container, this);
 837
 838                 update_queue_handled = update_queue;
 839                 update_queue = NULL;
 840                 signal_manager();
 841                 container->ss->sync_metadata(container);
 842         }
 843
 844         rv = 0;
 845         dirty_arrays = 0;
 846         for (a = *aap; a ; a = a->next) {
 847
 848                 if (a->replaces && !discard_this) {
 849                         struct active_array **ap;
 850                         for (ap = &a->next; *ap && *ap != a->replaces;
 851                              ap = & (*ap)->next)
 852                                 ;
 853                         if (*ap)
 854                                 *ap = (*ap)->next;
 855                         discard_this = a->replaces;
 856                         a->replaces = NULL;
 857                         /* FIXME check if device->state_fd need to be cleared?*/
 858                         signal_manager();
 859                 }
 860                 if (a->container && !a->to_remove) {
 861                         int ret = read_and_act(a);
 862
 863                         rv |= 1;
 864                         dirty_arrays += !!(ret & ARRAY_DIRTY);
 865                         /* when terminating stop manipulating the array after it
 866                          * is clean, but make sure read_and_act() is given a
 867                          * chance to handle 'active_idle'
 868                          */
 869                         if (sigterm && !(ret & ARRAY_DIRTY))
 870                                 a->container = NULL; /* stop touching this array */
 871                         if (ret & ARRAY_BUSY)
 872                                 container->retry_soon = 1;
 873                 }
 874         }
 875
 876         /* propagate failures across container members */
 877         for (a = *aap; a ; a = a->next) {
 878                 if (!a->container || a->to_remove)
 879                         continue;
 880                 for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
 881                         if (mdi->curr_state & DS_FAULTY)
 882                                 reconcile_failed(*aap, mdi);
 883         }
 884
 885         return rv;
 886 }
 887
 888 void do_monitor(struct supertype *container)
 889 {
 890         int rv;
 891         int first = 1;
 892         do {
 893                 rv = wait_and_act(container, first);
 894                 first = 0;
 895         } while (rv >= 0);
 896 }