Monitor.c

   1 /*
   2  * mdadm - manage Linux "md" devices aka RAID arrays.
   3  *
   4  * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
   5  *
   6  *
   7  *    This program is free software; you can redistribute it and/or modify
   8  *    it under the terms of the GNU General Public License as published by
   9  *    the Free Software Foundation; either version 2 of the License, or
  10  *    (at your option) any later version.
  11  *
  12  *    This program is distributed in the hope that it will be useful,
  13  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *    GNU General Public License for more details.
  16  *
  17  *    You should have received a copy of the GNU General Public License
  18  *    along with this program; if not, write to the Free Software
  19  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20  *
  21  *    Author: Neil Brown
  22  *    Email: <neilb@suse.de>
  23  */
  24
  25 #include        "mdadm.h"
  26 #include        "md_p.h"
  27 #include        "md_u.h"
  28 #include        <sys/wait.h>
  29 #include        <limits.h>
  30 #include        <syslog.h>
  31 #ifndef NO_LIBUDEV
  32 #include        <libudev.h>
  33 #endif
  34
  35 #define EVENT_NAME_MAX 32
  36
  37 struct state {
  38         char devname[MD_NAME_MAX + sizeof("/dev/md/")]; /* length of "/dev/md/" + device name + terminating byte*/
  39         char devnm[MD_NAME_MAX];        /* to sync with mdstat info */
  40         unsigned int utime;
  41         int err;
  42         char *spare_group;
  43         int active, working, failed, spare, raid;
  44         int from_config;
  45         int from_auto;
  46         int expected_spares;
  47         int devstate[MAX_DISKS];
  48         dev_t devid[MAX_DISKS];
  49         int percent;
  50         char parent_devnm[MD_NAME_MAX]; /* For subarray, devnm of parent.
  51                                         * For others, ""
  52                                         */
  53         struct supertype *metadata;
  54         struct state *subarray;/* for a container it is a link to first subarray
  55                                 * for a subarray it is a link to next subarray
  56                                 * in the same container */
  57         struct state *parent;  /* for a subarray it is a link to its container
  58                                 */
  59         struct state *next;
  60 };
  61
  62 struct alert_info {
  63         char hostname[HOST_NAME_MAX];
  64         char *mailaddr;
  65         char *mailfrom;
  66         char *alert_cmd;
  67         int dosyslog;
  68         int test;
  69 } info;
  70
  71 enum event {
  72         EVENT_SPARE_ACTIVE = 0,
  73         EVENT_NEW_ARRAY,
  74         EVENT_MOVE_SPARE,
  75         EVENT_TEST_MESSAGE,
  76         __SYSLOG_PRIORITY_WARNING,
  77         EVENT_REBUILD_STARTED,
  78         EVENT_REBUILD,
  79         EVENT_REBUILD_FINISHED,
  80         EVENT_SPARES_MISSING,
  81         __SYSLOG_PRIORITY_CRITICAL,
  82         EVENT_DEVICE_DISAPPEARED,
  83         EVENT_FAIL,
  84         EVENT_FAIL_SPARE,
  85         EVENT_DEGRADED_ARRAY,
  86         EVENT_UNKNOWN
  87 };
  88
  89 mapping_t events_map[] = {
  90         {"SpareActive", EVENT_SPARE_ACTIVE},
  91         {"NewArray", EVENT_NEW_ARRAY},
  92         {"MoveSpare", EVENT_MOVE_SPARE},
  93         {"TestMessage", EVENT_TEST_MESSAGE},
  94         {"RebuildStarted", EVENT_REBUILD_STARTED},
  95         {"Rebuild", EVENT_REBUILD},
  96         {"RebuildFinished", EVENT_REBUILD_FINISHED},
  97         {"SparesMissing", EVENT_SPARES_MISSING},
  98         {"DeviceDisappeared", EVENT_DEVICE_DISAPPEARED},
  99         {"Fail", EVENT_FAIL},
 100         {"FailSpare", EVENT_FAIL_SPARE},
 101         {"DegradedArray", EVENT_DEGRADED_ARRAY},
 102         {NULL, EVENT_UNKNOWN}
 103 };
 104
 105 struct event_data {
 106         enum event event_enum;
 107         /*
 108          * @event_name: Rebuild event name must be in form "RebuildXX", where XX is rebuild progress.
 109          */
 110         char event_name[EVENT_NAME_MAX];
 111         char message[BUFSIZ];
 112         const char *description;
 113         const char *dev;
 114         const char *disc;
 115 };
 116
 117 static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist);
 118 static void try_spare_migration(struct state *statelist);
 119 static void link_containers_with_subarrays(struct state *list);
 120 static void free_statelist(struct state *statelist);
 121 static int check_array(struct state *st, struct mdstat_ent *mdstat, int increments, char *prefer);
 122 static int check_one_sharer(int scan);
 123 #ifndef NO_LIBUDEV
 124 static int check_udev_activity(void);
 125 #endif
 126 static void link_containers_with_subarrays(struct state *list);
 127 static int make_daemon(char *pidfile);
 128 static void try_spare_migration(struct state *statelist);
 129 static void write_autorebuild_pid(void);
 130
 131 int Monitor(struct mddev_dev *devlist,
 132             char *mailaddr, char *alert_cmd,
 133             struct context *c,
 134             int daemonise, int oneshot,
 135             int dosyslog, char *pidfile, int increments,
 136             int share)
 137 {
 138         /*
 139          * Every few seconds, scan every md device looking for changes
 140          * When a change is found, log it, possibly run the alert command,
 141          * and possibly send Email
 142          *
 143          * For each array, we record:
 144          *   Update time
 145          *   active/working/failed/spare drives
 146          *   State of each device.
 147          *   %rebuilt if rebuilding
 148          *
 149          * If the update time changes, check out all the data again
 150          * It is possible that we cannot get the state of each device
 151          * due to bugs in the md kernel module.
 152          * We also read /proc/mdstat to get rebuild percent,
 153          * and to get state on all active devices incase of kernel bug.
 154          *
 155          * Events are:
 156          *    Fail
 157          *      An active device had Faulty set or Active/Sync removed
 158          *    FailSpare
 159          *      A spare device had Faulty set
 160          *    SpareActive
 161          *      An active device had a reverse transition
 162          *    RebuildStarted
 163          *      percent went from -1 to +ve
 164          *    RebuildNN
 165          *      percent went from below to not-below NN%
 166          *    DeviceDisappeared
 167          *      Couldn't access a device which was previously visible
 168          *
 169          * if we detect an array with active<raid and spare==0
 170          * we look at other arrays that have same spare-group
 171          * If we find one with active==raid and spare>0,
 172          *  and if we can get_disk_info and find a name
 173          *  Then we hot-remove and hot-add to the other array
 174          *
 175          * If devlist is NULL, then we can monitor everything if --scan
 176          * was given.  We get an initial list from config file and add anything
 177          * that appears in /proc/mdstat
 178          */
 179
 180         struct state *statelist = NULL;
 181         int finished = 0;
 182         struct mdstat_ent *mdstat = NULL;
 183         char *mailfrom;
 184         struct mddev_ident *mdlist;
 185         int delay_for_event = c->delay;
 186
 187         if (devlist && c->scan) {
 188                 pr_err("Devices list and --scan option cannot be combined - not monitoring.\n");
 189                 return 1;
 190         }
 191
 192         if (!mailaddr)
 193                 mailaddr = conf_get_mailaddr();
 194
 195         if (!alert_cmd)
 196                 alert_cmd = conf_get_program();
 197
 198         mailfrom = conf_get_mailfrom();
 199
 200         if (c->scan && !mailaddr && !alert_cmd && !dosyslog) {
 201                 pr_err("No mail address or alert command - not monitoring.\n");
 202                 return 1;
 203         }
 204
 205         if (c->verbose) {
 206                 pr_err("Monitor is started with delay %ds\n", c->delay);
 207                 if (mailaddr)
 208                         pr_err("Monitor using email address %s\n", mailaddr);
 209                 if (alert_cmd)
 210                         pr_err("Monitor using program %s\n", alert_cmd);
 211         }
 212
 213         info.alert_cmd = alert_cmd;
 214         info.mailaddr = mailaddr;
 215         info.mailfrom = mailfrom;
 216         info.dosyslog = dosyslog;
 217         info.test = c->test;
 218
 219         if (gethostname(info.hostname, sizeof(info.hostname)) != 0) {
 220                 pr_err("Cannot get hostname.\n");
 221                 return 1;
 222         }
 223         info.hostname[sizeof(info.hostname) - 1] = '\0';
 224
 225         if (share){
 226                 if (check_one_sharer(c->scan))
 227                         return 1;
 228         }
 229
 230         if (daemonise) {
 231                 int rv = make_daemon(pidfile);
 232                 if (rv >= 0)
 233                         return rv;
 234         }
 235
 236         if (share)
 237                 write_autorebuild_pid();
 238
 239         if (devlist == NULL) {
 240                 mdlist = conf_get_ident(NULL);
 241                 for (; mdlist; mdlist = mdlist->next) {
 242                         struct state *st;
 243
 244                         if (mdlist->devname == NULL)
 245                                 continue;
 246                         if (strcasecmp(mdlist->devname, "<ignore>") == 0)
 247                                 continue;
 248                         if (!is_mddev(mdlist->devname))
 249                                 continue;
 250
 251                         st = xcalloc(1, sizeof *st);
 252                         snprintf(st->devname, MD_NAME_MAX + sizeof("/dev/md/"),
 253                                  "/dev/md/%s", basename(mdlist->devname));
 254                         st->next = statelist;
 255                         st->devnm[0] = 0;
 256                         st->percent = RESYNC_UNKNOWN;
 257                         st->from_config = 1;
 258                         st->expected_spares = mdlist->spare_disks;
 259                         if (mdlist->spare_group)
 260                                 st->spare_group = xstrdup(mdlist->spare_group);
 261                         statelist = st;
 262                 }
 263         } else {
 264                 struct mddev_dev *dv;
 265
 266                 for (dv = devlist; dv; dv = dv->next) {
 267                         struct state *st;
 268
 269                         if (!is_mddev(dv->devname))
 270                                 continue;
 271
 272                         st = xcalloc(1, sizeof *st);
 273                         mdlist = conf_get_ident(dv->devname);
 274                         snprintf(st->devname, MD_NAME_MAX + sizeof("/dev/md/"), "%s", dv->devname);
 275                         st->next = statelist;
 276                         st->devnm[0] = 0;
 277                         st->percent = RESYNC_UNKNOWN;
 278                         st->expected_spares = -1;
 279                         if (mdlist) {
 280                                 st->expected_spares = mdlist->spare_disks;
 281                                 if (mdlist->spare_group)
 282                                         st->spare_group = xstrdup(mdlist->spare_group);
 283                         }
 284                         statelist = st;
 285                 }
 286         }
 287
 288         while (!finished) {
 289                 int new_found = 0;
 290                 struct state *st, **stp;
 291                 int anydegraded = 0;
 292                 int anyredundant = 0;
 293
 294                 if (mdstat)
 295                         free_mdstat(mdstat);
 296                 mdstat = mdstat_read(oneshot ? 0 : 1, 0);
 297
 298                 for (st = statelist; st; st = st->next) {
 299                         if (check_array(st, mdstat, increments, c->prefer))
 300                                 anydegraded = 1;
 301                         /* for external arrays, metadata is filled for
 302                          * containers only
 303                          */
 304                         if (st->metadata && st->metadata->ss->external)
 305                                 continue;
 306                         if (st->err == 0 && !anyredundant)
 307                                 anyredundant = 1;
 308                 }
 309
 310                 /* now check if there are any new devices found in mdstat */
 311                 if (c->scan)
 312                         new_found = add_new_arrays(mdstat, &statelist);
 313
 314                 /* If an array has active < raid && spare == 0 && spare_group != NULL
 315                  * Look for another array with spare > 0 and active == raid and same spare_group
 316                  * if found, choose a device and hotremove/hotadd
 317                  */
 318                 if (share && anydegraded)
 319                         try_spare_migration(statelist);
 320                 if (!new_found) {
 321                         if (oneshot)
 322                                 break;
 323                         else if (!anyredundant) {
 324                                 pr_err("No array with redundancy detected, stopping\n");
 325                                 break;
 326                         }
 327                         else {
 328 #ifndef NO_LIBUDEV
 329                                 /*
 330                                  * Wait for udevd to finish new devices
 331                                  * processing.
 332                                  */
 333                                 if (mdstat_wait(delay_for_event) &&
 334                                     check_udev_activity())
 335                                         pr_err("Error while waiting for UDEV to complete new devices processing\n");
 336 #else
 337                                 int wait_result = mdstat_wait(delay_for_event);
 338                                 /*
 339                                  * Give chance to process new device
 340                                  */
 341                                 if (wait_result != 0) {
 342                                         if (c->delay > 5)
 343                                                 delay_for_event = 5;
 344                                 } else
 345                                         delay_for_event = c->delay;
 346 #endif
 347                                 mdstat_close();
 348                         }
 349                 }
 350                 info.test = 0;
 351
 352                 for (stp = &statelist; (st = *stp) != NULL; ) {
 353                         if (st->from_auto && st->err > 5) {
 354                                 *stp = st->next;
 355                                 if (st->spare_group)
 356                                         free(st->spare_group);
 357
 358                                 free(st);
 359                         } else
 360                                 stp = &st->next;
 361                 }
 362         }
 363
 364         free_statelist(statelist);
 365
 366         if (pidfile)
 367                 unlink(pidfile);
 368         return 0;
 369 }
 370
 371 static int make_daemon(char *pidfile)
 372 {
 373         /* Return:
 374          * -1 in the forked daemon
 375          *  0 in the parent
 376          *  1 on error
 377          * so a none-negative becomes the exit code.
 378          */
 379         int pid = fork();
 380         if (pid > 0) {
 381                 if (!pidfile)
 382                         printf("%d\n", pid);
 383                 else {
 384                         FILE *pid_file = NULL;
 385                         int fd = open(pidfile, O_WRONLY | O_CREAT | O_TRUNC,
 386                                       0644);
 387                         if (fd >= 0)
 388                                 pid_file = fdopen(fd, "w");
 389                         if (!pid_file)
 390                                 perror("cannot create pid file");
 391                         else {
 392                                 fprintf(pid_file,"%d\n", pid);
 393                                 fclose(pid_file);
 394                         }
 395                 }
 396                 return 0;
 397         }
 398         if (pid < 0) {
 399                 perror("daemonise");
 400                 return 1;
 401         }
 402         manage_fork_fds(0);
 403         setsid();
 404         return -1;
 405 }
 406
 407 static int check_one_sharer(int scan)
 408 {
 409         int pid;
 410         FILE *comm_fp;
 411         FILE *fp;
 412         char comm_path[PATH_MAX];
 413         char path[PATH_MAX];
 414         char comm[20];
 415
 416         sprintf(path, "%s/autorebuild.pid", MDMON_DIR);
 417         fp = fopen(path, "r");
 418         if (fp) {
 419                 if (fscanf(fp, "%d", &pid) != 1)
 420                         pid = -1;
 421                 snprintf(comm_path, sizeof(comm_path),
 422                          "/proc/%d/comm", pid);
 423                 comm_fp = fopen(comm_path, "r");
 424                 if (comm_fp) {
 425                         if (fscanf(comm_fp, "%19s", comm) &&
 426                             strncmp(basename(comm), Name, strlen(Name)) == 0) {
 427                                 if (scan) {
 428                                         pr_err("Only one autorebuild process allowed in scan mode, aborting\n");
 429                                         fclose(comm_fp);
 430                                         fclose(fp);
 431                                         return 1;
 432                                 } else {
 433                                         pr_err("Warning: One autorebuild process already running.\n");
 434                                 }
 435                         }
 436                         fclose(comm_fp);
 437                 }
 438                 fclose(fp);
 439         }
 440         return 0;
 441 }
 442
 443 static void write_autorebuild_pid()
 444 {
 445         char path[PATH_MAX];
 446         int pid;
 447         FILE *fp = NULL;
 448         sprintf(path, "%s/autorebuild.pid", MDMON_DIR);
 449
 450         if (mkdir(MDMON_DIR, 0700) < 0 && errno != EEXIST) {
 451                 pr_err("Can't create autorebuild.pid file\n");
 452         } else {
 453                 int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0700);
 454
 455                 if (fd >= 0)
 456                         fp = fdopen(fd, "w");
 457
 458                 if (!fp)
 459                         pr_err("Can't create autorebuild.pid file\n");
 460                 else {
 461                         pid = getpid();
 462                         fprintf(fp, "%d\n", pid);
 463                         fclose(fp);
 464                 }
 465         }
 466 }
 467
 468 #define BASE_MESSAGE "%s event detected on md device %s"
 469 #define COMPONENT_DEVICE_MESSAGE ", component device %s"
 470 #define DESCRIPTION_MESSAGE ": %s"
 471 /*
 472  * sprint_event_message() - Writes basic message about detected event to destination ptr.
 473  * @dest: message destination, should be at least the size of BUFSIZ
 474  * @data: event data
 475  *
 476  * Return: 0 on success, 1 on error
 477  */
 478 static int sprint_event_message(char *dest, const struct event_data *data)
 479 {
 480         if (!dest || !data)
 481                 return 1;
 482
 483         if (data->disc && data->description)
 484                 snprintf(dest, BUFSIZ, BASE_MESSAGE COMPONENT_DEVICE_MESSAGE DESCRIPTION_MESSAGE,
 485                          data->event_name, data->dev, data->disc, data->description);
 486         else if (data->disc)
 487                 snprintf(dest, BUFSIZ, BASE_MESSAGE COMPONENT_DEVICE_MESSAGE,
 488                          data->event_name, data->dev, data->disc);
 489         else if (data->description)
 490                 snprintf(dest, BUFSIZ, BASE_MESSAGE DESCRIPTION_MESSAGE,
 491                          data->event_name, data->dev, data->description);
 492         else
 493                 snprintf(dest, BUFSIZ, BASE_MESSAGE, data->event_name, data->dev);
 494
 495         return 0;
 496 }
 497
 498 /*
 499  * get_syslog_event_priority() - Determines event priority.
 500  * @event_enum: event to be checked
 501  *
 502  * Return: LOG_CRIT, LOG_WARNING or LOG_INFO
 503  */
 504 static int get_syslog_event_priority(const enum event event_enum)
 505 {
 506         if (event_enum > __SYSLOG_PRIORITY_CRITICAL)
 507                 return LOG_CRIT;
 508         if (event_enum > __SYSLOG_PRIORITY_WARNING)
 509                 return LOG_WARNING;
 510         return LOG_INFO;
 511 }
 512
 513 /*
 514  * is_email_event() - Determines whether email for event should be sent or not.
 515  * @event_enum: event to be checked
 516  *
 517  * Return: true if email should be sent, false otherwise
 518  */
 519 static bool is_email_event(const enum event event_enum)
 520 {
 521         static const enum event email_events[] = {
 522         EVENT_FAIL,
 523         EVENT_FAIL_SPARE,
 524         EVENT_DEGRADED_ARRAY,
 525         EVENT_SPARES_MISSING,
 526         EVENT_TEST_MESSAGE
 527         };
 528         unsigned int i;
 529
 530         for (i = 0; i < ARRAY_SIZE(email_events); ++i) {
 531                 if (event_enum == email_events[i])
 532                         return true;
 533         }
 534         return false;
 535 }
 536
 537 /*
 538  * execute_alert_cmd() - Forks and executes command provided as alert_cmd.
 539  * @data: event data
 540  */
 541 static void execute_alert_cmd(const struct event_data *data)
 542 {
 543         int pid = fork();
 544
 545         switch (pid) {
 546         default:
 547                 waitpid(pid, NULL, 0);
 548                 break;
 549         case -1:
 550                 pr_err("Cannot fork to execute alert command");
 551                 break;
 552         case 0:
 553                 execl(info.alert_cmd, info.alert_cmd, data->event_name, data->dev, data->disc, NULL);
 554                 exit(2);
 555         }
 556 }
 557
 558 /*
 559  * send_event_email() - Sends an email about event detected by monitor.
 560  * @data: event data
 561  */
 562 static void send_event_email(const struct event_data *data)
 563 {
 564         FILE *mp, *mdstat;
 565         char buf[BUFSIZ];
 566         int n;
 567
 568         mp = popen(Sendmail, "w");
 569         if (!mp) {
 570                 pr_err("Cannot open pipe stream for sendmail.\n");
 571                 return;
 572         }
 573
 574         signal(SIGPIPE, SIG_IGN);
 575         if (info.mailfrom)
 576                 fprintf(mp, "From: %s\n", info.mailfrom);
 577         else
 578                 fprintf(mp, "From: %s monitoring <root>\n", Name);
 579         fprintf(mp, "To: %s\n", info.mailaddr);
 580         fprintf(mp, "Subject: %s event on %s:%s\n\n", data->event_name, data->dev, info.hostname);
 581         fprintf(mp, "This is an automatically generated mail message.\n");
 582         fprintf(mp, "%s\n", data->message);
 583
 584         mdstat = fopen("/proc/mdstat", "r");
 585         if (!mdstat) {
 586                 pr_err("Cannot open /proc/mdstat\n");
 587                 pclose(mp);
 588                 return;
 589         }
 590
 591         fprintf(mp, "The /proc/mdstat file currently contains the following:\n\n");
 592         while ((n = fread(buf, 1, sizeof(buf), mdstat)) > 0)
 593                 n = fwrite(buf, 1, n, mp);
 594         fclose(mdstat);
 595         pclose(mp);
 596 }
 597
 598 /*
 599  * log_event_to_syslog() - Logs an event into syslog.
 600  * @data: event data
 601  */
 602 static void log_event_to_syslog(const struct event_data *data)
 603 {
 604         int priority;
 605
 606         priority = get_syslog_event_priority(data->event_enum);
 607
 608         syslog(priority, "%s\n", data->message);
 609 }
 610
 611 /*
 612  * alert() - Alerts about the monitor event.
 613  * @event_enum: event to be sent
 614  * @description: event description
 615  * @progress: rebuild progress
 616  * @dev: md device name
 617  * @disc: component device
 618  *
 619  * If needed function executes alert command, sends an email or logs event to syslog.
 620  */
 621 static void alert(const enum event event_enum, const char *description, const uint8_t progress,
 622                   const char *dev, const char *disc)
 623 {
 624         struct event_data data = {.dev = dev, .disc = disc, .description = description};
 625
 626         if (!dev)
 627                 return;
 628
 629         if (event_enum == EVENT_REBUILD) {
 630                 snprintf(data.event_name, sizeof(data.event_name), "%s%02d",
 631                          map_num_s(events_map, EVENT_REBUILD), progress);
 632         } else {
 633                 snprintf(data.event_name, sizeof(data.event_name), "%s", map_num_s(events_map, event_enum));
 634         }
 635
 636         data.event_enum = event_enum;
 637
 638         if (sprint_event_message(data.message, &data) != 0) {
 639                 pr_err("Cannot create event message.\n");
 640                 return;
 641         }
 642         pr_err("%s\n", data.message);
 643
 644         if (info.alert_cmd)
 645                 execute_alert_cmd(&data);
 646
 647         if (info.mailaddr && is_email_event(event_enum))
 648                 send_event_email(&data);
 649
 650         if (info.dosyslog)
 651                 log_event_to_syslog(&data);
 652 }
 653
 654 static int check_array(struct state *st, struct mdstat_ent *mdstat,
 655                        int increments, char *prefer)
 656 {
 657         /* Update the state 'st' to reflect any changes shown in mdstat,
 658          * or found by directly examining the array, and return
 659          * '1' if the array is degraded, or '0' if it is optimal (or dead).
 660          */
 661         struct { int state, major, minor; } disks_info[MAX_DISKS];
 662         struct mdinfo *sra = NULL;
 663         mdu_array_info_t array;
 664         struct mdstat_ent *mse = NULL, *mse2;
 665         char *dev = st->devname;
 666         int fd;
 667         int i;
 668         int remaining_disks;
 669         int last_disk;
 670         int new_array = 0;
 671         int retval;
 672         int is_container = 0;
 673         unsigned long redundancy_only_flags = 0;
 674
 675         if (info.test)
 676                 alert(EVENT_TEST_MESSAGE, NULL, 0, dev, NULL);
 677
 678         retval = 0;
 679
 680         fd = open(dev, O_RDONLY);
 681         if (fd < 0)
 682                 goto disappeared;
 683
 684         if (st->devnm[0] == 0)
 685                 snprintf(st->devnm, MD_NAME_MAX, "%s", fd2devnm(fd));
 686
 687         for (mse2 = mdstat; mse2; mse2 = mse2->next)
 688                 if (strcmp(mse2->devnm, st->devnm) == 0) {
 689                         mse2->devnm[0] = 0; /* flag it as "used" */
 690                         mse = mse2;
 691                 }
 692
 693         if (!mse) {
 694                 /* duplicated array in statelist
 695                  * or re-created after reading mdstat
 696                  */
 697                 st->err++;
 698                 goto out;
 699         }
 700
 701         if (mse->level == NULL)
 702                 is_container = 1;
 703
 704         if (!is_container && !md_array_active(fd))
 705                 goto disappeared;
 706
 707         fcntl(fd, F_SETFD, FD_CLOEXEC);
 708         if (md_get_array_info(fd, &array) < 0)
 709                 goto disappeared;
 710
 711         if (!is_container && map_name(pers, mse->level) > 0)
 712                 redundancy_only_flags |= GET_MISMATCH;
 713
 714         sra = sysfs_read(-1, st->devnm, GET_LEVEL | GET_DISKS | GET_DEVS |
 715                         GET_STATE | redundancy_only_flags);
 716
 717         if (!sra)
 718                 goto disappeared;
 719
 720         /* It's much easier to list what array levels can't
 721          * have a device disappear than all of them that can
 722          */
 723         if (sra->array.level == 0 || sra->array.level == -1) {
 724                 if (!st->err && !st->from_config)
 725                         alert(EVENT_DEVICE_DISAPPEARED, "Wrong-Level", 0, dev, NULL);
 726                 st->err++;
 727                 goto out;
 728         }
 729
 730         /* this array is in /proc/mdstat */
 731         if (array.utime == 0)
 732                 /* external arrays don't update utime, so
 733                  * just make sure it is always different. */
 734                 array.utime = st->utime + 1;;
 735
 736         if (st->err) {
 737                 /* New array appeared where previously had an error */
 738                 st->err = 0;
 739                 st->percent = RESYNC_NONE;
 740                 new_array = 1;
 741                 if (!is_container)
 742                         alert(EVENT_NEW_ARRAY, NULL, 0, st->devname, NULL);
 743         }
 744
 745         if (st->utime == array.utime && st->failed == sra->array.failed_disks &&
 746             st->working == sra->array.working_disks &&
 747             st->spare == sra->array.spare_disks &&
 748             (mse == NULL || (mse->percent == st->percent))) {
 749                 if ((st->active < st->raid) && st->spare == 0)
 750                         retval = 1;
 751                 goto out;
 752         }
 753         if (st->utime == 0 && /* new array */
 754             mse->pattern && strchr(mse->pattern, '_') /* degraded */)
 755                 alert(EVENT_DEGRADED_ARRAY, NULL, 0, dev, NULL);
 756
 757         if (st->utime == 0 && /* new array */ st->expected_spares > 0 &&
 758             sra->array.spare_disks < st->expected_spares)
 759                 alert(EVENT_SPARES_MISSING, NULL, 0, dev, NULL);
 760         if (st->percent < 0 && st->percent != RESYNC_UNKNOWN &&
 761             mse->percent >= 0)
 762                 alert(EVENT_REBUILD_STARTED, NULL, 0, dev, NULL);
 763         if (st->percent >= 0 && mse->percent >= 0 &&
 764             (mse->percent / increments) > (st->percent / increments)) {
 765                 if((mse->percent / increments) == 0)
 766                         alert(EVENT_REBUILD_STARTED, NULL, 0, dev, NULL);
 767                 else
 768                         alert(EVENT_REBUILD, NULL, mse->percent, dev, NULL);
 769         }
 770
 771         if (mse->percent == RESYNC_NONE && st->percent >= 0) {
 772                 /* Rebuild/sync/whatever just finished.
 773                  * If there is a number in /mismatch_cnt,
 774                  * we should report that.
 775                  */
 776                 if (sra && sra->mismatch_cnt > 0) {
 777                         char cnt[80];
 778                         snprintf(cnt, sizeof(cnt),
 779                                  " mismatches found: %d (on raid level %d)",
 780                                  sra->mismatch_cnt, sra->array.level);
 781                         alert(EVENT_REBUILD_FINISHED, NULL, 0, dev, cnt);
 782                 } else
 783                         alert(EVENT_REBUILD_FINISHED, NULL, 0, dev, NULL);
 784         }
 785         st->percent = mse->percent;
 786
 787         remaining_disks = sra->array.nr_disks;
 788         for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
 789                 mdu_disk_info_t disc;
 790                 disc.number = i;
 791                 if (md_get_disk_info(fd, &disc) >= 0) {
 792                         disks_info[i].state = disc.state;
 793                         disks_info[i].major = disc.major;
 794                         disks_info[i].minor = disc.minor;
 795                         if (disc.major || disc.minor)
 796                                 remaining_disks --;
 797                 } else
 798                         disks_info[i].major = disks_info[i].minor = 0;
 799         }
 800         last_disk = i;
 801
 802         if (mse->metadata_version &&
 803             strncmp(mse->metadata_version, "external:", 9) == 0 &&
 804             is_subarray(mse->metadata_version+9)) {
 805                 char *sl;
 806                 snprintf(st->parent_devnm, MD_NAME_MAX, "%s", mse->metadata_version + 10);
 807                 sl = strchr(st->parent_devnm, '/');
 808                 if (sl)
 809                         *sl = 0;
 810         } else
 811                 st->parent_devnm[0] = 0;
 812         if (st->metadata == NULL && st->parent_devnm[0] == 0)
 813                 st->metadata = super_by_fd(fd, NULL);
 814
 815         for (i = 0; i < MAX_DISKS; i++) {
 816                 mdu_disk_info_t disc = {0, 0, 0, 0, 0};
 817                 int newstate = 0;
 818                 int change;
 819                 char *dv = NULL;
 820                 disc.number = i;
 821                 if (i < last_disk && (disks_info[i].major || disks_info[i].minor)) {
 822                         newstate = disks_info[i].state;
 823                         dv = map_dev_preferred(disks_info[i].major, disks_info[i].minor, 1,
 824                                                prefer);
 825                         disc.state = newstate;
 826                         disc.major = disks_info[i].major;
 827                         disc.minor = disks_info[i].minor;
 828                 } else
 829                         newstate = (1 << MD_DISK_REMOVED);
 830
 831                 if (dv == NULL && st->devid[i])
 832                         dv = map_dev_preferred(major(st->devid[i]),
 833                                                minor(st->devid[i]), 1, prefer);
 834                 change = newstate ^ st->devstate[i];
 835                 if (st->utime && change && !st->err && !new_array) {
 836                         if ((st->devstate[i]&change) & (1 << MD_DISK_SYNC))
 837                                 alert(EVENT_FAIL, NULL, 0, dev, dv);
 838                         else if ((newstate & (1 << MD_DISK_FAULTY)) &&
 839                                  (disc.major || disc.minor) &&
 840                                  st->devid[i] == makedev(disc.major,
 841                                                          disc.minor))
 842                                 alert(EVENT_FAIL_SPARE, NULL, 0, dev, dv);
 843                         else if ((newstate&change) & (1 << MD_DISK_SYNC))
 844                                 alert(EVENT_SPARE_ACTIVE, NULL, 0, dev, dv);
 845                 }
 846                 st->devstate[i] = newstate;
 847                 st->devid[i] = makedev(disc.major, disc.minor);
 848         }
 849         st->active = sra->array.active_disks;
 850         st->working = sra->array.working_disks;
 851         st->spare = sra->array.spare_disks;
 852         st->failed = sra->array.failed_disks;
 853         st->utime = array.utime;
 854         st->raid = sra->array.raid_disks;
 855         st->err = 0;
 856         if ((st->active < st->raid) && st->spare == 0)
 857                 retval = 1;
 858
 859  out:
 860         if (sra)
 861                 sysfs_free(sra);
 862         if (fd >= 0)
 863                 close(fd);
 864         return retval;
 865
 866  disappeared:
 867         if (!st->err && !is_container)
 868                 alert(EVENT_DEVICE_DISAPPEARED, NULL, 0, dev, NULL);
 869         st->err++;
 870         goto out;
 871 }
 872
 873 static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist)
 874 {
 875         struct mdstat_ent *mse;
 876         int new_found = 0;
 877         char *name;
 878
 879         for (mse = mdstat; mse; mse = mse->next)
 880                 if (mse->devnm[0] && (!mse->level || /* retrieve containers */
 881                                       (strcmp(mse->level, "raid0") != 0 &&
 882                                        strcmp(mse->level, "linear") != 0))) {
 883                         struct state *st = xcalloc(1, sizeof *st);
 884                         mdu_array_info_t array;
 885                         int fd;
 886
 887                         name = get_md_name(mse->devnm);
 888                         if (!name) {
 889                                 free(st);
 890                                 continue;
 891                         }
 892
 893                         snprintf(st->devname, MD_NAME_MAX + sizeof("/dev/md/"), "%s", name);
 894                         if ((fd = open(st->devname, O_RDONLY)) < 0 ||
 895                             md_get_array_info(fd, &array) < 0) {
 896                                 /* no such array */
 897                                 if (fd >= 0)
 898                                         close(fd);
 899                                 put_md_name(st->devname);
 900                                 if (st->metadata) {
 901                                         st->metadata->ss->free_super(st->metadata);
 902                                         free(st->metadata);
 903                                 }
 904                                 free(st);
 905                                 continue;
 906                         }
 907                         close(fd);
 908                         st->next = *statelist;
 909                         st->err = 1;
 910                         st->from_auto = 1;
 911                         snprintf(st->devnm, MD_NAME_MAX, "%s", mse->devnm);
 912                         st->percent = RESYNC_UNKNOWN;
 913                         st->expected_spares = -1;
 914                         if (mse->metadata_version &&
 915                             strncmp(mse->metadata_version,
 916                                     "external:", 9) == 0 &&
 917                             is_subarray(mse->metadata_version+9)) {
 918                                 char *sl;
 919                                 snprintf(st->parent_devnm, MD_NAME_MAX,
 920                                          "%s", mse->metadata_version + 10);
 921                                 sl = strchr(st->parent_devnm, '/');
 922                                 *sl = 0;
 923                         } else
 924                                 st->parent_devnm[0] = 0;
 925                         *statelist = st;
 926                         if (info.test)
 927                                 alert(EVENT_TEST_MESSAGE, NULL, 0, st->devname, NULL);
 928                         new_found = 1;
 929                 }
 930         return new_found;
 931 }
 932
 933 static int get_required_spare_criteria(struct state *st,
 934                                        struct spare_criteria *sc)
 935 {
 936         int fd;
 937
 938         if (!st->metadata || !st->metadata->ss->get_spare_criteria) {
 939                 sc->min_size = 0;
 940                 sc->sector_size = 0;
 941                 return 0;
 942         }
 943
 944         fd = open(st->devname, O_RDONLY);
 945         if (fd < 0)
 946                 return 1;
 947         if (st->metadata->ss->external)
 948                 st->metadata->ss->load_container(st->metadata, fd, st->devname);
 949         else
 950                 st->metadata->ss->load_super(st->metadata, fd, st->devname);
 951         close(fd);
 952         if (!st->metadata->sb)
 953                 return 1;
 954
 955         st->metadata->ss->get_spare_criteria(st->metadata, sc);
 956         st->metadata->ss->free_super(st->metadata);
 957
 958         return 0;
 959 }
 960
 961 static int check_donor(struct state *from, struct state *to)
 962 {
 963         struct state *sub;
 964
 965         if (from == to)
 966                 return 0;
 967         if (from->parent)
 968                 /* Cannot move from a member */
 969                 return 0;
 970         if (from->err)
 971                 return 0;
 972         for (sub = from->subarray; sub; sub = sub->subarray)
 973                 /* If source array has degraded subarrays, don't
 974                  * remove anything
 975                  */
 976                 if (sub->active < sub->raid)
 977                         return 0;
 978         if (from->metadata->ss->external == 0)
 979                 if (from->active < from->raid)
 980                         return 0;
 981         if (from->spare <= 0)
 982                 return 0;
 983         return 1;
 984 }
 985
 986 static dev_t choose_spare(struct state *from, struct state *to,
 987                           struct domainlist *domlist, struct spare_criteria *sc)
 988 {
 989         int d;
 990         dev_t dev = 0;
 991
 992         for (d = from->raid; !dev && d < MAX_DISKS; d++) {
 993                 if (from->devid[d] > 0 && from->devstate[d] == 0) {
 994                         struct dev_policy *pol;
 995                         unsigned long long dev_size;
 996                         unsigned int dev_sector_size;
 997
 998                         if (to->metadata->ss->external &&
 999                             test_partition_from_id(from->devid[d]))
1000                                 continue;
1001
1002                         if (sc->min_size &&
1003                             dev_size_from_id(from->devid[d], &dev_size) &&
1004                             dev_size < sc->min_size)
1005                                 continue;
1006
1007                         if (sc->sector_size &&
1008                             dev_sector_size_from_id(from->devid[d],
1009                                                     &dev_sector_size) &&
1010                             sc->sector_size != dev_sector_size)
1011                                 continue;
1012
1013                         pol = devid_policy(from->devid[d]);
1014                         if (from->spare_group)
1015                                 pol_add(&pol, pol_domain,
1016                                         from->spare_group, NULL);
1017                         if (domain_test(domlist, pol,
1018                                         to->metadata->ss->name) == 1)
1019                             dev = from->devid[d];
1020                         dev_policy_free(pol);
1021                 }
1022         }
1023         return dev;
1024 }
1025
1026 static dev_t container_choose_spare(struct state *from, struct state *to,
1027                                     struct domainlist *domlist,
1028                                     struct spare_criteria *sc, int active)
1029 {
1030         /* This is similar to choose_spare, but we cannot trust devstate,
1031          * so we need to read the metadata instead
1032          */
1033         struct mdinfo *list;
1034         struct supertype *st = from->metadata;
1035         int fd = open(from->devname, O_RDONLY);
1036         int err;
1037         dev_t dev = 0;
1038
1039         if (fd < 0)
1040                 return 0;
1041         if (!st->ss->getinfo_super_disks) {
1042                 close(fd);
1043                 return 0;
1044         }
1045
1046         err = st->ss->load_container(st, fd, NULL);
1047         close(fd);
1048         if (err)
1049                 return 0;
1050
1051         if (from == to) {
1052                 /* We must check if number of active disks has not increased
1053                  * since ioctl in main loop. mdmon may have added spare
1054                  * to subarray. If so we do not need to look for more spares
1055                  * so return non zero value */
1056                 int active_cnt = 0;
1057                 struct mdinfo *dp;
1058                 list = st->ss->getinfo_super_disks(st);
1059                 if (!list) {
1060                         st->ss->free_super(st);
1061                         return 1;
1062                 }
1063                 dp = list->devs;
1064                 while (dp) {
1065                         if (dp->disk.state & (1 << MD_DISK_SYNC) &&
1066                             !(dp->disk.state & (1 << MD_DISK_FAULTY)))
1067                                 active_cnt++;
1068                         dp = dp->next;
1069                 }
1070                 sysfs_free(list);
1071                 if (active < active_cnt) {
1072                         /* Spare just activated.*/
1073                         st->ss->free_super(st);
1074                         return 1;
1075                 }
1076         }
1077
1078         /* We only need one spare so full list not needed */
1079         list = container_choose_spares(st, sc, domlist, from->spare_group,
1080                                        to->metadata->ss->name, 1);
1081         if (list) {
1082                 struct mdinfo *disks = list->devs;
1083                 if (disks)
1084                         dev = makedev(disks->disk.major, disks->disk.minor);
1085                 sysfs_free(list);
1086         }
1087         st->ss->free_super(st);
1088         return dev;
1089 }
1090
1091 static void try_spare_migration(struct state *statelist)
1092 {
1093         struct state *from;
1094         struct state *st;
1095         struct spare_criteria sc;
1096
1097         link_containers_with_subarrays(statelist);
1098         for (st = statelist; st; st = st->next)
1099                 if (st->active < st->raid && st->spare == 0 && !st->err) {
1100                         struct domainlist *domlist = NULL;
1101                         int d;
1102                         struct state *to = st;
1103
1104                         if (to->parent_devnm[0] && !to->parent)
1105                                 /* subarray monitored without parent container
1106                                  * we can't move spares here */
1107                                 continue;
1108
1109                         if (to->parent)
1110                                 /* member of a container */
1111                                 to = to->parent;
1112
1113                         if (get_required_spare_criteria(to, &sc))
1114                                 continue;
1115                         if (to->metadata->ss->external) {
1116                                 /* We must make sure there is
1117                                  * no suitable spare in container already.
1118                                  * If there is we don't add more */
1119                                 dev_t devid = container_choose_spare(
1120                                         to, to, NULL, &sc, st->active);
1121                                 if (devid > 0)
1122                                         continue;
1123                         }
1124                         for (d = 0; d < MAX_DISKS; d++)
1125                                 if (to->devid[d])
1126                                         domainlist_add_dev(&domlist,
1127                                                            to->devid[d],
1128                                                            to->metadata->ss->name);
1129                         if (to->spare_group)
1130                                 domain_add(&domlist, to->spare_group);
1131                         /*
1132                          * No spare migration if the destination
1133                          * has no domain. Skip this array.
1134                          */
1135                         if (!domlist)
1136                                 continue;
1137                         for (from=statelist ; from ; from=from->next) {
1138                                 dev_t devid;
1139                                 if (!check_donor(from, to))
1140                                         continue;
1141                                 if (from->metadata->ss->external)
1142                                         devid = container_choose_spare(
1143                                                 from, to, domlist, &sc, 0);
1144                                 else
1145                                         devid = choose_spare(from, to, domlist,
1146                                                              &sc);
1147                                 if (devid > 0 &&
1148                                     move_spare(from->devname, to->devname,
1149                                                devid)) {
1150                                         alert(EVENT_MOVE_SPARE, NULL, 0, to->devname, from->devname);
1151                                         break;
1152                                 }
1153                         }
1154                         domain_free(domlist);
1155                 }
1156 }
1157
1158 /* search the statelist to connect external
1159  * metadata subarrays with their containers
1160  * We always completely rebuild the tree from scratch as
1161  * that is safest considering the possibility of entries
1162  * disappearing or changing.
1163  */
1164 static void link_containers_with_subarrays(struct state *list)
1165 {
1166         struct state *st;
1167         struct state *cont;
1168         for (st = list; st; st = st->next) {
1169                 st->parent = NULL;
1170                 st->subarray = NULL;
1171         }
1172         for (st = list; st; st = st->next)
1173                 if (st->parent_devnm[0])
1174                         for (cont = list; cont; cont = cont->next)
1175                                 if (!cont->err && cont->parent_devnm[0] == 0 &&
1176                                     strcmp(cont->devnm, st->parent_devnm) == 0) {
1177                                         st->parent = cont;
1178                                         st->subarray = cont->subarray;
1179                                         cont->subarray = st;
1180                                         break;
1181                                 }
1182 }
1183
1184 /**
1185  * free_statelist() - Frees statelist.
1186  * @statelist: statelist to free
1187  */
1188 static void free_statelist(struct state *statelist)
1189 {
1190         struct state *tmp = NULL;
1191
1192         while (statelist) {
1193                 if (statelist->spare_group)
1194                         free(statelist->spare_group);
1195
1196                 tmp = statelist;
1197                 statelist = statelist->next;
1198                 free(tmp);
1199         }
1200 }
1201
1202 #ifndef NO_LIBUDEV
1203 /* function: check_udev_activity
1204  * Description: Function waits for udev to finish
1205  * events processing.
1206  * Returns:
1207  *              1 - detected error while opening udev
1208  *              2 - timeout
1209  *              0 - successfull completion
1210  */
1211 static int check_udev_activity(void)
1212 {
1213         struct udev *udev = NULL;
1214         struct udev_queue *udev_queue = NULL;
1215         int timeout_cnt = 30;
1216         int rc = 0;
1217
1218         /*
1219          * In rare cases systemd may not have udevm,
1220          * in such cases just exit with rc 0
1221          */
1222         if (!use_udev())
1223                 goto out;
1224
1225         udev = udev_new();
1226         if (!udev) {
1227                 rc = 1;
1228                 goto out;
1229         }
1230
1231         udev_queue = udev_queue_new(udev);
1232         if (!udev_queue) {
1233                 rc = 1;
1234                 goto out;
1235         }
1236
1237         if (udev_queue_get_queue_is_empty(udev_queue))
1238                 goto out;
1239
1240         while (!udev_queue_get_queue_is_empty(udev_queue)) {
1241                 sleep(1);
1242
1243                 if (timeout_cnt)
1244                         timeout_cnt--;
1245                 else {
1246                         rc = 2;
1247                         goto out;
1248                 }
1249         }
1250
1251 out:
1252         if (udev_queue)
1253                 udev_queue_unref(udev_queue);
1254         if (udev)
1255                 udev_unref(udev);
1256         return rc;
1257 }
1258 #endif
1259
1260 /* Not really Monitor but ... */
1261 int Wait(char *dev)
1262 {
1263         char devnm[32];
1264         dev_t rdev;
1265         char *tmp;
1266         int rv = 1;
1267         int frozen_remaining = 3;
1268
1269         if (!stat_is_blkdev(dev, &rdev))
1270                 return 2;
1271
1272         tmp = devid2devnm(rdev);
1273         if (!tmp) {
1274                 pr_err("Cannot get md device name.\n");
1275                 return 2;
1276         }
1277
1278         strcpy(devnm, tmp);
1279
1280         while(1) {
1281                 struct mdstat_ent *ms = mdstat_read(1, 0);
1282                 struct mdstat_ent *e;
1283
1284                 for (e = ms; e; e = e->next)
1285                         if (strcmp(e->devnm, devnm) == 0)
1286                                 break;
1287
1288                 if (e && e->percent == RESYNC_NONE) {
1289                         /* We could be in the brief pause before something
1290                          * starts. /proc/mdstat doesn't show that, but
1291                          * sync_action does.
1292                          */
1293                         struct mdinfo mdi;
1294                         char buf[21];
1295
1296                         if (sysfs_init(&mdi, -1, devnm))
1297                                 return 2;
1298                         if (sysfs_get_str(&mdi, NULL, "sync_action",
1299                                           buf, 20) > 0 &&
1300                             strcmp(buf,"idle\n") != 0) {
1301                                 e->percent = RESYNC_UNKNOWN;
1302                                 if (strcmp(buf, "frozen\n") == 0) {
1303                                         if (frozen_remaining == 0)
1304                                                 e->percent = RESYNC_NONE;
1305                                         else
1306                                                 frozen_remaining -= 1;
1307                                 }
1308                         }
1309                 }
1310                 if (!e || e->percent == RESYNC_NONE) {
1311                         if (e && e->metadata_version &&
1312                             strncmp(e->metadata_version, "external:", 9) == 0) {
1313                                 if (is_subarray(&e->metadata_version[9]))
1314                                         ping_monitor(&e->metadata_version[9]);
1315                                 else
1316                                         ping_monitor(devnm);
1317                         }
1318                         free_mdstat(ms);
1319                         return rv;
1320                 }
1321                 free_mdstat(ms);
1322                 rv = 0;
1323                 mdstat_wait(5);
1324         }
1325 }
1326
1327 /* The state "broken" is used only for RAID0/LINEAR - it's the same as
1328  * "clean", but used in case the array has one or more members missing.
1329  */
1330 static char *clean_states[] = {
1331         "clear", "inactive", "readonly", "read-auto", "clean", "broken", NULL };
1332
1333 int WaitClean(char *dev, int verbose)
1334 {
1335         int fd;
1336         struct mdinfo *mdi;
1337         int rv = 1;
1338         char devnm[32];
1339
1340         if (!stat_is_blkdev(dev, NULL))
1341                 return 2;
1342         fd = open(dev, O_RDONLY);
1343         if (fd < 0) {
1344                 if (verbose)
1345                         pr_err("Couldn't open %s: %s\n", dev, strerror(errno));
1346                 return 1;
1347         }
1348
1349         strcpy(devnm, fd2devnm(fd));
1350         mdi = sysfs_read(fd, devnm, GET_VERSION|GET_LEVEL|GET_SAFEMODE);
1351         if (!mdi) {
1352                 if (verbose)
1353                         pr_err("Failed to read sysfs attributes for %s\n", dev);
1354                 close(fd);
1355                 return 0;
1356         }
1357
1358         switch(mdi->array.level) {
1359         case LEVEL_LINEAR:
1360         case LEVEL_MULTIPATH:
1361         case 0:
1362                 /* safemode delay is irrelevant for these levels */
1363                 rv = 0;
1364         }
1365
1366         /* for internal metadata the kernel handles the final clean
1367          * transition, containers can never be dirty
1368          */
1369         if (!is_subarray(mdi->text_version))
1370                 rv = 0;
1371
1372         /* safemode disabled ? */
1373         if (mdi->safe_mode_delay == 0)
1374                 rv = 0;
1375
1376         if (rv) {
1377                 int state_fd = sysfs_open(fd2devnm(fd), NULL, "array_state");
1378                 char buf[20];
1379                 int delay = 5000;
1380
1381                 /* minimize the safe_mode_delay and prepare to wait up to 5s
1382                  * for writes to quiesce
1383                  */
1384                 sysfs_set_safemode(mdi, 1);
1385
1386                 /* wait for array_state to be clean */
1387                 while (1) {
1388                         rv = read(state_fd, buf, sizeof(buf));
1389                         if (rv < 0)
1390                                 break;
1391                         if (sysfs_match_word(buf, clean_states) <
1392                             (int)ARRAY_SIZE(clean_states) - 1)
1393                                 break;
1394                         rv = sysfs_wait(state_fd, &delay);
1395                         if (rv < 0 && errno != EINTR)
1396                                 break;
1397                         lseek(state_fd, 0, SEEK_SET);
1398                 }
1399                 if (rv < 0)
1400                         rv = 1;
1401                 else if (ping_monitor(mdi->text_version) == 0) {
1402                         /* we need to ping to close the window between array
1403                          * state transitioning to clean and the metadata being
1404                          * marked clean
1405                          */
1406                         rv = 0;
1407                 } else {
1408                         rv = 1;
1409                         pr_err("Error connecting monitor with %s\n", dev);
1410                 }
1411                 if (rv && verbose)
1412                         pr_err("Error waiting for %s to be clean\n", dev);
1413
1414                 /* restore the original safe_mode_delay */
1415                 sysfs_set_safemode(mdi, mdi->safe_mode_delay);
1416                 close(state_fd);
1417         }
1418
1419         sysfs_free(mdi);
1420         close(fd);
1421
1422         return rv;
1423 }