mdmon: delegate removal to managemon

author Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>

Tue, 30 Jul 2024 12:12:21 +0000 (14:12 +0200)

committer Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>

Mon, 4 Nov 2024 09:29:52 +0000 (10:29 +0100)
author Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>
Tue, 30 Jul 2024 12:12:21 +0000 (14:12 +0200)
committer Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>
Mon, 4 Nov 2024 09:29:52 +0000 (10:29 +0100)
diff --git a/managemon.c b/managemon.c

index 6ca592b19046ce8347150d106d01f03d7990c431..d798132824576a046e8560be7c5899720289ff8f 100644 (file)
--- a/managemon.c
+++ b/managemon.c
@@ -440,6 +440,39 @@ static int disk_init_and_add(struct mdinfo *disk, struct mdinfo *clone,
         return 0;
  }
  
+/**
+ * managemon_disk_remove()- remove disk from the MD array.
+ * @disk: device to be removed.
+ * @array_devnm: the name of the array to remove disk from.
+ *
+ * It tries to remove the disk from the MD array and if it is successful then it closes all opened
+ * descriptors. Removing action requires suspend, it might take a while.
+ * Invalidating mdi->state_fd will prevent from using this device further (see duplicate_aa()).
+ *
+ * To avoid deadlock, new file descriptor is opened because monitor may already wait on
+ * mdddev_suspend() in kernel and keep saved descriptor locked.
+ *
+ * Returns MDADM_STATUS_SUCCESS if disk has been removed, MDADM_STATUS_ERROR otherwise.
+ */
+static mdadm_status_t managemon_disk_remove(struct mdinfo *disk, char *array_devnm)
+{
+       int new_state_fd = sysfs_open2(array_devnm, disk->sys_name, "state");
+
+       if (!is_fd_valid(new_state_fd))
+               return MDADM_STATUS_ERROR;
+
+       if (write_attr("remove", new_state_fd) != MDADM_STATUS_SUCCESS)
+               return MDADM_STATUS_ERROR;
+
+       close_fd(&new_state_fd);
+       close_fd(&disk->state_fd);
+       close_fd(&disk->recovery_fd);
+       close_fd(&disk->bb_fd);
+       close_fd(&disk->ubb_fd);
+
+       return MDADM_STATUS_SUCCESS;
+}
+
  static void manage_member(struct mdstat_ent *mdstat,
                           struct active_array *a)
  {
@@ -518,6 +551,43 @@ static void manage_member(struct mdstat_ent *mdstat,
                 if (write_attr("0.001", a->safe_mode_delay_fd) == MDADM_STATUS_SUCCESS)
                         a->info.safe_mode_delay = 1;
  
+       if (a->check_member_remove) {
+               bool any_removed = false;
+               bool all_removed = true;
+               struct mdinfo *disk;
+
+               for (disk = a->info.devs; disk; disk = disk->next) {
+                       if (disk->man_disk_to_remove == false)
+                               continue;
+
+                       if (disk->mon_descriptors_not_used == false) {
+                               /* To early, repeat later */
+                               all_removed = false;
+                               continue;
+                       }
+
+                       if (managemon_disk_remove(disk, a->info.sys_name)) {
+                               all_removed = false;
+                               continue;
+                       }
+
+                       any_removed = true;
+               }
+
+               if (any_removed) {
+                       struct active_array *newa = duplicate_aa(a);
+
+                       if (all_removed)
+                               newa->check_member_remove = false;
+
+                       replace_array(container, a, newa);
+                       a = newa;
+               }
+
+               if (!all_removed)
+                       return;
+       }
+
         /* We don't check the array while any update is pending, as it
          * might container a change (such as a spare assignment) which
          * could affect our decisions.
@@ -539,8 +609,6 @@ static void manage_member(struct mdstat_ent *mdstat,
                         return;
  
                 newa = duplicate_aa(a);
-               if (!newa)
-                       goto out;
                 /* prevent the kernel from activating the disk(s) before we
                  * finish adding them
                  */
@@ -570,7 +638,7 @@ static void manage_member(struct mdstat_ent *mdstat,
                                   "sync_action", "recover") == 0)
                         newa->prev_action = recover;
                 dprintf("recovery started on %s\n", a->info.sys_name);
- out:
+
                 while (newdev) {
                         d = newdev->next;
                         free(newdev);
@@ -604,11 +672,9 @@ static void manage_member(struct mdstat_ent *mdstat,
                         if (d2)
                                 /* already have this one */
                                 continue;
-                       if (!newa) {
+                       if (!newa)
                                 newa = duplicate_aa(a);
-                               if (!newa)
-                                       break;
-                       }
+
                         newd = xmalloc(sizeof(*newd));
                         disk_init_and_add(newd, d, newa);
                 }
diff --git a/mdadm.h b/mdadm.h

index 2a7d038df3a00a081217444079a0832aabc6a833..260cdf541510b38fe9c2d0fb4aed1c4d8943c573 100644 (file)
--- a/mdadm.h
+++ b/mdadm.h
@@ -417,6 +417,12 @@ struct mdinfo {
         #define DS_EXTERNAL_BB  4096
         int prev_state, curr_state, next_state;
  
+       /* If set by monitor, managemon needs to remove faulty device */
+       bool man_disk_to_remove : 1;
+
+       /* Managemon cannot close descriptors if monitor is using them for select() */
+       bool mon_descriptors_not_used : 1;
+
         /* info read from sysfs */
         enum {
                 ARRAY_CLEAR,
diff --git a/mdmon.h b/mdmon.h

index 110cbef29205222df0ea7834358d960b9565049f..ce6218a2fafed0f59a7b852ea20891d7e8fe890e 100644 (file)
--- a/mdmon.h
+++ b/mdmon.h
@@ -48,8 +48,14 @@ struct active_array {
         enum array_state prev_state, curr_state, next_state;
         enum sync_action prev_action, curr_action, next_action;
  
-       int check_degraded; /* flag set by mon, read by manage */
-       int check_reshape; /* flag set by mon, read by manage */
+       bool check_degraded : 1; /* flag set by mon, read by manage */
+       bool check_reshape : 1; /* flag set by mon, read by manage */
+
+       /**
+        * Signalize managemon there is a mdi to be removed.
+        * Monitor must acknowledge faulty state first.
+        */
+       bool check_member_remove : 1;
  };
  
  /*
diff --git a/monitor.c b/monitor.c

index 6429afc6631738980c93b5b50add5760993ed7d2..81ae88934f30e9cbe4252d2d4938bece1cba04bd 100644 (file)
--- a/monitor.c
+++ b/monitor.c
@@ -399,8 +399,9 @@ static void signal_manager(void)
  static int read_and_act(struct active_array *a)
  {
         unsigned long long sync_completed;
-       int check_degraded = 0;
-       int check_reshape = 0;
+       bool disks_to_remove = false;
+       bool check_degraded = false;
+       bool check_reshape = false;
         int deactivate = 0;
         struct mdinfo *mdi;
         int ret = 0;
@@ -425,7 +426,7 @@ static int read_and_act(struct active_array *a)
                 mdi->next_state = 0;
                 mdi->curr_state = 0;
  
-               if (!is_fd_valid(mdi->state_fd))
+               if (mdi->man_disk_to_remove)
                         /* We are removing this device, skip it then */
                         continue;
  
@@ -624,21 +625,12 @@ static int read_and_act(struct active_array *a)
                         write_attr("-blocked", mdi->state_fd);
                 }
  
-               if ((mdi->next_state & DS_REMOVE) && mdi->state_fd >= 0) {
-                       /* The kernel may not be able to immediately remove the
-                        * disk.  In that case we wait a little while and
-                        * try again.
-                        */
-                       if (write_attr("remove", mdi->state_fd) == MDADM_STATUS_SUCCESS) {
-                               dprintf_cont(" %d:removed", mdi->disk.raid_disk);
-                               close(mdi->state_fd);
-                               close(mdi->recovery_fd);
-                               close(mdi->bb_fd);
-                               close(mdi->ubb_fd);
-                               mdi->state_fd = -1;
-                       } else
-                               ret |= ARRAY_BUSY;
+               if ((mdi->next_state & DS_REMOVE) && !mdi->man_disk_to_remove) {
+                       dprintf_cont(" %d:disk_to_remove", mdi->disk.raid_disk);
+                       mdi->man_disk_to_remove = true;
+                       disks_to_remove = true;
                 }
+
                 if (mdi->next_state & DS_INSYNC) {
                         write_attr("+in_sync", mdi->state_fd);
                         dprintf_cont(" %d:+in_sync", mdi->disk.raid_disk);
@@ -651,17 +643,14 @@ static int read_and_act(struct active_array *a)
  
         a->prev_action = a->curr_action;
  
-       for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+       for (mdi = a->info.devs; mdi ; mdi = mdi->next)
                 mdi->prev_state = mdi->curr_state;
-               mdi->next_state = 0;
-       }
  
-       if (check_degraded || check_reshape) {
-               /* manager will do the actual check */
-               if (check_degraded)
-                       a->check_degraded = 1;
-               if (check_reshape)
-                       a->check_reshape = 1;
+       if (check_degraded || check_reshape || disks_to_remove) {
+
+               a->check_member_remove |= disks_to_remove;
+               a->check_degraded |= check_degraded;
+               a->check_reshape |= check_reshape;
                 signal_manager();
         }
  
@@ -734,13 +723,11 @@ int monitor_loop_cnt;
  
  static int wait_and_act(struct supertype *container, int nowait)
  {
-       fd_set rfds;
-       int maxfd = 0;
-       struct active_array **aap = &container->arrays;
-       struct active_array *a, **ap;
-       int rv;
-       struct mdinfo *mdi;
+       struct active_array *a, **ap, **aap = &container->arrays;
         static unsigned int dirty_arrays = ~0; /* start at some non-zero value */
+       struct mdinfo *mdi;
+       int rv, maxfd = 0;
+       fd_set rfds;
  
         FD_ZERO(&rfds);
  
@@ -764,7 +751,18 @@ static int wait_and_act(struct supertype *container, int nowait)
                 add_fd(&rfds, &maxfd, a->info.state_fd);
                 add_fd(&rfds, &maxfd, a->action_fd);
                 add_fd(&rfds, &maxfd, a->sync_completed_fd);
+
                 for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
+                       if (mdi->man_disk_to_remove) {
+                               mdi->mon_descriptors_not_used = true;
+
+                               /* Managemon could be blocked on suspend in kernel.
+                                * Monitor must respond if any badblock is recorded in this time.
+                                */
+                               container->retry_soon = 1;
+                               continue;
+                       }
+
                         add_fd(&rfds, &maxfd, mdi->state_fd);
                         add_fd(&rfds, &maxfd, mdi->bb_fd);
                         add_fd(&rfds, &maxfd, mdi->ubb_fd);
author	Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>
	Tue, 30 Jul 2024 12:12:21 +0000 (14:12 +0200)
committer	Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>
	Mon, 4 Nov 2024 09:29:52 +0000 (10:29 +0100)
managemon.c		patch \| blob \| blame \| history
mdadm.h		patch \| blob \| blame \| history
mdmon.h		patch \| blob \| blame \| history
monitor.c		patch \| blob \| blame \| history