xfs: track directory entry updates during live nlinks fsck

author Darrick J. Wong <djwong@kernel.org>

Thu, 22 Feb 2024 20:30:59 +0000 (12:30 -0800)

committer Darrick J. Wong <djwong@kernel.org>

Thu, 22 Feb 2024 20:30:59 +0000 (12:30 -0800)
author Darrick J. Wong <djwong@kernel.org>
Thu, 22 Feb 2024 20:30:59 +0000 (12:30 -0800)
committer Darrick J. Wong <djwong@kernel.org>
Thu, 22 Feb 2024 20:30:59 +0000 (12:30 -0800)
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c

index c5a6c47d3df2e65c085ca48fdf09015726467136..699092195f41b6d4007db70f6c787fcf1f127c3e 100644 (file)
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -1302,6 +1302,9 @@ xchk_fsgates_enable(
         if (scrub_fsgates & XCHK_FSGATES_QUOTA)
                 xfs_dqtrx_hook_enable();
  
+       if (scrub_fsgates & XCHK_FSGATES_DIRENTS)
+               xfs_dir_hook_enable();
+
         sc->flags |= scrub_fsgates;
  }
  
diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c

index c899a50a83daf5196ddade8589abdba52e76249d..341ab737f9c5daa08a4b1362b5bfe240cd337e6b 100644 (file)
--- a/fs/xfs/scrub/nlinks.c
+++ b/fs/xfs/scrub/nlinks.c
@@ -43,8 +43,7 @@ int
  xchk_setup_nlinks(
         struct xfs_scrub        *sc)
  {
-       /* Not ready for general consumption yet. */
-       return -EOPNOTSUPP;
+       xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
  
         sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS);
         if (!sc->buf)
@@ -63,6 +62,21 @@ xchk_setup_nlinks(
   * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
   * ECANCELED) that are absorbed into a scrub state flag update by
   * xchk_*_process_error.
+ *
+ * Because we are scanning a live filesystem, it's possible that another thread
+ * will try to update the link counts for an inode that we've already scanned.
+ * This will cause our counts to be incorrect.  Therefore, we hook all
+ * directory entry updates because that is when link count updates occur.  By
+ * shadowing transaction updates in this manner, live nlink check can ensure by
+ * locking the inode and the shadow structure that its own copies are not out
+ * of date.  Because the hook code runs in a different process context from the
+ * scrub code and the scrub state flags are not accessed atomically, failures
+ * in the hook code must abort the iscan and the scrubber must notice the
+ * aborted scan and set the incomplete flag.
+ *
+ * Note that we use jump labels and srcu notifier hooks to minimize the
+ * overhead when live nlinks is /not/ running.  Locking order for nlink
+ * observations is inode ILOCK -> iscan_lock/xchk_nlink_ctrs lock.
   */
  
  /*
@@ -120,6 +134,63 @@ xchk_nlinks_update_incore(
         return error;
  }
  
+/*
+ * Apply a link count change from the regular filesystem into our shadow link
+ * count structure based on a directory update in progress.
+ */
+STATIC int
+xchk_nlinks_live_update(
+       struct notifier_block           *nb,
+       unsigned long                   action,
+       void                            *data)
+{
+       struct xfs_dir_update_params    *p = data;
+       struct xchk_nlink_ctrs          *xnc;
+       int                             error;
+
+       xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb);
+
+       trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino,
+                       p->delta, p->name->name, p->name->len);
+
+       /*
+        * If we've already scanned @dp, update the number of parents that link
+        * to @ip.  If @ip is a subdirectory, update the number of child links
+        * going out of @dp.
+        */
+       if (xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) {
+               mutex_lock(&xnc->lock);
+               error = xchk_nlinks_update_incore(xnc, p->ip->i_ino, p->delta,
+                               0, 0);
+               if (!error && S_ISDIR(VFS_IC(p->ip)->i_mode))
+                       error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
+                                       0, p->delta);
+               mutex_unlock(&xnc->lock);
+               if (error)
+                       goto out_abort;
+       }
+
+       /*
+        * If @ip is a subdirectory and we've already scanned it, update the
+        * number of backrefs pointing to @dp.
+        */
+       if (S_ISDIR(VFS_IC(p->ip)->i_mode) &&
+           xchk_iscan_want_live_update(&xnc->collect_iscan, p->ip->i_ino)) {
+               mutex_lock(&xnc->lock);
+               error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
+                               p->delta, 0);
+               mutex_unlock(&xnc->lock);
+               if (error)
+                       goto out_abort;
+       }
+
+       return NOTIFY_DONE;
+
+out_abort:
+       xchk_iscan_abort(&xnc->collect_iscan);
+       return NOTIFY_DONE;
+}
+
  /* Bump the observed link count for the inode referenced by this entry. */
  STATIC int
  xchk_nlinks_collect_dirent(
@@ -747,6 +818,11 @@ xchk_nlinks_teardown_scan(
  {
         struct xchk_nlink_ctrs  *xnc = priv;
  
+       /* Discourage any hook functions that might be running. */
+       xchk_iscan_abort(&xnc->collect_iscan);
+
+       xfs_dir_hook_del(xnc->sc->mp, &xnc->dhook);
+
         xfarray_destroy(xnc->nlinks);
         xnc->nlinks = NULL;
  
@@ -793,6 +869,19 @@ xchk_nlinks_setup_scan(
         if (error)
                 goto out_teardown;
  
+       /*
+        * Hook into the directory entry code so that we can capture updates to
+        * file link counts.  The hook only triggers for inodes that were
+        * already scanned, and the scanner thread takes each inode's ILOCK,
+        * which means that any in-progress inode updates will finish before we
+        * can scan the inode.
+        */
+       ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
+       xfs_dir_hook_setup(&xnc->dhook, xchk_nlinks_live_update);
+       error = xfs_dir_hook_add(mp, &xnc->dhook);
+       if (error)
+               goto out_teardown;
+
         /* Use deferred cleanup to pass the inode link count data to repair. */
         sc->buf_cleanup = xchk_nlinks_teardown_scan;
         return 0;
diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h

index 69a3460c5e52fe2da92cfa51f82a4df06aa83ffa..ea6408e1c1832787a7800e68a0ac7e10f0b764a1 100644 (file)
--- a/fs/xfs/scrub/nlinks.h
+++ b/fs/xfs/scrub/nlinks.h
@@ -22,6 +22,12 @@ struct xchk_nlink_ctrs {
          */
         struct xchk_iscan       collect_iscan;
         struct xchk_iscan       compare_iscan;
+
+       /*
+        * Hook into directory updates so that we can receive live updates
+        * from other writer threads.
+        */
+       struct xfs_dir_hook     dhook;
  };
  
  /*
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c

index 8c60774d5f3450cb19de0678caed3e54f4844b54..883c47b6c686004a63b2c9967ec78e3d387b51dd 100644 (file)
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -160,6 +160,9 @@ xchk_fsgates_disable(
         if (sc->flags & XCHK_FSGATES_QUOTA)
                 xfs_dqtrx_hook_disable();
  
+       if (sc->flags & XCHK_FSGATES_DIRENTS)
+               xfs_dir_hook_disable();
+
         sc->flags &= ~XCHK_FSGATES_ALL;
  }
  
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h

index de6b45f99dd5fca8eb790970fec5ec0f9ea65454..f99a3c21d02ea00cb8dbe114afedf99ab0286df4 100644 (file)
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -122,6 +122,7 @@ struct xfs_scrub {
  #define XCHK_FSGATES_DRAIN     (1U << 2)  /* defer ops draining enabled */
  #define XCHK_NEED_DRAIN                (1U << 3)  /* scrub needs to drain defer ops */
  #define XCHK_FSGATES_QUOTA     (1U << 4)  /* quota live update enabled */
+#define XCHK_FSGATES_DIRENTS   (1U << 5)  /* directory live update enabled */
  #define XREP_RESET_PERAG_RESV  (1U << 30) /* must reset AG space reservation */
  #define XREP_ALREADY_FIXED     (1U << 31) /* checking our repair work */
  
@@ -132,7 +133,8 @@ struct xfs_scrub {
   * must be enabled during scrub setup and can only be torn down afterwards.
   */
  #define XCHK_FSGATES_ALL       (XCHK_FSGATES_DRAIN | \
-                                XCHK_FSGATES_QUOTA)
+                                XCHK_FSGATES_QUOTA | \
+                                XCHK_FSGATES_DIRENTS)
  
  /* Metadata scrubbers */
  int xchk_tester(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h

index 0deea8f18a30bf07bb51826420b64935e9c3753d..9512170ea9a7b8824f79d50a1c58661283ab6823 100644 (file)
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -116,6 +116,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS);
         { XCHK_FSGATES_DRAIN,                   "fsgates_drain" }, \
         { XCHK_NEED_DRAIN,                      "need_drain" }, \
         { XCHK_FSGATES_QUOTA,                   "fsgates_quota" }, \
+       { XCHK_FSGATES_DIRENTS,                 "fsgates_dirents" }, \
         { XREP_RESET_PERAG_RESV,                "reset_perag_resv" }, \
         { XREP_ALREADY_FIXED,                   "already_fixed" }
  
@@ -1363,6 +1364,38 @@ TRACE_EVENT(xchk_nlinks_collect_metafile,
                   __entry->ino)
  );
  
+TRACE_EVENT(xchk_nlinks_live_update,
+       TP_PROTO(struct xfs_mount *mp, const struct xfs_inode *dp,
+                int action, xfs_ino_t ino, int delta,
+                const char *name, unsigned int namelen),
+       TP_ARGS(mp, dp, action, ino, delta, name, namelen),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, dir)
+               __field(int, action)
+               __field(xfs_ino_t, ino)
+               __field(int, delta)
+               __field(unsigned int, namelen)
+               __dynamic_array(char, name, namelen)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->dir = dp ? dp->i_ino : NULLFSINO;
+               __entry->action = action;
+               __entry->ino = ino;
+               __entry->delta = delta;
+               __entry->namelen = namelen;
+               memcpy(__get_str(name), name, namelen);
+       ),
+       TP_printk("dev %d:%d dir 0x%llx ino 0x%llx nlink_delta %d name '%.*s'",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->dir,
+                 __entry->ino,
+                 __entry->delta,
+                 __entry->namelen,
+                 __get_str(name))
+);
+
  TRACE_EVENT(xchk_nlinks_check_zero,
         TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino,
                  const struct xchk_nlink *live),
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c

index d6635d219527b5ae967eb3d7ba5c7cbc1bce7e43..e8845287debd68fc5586369f081ff22ff213a07a 100644 (file)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -925,6 +925,81 @@ xfs_bumplink(
         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  }
  
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/*
+ * Use a static key here to reduce the overhead of directory live update hooks.
+ * If the compiler supports jump labels, the static branch will be replaced by
+ * a nop sled when there are no hook users.  Online fsck is currently the only
+ * caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock.  Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch);
+
+void
+xfs_dir_hook_disable(void)
+{
+       xfs_hooks_switch_off(&xfs_dir_hooks_switch);
+}
+
+void
+xfs_dir_hook_enable(void)
+{
+       xfs_hooks_switch_on(&xfs_dir_hooks_switch);
+}
+
+/* Call hooks for a directory update relating to a child dirent update. */
+inline void
+xfs_dir_update_hook(
+       struct xfs_inode                *dp,
+       struct xfs_inode                *ip,
+       int                             delta,
+       const struct xfs_name           *name)
+{
+       if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) {
+               struct xfs_dir_update_params    p = {
+                       .dp             = dp,
+                       .ip             = ip,
+                       .delta          = delta,
+                       .name           = name,
+               };
+               struct xfs_mount        *mp = ip->i_mount;
+
+               xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p);
+       }
+}
+
+/* Call the specified function during a directory update. */
+int
+xfs_dir_hook_add(
+       struct xfs_mount        *mp,
+       struct xfs_dir_hook     *hook)
+{
+       return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook);
+}
+
+/* Stop calling the specified function during a directory update. */
+void
+xfs_dir_hook_del(
+       struct xfs_mount        *mp,
+       struct xfs_dir_hook     *hook)
+{
+       xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook);
+}
+
+/* Configure directory update hook functions. */
+void
+xfs_dir_hook_setup(
+       struct xfs_dir_hook     *hook,
+       notifier_fn_t           mod_fn)
+{
+       xfs_hook_setup(&hook->dirent_hook, mod_fn);
+}
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
  int
  xfs_create(
         struct mnt_idmap        *idmap,
@@ -1035,6 +1110,12 @@ xfs_create(
                 xfs_bumplink(tp, dp);
         }
  
+       /*
+        * Create ip with a reference from dp, and add '.' and '..' references
+        * if it's a directory.
+        */
+       xfs_dir_update_hook(dp, ip, 1, name);
+
         /*
          * If this is a synchronous mount, make sure that the
          * create transaction goes to disk before returning to
@@ -1249,6 +1330,7 @@ xfs_link(
         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
  
         xfs_bumplink(tp, sip);
+       xfs_dir_update_hook(tdp, sip, 1, target_name);
  
         /*
          * If this is a synchronous mount, make sure that the
@@ -2562,6 +2644,12 @@ xfs_remove(
                 goto out_trans_cancel;
         }
  
+       /*
+        * Drop the link from dp to ip, and if ip was a directory, remove the
+        * '.' and '..' references since we freed the directory.
+        */
+       xfs_dir_update_hook(dp, ip, -1, name);
+
         /*
          * If this is a synchronous mount, make sure that the
          * remove transaction goes to disk before returning to
@@ -2752,6 +2840,20 @@ xfs_cross_rename(
         }
         xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
         xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
+
+       /*
+        * Inform our hook clients that we've finished an exchange operation as
+        * follows: removed the source and target files from their directories;
+        * added the target to the source directory; and added the source to
+        * the target directory.  All inodes are locked, so it's ok to model a
+        * rename this way so long as we say we deleted entries before we add
+        * new ones.
+        */
+       xfs_dir_update_hook(dp1, ip1, -1, name1);
+       xfs_dir_update_hook(dp2, ip2, -1, name2);
+       xfs_dir_update_hook(dp1, ip2, 1, name1);
+       xfs_dir_update_hook(dp2, ip1, 1, name2);
+
         return xfs_finish_rename(tp);
  
  out_trans_abort:
@@ -3135,6 +3237,21 @@ retry:
         if (new_parent)
                 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
  
+       /*
+        * Inform our hook clients that we've finished a rename operation as
+        * follows: removed the source and target files from their directories;
+        * that we've added the source to the target directory; and finally
+        * that we've added the whiteout, if there was one.  All inodes are
+        * locked, so it's ok to model a rename this way so long as we say we
+        * deleted entries before we add new ones.
+        */
+       if (target_ip)
+               xfs_dir_update_hook(target_dp, target_ip, -1, target_name);
+       xfs_dir_update_hook(src_dp, src_ip, -1, src_name);
+       xfs_dir_update_hook(target_dp, src_ip, 1, target_name);
+       if (wip)
+               xfs_dir_update_hook(src_dp, wip, 1, src_name);
+
         error = xfs_finish_rename(tp);
         if (wip)
                 xfs_irele(wip);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h

index 7bbdc7009e7d0d869414988c691e569331d7390c..ab46ffb3ac19ee91c8f12ea9980fa7d5c410ad93 100644 (file)
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -171,6 +171,12 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
         return &ip->i_vnode;
  }
  
+/* convert from const xfs inode to const vfs inode */
+static inline const struct inode *VFS_IC(const struct xfs_inode *ip)
+{
+       return &ip->i_vnode;
+}
+
  /*
   * For regular files we only update the on-disk filesize when actually
   * writing data back to disk.  Until then only the copy in the VFS inode
@@ -626,4 +632,29 @@ bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork);
  void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
                 xfs_filblks_t *dblocks, xfs_filblks_t *rblocks);
  
+struct xfs_dir_update_params {
+       const struct xfs_inode  *dp;
+       const struct xfs_inode  *ip;
+       const struct xfs_name   *name;
+       int                     delta;
+};
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+void xfs_dir_update_hook(struct xfs_inode *dp, struct xfs_inode *ip,
+               int delta, const struct xfs_name *name);
+
+struct xfs_dir_hook {
+       struct xfs_hook         dirent_hook;
+};
+
+void xfs_dir_hook_disable(void);
+void xfs_dir_hook_enable(void);
+
+int xfs_dir_hook_add(struct xfs_mount *mp, struct xfs_dir_hook *hook);
+void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook);
+void xfs_dir_hook_setup(struct xfs_dir_hook *hook, notifier_fn_t mod_fn);
+#else
+# define xfs_dir_update_hook(dp, ip, delta, name)      ((void)0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
  #endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h

index 503fe3c7edbf82cd1f194b3fd3d33b27bb06167d..e86dfe67894fb2dbaeb662c09151840e54719092 100644 (file)
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -252,6 +252,9 @@ typedef struct xfs_mount {
  
         /* cpus that have inodes queued for inactivation */
         struct cpumask          m_inodegc_cpumask;
+
+       /* Hook to feed dirent updates to an active online repair. */
+       struct xfs_hooks        m_dir_update_hooks;
  } xfs_mount_t;
  
  #define M_IGEO(mp)             (&(mp)->m_ino_geo)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c

index b31652fa70040a72670fd85429bcbc0ef85b372e..74e87ed5eee18e2c9f81f47c2400760505d59b12 100644 (file)
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -2011,6 +2011,8 @@ static int xfs_init_fs_context(
         mp->m_logbsize = -1;
         mp->m_allocsize_log = 16; /* 64k */
  
+       xfs_hooks_init(&mp->m_dir_update_hooks);
+
         fc->s_fs_info = mp;
         fc->ops = &xfs_context_ops;
  
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c

index c2dc8c501bdc53e86296c6d00a5cbedc10be10f5..e73692fbe1792578bf0fa66d8b9c0ec671e6ee6e 100644 (file)
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -322,6 +322,7 @@ xfs_symlink(
                 goto out_trans_cancel;
         xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+       xfs_dir_update_hook(dp, ip, 1, link_name);
  
         /*
          * If this is a synchronous mount, make sure that the
author	Darrick J. Wong <djwong@kernel.org>
	Thu, 22 Feb 2024 20:30:59 +0000 (12:30 -0800)
committer	Darrick J. Wong <djwong@kernel.org>
	Thu, 22 Feb 2024 20:30:59 +0000 (12:30 -0800)
fs/xfs/scrub/common.c		patch \| blob \| blame \| history
fs/xfs/scrub/nlinks.c		patch \| blob \| blame \| history
fs/xfs/scrub/nlinks.h		patch \| blob \| blame \| history
fs/xfs/scrub/scrub.c		patch \| blob \| blame \| history
fs/xfs/scrub/scrub.h		patch \| blob \| blame \| history
fs/xfs/scrub/trace.h		patch \| blob \| blame \| history
fs/xfs/xfs_inode.c		patch \| blob \| blame \| history
fs/xfs/xfs_inode.h		patch \| blob \| blame \| history
fs/xfs/xfs_mount.h		patch \| blob \| blame \| history
fs/xfs/xfs_super.c		patch \| blob \| blame \| history
fs/xfs/xfs_symlink.c		patch \| blob \| blame \| history