From a48373e7d35a89f6f9b39f0d0da9bf158af054ee Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 20 Jan 2026 18:06:45 -0800 Subject: [PATCH] xfs: start creating infrastructure for health monitoring Start creating helper functions and infrastructure to pass filesystem health events to a health monitoring file. Since this is an administrative interface, we only support a single health monitor process per filesystem, so we don't need to use anything fancy such as notifier chains (== tons of indirect calls). Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/libxfs/xfs_fs.h | 7 ++ fs/xfs/xfs_health.c | 1 + fs/xfs/xfs_healthmon.c | 262 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_healthmon.h | 36 ++++++ fs/xfs/xfs_ioctl.c | 4 + fs/xfs/xfs_mount.c | 2 + fs/xfs/xfs_mount.h | 4 + 8 files changed, 317 insertions(+) create mode 100644 fs/xfs/xfs_healthmon.c create mode 100644 fs/xfs/xfs_healthmon.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 5bf501cf82717..1b7385e23b346 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -88,6 +88,7 @@ xfs-y += xfs_aops.o \ xfs_globals.o \ xfs_handle.o \ xfs_health.o \ + xfs_healthmon.o \ xfs_icache.o \ xfs_ioctl.o \ xfs_iomap.o \ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 12463ba766da0..c58e55b3df409 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -1003,6 +1003,12 @@ struct xfs_rtgroup_geometry { #define XFS_RTGROUP_GEOM_SICK_RMAPBT (1U << 3) /* reverse mappings */ #define XFS_RTGROUP_GEOM_SICK_REFCNTBT (1U << 4) /* reference counts */ +struct xfs_health_monitor { + __u64 flags; /* flags */ + __u8 format; /* output format */ + __u8 pad[23]; /* zeroes */ +}; + /* * ioctl commands that are used by Linux filesystems */ @@ -1042,6 +1048,7 @@ struct xfs_rtgroup_geometry { #define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle) #define XFS_IOC_SCRUBV_METADATA _IOWR('X', 64, struct xfs_scrub_vec_head) #define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 65, struct xfs_rtgroup_geometry) +#define XFS_IOC_HEALTH_MONITOR _IOW ('X', 68, struct xfs_health_monitor) /* * ioctl commands that replace IRIX syssgi()'s diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index fbb8886c72fe5..3d50397f8f7c0 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -19,6 +19,7 @@ #include "xfs_da_btree.h" #include "xfs_quota_defs.h" #include "xfs_rtgroup.h" +#include "xfs_healthmon.h" #include diff --git a/fs/xfs/xfs_healthmon.c b/fs/xfs/xfs_healthmon.c new file mode 100644 index 0000000000000..b7095ea55897c --- /dev/null +++ b/fs/xfs/xfs_healthmon.c @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2024-2026 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trace.h" +#include "xfs_ag.h" +#include "xfs_btree.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_quota_defs.h" +#include "xfs_rtgroup.h" +#include "xfs_healthmon.h" + +#include +#include +#include + +/* + * Live Health Monitoring + * ====================== + * + * Autonomous self-healing of XFS filesystems requires a means for the kernel + * to send filesystem health events to a monitoring daemon in userspace. To + * accomplish this, we establish a thread_with_file kthread object to handle + * translating internal events about filesystem health into a format that can + * be parsed easily by userspace. When those internal events occur, the core + * filesystem code calls this health monitor to convey the events to userspace. + * Userspace reads events from the file descriptor returned by the ioctl. + * + * The healthmon abstraction has a weak reference to the host filesystem mount + * so that the queueing and processing of the events do not pin the mount and + * cannot slow down the main filesystem. The healthmon object can exist past + * the end of the filesystem mount. + */ + +/* sign of a detached health monitor */ +#define DETACHED_MOUNT_COOKIE ((uintptr_t)0) + +/* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */ +static DEFINE_SPINLOCK(xfs_healthmon_lock); + +/* Grab a reference to the healthmon object for a given mount, if any. */ +static struct xfs_healthmon * +xfs_healthmon_get( + struct xfs_mount *mp) +{ + struct xfs_healthmon *hm; + + rcu_read_lock(); + hm = mp->m_healthmon; + if (hm && !refcount_inc_not_zero(&hm->ref)) + hm = NULL; + rcu_read_unlock(); + + return hm; +} + +/* + * Release the reference to a healthmon object. If there are no more holders, + * free the health monitor after an RCU grace period to eliminate possibility + * of races with xfs_healthmon_get. + */ +static void +xfs_healthmon_put( + struct xfs_healthmon *hm) +{ + if (refcount_dec_and_test(&hm->ref)) + kfree_rcu_mightsleep(hm); +} + +/* Attach a health monitor to an xfs_mount. Only one allowed at a time. */ +STATIC int +xfs_healthmon_attach( + struct xfs_mount *mp, + struct xfs_healthmon *hm) +{ + spin_lock(&xfs_healthmon_lock); + if (mp->m_healthmon != NULL) { + spin_unlock(&xfs_healthmon_lock); + return -EEXIST; + } + + refcount_inc(&hm->ref); + mp->m_healthmon = hm; + hm->mount_cookie = (uintptr_t)mp->m_super; + spin_unlock(&xfs_healthmon_lock); + + return 0; +} + +/* Detach a xfs mount from a specific healthmon instance. */ +STATIC void +xfs_healthmon_detach( + struct xfs_healthmon *hm) +{ + spin_lock(&xfs_healthmon_lock); + if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) { + spin_unlock(&xfs_healthmon_lock); + return; + } + + XFS_M((struct super_block *)hm->mount_cookie)->m_healthmon = NULL; + hm->mount_cookie = DETACHED_MOUNT_COOKIE; + spin_unlock(&xfs_healthmon_lock); + + xfs_healthmon_put(hm); +} + +/* Detach the xfs mount from this healthmon instance. */ +void +xfs_healthmon_unmount( + struct xfs_mount *mp) +{ + struct xfs_healthmon *hm = xfs_healthmon_get(mp); + + if (!hm) + return; + + xfs_healthmon_detach(hm); + xfs_healthmon_put(hm); +} + +STATIC ssize_t +xfs_healthmon_read_iter( + struct kiocb *iocb, + struct iov_iter *to) +{ + return -EIO; +} + +/* Free the health monitoring information. */ +STATIC int +xfs_healthmon_release( + struct inode *inode, + struct file *file) +{ + struct xfs_healthmon *hm = file->private_data; + + /* + * We might be closing the healthmon file before the filesystem + * unmounts, because userspace processes can terminate at any time and + * for any reason. Null out xfs_mount::m_healthmon so that another + * process can create another health monitor file. + */ + xfs_healthmon_detach(hm); + + xfs_healthmon_put(hm); + return 0; +} + +/* Validate ioctl parameters. */ +static inline bool +xfs_healthmon_validate( + const struct xfs_health_monitor *hmo) +{ + if (hmo->flags) + return false; + if (hmo->format) + return false; + if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad))) + return false; + return true; +} + +/* Emit some data about the health monitoring fd. */ +static void +xfs_healthmon_show_fdinfo( + struct seq_file *m, + struct file *file) +{ + struct xfs_healthmon *hm = file->private_data; + + seq_printf(m, "state:\t%s\ndev:\t%d:%d\n", + hm->mount_cookie == DETACHED_MOUNT_COOKIE ? + "dead" : "alive", + MAJOR(hm->dev), MINOR(hm->dev)); +} + +static const struct file_operations xfs_healthmon_fops = { + .owner = THIS_MODULE, + .show_fdinfo = xfs_healthmon_show_fdinfo, + .read_iter = xfs_healthmon_read_iter, + .release = xfs_healthmon_release, +}; + +/* + * Create a health monitoring file. Returns an index to the fd table or a + * negative errno. + */ +long +xfs_ioc_health_monitor( + struct file *file, + struct xfs_health_monitor __user *arg) +{ + struct xfs_health_monitor hmo; + struct xfs_healthmon *hm; + struct xfs_inode *ip = XFS_I(file_inode(file)); + struct xfs_mount *mp = ip->i_mount; + int ret; + + /* + * The only intended user of the health monitoring system should be the + * xfs_healer daemon running on behalf of the whole filesystem in the + * initial user namespace. IOWs, we don't allow unprivileged userspace + * (they can use fsnotify) nor do we allow containers. + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (ip->i_ino != mp->m_sb.sb_rootino) + return -EPERM; + if (current_user_ns() != &init_user_ns) + return -EPERM; + + if (copy_from_user(&hmo, arg, sizeof(hmo))) + return -EFAULT; + + if (!xfs_healthmon_validate(&hmo)) + return -EINVAL; + + hm = kzalloc(sizeof(*hm), GFP_KERNEL); + if (!hm) + return -ENOMEM; + hm->dev = mp->m_super->s_dev; + refcount_set(&hm->ref, 1); + + /* + * Try to attach this health monitor to the xfs_mount. The monitor is + * considered live and will receive events if this succeeds. + */ + ret = xfs_healthmon_attach(mp, hm); + if (ret) + goto out_hm; + + /* + * Create the anonymous file and install a fd for it. If it succeeds, + * the file owns hm and can go away at any time, so we must not access + * it again. This must go last because we can't undo a fd table + * installation. + */ + ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm, + O_CLOEXEC | O_RDONLY); + if (ret < 0) + goto out_mp; + + return ret; + +out_mp: + xfs_healthmon_detach(hm); +out_hm: + ASSERT(refcount_read(&hm->ref) == 1); + xfs_healthmon_put(hm); + return ret; +} diff --git a/fs/xfs/xfs_healthmon.h b/fs/xfs/xfs_healthmon.h new file mode 100644 index 0000000000000..218d5aac87b01 --- /dev/null +++ b/fs/xfs/xfs_healthmon.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2024-2026 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_HEALTHMON_H__ +#define __XFS_HEALTHMON_H__ + +struct xfs_healthmon { + /* + * Weak reference to the xfs filesystem that is being monitored. It + * will be set to zero when the filesystem detaches from the monitor. + * Do not dereference this pointer. + */ + uintptr_t mount_cookie; + + /* + * Device number of the filesystem being monitored. This is for + * consistent tracing even after unmount. + */ + dev_t dev; + + /* + * Reference count of this structure. The open healthmon fd holds one + * ref, the xfs_mount holds another ref if it points to this object, + * and running event handlers hold their own refs. + */ + refcount_t ref; +}; + +void xfs_healthmon_unmount(struct xfs_mount *mp); + +long xfs_ioc_health_monitor(struct file *file, + struct xfs_health_monitor __user *arg); + +#endif /* __XFS_HEALTHMON_H__ */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 59eaad7743718..c04c41ca924e3 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -41,6 +41,7 @@ #include "xfs_exchrange.h" #include "xfs_handle.h" #include "xfs_rtgroup.h" +#include "xfs_healthmon.h" #include #include @@ -1419,6 +1420,9 @@ xfs_file_ioctl( case XFS_IOC_COMMIT_RANGE: return xfs_ioc_commit_range(filp, arg); + case XFS_IOC_HEALTH_MONITOR: + return xfs_ioc_health_monitor(filp, arg); + default: return -ENOTTY; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 0953f6ae94abc..ab67c91915384 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -41,6 +41,7 @@ #include "xfs_rtrefcount_btree.h" #include "scrub/stats.h" #include "xfs_zone_alloc.h" +#include "xfs_healthmon.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -625,6 +626,7 @@ xfs_unmount_flush_inodes( cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp); xfs_health_unmount(mp); + xfs_healthmon_unmount(mp); } static void diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b871dfde372b5..61c71128d171c 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -13,6 +13,7 @@ struct xfs_ail; struct xfs_quotainfo; struct xfs_da_geometry; struct xfs_perag; +struct xfs_healthmon; /* dynamic preallocation free space thresholds, 5% down to 1% */ enum { @@ -342,6 +343,9 @@ typedef struct xfs_mount { /* Hook to feed dirent updates to an active online repair. */ struct xfs_hooks m_dir_update_hooks; + + /* Private data referring to a health monitor object. */ + struct xfs_healthmon *m_healthmon; } xfs_mount_t; #define M_IGEO(mp) (&(mp)->m_ino_geo) -- 2.47.3