]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
Merge tag 'health-monitoring-7.0_2026-01-20' of https://git.kernel.org/pub/scm/linux...
authorCarlos Maiolino <cem@kernel.org>
Wed, 28 Jan 2026 08:47:42 +0000 (09:47 +0100)
committerCarlos Maiolino <cem@kernel.org>
Wed, 28 Jan 2026 09:02:20 +0000 (10:02 +0100)
xfs: autonomous self healing of filesystems [v7]

This patchset builds new functionality to deliver live information about
filesystem health events to userspace.  This is done by creating an
anonymous file that can be read() for events by userspace programs.
Events are captured by hooking various parts of XFS and iomap so that
metadata health failures, file I/O errors, and major changes in
filesystem state (unmounts, shutdowns, etc.) can be observed by
programs.

When an event occurs, the hook functions queue an event object to each
event anonfd for later processing.  Programs must have CAP_SYS_ADMIN
to open the anonfd and there's a maximum event lag to prevent resource
overconsumption.  The events themselves can be read() from the anonfd
as C structs for the xfs_healer daemon.

In userspace, we create a new daemon program that will read the event
objects and initiate repairs automatically.  This daemon is managed
entirely by systemd and will not block unmounting of the filesystem
unless repairs are ongoing.  They are auto-started by a starter
service that uses fanotify.

This patchset depends on the new fserror code that Christian Brauner
has tentatively accepted for Linux 7.0:
https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git/log/?h=vfs-7.0.fserror

v7: more cleanups of the media verification ioctl, improve comments, and
    reuse the bio
v6: fix pi-breaking bugs, make verify failures trigger health reports
    and filter bio status flags better
v5: add verify-media ioctl, collapse small helper funcs with only
    one caller
v4: drop multiple client support so we can make direct calls into
    healthmon instead of chasing pointers and doing indirect calls
v3: drag out of rfc status

With a bit of luck, this should all go splendidly.

Conflicts:
This merge required an update on files:
- fs/xfs/xfs_healthmon.c
- fs/xfs/xfs_verify_media.c
Such change was required because a parallel developement changed
XFS header file xfs.h naming to xfs_platform.h, so the merge
required to update those includes in both files above

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
1  2 
fs/iomap/buffered-io.c
fs/xfs/xfs_fsops.c
fs/xfs/xfs_health.c
fs/xfs/xfs_healthmon.c
fs/xfs/xfs_ioctl.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_notify_failure.c
fs/xfs/xfs_platform.h
fs/xfs/xfs_super.c
fs/xfs/xfs_trace.c
fs/xfs/xfs_verify_media.c

Simple merge
Simple merge
Simple merge
index 0000000000000000000000000000000000000000,3030fa93c1e5753adb9761cc50a07c54f815dee3..ca7352dcd182fb602ee3940f0a5ac12a3a4c9670
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1255 +1,1255 @@@
 -#include "xfs.h"
+ // SPDX-License-Identifier: GPL-2.0-or-later
+ /*
+  * Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+  * Author: Darrick J. Wong <djwong@kernel.org>
+  */
++#include "xfs_platform.h"
+ #include "xfs_fs.h"
+ #include "xfs_shared.h"
+ #include "xfs_format.h"
+ #include "xfs_log_format.h"
+ #include "xfs_trans_resv.h"
+ #include "xfs_mount.h"
+ #include "xfs_inode.h"
+ #include "xfs_trace.h"
+ #include "xfs_ag.h"
+ #include "xfs_btree.h"
+ #include "xfs_da_format.h"
+ #include "xfs_da_btree.h"
+ #include "xfs_quota_defs.h"
+ #include "xfs_rtgroup.h"
+ #include "xfs_health.h"
+ #include "xfs_healthmon.h"
+ #include "xfs_fsops.h"
+ #include "xfs_notify_failure.h"
+ #include "xfs_file.h"
+ #include "xfs_ioctl.h"
+ #include <linux/anon_inodes.h>
+ #include <linux/eventpoll.h>
+ #include <linux/poll.h>
+ #include <linux/fserror.h>
+ /*
+  * Live Health Monitoring
+  * ======================
+  *
+  * Autonomous self-healing of XFS filesystems requires a means for the kernel
+  * to send filesystem health events to a monitoring daemon in userspace.  To
+  * accomplish this, we establish a thread_with_file kthread object to handle
+  * translating internal events about filesystem health into a format that can
+  * be parsed easily by userspace.  When those internal events occur, the core
+  * filesystem code calls this health monitor to convey the events to userspace.
+  * Userspace reads events from the file descriptor returned by the ioctl.
+  *
+  * The healthmon abstraction has a weak reference to the host filesystem mount
+  * so that the queueing and processing of the events do not pin the mount and
+  * cannot slow down the main filesystem.  The healthmon object can exist past
+  * the end of the filesystem mount.
+  */
+ /* sign of a detached health monitor */
+ #define DETACHED_MOUNT_COOKIE         ((uintptr_t)0)
+ /* Constrain the number of event objects that can build up in memory. */
+ #define XFS_HEALTHMON_MAX_EVENTS      (SZ_32K / \
+                                        sizeof(struct xfs_healthmon_event))
+ /* Constrain the size of the output buffer for read_iter. */
+ #define XFS_HEALTHMON_MAX_OUTBUF      SZ_64K
+ /* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */
+ static DEFINE_SPINLOCK(xfs_healthmon_lock);
+ /* Grab a reference to the healthmon object for a given mount, if any. */
+ static struct xfs_healthmon *
+ xfs_healthmon_get(
+       struct xfs_mount                *mp)
+ {
+       struct xfs_healthmon            *hm;
+       rcu_read_lock();
+       hm = mp->m_healthmon;
+       if (hm && !refcount_inc_not_zero(&hm->ref))
+               hm = NULL;
+       rcu_read_unlock();
+       return hm;
+ }
+ /*
+  * Release the reference to a healthmon object.  If there are no more holders,
+  * free the health monitor after an RCU grace period to eliminate possibility
+  * of races with xfs_healthmon_get.
+  */
+ static void
+ xfs_healthmon_put(
+       struct xfs_healthmon            *hm)
+ {
+       if (refcount_dec_and_test(&hm->ref)) {
+               struct xfs_healthmon_event      *event;
+               struct xfs_healthmon_event      *next = hm->first_event;
+               while ((event = next) != NULL) {
+                       trace_xfs_healthmon_drop(hm, event);
+                       next = event->next;
+                       kfree(event);
+               }
+               kfree(hm->unmount_event);
+               kfree(hm->buffer);
+               mutex_destroy(&hm->lock);
+               kfree_rcu_mightsleep(hm);
+       }
+ }
+ /* Attach a health monitor to an xfs_mount.  Only one allowed at a time. */
+ STATIC int
+ xfs_healthmon_attach(
+       struct xfs_mount        *mp,
+       struct xfs_healthmon    *hm)
+ {
+       spin_lock(&xfs_healthmon_lock);
+       if (mp->m_healthmon != NULL) {
+               spin_unlock(&xfs_healthmon_lock);
+               return -EEXIST;
+       }
+       refcount_inc(&hm->ref);
+       mp->m_healthmon = hm;
+       hm->mount_cookie = (uintptr_t)mp->m_super;
+       spin_unlock(&xfs_healthmon_lock);
+       return 0;
+ }
+ /* Detach a xfs mount from a specific healthmon instance. */
+ STATIC void
+ xfs_healthmon_detach(
+       struct xfs_healthmon    *hm)
+ {
+       spin_lock(&xfs_healthmon_lock);
+       if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) {
+               spin_unlock(&xfs_healthmon_lock);
+               return;
+       }
+       XFS_M((struct super_block *)hm->mount_cookie)->m_healthmon = NULL;
+       hm->mount_cookie = DETACHED_MOUNT_COOKIE;
+       spin_unlock(&xfs_healthmon_lock);
+       trace_xfs_healthmon_detach(hm);
+       xfs_healthmon_put(hm);
+ }
+ static inline void xfs_healthmon_bump_events(struct xfs_healthmon *hm)
+ {
+       hm->events++;
+       hm->total_events++;
+ }
+ static inline void xfs_healthmon_bump_lost(struct xfs_healthmon *hm)
+ {
+       hm->lost_prev_event++;
+       hm->total_lost++;
+ }
+ /*
+  * If possible, merge a new event into an existing event.  Returns whether or
+  * not it merged anything.
+  */
+ static bool
+ xfs_healthmon_merge_events(
+       struct xfs_healthmon_event              *existing,
+       const struct xfs_healthmon_event        *new)
+ {
+       if (!existing)
+               return false;
+       /* type and domain must match to merge events */
+       if (existing->type != new->type ||
+           existing->domain != new->domain)
+               return false;
+       switch (existing->type) {
+       case XFS_HEALTHMON_RUNNING:
+       case XFS_HEALTHMON_UNMOUNT:
+               /* should only ever be one of these events anyway */
+               return false;
+       case XFS_HEALTHMON_LOST:
+               existing->lostcount += new->lostcount;
+               return true;
+       case XFS_HEALTHMON_SICK:
+       case XFS_HEALTHMON_CORRUPT:
+       case XFS_HEALTHMON_HEALTHY:
+               switch (existing->domain) {
+               case XFS_HEALTHMON_FS:
+                       existing->fsmask |= new->fsmask;
+                       return true;
+               case XFS_HEALTHMON_AG:
+               case XFS_HEALTHMON_RTGROUP:
+                       if (existing->group == new->group){
+                               existing->grpmask |= new->grpmask;
+                               return true;
+                       }
+                       return false;
+               case XFS_HEALTHMON_INODE:
+                       if (existing->ino == new->ino &&
+                           existing->gen == new->gen) {
+                               existing->imask |= new->imask;
+                               return true;
+                       }
+                       return false;
+               default:
+                       ASSERT(0);
+                       return false;
+               }
+               return false;
+       case XFS_HEALTHMON_SHUTDOWN:
+               /* yes, we can race to shutdown */
+               existing->flags |= new->flags;
+               return true;
+       case XFS_HEALTHMON_MEDIA_ERROR:
+               /* physically adjacent errors can merge */
+               if (existing->daddr + existing->bbcount == new->daddr) {
+                       existing->bbcount += new->bbcount;
+                       return true;
+               }
+               if (new->daddr + new->bbcount == existing->daddr) {
+                       existing->daddr = new->daddr;
+                       existing->bbcount += new->bbcount;
+                       return true;
+               }
+               return false;
+       case XFS_HEALTHMON_BUFREAD:
+       case XFS_HEALTHMON_BUFWRITE:
+       case XFS_HEALTHMON_DIOREAD:
+       case XFS_HEALTHMON_DIOWRITE:
+       case XFS_HEALTHMON_DATALOST:
+               /* logically adjacent file ranges can merge */
+               if (existing->fino != new->fino || existing->fgen != new->fgen)
+                       return false;
+               if (existing->fpos + existing->flen == new->fpos) {
+                       existing->flen += new->flen;
+                       return true;
+               }
+               if (new->fpos + new->flen == existing->fpos) {
+                       existing->fpos = new->fpos;
+                       existing->flen += new->flen;
+                       return true;
+               }
+               return false;
+       }
+       return false;
+ }
+ /* Insert an event onto the start of the queue. */
+ static inline void
+ __xfs_healthmon_insert(
+       struct xfs_healthmon            *hm,
+       struct xfs_healthmon_event      *event)
+ {
+       struct timespec64               now;
+       ktime_get_coarse_real_ts64(&now);
+       event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;
+       event->next = hm->first_event;
+       if (!hm->first_event)
+               hm->first_event = event;
+       if (!hm->last_event)
+               hm->last_event = event;
+       xfs_healthmon_bump_events(hm);
+       wake_up(&hm->wait);
+       trace_xfs_healthmon_insert(hm, event);
+ }
+ /* Push an event onto the end of the queue. */
+ static inline void
+ __xfs_healthmon_push(
+       struct xfs_healthmon            *hm,
+       struct xfs_healthmon_event      *event)
+ {
+       struct timespec64               now;
+       ktime_get_coarse_real_ts64(&now);
+       event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;
+       if (!hm->first_event)
+               hm->first_event = event;
+       if (hm->last_event)
+               hm->last_event->next = event;
+       hm->last_event = event;
+       event->next = NULL;
+       xfs_healthmon_bump_events(hm);
+       wake_up(&hm->wait);
+       trace_xfs_healthmon_push(hm, event);
+ }
+ /* Deal with any previously lost events */
+ static int
+ xfs_healthmon_clear_lost_prev(
+       struct xfs_healthmon            *hm)
+ {
+       struct xfs_healthmon_event      lost_event = {
+               .type                   = XFS_HEALTHMON_LOST,
+               .domain                 = XFS_HEALTHMON_MOUNT,
+               .lostcount              = hm->lost_prev_event,
+       };
+       struct xfs_healthmon_event      *event = NULL;
+       if (xfs_healthmon_merge_events(hm->last_event, &lost_event)) {
+               trace_xfs_healthmon_merge(hm, hm->last_event);
+               wake_up(&hm->wait);
+               goto cleared;
+       }
+       if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
+               event = kmemdup(&lost_event, sizeof(struct xfs_healthmon_event),
+                               GFP_NOFS);
+       if (!event)
+               return -ENOMEM;
+       __xfs_healthmon_push(hm, event);
+ cleared:
+       hm->lost_prev_event = 0;
+       return 0;
+ }
+ /*
+  * Push an event onto the end of the list after dealing with lost events and
+  * possibly full queues.
+  */
+ STATIC int
+ xfs_healthmon_push(
+       struct xfs_healthmon                    *hm,
+       const struct xfs_healthmon_event        *template)
+ {
+       struct xfs_healthmon_event              *event = NULL;
+       int                                     error = 0;
+       /*
+        * Locklessly check if the health monitor has already detached from the
+        * mount.  If so, ignore the event.  If we race with deactivation,
+        * we'll queue the event but never send it.
+        */
+       if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
+               return -ESHUTDOWN;
+       mutex_lock(&hm->lock);
+       /* Report previously lost events before we do anything else */
+       if (hm->lost_prev_event) {
+               error = xfs_healthmon_clear_lost_prev(hm);
+               if (error)
+                       goto out_unlock;
+       }
+       /* Try to merge with the newest event */
+       if (xfs_healthmon_merge_events(hm->last_event, template)) {
+               trace_xfs_healthmon_merge(hm, hm->last_event);
+               wake_up(&hm->wait);
+               goto out_unlock;
+       }
+       /* Only create a heap event object if we're not already at capacity. */
+       if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
+               event = kmemdup(template, sizeof(struct xfs_healthmon_event),
+                               GFP_NOFS);
+       if (!event) {
+               /* No memory means we lose the event */
+               trace_xfs_healthmon_lost_event(hm);
+               xfs_healthmon_bump_lost(hm);
+               error = -ENOMEM;
+               goto out_unlock;
+       }
+       __xfs_healthmon_push(hm, event);
+ out_unlock:
+       mutex_unlock(&hm->lock);
+       return error;
+ }
+ /*
+  * Report that the filesystem is being unmounted, then detach the xfs mount
+  * from this healthmon instance.
+  */
+ void
+ xfs_healthmon_unmount(
+       struct xfs_mount                *mp)
+ {
+       struct xfs_healthmon            *hm = xfs_healthmon_get(mp);
+       if (!hm)
+               return;
+       trace_xfs_healthmon_report_unmount(hm);
+       /*
+        * Insert the unmount notification at the start of the event queue so
+        * that userspace knows the filesystem went away as soon as possible.
+        * There's nothing actionable for userspace after an unmount.  Once
+        * we've inserted the unmount event, hm no longer owns that event.
+        */
+       __xfs_healthmon_insert(hm, hm->unmount_event);
+       hm->unmount_event = NULL;
+       xfs_healthmon_detach(hm);
+       xfs_healthmon_put(hm);
+ }
+ /* Compute the reporting mask for non-unmount metadata health events. */
+ static inline unsigned int
+ metadata_event_mask(
+       struct xfs_healthmon            *hm,
+       enum xfs_healthmon_type         type,
+       unsigned int                    old_mask,
+       unsigned int                    new_mask)
+ {
+       /* If we want all events, return all events. */
+       if (hm->verbose)
+               return new_mask;
+       switch (type) {
+       case XFS_HEALTHMON_SICK:
+               /* Always report runtime corruptions */
+               return new_mask;
+       case XFS_HEALTHMON_CORRUPT:
+               /* Only report new fsck errors */
+               return new_mask & ~old_mask;
+       case XFS_HEALTHMON_HEALTHY:
+               /* Only report healthy metadata that got fixed */
+               return new_mask & old_mask;
+       default:
+               ASSERT(0);
+               break;
+       }
+       return 0;
+ }
+ /* Report XFS_FS_SICK_* events to healthmon */
+ void
+ xfs_healthmon_report_fs(
+       struct xfs_mount                *mp,
+       enum xfs_healthmon_type         type,
+       unsigned int                    old_mask,
+       unsigned int                    new_mask)
+ {
+       struct xfs_healthmon_event      event = {
+               .type                   = type,
+               .domain                 = XFS_HEALTHMON_FS,
+       };
+       struct xfs_healthmon            *hm = xfs_healthmon_get(mp);
+       if (!hm)
+               return;
+       event.fsmask = metadata_event_mask(hm, type, old_mask, new_mask) &
+                       ~XFS_SICK_FS_SECONDARY;
+       trace_xfs_healthmon_report_fs(hm, old_mask, new_mask, &event);
+       if (event.fsmask)
+               xfs_healthmon_push(hm, &event);
+       xfs_healthmon_put(hm);
+ }
+ /* Report XFS_SICK_(AG|RG)* flags to healthmon */
+ void
+ xfs_healthmon_report_group(
+       struct xfs_group                *xg,
+       enum xfs_healthmon_type         type,
+       unsigned int                    old_mask,
+       unsigned int                    new_mask)
+ {
+       struct xfs_healthmon_event      event = {
+               .type                   = type,
+               .group                  = xg->xg_gno,
+       };
+       struct xfs_healthmon            *hm = xfs_healthmon_get(xg->xg_mount);
+       if (!hm)
+               return;
+       switch (xg->xg_type) {
+       case XG_TYPE_RTG:
+               event.domain = XFS_HEALTHMON_RTGROUP;
+               event.grpmask = metadata_event_mask(hm, type, old_mask,
+                                                   new_mask) &
+                               ~XFS_SICK_RG_SECONDARY;
+               break;
+       case XG_TYPE_AG:
+               event.domain = XFS_HEALTHMON_AG;
+               event.grpmask = metadata_event_mask(hm, type, old_mask,
+                                                   new_mask) &
+                               ~XFS_SICK_AG_SECONDARY;
+               break;
+       default:
+               ASSERT(0);
+               break;
+       }
+       trace_xfs_healthmon_report_group(hm, old_mask, new_mask, &event);
+       if (event.grpmask)
+               xfs_healthmon_push(hm, &event);
+       xfs_healthmon_put(hm);
+ }
+ /* Report XFS_SICK_INO_* flags to healthmon */
+ void
+ xfs_healthmon_report_inode(
+       struct xfs_inode                *ip,
+       enum xfs_healthmon_type         type,
+       unsigned int                    old_mask,
+       unsigned int                    new_mask)
+ {
+       struct xfs_healthmon_event      event = {
+               .type                   = type,
+               .domain                 = XFS_HEALTHMON_INODE,
+               .ino                    = ip->i_ino,
+               .gen                    = VFS_I(ip)->i_generation,
+       };
+       struct xfs_healthmon            *hm = xfs_healthmon_get(ip->i_mount);
+       if (!hm)
+               return;
+       event.imask = metadata_event_mask(hm, type, old_mask, new_mask) &
+                       ~XFS_SICK_INO_SECONDARY;
+       trace_xfs_healthmon_report_inode(hm, old_mask, event.imask, &event);
+       if (event.imask)
+               xfs_healthmon_push(hm, &event);
+       xfs_healthmon_put(hm);
+ }
+ /* Add a shutdown event to the reporting queue. */
+ void
+ xfs_healthmon_report_shutdown(
+       struct xfs_mount                *mp,
+       uint32_t                        flags)
+ {
+       struct xfs_healthmon_event      event = {
+               .type                   = XFS_HEALTHMON_SHUTDOWN,
+               .domain                 = XFS_HEALTHMON_MOUNT,
+               .flags                  = flags,
+       };
+       struct xfs_healthmon            *hm = xfs_healthmon_get(mp);
+       if (!hm)
+               return;
+       trace_xfs_healthmon_report_shutdown(hm, flags);
+       xfs_healthmon_push(hm, &event);
+       xfs_healthmon_put(hm);
+ }
+ static inline enum xfs_healthmon_domain
+ media_error_domain(
+       enum xfs_device                 fdev)
+ {
+       switch (fdev) {
+       case XFS_DEV_DATA:
+               return XFS_HEALTHMON_DATADEV;
+       case XFS_DEV_LOG:
+               return XFS_HEALTHMON_LOGDEV;
+       case XFS_DEV_RT:
+               return XFS_HEALTHMON_RTDEV;
+       }
+       ASSERT(0);
+       return 0;
+ }
+ /* Add a media error event to the reporting queue. */
+ void
+ xfs_healthmon_report_media(
+       struct xfs_mount                *mp,
+       enum xfs_device                 fdev,
+       xfs_daddr_t                     daddr,
+       uint64_t                        bbcount)
+ {
+       struct xfs_healthmon_event      event = {
+               .type                   = XFS_HEALTHMON_MEDIA_ERROR,
+               .domain                 = media_error_domain(fdev),
+               .daddr                  = daddr,
+               .bbcount                = bbcount,
+       };
+       struct xfs_healthmon            *hm = xfs_healthmon_get(mp);
+       if (!hm)
+               return;
+       trace_xfs_healthmon_report_media(hm, fdev, &event);
+       xfs_healthmon_push(hm, &event);
+       xfs_healthmon_put(hm);
+ }
+ static inline enum xfs_healthmon_type file_ioerr_type(enum fserror_type action)
+ {
+       switch (action) {
+       case FSERR_BUFFERED_READ:
+               return XFS_HEALTHMON_BUFREAD;
+       case FSERR_BUFFERED_WRITE:
+               return XFS_HEALTHMON_BUFWRITE;
+       case FSERR_DIRECTIO_READ:
+               return XFS_HEALTHMON_DIOREAD;
+       case FSERR_DIRECTIO_WRITE:
+               return XFS_HEALTHMON_DIOWRITE;
+       case FSERR_DATA_LOST:
+               return XFS_HEALTHMON_DATALOST;
+       case FSERR_METADATA:
+               /* filtered out by xfs_fs_report_error */
+               break;
+       }
+       ASSERT(0);
+       return -1;
+ }
+ /* Add a file io error event to the reporting queue. */
+ void
+ xfs_healthmon_report_file_ioerror(
+       struct xfs_inode                *ip,
+       const struct fserror_event      *p)
+ {
+       struct xfs_healthmon_event      event = {
+               .type                   = file_ioerr_type(p->type),
+               .domain                 = XFS_HEALTHMON_FILERANGE,
+               .fino                   = ip->i_ino,
+               .fgen                   = VFS_I(ip)->i_generation,
+               .fpos                   = p->pos,
+               .flen                   = p->len,
+               /* send positive error number to userspace */
+               .error                  = -p->error,
+       };
+       struct xfs_healthmon            *hm = xfs_healthmon_get(ip->i_mount);
+       if (!hm)
+               return;
+       trace_xfs_healthmon_report_file_ioerror(hm, p);
+       xfs_healthmon_push(hm, &event);
+       xfs_healthmon_put(hm);
+ }
+ static inline void
+ xfs_healthmon_reset_outbuf(
+       struct xfs_healthmon            *hm)
+ {
+       hm->buftail = 0;
+       hm->bufhead = 0;
+ }
+ struct flags_map {
+       unsigned int            in_mask;
+       unsigned int            out_mask;
+ };
+ static const struct flags_map shutdown_map[] = {
+       { SHUTDOWN_META_IO_ERROR,       XFS_HEALTH_SHUTDOWN_META_IO_ERROR },
+       { SHUTDOWN_LOG_IO_ERROR,        XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR },
+       { SHUTDOWN_FORCE_UMOUNT,        XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT },
+       { SHUTDOWN_CORRUPT_INCORE,      XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE },
+       { SHUTDOWN_CORRUPT_ONDISK,      XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK },
+       { SHUTDOWN_DEVICE_REMOVED,      XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED },
+ };
+ static inline unsigned int
+ __map_flags(
+       const struct flags_map  *map,
+       size_t                  array_len,
+       unsigned int            flags)
+ {
+       const struct flags_map  *m;
+       unsigned int            ret = 0;
+       for (m = map; m < map + array_len; m++) {
+               if (flags & m->in_mask)
+                       ret |= m->out_mask;
+       }
+       return ret;
+ }
+ #define map_flags(map, flags) __map_flags((map), ARRAY_SIZE(map), (flags))
+ static inline unsigned int shutdown_mask(unsigned int in)
+ {
+       return map_flags(shutdown_map, in);
+ }
+ static const unsigned int domain_map[] = {
+       [XFS_HEALTHMON_MOUNT]           = XFS_HEALTH_MONITOR_DOMAIN_MOUNT,
+       [XFS_HEALTHMON_FS]              = XFS_HEALTH_MONITOR_DOMAIN_FS,
+       [XFS_HEALTHMON_AG]              = XFS_HEALTH_MONITOR_DOMAIN_AG,
+       [XFS_HEALTHMON_INODE]           = XFS_HEALTH_MONITOR_DOMAIN_INODE,
+       [XFS_HEALTHMON_RTGROUP]         = XFS_HEALTH_MONITOR_DOMAIN_RTGROUP,
+       [XFS_HEALTHMON_DATADEV]         = XFS_HEALTH_MONITOR_DOMAIN_DATADEV,
+       [XFS_HEALTHMON_RTDEV]           = XFS_HEALTH_MONITOR_DOMAIN_RTDEV,
+       [XFS_HEALTHMON_LOGDEV]          = XFS_HEALTH_MONITOR_DOMAIN_LOGDEV,
+       [XFS_HEALTHMON_FILERANGE]       = XFS_HEALTH_MONITOR_DOMAIN_FILERANGE,
+ };
+ static const unsigned int type_map[] = {
+       [XFS_HEALTHMON_RUNNING]         = XFS_HEALTH_MONITOR_TYPE_RUNNING,
+       [XFS_HEALTHMON_LOST]            = XFS_HEALTH_MONITOR_TYPE_LOST,
+       [XFS_HEALTHMON_SICK]            = XFS_HEALTH_MONITOR_TYPE_SICK,
+       [XFS_HEALTHMON_CORRUPT]         = XFS_HEALTH_MONITOR_TYPE_CORRUPT,
+       [XFS_HEALTHMON_HEALTHY]         = XFS_HEALTH_MONITOR_TYPE_HEALTHY,
+       [XFS_HEALTHMON_UNMOUNT]         = XFS_HEALTH_MONITOR_TYPE_UNMOUNT,
+       [XFS_HEALTHMON_SHUTDOWN]        = XFS_HEALTH_MONITOR_TYPE_SHUTDOWN,
+       [XFS_HEALTHMON_MEDIA_ERROR]     = XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR,
+       [XFS_HEALTHMON_BUFREAD]         = XFS_HEALTH_MONITOR_TYPE_BUFREAD,
+       [XFS_HEALTHMON_BUFWRITE]        = XFS_HEALTH_MONITOR_TYPE_BUFWRITE,
+       [XFS_HEALTHMON_DIOREAD]         = XFS_HEALTH_MONITOR_TYPE_DIOREAD,
+       [XFS_HEALTHMON_DIOWRITE]        = XFS_HEALTH_MONITOR_TYPE_DIOWRITE,
+       [XFS_HEALTHMON_DATALOST]        = XFS_HEALTH_MONITOR_TYPE_DATALOST,
+ };
+ /* Render event as a V0 structure */
+ STATIC int
+ xfs_healthmon_format_v0(
+       struct xfs_healthmon            *hm,
+       const struct xfs_healthmon_event *event)
+ {
+       struct xfs_health_monitor_event hme = {
+               .time_ns                = event->time_ns,
+       };
+       trace_xfs_healthmon_format(hm, event);
+       if (event->domain < 0 || event->domain >= ARRAY_SIZE(domain_map) ||
+           event->type < 0   || event->type >= ARRAY_SIZE(type_map))
+               return -EFSCORRUPTED;
+       hme.domain = domain_map[event->domain];
+       hme.type = type_map[event->type];
+       /* fill in the event-specific details */
+       switch (event->domain) {
+       case XFS_HEALTHMON_MOUNT:
+               switch (event->type) {
+               case XFS_HEALTHMON_LOST:
+                       hme.e.lost.count = event->lostcount;
+                       break;
+               case XFS_HEALTHMON_SHUTDOWN:
+                       hme.e.shutdown.reasons = shutdown_mask(event->flags);
+                       break;
+               default:
+                       break;
+               }
+               break;
+       case XFS_HEALTHMON_FS:
+               hme.e.fs.mask = xfs_healthmon_fs_mask(event->fsmask);
+               break;
+       case XFS_HEALTHMON_RTGROUP:
+               hme.e.group.mask = xfs_healthmon_rtgroup_mask(event->grpmask);
+               hme.e.group.gno = event->group;
+               break;
+       case XFS_HEALTHMON_AG:
+               hme.e.group.mask = xfs_healthmon_perag_mask(event->grpmask);
+               hme.e.group.gno = event->group;
+               break;
+       case XFS_HEALTHMON_INODE:
+               hme.e.inode.mask = xfs_healthmon_inode_mask(event->imask);
+               hme.e.inode.ino = event->ino;
+               hme.e.inode.gen = event->gen;
+               break;
+       case XFS_HEALTHMON_DATADEV:
+       case XFS_HEALTHMON_LOGDEV:
+       case XFS_HEALTHMON_RTDEV:
+               hme.e.media.daddr = event->daddr;
+               hme.e.media.bbcount = event->bbcount;
+               break;
+       case XFS_HEALTHMON_FILERANGE:
+               hme.e.filerange.ino = event->fino;
+               hme.e.filerange.gen = event->fgen;
+               hme.e.filerange.pos = event->fpos;
+               hme.e.filerange.len = event->flen;
+               hme.e.filerange.error = abs(event->error);
+               break;
+       default:
+               break;
+       }
+       ASSERT(hm->bufhead + sizeof(hme) <= hm->bufsize);
+       /* copy formatted object to the outbuf */
+       if (hm->bufhead + sizeof(hme) <= hm->bufsize) {
+               memcpy(hm->buffer + hm->bufhead, &hme, sizeof(hme));
+               hm->bufhead += sizeof(hme);
+       }
+       return 0;
+ }
+ /* How many bytes are waiting in the outbuf to be copied? */
+ static inline size_t
+ xfs_healthmon_outbuf_bytes(
+       struct xfs_healthmon    *hm)
+ {
+       if (hm->bufhead > hm->buftail)
+               return hm->bufhead - hm->buftail;
+       return 0;
+ }
+ /*
+  * Do we have something for userspace to read?  This can mean unmount events,
+  * events pending in the queue, or pending bytes in the outbuf.
+  */
+ static inline bool
+ xfs_healthmon_has_eventdata(
+       struct xfs_healthmon    *hm)
+ {
+       /*
+        * If the health monitor is already detached from the xfs_mount, we
+        * want reads to return 0 bytes even if there are no events, because
+        * userspace interprets that as EOF.  If we race with deactivation,
+        * read_iter will take the necessary locks to discover that there are
+        * no events to send.
+        */
+       if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
+               return true;
+       /*
+        * Either there are events waiting to be formatted into the buffer, or
+        * there's unread bytes in the buffer.
+        */
+       return hm->events > 0 || xfs_healthmon_outbuf_bytes(hm) > 0;
+ }
+ /* Try to copy the rest of the outbuf to the iov iter. */
+ STATIC ssize_t
+ xfs_healthmon_copybuf(
+       struct xfs_healthmon    *hm,
+       struct iov_iter         *to)
+ {
+       size_t                  to_copy;
+       size_t                  w = 0;
+       trace_xfs_healthmon_copybuf(hm, to);
+       to_copy = xfs_healthmon_outbuf_bytes(hm);
+       if (to_copy) {
+               w = copy_to_iter(hm->buffer + hm->buftail, to_copy, to);
+               if (!w)
+                       return -EFAULT;
+               hm->buftail += w;
+       }
+       /*
+        * Nothing left to copy?  Reset the output buffer cursors to the start
+        * since there's no live data in the buffer.
+        */
+       if (xfs_healthmon_outbuf_bytes(hm) == 0)
+               xfs_healthmon_reset_outbuf(hm);
+       return w;
+ }
+ /*
+  * Return a health monitoring event for formatting into the output buffer if
+  * there's enough space in the outbuf and an event waiting for us.  Caller
+  * must hold i_rwsem on the healthmon file.
+  */
+ static inline struct xfs_healthmon_event *
+ xfs_healthmon_format_pop(
+       struct xfs_healthmon    *hm)
+ {
+       struct xfs_healthmon_event *event;
+       if (hm->bufhead + sizeof(*event) > hm->bufsize)
+               return NULL;
+       mutex_lock(&hm->lock);
+       event = hm->first_event;
+       if (event) {
+               if (hm->last_event == event)
+                       hm->last_event = NULL;
+               hm->first_event = event->next;
+               hm->events--;
+               trace_xfs_healthmon_pop(hm, event);
+       }
+       mutex_unlock(&hm->lock);
+       return event;
+ }
+ /* Allocate formatting buffer */
+ STATIC int
+ xfs_healthmon_alloc_outbuf(
+       struct xfs_healthmon    *hm,
+       size_t                  user_bufsize)
+ {
+       void                    *outbuf;
+       size_t                  bufsize =
+               min(XFS_HEALTHMON_MAX_OUTBUF, max(PAGE_SIZE, user_bufsize));
+       outbuf = kzalloc(bufsize, GFP_KERNEL);
+       if (!outbuf) {
+               if (bufsize == PAGE_SIZE)
+                       return -ENOMEM;
+               bufsize = PAGE_SIZE;
+               outbuf = kzalloc(bufsize, GFP_KERNEL);
+               if (!outbuf)
+                       return -ENOMEM;
+       }
+       hm->buffer = outbuf;
+       hm->bufsize = bufsize;
+       hm->bufhead = 0;
+       hm->buftail = 0;
+       return 0;
+ }
+ /*
+  * Convey queued event data to userspace.  First copy any remaining bytes in
+  * the outbuf, then format the oldest event into the outbuf and copy that too.
+  */
+ STATIC ssize_t
+ xfs_healthmon_read_iter(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+ {
+       struct file             *file = iocb->ki_filp;
+       struct inode            *inode = file_inode(file);
+       struct xfs_healthmon    *hm = file->private_data;
+       struct xfs_healthmon_event *event;
+       size_t                  copied = 0;
+       ssize_t                 ret = 0;
+       if (file->f_flags & O_NONBLOCK) {
+               if (!xfs_healthmon_has_eventdata(hm) || !inode_trylock(inode))
+                       return -EAGAIN;
+       } else {
+               ret = wait_event_interruptible(hm->wait,
+                               xfs_healthmon_has_eventdata(hm));
+               if (ret)
+                       return ret;
+               inode_lock(inode);
+       }
+       if (hm->bufsize == 0) {
+               ret = xfs_healthmon_alloc_outbuf(hm, iov_iter_count(to));
+               if (ret)
+                       goto out_unlock;
+       }
+       trace_xfs_healthmon_read_start(hm);
+       /*
+        * If there's anything left in the output buffer, copy that before
+        * formatting more events.
+        */
+       ret = xfs_healthmon_copybuf(hm, to);
+       if (ret < 0)
+               goto out_unlock;
+       copied += ret;
+       while (iov_iter_count(to) > 0) {
+               /* Format the next events into the outbuf until it's full. */
+               while ((event = xfs_healthmon_format_pop(hm)) != NULL) {
+                       ret = xfs_healthmon_format_v0(hm, event);
+                       kfree(event);
+                       if (ret)
+                               goto out_unlock;
+               }
+               /* Copy anything formatted into outbuf to userspace */
+               ret = xfs_healthmon_copybuf(hm, to);
+               if (ret <= 0)
+                       break;
+               copied += ret;
+       }
+ out_unlock:
+       trace_xfs_healthmon_read_finish(hm);
+       inode_unlock(inode);
+       return copied ?: ret;
+ }
+ /* Poll for available events. */
+ STATIC __poll_t
+ xfs_healthmon_poll(
+       struct file                     *file,
+       struct poll_table_struct        *wait)
+ {
+       struct xfs_healthmon            *hm = file->private_data;
+       __poll_t                        mask = 0;
+       poll_wait(file, &hm->wait, wait);
+       if (xfs_healthmon_has_eventdata(hm))
+               mask |= EPOLLIN;
+       return mask;
+ }
+ /* Free the health monitoring information. */
+ STATIC int
+ xfs_healthmon_release(
+       struct inode            *inode,
+       struct file             *file)
+ {
+       struct xfs_healthmon    *hm = file->private_data;
+       trace_xfs_healthmon_release(hm);
+       /*
+        * We might be closing the healthmon file before the filesystem
+        * unmounts, because userspace processes can terminate at any time and
+        * for any reason.  Null out xfs_mount::m_healthmon so that another
+        * process can create another health monitor file.
+        */
+       xfs_healthmon_detach(hm);
+       /*
+        * Wake up any readers that might be left.  There shouldn't be any
+        * because the only users of the waiter are read and poll.
+        */
+       wake_up_all(&hm->wait);
+       xfs_healthmon_put(hm);
+       return 0;
+ }
+ /* Validate ioctl parameters. */
+ static inline bool
+ xfs_healthmon_validate(
+       const struct xfs_health_monitor *hmo)
+ {
+       if (hmo->flags & ~XFS_HEALTH_MONITOR_ALL)
+               return false;
+       if (hmo->format != XFS_HEALTH_MONITOR_FMT_V0)
+               return false;
+       if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad)))
+               return false;
+       return true;
+ }
+ /* Emit some data about the health monitoring fd. */
+ static void
+ xfs_healthmon_show_fdinfo(
+       struct seq_file         *m,
+       struct file             *file)
+ {
+       struct xfs_healthmon    *hm = file->private_data;
+       mutex_lock(&hm->lock);
+       seq_printf(m, "state:\t%s\ndev:\t%d:%d\nformat:\tv0\nevents:\t%llu\nlost:\t%llu\n",
+                       hm->mount_cookie == DETACHED_MOUNT_COOKIE ?
+                               "dead" : "alive",
+                       MAJOR(hm->dev), MINOR(hm->dev),
+                       hm->total_events,
+                       hm->total_lost);
+       mutex_unlock(&hm->lock);
+ }
+ /* Reconfigure the health monitor. */
+ STATIC long
+ xfs_healthmon_reconfigure(
+       struct file                     *file,
+       unsigned int                    cmd,
+       void __user                     *arg)
+ {
+       struct xfs_health_monitor       hmo;
+       struct xfs_healthmon            *hm = file->private_data;
+       if (copy_from_user(&hmo, arg, sizeof(hmo)))
+               return -EFAULT;
+       if (!xfs_healthmon_validate(&hmo))
+               return -EINVAL;
+       mutex_lock(&hm->lock);
+       hm->verbose = !!(hmo.flags & XFS_HEALTH_MONITOR_VERBOSE);
+       mutex_unlock(&hm->lock);
+       return 0;
+ }
+ /* Does the fd point to the same filesystem as the one we're monitoring? */
+ STATIC long
+ xfs_healthmon_file_on_monitored_fs(
+       struct file                     *file,
+       unsigned int                    cmd,
+       void __user                     *arg)
+ {
+       struct xfs_health_file_on_monitored_fs hms;
+       struct xfs_healthmon            *hm = file->private_data;
+       struct inode                    *hms_inode;
+       if (copy_from_user(&hms, arg, sizeof(hms)))
+               return -EFAULT;
+       if (hms.flags)
+               return -EINVAL;
+       CLASS(fd, hms_fd)(hms.fd);
+       if (fd_empty(hms_fd))
+               return -EBADF;
+       hms_inode = file_inode(fd_file(hms_fd));
+       mutex_lock(&hm->lock);
+       if (hm->mount_cookie != (uintptr_t)hms_inode->i_sb) {
+               mutex_unlock(&hm->lock);
+               return -ESTALE;
+       }
+       mutex_unlock(&hm->lock);
+       return 0;
+ }
+ /* Handle ioctls for the health monitoring thread. */
+ STATIC long
+ xfs_healthmon_ioctl(
+       struct file                     *file,
+       unsigned int                    cmd,
+       unsigned long                   p)
+ {
+       void __user                     *arg = (void __user *)p;
+       switch (cmd) {
+       case XFS_IOC_HEALTH_MONITOR:
+               return xfs_healthmon_reconfigure(file, cmd, arg);
+       case XFS_IOC_HEALTH_FD_ON_MONITORED_FS:
+               return xfs_healthmon_file_on_monitored_fs(file, cmd, arg);
+       default:
+               break;
+       }
+       return -ENOTTY;
+ }
+ static const struct file_operations xfs_healthmon_fops = {
+       .owner          = THIS_MODULE,
+       .show_fdinfo    = xfs_healthmon_show_fdinfo,
+       .read_iter      = xfs_healthmon_read_iter,
+       .poll           = xfs_healthmon_poll,
+       .release        = xfs_healthmon_release,
+       .unlocked_ioctl = xfs_healthmon_ioctl,
+ };
+ /*
+  * Create a health monitoring file.  Returns an index to the fd table or a
+  * negative errno.
+  */
+ long
+ xfs_ioc_health_monitor(
+       struct file                     *file,
+       struct xfs_health_monitor __user *arg)
+ {
+       struct xfs_health_monitor       hmo;
+       struct xfs_healthmon_event      *running_event;
+       struct xfs_healthmon            *hm;
+       struct xfs_inode                *ip = XFS_I(file_inode(file));
+       struct xfs_mount                *mp = ip->i_mount;
+       int                             ret;
+       /*
+        * The only intended user of the health monitoring system should be the
+        * xfs_healer daemon running on behalf of the whole filesystem in the
+        * initial user namespace.  IOWs, we don't allow unprivileged userspace
+        * (they can use fsnotify) nor do we allow containers.
+        */
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       if (ip->i_ino != mp->m_sb.sb_rootino)
+               return -EPERM;
+       if (current_user_ns() != &init_user_ns)
+               return -EPERM;
+       if (copy_from_user(&hmo, arg, sizeof(hmo)))
+               return -EFAULT;
+       if (!xfs_healthmon_validate(&hmo))
+               return -EINVAL;
+       hm = kzalloc(sizeof(*hm), GFP_KERNEL);
+       if (!hm)
+               return -ENOMEM;
+       hm->dev = mp->m_super->s_dev;
+       refcount_set(&hm->ref, 1);
+       mutex_init(&hm->lock);
+       init_waitqueue_head(&hm->wait);
+       if (hmo.flags & XFS_HEALTH_MONITOR_VERBOSE)
+               hm->verbose = true;
+       /* Queue up the first event that lets the client know we're running. */
+       running_event = kzalloc(sizeof(struct xfs_healthmon_event), GFP_NOFS);
+       if (!running_event) {
+               ret = -ENOMEM;
+               goto out_hm;
+       }
+       running_event->type = XFS_HEALTHMON_RUNNING;
+       running_event->domain = XFS_HEALTHMON_MOUNT;
+       __xfs_healthmon_insert(hm, running_event);
+       /*
+        * Preallocate the unmount event so that we can't fail to notify the
+        * filesystem later.  This is key for triggering fast exit of the
+        * xfs_healer daemon.
+        */
+       hm->unmount_event = kzalloc(sizeof(struct xfs_healthmon_event),
+                       GFP_NOFS);
+       if (!hm->unmount_event) {
+               ret = -ENOMEM;
+               goto out_hm;
+       }
+       hm->unmount_event->type = XFS_HEALTHMON_UNMOUNT;
+       hm->unmount_event->domain = XFS_HEALTHMON_MOUNT;
+       /*
+        * Try to attach this health monitor to the xfs_mount.  The monitor is
+        * considered live and will receive events if this succeeds.
+        */
+       ret = xfs_healthmon_attach(mp, hm);
+       if (ret)
+               goto out_hm;
+       /*
+        * Create the anonymous file and install a fd for it.  If it succeeds,
+        * the file owns hm and can go away at any time, so we must not access
+        * it again.  This must go last because we can't undo a fd table
+        * installation.
+        */
+       ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm,
+                       O_CLOEXEC | O_RDONLY);
+       if (ret < 0)
+               goto out_mp;
+       trace_xfs_healthmon_create(mp->m_super->s_dev, hmo.flags, hmo.format);
+       return ret;
+ out_mp:
+       xfs_healthmon_detach(hm);
+ out_hm:
+       ASSERT(refcount_read(&hm->ref) == 1);
+       xfs_healthmon_put(hm);
+       return ret;
+ }
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index 0000000000000000000000000000000000000000,f4f620c98d92ca965b5aceb1d26132f1634daa45..069cd371619dc2ad8bf7048a52368f7af2f70a95
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,445 +1,445 @@@
 -#include "xfs.h"
+ // SPDX-License-Identifier: GPL-2.0-or-later
+ /*
+  * Copyright (c) 2026 Oracle.  All Rights Reserved.
+  * Author: Darrick J. Wong <djwong@kernel.org>
+  */
++#include "xfs_platform.h"
+ #include "xfs_shared.h"
+ #include "xfs_format.h"
+ #include "xfs_log_format.h"
+ #include "xfs_trans_resv.h"
+ #include "xfs_mount.h"
+ #include "xfs_bit.h"
+ #include "xfs_btree.h"
+ #include "xfs_inode.h"
+ #include "xfs_icache.h"
+ #include "xfs_trans.h"
+ #include "xfs_alloc.h"
+ #include "xfs_ag.h"
+ #include "xfs_rmap.h"
+ #include "xfs_rmap_btree.h"
+ #include "xfs_rtgroup.h"
+ #include "xfs_rtrmap_btree.h"
+ #include "xfs_health.h"
+ #include "xfs_healthmon.h"
+ #include "xfs_trace.h"
+ #include "xfs_verify_media.h"
+ #include <linux/fserror.h>
+ struct xfs_group_data_lost {
+       xfs_agblock_t           startblock;
+       xfs_extlen_t            blockcount;
+ };
+ /* Report lost file data from rmap records */
+ static int
+ xfs_verify_report_data_lost(
+       struct xfs_btree_cur            *cur,
+       const struct xfs_rmap_irec      *rec,
+       void                            *data)
+ {
+       struct xfs_mount                *mp = cur->bc_mp;
+       struct xfs_inode                *ip;
+       struct xfs_group_data_lost      *lost = data;
+       xfs_fileoff_t                   fileoff = rec->rm_offset;
+       xfs_extlen_t                    blocks = rec->rm_blockcount;
+       const bool                      is_attr =
+                       (rec->rm_flags & XFS_RMAP_ATTR_FORK);
+       const xfs_agblock_t             lost_end =
+                       lost->startblock + lost->blockcount;
+       const xfs_agblock_t             rmap_end =
+                       rec->rm_startblock + rec->rm_blockcount;
+       int                             error = 0;
+       if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
+              return 0;
+       error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip);
+       if (error)
+               return 0;
+       if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
+               xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK);
+               goto out_rele;
+       }
+       if (is_attr) {
+               xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR);
+               goto out_rele;
+       }
+       if (lost->startblock > rec->rm_startblock) {
+               fileoff += lost->startblock - rec->rm_startblock;
+               blocks -= lost->startblock - rec->rm_startblock;
+       }
+       if (rmap_end > lost_end)
+               blocks -= rmap_end - lost_end;
+       fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff),
+                       XFS_FSB_TO_B(mp, blocks), GFP_NOFS);
+ out_rele:
+       xfs_irele(ip);
+       return 0;
+ }
+ /* Walk reverse mappings to look for all file data loss */
+ static int
+ xfs_verify_report_losses(
+       struct xfs_mount        *mp,
+       enum xfs_group_type     type,
+       xfs_daddr_t             daddr,
+       u64                     bblen)
+ {
+       struct xfs_group        *xg = NULL;
+       struct xfs_trans        *tp;
+       xfs_fsblock_t           start_bno, end_bno;
+       uint32_t                start_gno, end_gno;
+       int                     error;
+       if (type == XG_TYPE_RTG) {
+               start_bno = xfs_daddr_to_rtb(mp, daddr);
+               end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
+       } else {
+               start_bno = XFS_DADDR_TO_FSB(mp, daddr);
+               end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
+       }
+       tp = xfs_trans_alloc_empty(mp);
+       start_gno = xfs_fsb_to_gno(mp, start_bno, type);
+       end_gno = xfs_fsb_to_gno(mp, end_bno, type);
+       while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
+               struct xfs_buf          *agf_bp = NULL;
+               struct xfs_rtgroup      *rtg = NULL;
+               struct xfs_btree_cur    *cur;
+               struct xfs_rmap_irec    ri_low = { };
+               struct xfs_rmap_irec    ri_high;
+               struct xfs_group_data_lost lost;
+               if (type == XG_TYPE_AG) {
+                       struct xfs_perag        *pag = to_perag(xg);
+                       error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
+                       if (error) {
+                               xfs_perag_put(pag);
+                               break;
+                       }
+                       cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
+               } else {
+                       rtg = to_rtg(xg);
+                       xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+                       cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+               }
+               /*
+                * Set the rmap range from ri_low to ri_high, which represents
+                * a [start, end] where we looking for the files or metadata.
+                */
+               memset(&ri_high, 0xFF, sizeof(ri_high));
+               if (xg->xg_gno == start_gno)
+                       ri_low.rm_startblock =
+                               xfs_fsb_to_gbno(mp, start_bno, type);
+               if (xg->xg_gno == end_gno)
+                       ri_high.rm_startblock =
+                               xfs_fsb_to_gbno(mp, end_bno, type);
+               lost.startblock = ri_low.rm_startblock;
+               lost.blockcount = min(xg->xg_block_count,
+                                     ri_high.rm_startblock + 1) -
+                                                       ri_low.rm_startblock;
+               error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+                               xfs_verify_report_data_lost, &lost);
+               xfs_btree_del_cursor(cur, error);
+               if (agf_bp)
+                       xfs_trans_brelse(tp, agf_bp);
+               if (rtg)
+                       xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+               if (error) {
+                       xfs_group_put(xg);
+                       break;
+               }
+       }
+       xfs_trans_cancel(tp);
+       return 0;
+ }
+ /*
+  * Compute the desired verify IO size.
+  *
+  * To minimize command overhead, we'd like to create bios that are 1MB, though
+  * we allow the user to ask for a smaller size.
+  */
+ static unsigned int
+ xfs_verify_iosize(
+       const struct xfs_verify_media   *me,
+       struct xfs_buftarg              *btp,
+       uint64_t                        bbcount)
+ {
+       unsigned int                    iosize =
+                       min_not_zero(SZ_1M, me->me_max_io_size);
+       BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT);
+       ASSERT(BBTOB(bbcount) >= bdev_logical_block_size(btp->bt_bdev));
+       return clamp(iosize, bdev_logical_block_size(btp->bt_bdev),
+                       BBTOB(bbcount));
+ }
+ /* Allocate as much memory as we can get for verification buffer. */
+ static struct folio *
+ xfs_verify_alloc_folio(
+       const unsigned int      iosize)
+ {
+       unsigned int            order = get_order(iosize);
+       while (order > 0) {
+               struct folio    *folio =
+                       folio_alloc(GFP_KERNEL | __GFP_NORETRY, order);
+               if (folio)
+                       return folio;
+               order--;
+       }
+       return folio_alloc(GFP_KERNEL, 0);
+ }
+ /* Report any kind of problem verifying media */
+ static void
+ xfs_verify_media_error(
+       struct xfs_mount        *mp,
+       struct xfs_verify_media *me,
+       struct xfs_buftarg      *btp,
+       xfs_daddr_t             daddr,
+       unsigned int            bio_bbcount,
+       blk_status_t            bio_status)
+ {
+       trace_xfs_verify_media_error(mp, me, btp->bt_bdev->bd_dev, daddr,
+                       bio_bbcount, bio_status);
+       /*
+        * Pass any error, I/O or otherwise, up to the caller if we didn't
+        * successfully verify any bytes at all.
+        */
+       if (me->me_start_daddr == daddr)
+               me->me_ioerror = -blk_status_to_errno(bio_status);
+       /*
+        * PI validation failures, medium errors, or general IO errors are
+        * treated as indicators of data loss.  Everything else are (hopefully)
+        * transient errors and are not reported to healthmon or fsnotify.
+        */
+       switch (bio_status) {
+       case BLK_STS_PROTECTION:
+       case BLK_STS_IOERR:
+       case BLK_STS_MEDIUM:
+               break;
+       default:
+               return;
+       }
+       if (!(me->me_flags & XFS_VERIFY_MEDIA_REPORT))
+               return;
+       xfs_healthmon_report_media(mp, me->me_dev, daddr, bio_bbcount);
+       if (!xfs_has_rmapbt(mp))
+               return;
+       switch (me->me_dev) {
+       case XFS_DEV_DATA:
+               xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount);
+               break;
+       case XFS_DEV_RT:
+               xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount);
+               break;
+       }
+ }
+ /* Verify the media of an xfs device by submitting read requests to the disk. */
+ static int
+ xfs_verify_media(
+       struct xfs_mount        *mp,
+       struct xfs_verify_media *me)
+ {
+       struct xfs_buftarg      *btp = NULL;
+       struct bio              *bio;
+       struct folio            *folio;
+       xfs_daddr_t             daddr;
+       uint64_t                bbcount;
+       int                     error = 0;
+       me->me_ioerror = 0;
+       switch (me->me_dev) {
+       case XFS_DEV_DATA:
+               btp = mp->m_ddev_targp;
+               break;
+       case XFS_DEV_LOG:
+               if (mp->m_logdev_targp->bt_bdev != mp->m_ddev_targp->bt_bdev)
+                       btp = mp->m_logdev_targp;
+               break;
+       case XFS_DEV_RT:
+               btp = mp->m_rtdev_targp;
+               break;
+       }
+       if (!btp)
+               return -ENODEV;
+       /*
+        * If the caller told us to verify beyond the end of the disk, tell the
+        * user exactly where that was.
+        */
+       if (me->me_end_daddr > btp->bt_nr_sectors)
+               me->me_end_daddr = btp->bt_nr_sectors;
+       /* start and end have to be aligned to the lba size */
+       if (!IS_ALIGNED(BBTOB(me->me_start_daddr | me->me_end_daddr),
+                       bdev_logical_block_size(btp->bt_bdev)))
+               return -EINVAL;
+       /*
+        * end_daddr is the exclusive end of the range, so if start_daddr
+        * reaches there (or beyond), there's no work to be done.
+        */
+       if (me->me_start_daddr >= me->me_end_daddr)
+               return 0;
+       /*
+        * There are three ranges involved here:
+        *
+        *  - [me->me_start_daddr, me->me_end_daddr) is the range that the
+        *    user wants to verify.  end_daddr can be beyond the end of the
+        *    disk; we'll constrain it to the end if necessary.
+        *
+        *  - [daddr, me->me_end_daddr) is the range that we have not yet
+        *    verified.  We update daddr after each successful read.
+        *    me->me_start_daddr is set to daddr before returning.
+        *
+        *  - [daddr, daddr + bio_bbcount) is the range that we're currently
+        *    verifying.
+        */
+       daddr = me->me_start_daddr;
+       bbcount = min_t(sector_t, me->me_end_daddr, btp->bt_nr_sectors) -
+                         me->me_start_daddr;
+       folio = xfs_verify_alloc_folio(xfs_verify_iosize(me, btp, bbcount));
+       if (!folio)
+               return -ENOMEM;
+       trace_xfs_verify_media(mp, me, btp->bt_bdev->bd_dev, daddr, bbcount,
+                       folio);
+       bio = bio_alloc(btp->bt_bdev, 1, REQ_OP_READ, GFP_KERNEL);
+       if (!bio) {
+               error = -ENOMEM;
+               goto out_folio;
+       }
+       while (bbcount > 0) {
+               unsigned int    bio_bbcount;
+               blk_status_t    bio_status;
+               bio_reset(bio, btp->bt_bdev, REQ_OP_READ);
+               bio->bi_iter.bi_sector = daddr;
+               bio_add_folio_nofail(bio, folio,
+                               min(bbcount << SECTOR_SHIFT, folio_size(folio)),
+                               0);
+               /*
+                * Save the length of the bio before we submit it, because we
+                * need the original daddr and length for reporting IO errors
+                * if the bio fails.
+                */
+               bio_bbcount = bio->bi_iter.bi_size >> SECTOR_SHIFT;
+               submit_bio_wait(bio);
+               bio_status = bio->bi_status;
+               if (bio_status != BLK_STS_OK) {
+                       xfs_verify_media_error(mp, me, btp, daddr, bio_bbcount,
+                                       bio_status);
+                       error = 0;
+                       break;
+               }
+               daddr += bio_bbcount;
+               bbcount -= bio_bbcount;
+               if (bbcount == 0)
+                       break;
+               if (me->me_rest_us) {
+                       ktime_t expires;
+                       expires = ktime_add_ns(ktime_get(),
+                                       me->me_rest_us * 1000);
+                       set_current_state(TASK_KILLABLE);
+                       schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+               }
+               if (fatal_signal_pending(current)) {
+                       error = -EINTR;
+                       break;
+               }
+               cond_resched();
+       }
+       bio_put(bio);
+ out_folio:
+       folio_put(folio);
+       if (error)
+               return error;
+       /*
+        * Advance start_daddr to the end of what we verified if there wasn't
+        * an operational error.
+        */
+       me->me_start_daddr = daddr;
+       trace_xfs_verify_media_end(mp, me, btp->bt_bdev->bd_dev);
+       return 0;
+ }
+ int
+ xfs_ioc_verify_media(
+       struct file                     *file,
+       struct xfs_verify_media __user  *arg)
+ {
+       struct xfs_verify_media         me;
+       struct xfs_inode                *ip = XFS_I(file_inode(file));
+       struct xfs_mount                *mp = ip->i_mount;
+       int                             error;
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       if (copy_from_user(&me, arg, sizeof(me)))
+               return -EFAULT;
+       if (me.me_pad)
+               return -EINVAL;
+       if (me.me_flags & ~XFS_VERIFY_MEDIA_FLAGS)
+               return -EINVAL;
+       switch (me.me_dev) {
+       case XFS_DEV_DATA:
+       case XFS_DEV_LOG:
+       case XFS_DEV_RT:
+               break;
+       default:
+               return -EINVAL;
+       }
+       error = xfs_verify_media(mp, &me);
+       if (error)
+               return error;
+       if (copy_to_user(arg, &me, sizeof(me)))
+               return -EFAULT;
+       return 0;
+ }