Merge tag 'health-monitoring-7.0_2026-01-20' of https://git.kernel.org/pub/scm/linux...

author Carlos Maiolino <cem@kernel.org>

Wed, 28 Jan 2026 08:47:42 +0000 (09:47 +0100)

committer Carlos Maiolino <cem@kernel.org>

Wed, 28 Jan 2026 09:02:20 +0000 (10:02 +0100)
author Carlos Maiolino <cem@kernel.org>
Wed, 28 Jan 2026 08:47:42 +0000 (09:47 +0100)
committer Carlos Maiolino <cem@kernel.org>
Wed, 28 Jan 2026 09:02:20 +0000 (10:02 +0100)
diff --cc fs/iomap/buffered-io.c
Simple merge
diff --cc fs/xfs/xfs_fsops.c
Simple merge
diff --cc fs/xfs/xfs_health.c
Simple merge
diff --cc fs/xfs/xfs_healthmon.c

index 0000000000000000000000000000000000000000,3030fa93c1e5753adb9761cc50a07c54f815dee3..ca7352dcd182fb602ee3940f0a5ac12a3a4c9670

mode 000000,100644..100644
--- /dev/null
--- 2/fs/xfs/xfs_healthmon.c
+++ b/fs/xfs/xfs_healthmon.c
@@@ -1,0 -1,1255 +1,1255 @@@
- -#include "xfs.h"
+ // SPDX-License-Identifier: GPL-2.0-or-later
+ /*
+  * Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+  * Author: Darrick J. Wong <djwong@kernel.org>
+  */
++#include "xfs_platform.h"
+ #include "xfs_fs.h"
+ #include "xfs_shared.h"
+ #include "xfs_format.h"
+ #include "xfs_log_format.h"
+ #include "xfs_trans_resv.h"
+ #include "xfs_mount.h"
+ #include "xfs_inode.h"
+ #include "xfs_trace.h"
+ #include "xfs_ag.h"
+ #include "xfs_btree.h"
+ #include "xfs_da_format.h"
+ #include "xfs_da_btree.h"
+ #include "xfs_quota_defs.h"
+ #include "xfs_rtgroup.h"
+ #include "xfs_health.h"
+ #include "xfs_healthmon.h"
+ #include "xfs_fsops.h"
+ #include "xfs_notify_failure.h"
+ #include "xfs_file.h"
+ #include "xfs_ioctl.h"
+ 
+ #include <linux/anon_inodes.h>
+ #include <linux/eventpoll.h>
+ #include <linux/poll.h>
+ #include <linux/fserror.h>
+ 
+ /*
+  * Live Health Monitoring
+  * ======================
+  *
+  * Autonomous self-healing of XFS filesystems requires a means for the kernel
+  * to send filesystem health events to a monitoring daemon in userspace.  To
+  * accomplish this, we establish a thread_with_file kthread object to handle
+  * translating internal events about filesystem health into a format that can
+  * be parsed easily by userspace.  When those internal events occur, the core
+  * filesystem code calls this health monitor to convey the events to userspace.
+  * Userspace reads events from the file descriptor returned by the ioctl.
+  *
+  * The healthmon abstraction has a weak reference to the host filesystem mount
+  * so that the queueing and processing of the events do not pin the mount and
+  * cannot slow down the main filesystem.  The healthmon object can exist past
+  * the end of the filesystem mount.
+  */
+ 
+ /* sign of a detached health monitor */
+ #define DETACHED_MOUNT_COOKIE         ((uintptr_t)0)
+ 
+ /* Constrain the number of event objects that can build up in memory. */
+ #define XFS_HEALTHMON_MAX_EVENTS      (SZ_32K / \
+                                        sizeof(struct xfs_healthmon_event))
+ 
+ /* Constrain the size of the output buffer for read_iter. */
+ #define XFS_HEALTHMON_MAX_OUTBUF      SZ_64K
+ 
+ /* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */
+ static DEFINE_SPINLOCK(xfs_healthmon_lock);
+ 
+ /* Grab a reference to the healthmon object for a given mount, if any. */
+ static struct xfs_healthmon *
+ xfs_healthmon_get(
+       struct xfs_mount                *mp)
+ {
+       struct xfs_healthmon            *hm;
+ 
+       rcu_read_lock();
+       hm = mp->m_healthmon;
+       if (hm && !refcount_inc_not_zero(&hm->ref))
+               hm = NULL;
+       rcu_read_unlock();
+ 
+       return hm;
+ }
+ 
+ /*
+  * Release the reference to a healthmon object.  If there are no more holders,
+  * free the health monitor after an RCU grace period to eliminate possibility
+  * of races with xfs_healthmon_get.
+  */
+ static void
+ xfs_healthmon_put(
+       struct xfs_healthmon            *hm)
+ {
+       if (refcount_dec_and_test(&hm->ref)) {
+               struct xfs_healthmon_event      *event;
+               struct xfs_healthmon_event      *next = hm->first_event;
+ 
+               while ((event = next) != NULL) {
+                       trace_xfs_healthmon_drop(hm, event);
+                       next = event->next;
+                       kfree(event);
+               }
+ 
+               kfree(hm->unmount_event);
+               kfree(hm->buffer);
+               mutex_destroy(&hm->lock);
+               kfree_rcu_mightsleep(hm);
+       }
+ }
+ 
+ /* Attach a health monitor to an xfs_mount.  Only one allowed at a time. */
+ STATIC int
+ xfs_healthmon_attach(
+       struct xfs_mount        *mp,
+       struct xfs_healthmon    *hm)
+ {
+       spin_lock(&xfs_healthmon_lock);
+       if (mp->m_healthmon != NULL) {
+               spin_unlock(&xfs_healthmon_lock);
+               return -EEXIST;
+       }
+ 
+       refcount_inc(&hm->ref);
+       mp->m_healthmon = hm;
+       hm->mount_cookie = (uintptr_t)mp->m_super;
+       spin_unlock(&xfs_healthmon_lock);
+ 
+       return 0;
+ }
+ 
+ /* Detach a xfs mount from a specific healthmon instance. */
+ STATIC void
+ xfs_healthmon_detach(
+       struct xfs_healthmon    *hm)
+ {
+       spin_lock(&xfs_healthmon_lock);
+       if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) {
+               spin_unlock(&xfs_healthmon_lock);
+               return;
+       }
+ 
+       XFS_M((struct super_block *)hm->mount_cookie)->m_healthmon = NULL;
+       hm->mount_cookie = DETACHED_MOUNT_COOKIE;
+       spin_unlock(&xfs_healthmon_lock);
+ 
+       trace_xfs_healthmon_detach(hm);
+       xfs_healthmon_put(hm);
+ }
+ 
+ static inline void xfs_healthmon_bump_events(struct xfs_healthmon *hm)
+ {
+       hm->events++;
+       hm->total_events++;
+ }
+ 
+ static inline void xfs_healthmon_bump_lost(struct xfs_healthmon *hm)
+ {
+       hm->lost_prev_event++;
+       hm->total_lost++;
+ }
+ 
+ /*
+  * If possible, merge a new event into an existing event.  Returns whether or
+  * not it merged anything.
+  */
+ static bool
+ xfs_healthmon_merge_events(
+       struct xfs_healthmon_event              *existing,
+       const struct xfs_healthmon_event        *new)
+ {
+       if (!existing)
+               return false;
+ 
+       /* type and domain must match to merge events */
+       if (existing->type != new->type ||
+           existing->domain != new->domain)
+               return false;
+ 
+       switch (existing->type) {
+       case XFS_HEALTHMON_RUNNING:
+       case XFS_HEALTHMON_UNMOUNT:
+               /* should only ever be one of these events anyway */
+               return false;
+ 
+       case XFS_HEALTHMON_LOST:
+               existing->lostcount += new->lostcount;
+               return true;
+ 
+       case XFS_HEALTHMON_SICK:
+       case XFS_HEALTHMON_CORRUPT:
+       case XFS_HEALTHMON_HEALTHY:
+               switch (existing->domain) {
+               case XFS_HEALTHMON_FS:
+                       existing->fsmask |= new->fsmask;
+                       return true;
+               case XFS_HEALTHMON_AG:
+               case XFS_HEALTHMON_RTGROUP:
+                       if (existing->group == new->group){
+                               existing->grpmask |= new->grpmask;
+                               return true;
+                       }
+                       return false;
+               case XFS_HEALTHMON_INODE:
+                       if (existing->ino == new->ino &&
+                           existing->gen == new->gen) {
+                               existing->imask |= new->imask;
+                               return true;
+                       }
+                       return false;
+               default:
+                       ASSERT(0);
+                       return false;
+               }
+               return false;
+ 
+       case XFS_HEALTHMON_SHUTDOWN:
+               /* yes, we can race to shutdown */
+               existing->flags |= new->flags;
+               return true;
+ 
+       case XFS_HEALTHMON_MEDIA_ERROR:
+               /* physically adjacent errors can merge */
+               if (existing->daddr + existing->bbcount == new->daddr) {
+                       existing->bbcount += new->bbcount;
+                       return true;
+               }
+               if (new->daddr + new->bbcount == existing->daddr) {
+                       existing->daddr = new->daddr;
+                       existing->bbcount += new->bbcount;
+                       return true;
+               }
+               return false;
+ 
+       case XFS_HEALTHMON_BUFREAD:
+       case XFS_HEALTHMON_BUFWRITE:
+       case XFS_HEALTHMON_DIOREAD:
+       case XFS_HEALTHMON_DIOWRITE:
+       case XFS_HEALTHMON_DATALOST:
+               /* logically adjacent file ranges can merge */
+               if (existing->fino != new->fino || existing->fgen != new->fgen)
+                       return false;
+ 
+               if (existing->fpos + existing->flen == new->fpos) {
+                       existing->flen += new->flen;
+                       return true;
+               }
+ 
+               if (new->fpos + new->flen == existing->fpos) {
+                       existing->fpos = new->fpos;
+                       existing->flen += new->flen;
+                       return true;
+               }
+               return false;
+       }
+ 
+       return false;
+ }
+ 
+ /* Insert an event onto the start of the queue. */
+ static inline void
+ __xfs_healthmon_insert(
+       struct xfs_healthmon            *hm,
+       struct xfs_healthmon_event      *event)
+ {
+       struct timespec64               now;
+ 
+       ktime_get_coarse_real_ts64(&now);
+       event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;
+ 
+       event->next = hm->first_event;
+       if (!hm->first_event)
+               hm->first_event = event;
+       if (!hm->last_event)
+               hm->last_event = event;
+       xfs_healthmon_bump_events(hm);
+       wake_up(&hm->wait);
+ 
+       trace_xfs_healthmon_insert(hm, event);
+ }
+ 
+ /* Push an event onto the end of the queue. */
+ static inline void
+ __xfs_healthmon_push(
+       struct xfs_healthmon            *hm,
+       struct xfs_healthmon_event      *event)
+ {
+       struct timespec64               now;
+ 
+       ktime_get_coarse_real_ts64(&now);
+       event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;
+ 
+       if (!hm->first_event)
+               hm->first_event = event;
+       if (hm->last_event)
+               hm->last_event->next = event;
+       hm->last_event = event;
+       event->next = NULL;
+       xfs_healthmon_bump_events(hm);
+       wake_up(&hm->wait);
+ 
+       trace_xfs_healthmon_push(hm, event);
+ }
+ 
+ /* Deal with any previously lost events */
+ static int
+ xfs_healthmon_clear_lost_prev(
+       struct xfs_healthmon            *hm)
+ {
+       struct xfs_healthmon_event      lost_event = {
+               .type                   = XFS_HEALTHMON_LOST,
+               .domain                 = XFS_HEALTHMON_MOUNT,
+               .lostcount              = hm->lost_prev_event,
+       };
+       struct xfs_healthmon_event      *event = NULL;
+ 
+       if (xfs_healthmon_merge_events(hm->last_event, &lost_event)) {
+               trace_xfs_healthmon_merge(hm, hm->last_event);
+               wake_up(&hm->wait);
+               goto cleared;
+       }
+ 
+       if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
+               event = kmemdup(&lost_event, sizeof(struct xfs_healthmon_event),
+                               GFP_NOFS);
+       if (!event)
+               return -ENOMEM;
+ 
+       __xfs_healthmon_push(hm, event);
+ cleared:
+       hm->lost_prev_event = 0;
+       return 0;
+ }
+ 
+ /*
+  * Push an event onto the end of the list after dealing with lost events and
+  * possibly full queues.
+  */
+ STATIC int
+ xfs_healthmon_push(
+       struct xfs_healthmon                    *hm,
+       const struct xfs_healthmon_event        *template)
+ {
+       struct xfs_healthmon_event              *event = NULL;
+       int                                     error = 0;
+ 
+       /*
+        * Locklessly check if the health monitor has already detached from the
+        * mount.  If so, ignore the event.  If we race with deactivation,
+        * we'll queue the event but never send it.
+        */
+       if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
+               return -ESHUTDOWN;
+ 
+       mutex_lock(&hm->lock);
+ 
+       /* Report previously lost events before we do anything else */
+       if (hm->lost_prev_event) {
+               error = xfs_healthmon_clear_lost_prev(hm);
+               if (error)
+                       goto out_unlock;
+       }
+ 
+       /* Try to merge with the newest event */
+       if (xfs_healthmon_merge_events(hm->last_event, template)) {
+               trace_xfs_healthmon_merge(hm, hm->last_event);
+               wake_up(&hm->wait);
+               goto out_unlock;
+       }
+ 
+       /* Only create a heap event object if we're not already at capacity. */
+       if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
+               event = kmemdup(template, sizeof(struct xfs_healthmon_event),
+                               GFP_NOFS);
+       if (!event) {
+               /* No memory means we lose the event */
+               trace_xfs_healthmon_lost_event(hm);
+               xfs_healthmon_bump_lost(hm);
+               error = -ENOMEM;
+               goto out_unlock;
+       }
+ 
+       __xfs_healthmon_push(hm, event);
+ 
+ out_unlock:
+       mutex_unlock(&hm->lock);
+       return error;
+ }
+ 
+ /*
+  * Report that the filesystem is being unmounted, then detach the xfs mount
+  * from this healthmon instance.
+  */
+ void
+ xfs_healthmon_unmount(
+       struct xfs_mount                *mp)
+ {
+       struct xfs_healthmon            *hm = xfs_healthmon_get(mp);
+ 
+       if (!hm)
+               return;
+ 
+       trace_xfs_healthmon_report_unmount(hm);
+ 
+       /*
+        * Insert the unmount notification at the start of the event queue so
+        * that userspace knows the filesystem went away as soon as possible.
+        * There's nothing actionable for userspace after an unmount.  Once
+        * we've inserted the unmount event, hm no longer owns that event.
+        */
+       __xfs_healthmon_insert(hm, hm->unmount_event);
+       hm->unmount_event = NULL;
+ 
+       xfs_healthmon_detach(hm);
+       xfs_healthmon_put(hm);
+ }
+ 
+ /* Compute the reporting mask for non-unmount metadata health events. */
+ static inline unsigned int
+ metadata_event_mask(
+       struct xfs_healthmon            *hm,
+       enum xfs_healthmon_type         type,
+       unsigned int                    old_mask,
+       unsigned int                    new_mask)
+ {
+       /* If we want all events, return all events. */
+       if (hm->verbose)
+               return new_mask;
+ 
+       switch (type) {
+       case XFS_HEALTHMON_SICK:
+               /* Always report runtime corruptions */
+               return new_mask;
+       case XFS_HEALTHMON_CORRUPT:
+               /* Only report new fsck errors */
+               return new_mask & ~old_mask;
+       case XFS_HEALTHMON_HEALTHY:
+               /* Only report healthy metadata that got fixed */
+               return new_mask & old_mask;
+       default:
+               ASSERT(0);
+               break;
+       }
+ 
+       return 0;
+ }
+ 
+ /* Report XFS_FS_SICK_* events to healthmon */
+ void
+ xfs_healthmon_report_fs(
+       struct xfs_mount                *mp,
+       enum xfs_healthmon_type         type,
+       unsigned int                    old_mask,
+       unsigned int                    new_mask)
+ {
+       struct xfs_healthmon_event      event = {
+               .type                   = type,
+               .domain                 = XFS_HEALTHMON_FS,
+       };
+       struct xfs_healthmon            *hm = xfs_healthmon_get(mp);
+ 
+       if (!hm)
+               return;
+ 
+       event.fsmask = metadata_event_mask(hm, type, old_mask, new_mask) &
+                       ~XFS_SICK_FS_SECONDARY;
+       trace_xfs_healthmon_report_fs(hm, old_mask, new_mask, &event);
+ 
+       if (event.fsmask)
+               xfs_healthmon_push(hm, &event);
+ 
+       xfs_healthmon_put(hm);
+ }
+ 
+ /* Report XFS_SICK_(AG|RG)* flags to healthmon */
+ void
+ xfs_healthmon_report_group(
+       struct xfs_group                *xg,
+       enum xfs_healthmon_type         type,
+       unsigned int                    old_mask,
+       unsigned int                    new_mask)
+ {
+       struct xfs_healthmon_event      event = {
+               .type                   = type,
+               .group                  = xg->xg_gno,
+       };
+       struct xfs_healthmon            *hm = xfs_healthmon_get(xg->xg_mount);
+ 
+       if (!hm)
+               return;
+ 
+       switch (xg->xg_type) {
+       case XG_TYPE_RTG:
+               event.domain = XFS_HEALTHMON_RTGROUP;
+               event.grpmask = metadata_event_mask(hm, type, old_mask,
+                                                   new_mask) &
+                               ~XFS_SICK_RG_SECONDARY;
+               break;
+       case XG_TYPE_AG:
+               event.domain = XFS_HEALTHMON_AG;
+               event.grpmask = metadata_event_mask(hm, type, old_mask,
+                                                   new_mask) &
+                               ~XFS_SICK_AG_SECONDARY;
+               break;
+       default:
+               ASSERT(0);
+               break;
+       }
+ 
+       trace_xfs_healthmon_report_group(hm, old_mask, new_mask, &event);
+ 
+       if (event.grpmask)
+               xfs_healthmon_push(hm, &event);
+ 
+       xfs_healthmon_put(hm);
+ }
+ 
+ /* Report XFS_SICK_INO_* flags to healthmon */
+ void
+ xfs_healthmon_report_inode(
+       struct xfs_inode                *ip,
+       enum xfs_healthmon_type         type,
+       unsigned int                    old_mask,
+       unsigned int                    new_mask)
+ {
+       struct xfs_healthmon_event      event = {
+               .type                   = type,
+               .domain                 = XFS_HEALTHMON_INODE,
+               .ino                    = ip->i_ino,
+               .gen                    = VFS_I(ip)->i_generation,
+       };
+       struct xfs_healthmon            *hm = xfs_healthmon_get(ip->i_mount);
+ 
+       if (!hm)
+               return;
+ 
+       event.imask = metadata_event_mask(hm, type, old_mask, new_mask) &
+                       ~XFS_SICK_INO_SECONDARY;
+       trace_xfs_healthmon_report_inode(hm, old_mask, event.imask, &event);
+ 
+       if (event.imask)
+               xfs_healthmon_push(hm, &event);
+ 
+       xfs_healthmon_put(hm);
+ }
+ 
+ /* Add a shutdown event to the reporting queue. */
+ void
+ xfs_healthmon_report_shutdown(
+       struct xfs_mount                *mp,
+       uint32_t                        flags)
+ {
+       struct xfs_healthmon_event      event = {
+               .type                   = XFS_HEALTHMON_SHUTDOWN,
+               .domain                 = XFS_HEALTHMON_MOUNT,
+               .flags                  = flags,
+       };
+       struct xfs_healthmon            *hm = xfs_healthmon_get(mp);
+ 
+       if (!hm)
+               return;
+ 
+       trace_xfs_healthmon_report_shutdown(hm, flags);
+ 
+       xfs_healthmon_push(hm, &event);
+       xfs_healthmon_put(hm);
+ }
+ 
+ static inline enum xfs_healthmon_domain
+ media_error_domain(
+       enum xfs_device                 fdev)
+ {
+       switch (fdev) {
+       case XFS_DEV_DATA:
+               return XFS_HEALTHMON_DATADEV;
+       case XFS_DEV_LOG:
+               return XFS_HEALTHMON_LOGDEV;
+       case XFS_DEV_RT:
+               return XFS_HEALTHMON_RTDEV;
+       }
+ 
+       ASSERT(0);
+       return 0;
+ }
+ 
+ /* Add a media error event to the reporting queue. */
+ void
+ xfs_healthmon_report_media(
+       struct xfs_mount                *mp,
+       enum xfs_device                 fdev,
+       xfs_daddr_t                     daddr,
+       uint64_t                        bbcount)
+ {
+       struct xfs_healthmon_event      event = {
+               .type                   = XFS_HEALTHMON_MEDIA_ERROR,
+               .domain                 = media_error_domain(fdev),
+               .daddr                  = daddr,
+               .bbcount                = bbcount,
+       };
+       struct xfs_healthmon            *hm = xfs_healthmon_get(mp);
+ 
+       if (!hm)
+               return;
+ 
+       trace_xfs_healthmon_report_media(hm, fdev, &event);
+ 
+       xfs_healthmon_push(hm, &event);
+       xfs_healthmon_put(hm);
+ }
+ 
+ static inline enum xfs_healthmon_type file_ioerr_type(enum fserror_type action)
+ {
+       switch (action) {
+       case FSERR_BUFFERED_READ:
+               return XFS_HEALTHMON_BUFREAD;
+       case FSERR_BUFFERED_WRITE:
+               return XFS_HEALTHMON_BUFWRITE;
+       case FSERR_DIRECTIO_READ:
+               return XFS_HEALTHMON_DIOREAD;
+       case FSERR_DIRECTIO_WRITE:
+               return XFS_HEALTHMON_DIOWRITE;
+       case FSERR_DATA_LOST:
+               return XFS_HEALTHMON_DATALOST;
+       case FSERR_METADATA:
+               /* filtered out by xfs_fs_report_error */
+               break;
+       }
+ 
+       ASSERT(0);
+       return -1;
+ }
+ 
+ /* Add a file io error event to the reporting queue. */
+ void
+ xfs_healthmon_report_file_ioerror(
+       struct xfs_inode                *ip,
+       const struct fserror_event      *p)
+ {
+       struct xfs_healthmon_event      event = {
+               .type                   = file_ioerr_type(p->type),
+               .domain                 = XFS_HEALTHMON_FILERANGE,
+               .fino                   = ip->i_ino,
+               .fgen                   = VFS_I(ip)->i_generation,
+               .fpos                   = p->pos,
+               .flen                   = p->len,
+               /* send positive error number to userspace */
+               .error                  = -p->error,
+       };
+       struct xfs_healthmon            *hm = xfs_healthmon_get(ip->i_mount);
+ 
+       if (!hm)
+               return;
+ 
+       trace_xfs_healthmon_report_file_ioerror(hm, p);
+ 
+       xfs_healthmon_push(hm, &event);
+       xfs_healthmon_put(hm);
+ }
+ 
+ static inline void
+ xfs_healthmon_reset_outbuf(
+       struct xfs_healthmon            *hm)
+ {
+       hm->buftail = 0;
+       hm->bufhead = 0;
+ }
+ 
+ struct flags_map {
+       unsigned int            in_mask;
+       unsigned int            out_mask;
+ };
+ 
+ static const struct flags_map shutdown_map[] = {
+       { SHUTDOWN_META_IO_ERROR,       XFS_HEALTH_SHUTDOWN_META_IO_ERROR },
+       { SHUTDOWN_LOG_IO_ERROR,        XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR },
+       { SHUTDOWN_FORCE_UMOUNT,        XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT },
+       { SHUTDOWN_CORRUPT_INCORE,      XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE },
+       { SHUTDOWN_CORRUPT_ONDISK,      XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK },
+       { SHUTDOWN_DEVICE_REMOVED,      XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED },
+ };
+ 
+ static inline unsigned int
+ __map_flags(
+       const struct flags_map  *map,
+       size_t                  array_len,
+       unsigned int            flags)
+ {
+       const struct flags_map  *m;
+       unsigned int            ret = 0;
+ 
+       for (m = map; m < map + array_len; m++) {
+               if (flags & m->in_mask)
+                       ret |= m->out_mask;
+       }
+ 
+       return ret;
+ }
+ 
+ #define map_flags(map, flags) __map_flags((map), ARRAY_SIZE(map), (flags))
+ 
+ static inline unsigned int shutdown_mask(unsigned int in)
+ {
+       return map_flags(shutdown_map, in);
+ }
+ 
+ static const unsigned int domain_map[] = {
+       [XFS_HEALTHMON_MOUNT]           = XFS_HEALTH_MONITOR_DOMAIN_MOUNT,
+       [XFS_HEALTHMON_FS]              = XFS_HEALTH_MONITOR_DOMAIN_FS,
+       [XFS_HEALTHMON_AG]              = XFS_HEALTH_MONITOR_DOMAIN_AG,
+       [XFS_HEALTHMON_INODE]           = XFS_HEALTH_MONITOR_DOMAIN_INODE,
+       [XFS_HEALTHMON_RTGROUP]         = XFS_HEALTH_MONITOR_DOMAIN_RTGROUP,
+       [XFS_HEALTHMON_DATADEV]         = XFS_HEALTH_MONITOR_DOMAIN_DATADEV,
+       [XFS_HEALTHMON_RTDEV]           = XFS_HEALTH_MONITOR_DOMAIN_RTDEV,
+       [XFS_HEALTHMON_LOGDEV]          = XFS_HEALTH_MONITOR_DOMAIN_LOGDEV,
+       [XFS_HEALTHMON_FILERANGE]       = XFS_HEALTH_MONITOR_DOMAIN_FILERANGE,
+ };
+ 
+ static const unsigned int type_map[] = {
+       [XFS_HEALTHMON_RUNNING]         = XFS_HEALTH_MONITOR_TYPE_RUNNING,
+       [XFS_HEALTHMON_LOST]            = XFS_HEALTH_MONITOR_TYPE_LOST,
+       [XFS_HEALTHMON_SICK]            = XFS_HEALTH_MONITOR_TYPE_SICK,
+       [XFS_HEALTHMON_CORRUPT]         = XFS_HEALTH_MONITOR_TYPE_CORRUPT,
+       [XFS_HEALTHMON_HEALTHY]         = XFS_HEALTH_MONITOR_TYPE_HEALTHY,
+       [XFS_HEALTHMON_UNMOUNT]         = XFS_HEALTH_MONITOR_TYPE_UNMOUNT,
+       [XFS_HEALTHMON_SHUTDOWN]        = XFS_HEALTH_MONITOR_TYPE_SHUTDOWN,
+       [XFS_HEALTHMON_MEDIA_ERROR]     = XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR,
+       [XFS_HEALTHMON_BUFREAD]         = XFS_HEALTH_MONITOR_TYPE_BUFREAD,
+       [XFS_HEALTHMON_BUFWRITE]        = XFS_HEALTH_MONITOR_TYPE_BUFWRITE,
+       [XFS_HEALTHMON_DIOREAD]         = XFS_HEALTH_MONITOR_TYPE_DIOREAD,
+       [XFS_HEALTHMON_DIOWRITE]        = XFS_HEALTH_MONITOR_TYPE_DIOWRITE,
+       [XFS_HEALTHMON_DATALOST]        = XFS_HEALTH_MONITOR_TYPE_DATALOST,
+ };
+ 
+ /* Render event as a V0 structure */
+ STATIC int
+ xfs_healthmon_format_v0(
+       struct xfs_healthmon            *hm,
+       const struct xfs_healthmon_event *event)
+ {
+       struct xfs_health_monitor_event hme = {
+               .time_ns                = event->time_ns,
+       };
+ 
+       trace_xfs_healthmon_format(hm, event);
+ 
+       if (event->domain < 0 || event->domain >= ARRAY_SIZE(domain_map) ||
+           event->type < 0   || event->type >= ARRAY_SIZE(type_map))
+               return -EFSCORRUPTED;
+ 
+       hme.domain = domain_map[event->domain];
+       hme.type = type_map[event->type];
+ 
+       /* fill in the event-specific details */
+       switch (event->domain) {
+       case XFS_HEALTHMON_MOUNT:
+               switch (event->type) {
+               case XFS_HEALTHMON_LOST:
+                       hme.e.lost.count = event->lostcount;
+                       break;
+               case XFS_HEALTHMON_SHUTDOWN:
+                       hme.e.shutdown.reasons = shutdown_mask(event->flags);
+                       break;
+               default:
+                       break;
+               }
+               break;
+       case XFS_HEALTHMON_FS:
+               hme.e.fs.mask = xfs_healthmon_fs_mask(event->fsmask);
+               break;
+       case XFS_HEALTHMON_RTGROUP:
+               hme.e.group.mask = xfs_healthmon_rtgroup_mask(event->grpmask);
+               hme.e.group.gno = event->group;
+               break;
+       case XFS_HEALTHMON_AG:
+               hme.e.group.mask = xfs_healthmon_perag_mask(event->grpmask);
+               hme.e.group.gno = event->group;
+               break;
+       case XFS_HEALTHMON_INODE:
+               hme.e.inode.mask = xfs_healthmon_inode_mask(event->imask);
+               hme.e.inode.ino = event->ino;
+               hme.e.inode.gen = event->gen;
+               break;
+       case XFS_HEALTHMON_DATADEV:
+       case XFS_HEALTHMON_LOGDEV:
+       case XFS_HEALTHMON_RTDEV:
+               hme.e.media.daddr = event->daddr;
+               hme.e.media.bbcount = event->bbcount;
+               break;
+       case XFS_HEALTHMON_FILERANGE:
+               hme.e.filerange.ino = event->fino;
+               hme.e.filerange.gen = event->fgen;
+               hme.e.filerange.pos = event->fpos;
+               hme.e.filerange.len = event->flen;
+               hme.e.filerange.error = abs(event->error);
+               break;
+       default:
+               break;
+       }
+ 
+       ASSERT(hm->bufhead + sizeof(hme) <= hm->bufsize);
+ 
+       /* copy formatted object to the outbuf */
+       if (hm->bufhead + sizeof(hme) <= hm->bufsize) {
+               memcpy(hm->buffer + hm->bufhead, &hme, sizeof(hme));
+               hm->bufhead += sizeof(hme);
+       }
+ 
+       return 0;
+ }
+ 
+ /* How many bytes are waiting in the outbuf to be copied? */
+ static inline size_t
+ xfs_healthmon_outbuf_bytes(
+       struct xfs_healthmon    *hm)
+ {
+       if (hm->bufhead > hm->buftail)
+               return hm->bufhead - hm->buftail;
+       return 0;
+ }
+ 
+ /*
+  * Do we have something for userspace to read?  This can mean unmount events,
+  * events pending in the queue, or pending bytes in the outbuf.
+  */
+ static inline bool
+ xfs_healthmon_has_eventdata(
+       struct xfs_healthmon    *hm)
+ {
+       /*
+        * If the health monitor is already detached from the xfs_mount, we
+        * want reads to return 0 bytes even if there are no events, because
+        * userspace interprets that as EOF.  If we race with deactivation,
+        * read_iter will take the necessary locks to discover that there are
+        * no events to send.
+        */
+       if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
+               return true;
+ 
+       /*
+        * Either there are events waiting to be formatted into the buffer, or
+        * there's unread bytes in the buffer.
+        */
+       return hm->events > 0 || xfs_healthmon_outbuf_bytes(hm) > 0;
+ }
+ 
+ /* Try to copy the rest of the outbuf to the iov iter. */
+ STATIC ssize_t
+ xfs_healthmon_copybuf(
+       struct xfs_healthmon    *hm,
+       struct iov_iter         *to)
+ {
+       size_t                  to_copy;
+       size_t                  w = 0;
+ 
+       trace_xfs_healthmon_copybuf(hm, to);
+ 
+       to_copy = xfs_healthmon_outbuf_bytes(hm);
+       if (to_copy) {
+               w = copy_to_iter(hm->buffer + hm->buftail, to_copy, to);
+               if (!w)
+                       return -EFAULT;
+ 
+               hm->buftail += w;
+       }
+ 
+       /*
+        * Nothing left to copy?  Reset the output buffer cursors to the start
+        * since there's no live data in the buffer.
+        */
+       if (xfs_healthmon_outbuf_bytes(hm) == 0)
+               xfs_healthmon_reset_outbuf(hm);
+       return w;
+ }
+ 
+ /*
+  * Return a health monitoring event for formatting into the output buffer if
+  * there's enough space in the outbuf and an event waiting for us.  Caller
+  * must hold i_rwsem on the healthmon file.
+  */
+ static inline struct xfs_healthmon_event *
+ xfs_healthmon_format_pop(
+       struct xfs_healthmon    *hm)
+ {
+       struct xfs_healthmon_event *event;
+ 
+       if (hm->bufhead + sizeof(*event) > hm->bufsize)
+               return NULL;
+ 
+       mutex_lock(&hm->lock);
+       event = hm->first_event;
+       if (event) {
+               if (hm->last_event == event)
+                       hm->last_event = NULL;
+               hm->first_event = event->next;
+               hm->events--;
+ 
+               trace_xfs_healthmon_pop(hm, event);
+       }
+       mutex_unlock(&hm->lock);
+       return event;
+ }
+ 
+ /* Allocate formatting buffer */
+ STATIC int
+ xfs_healthmon_alloc_outbuf(
+       struct xfs_healthmon    *hm,
+       size_t                  user_bufsize)
+ {
+       void                    *outbuf;
+       size_t                  bufsize =
+               min(XFS_HEALTHMON_MAX_OUTBUF, max(PAGE_SIZE, user_bufsize));
+ 
+       outbuf = kzalloc(bufsize, GFP_KERNEL);
+       if (!outbuf) {
+               if (bufsize == PAGE_SIZE)
+                       return -ENOMEM;
+ 
+               bufsize = PAGE_SIZE;
+               outbuf = kzalloc(bufsize, GFP_KERNEL);
+               if (!outbuf)
+                       return -ENOMEM;
+       }
+ 
+       hm->buffer = outbuf;
+       hm->bufsize = bufsize;
+       hm->bufhead = 0;
+       hm->buftail = 0;
+ 
+       return 0;
+ }
+ 
+ /*
+  * Convey queued event data to userspace.  First copy any remaining bytes in
+  * the outbuf, then format the oldest event into the outbuf and copy that too.
+  */
+ STATIC ssize_t
+ xfs_healthmon_read_iter(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+ {
+       struct file             *file = iocb->ki_filp;
+       struct inode            *inode = file_inode(file);
+       struct xfs_healthmon    *hm = file->private_data;
+       struct xfs_healthmon_event *event;
+       size_t                  copied = 0;
+       ssize_t                 ret = 0;
+ 
+       if (file->f_flags & O_NONBLOCK) {
+               if (!xfs_healthmon_has_eventdata(hm) || !inode_trylock(inode))
+                       return -EAGAIN;
+       } else {
+               ret = wait_event_interruptible(hm->wait,
+                               xfs_healthmon_has_eventdata(hm));
+               if (ret)
+                       return ret;
+ 
+               inode_lock(inode);
+       }
+ 
+       if (hm->bufsize == 0) {
+               ret = xfs_healthmon_alloc_outbuf(hm, iov_iter_count(to));
+               if (ret)
+                       goto out_unlock;
+       }
+ 
+       trace_xfs_healthmon_read_start(hm);
+ 
+       /*
+        * If there's anything left in the output buffer, copy that before
+        * formatting more events.
+        */
+       ret = xfs_healthmon_copybuf(hm, to);
+       if (ret < 0)
+               goto out_unlock;
+       copied += ret;
+ 
+       while (iov_iter_count(to) > 0) {
+               /* Format the next events into the outbuf until it's full. */
+               while ((event = xfs_healthmon_format_pop(hm)) != NULL) {
+                       ret = xfs_healthmon_format_v0(hm, event);
+                       kfree(event);
+                       if (ret)
+                               goto out_unlock;
+               }
+ 
+               /* Copy anything formatted into outbuf to userspace */
+               ret = xfs_healthmon_copybuf(hm, to);
+               if (ret <= 0)
+                       break;
+ 
+               copied += ret;
+       }
+ 
+ out_unlock:
+       trace_xfs_healthmon_read_finish(hm);
+       inode_unlock(inode);
+       return copied ?: ret;
+ }
+ 
+ /* Poll for available events. */
+ STATIC __poll_t
+ xfs_healthmon_poll(
+       struct file                     *file,
+       struct poll_table_struct        *wait)
+ {
+       struct xfs_healthmon            *hm = file->private_data;
+       __poll_t                        mask = 0;
+ 
+       poll_wait(file, &hm->wait, wait);
+ 
+       if (xfs_healthmon_has_eventdata(hm))
+               mask |= EPOLLIN;
+       return mask;
+ }
+ 
+ /* Free the health monitoring information. */
+ STATIC int
+ xfs_healthmon_release(
+       struct inode            *inode,
+       struct file             *file)
+ {
+       struct xfs_healthmon    *hm = file->private_data;
+ 
+       trace_xfs_healthmon_release(hm);
+ 
+       /*
+        * We might be closing the healthmon file before the filesystem
+        * unmounts, because userspace processes can terminate at any time and
+        * for any reason.  Null out xfs_mount::m_healthmon so that another
+        * process can create another health monitor file.
+        */
+       xfs_healthmon_detach(hm);
+ 
+       /*
+        * Wake up any readers that might be left.  There shouldn't be any
+        * because the only users of the waiter are read and poll.
+        */
+       wake_up_all(&hm->wait);
+ 
+       xfs_healthmon_put(hm);
+       return 0;
+ }
+ 
+ /* Validate ioctl parameters. */
+ static inline bool
+ xfs_healthmon_validate(
+       const struct xfs_health_monitor *hmo)
+ {
+       if (hmo->flags & ~XFS_HEALTH_MONITOR_ALL)
+               return false;
+       if (hmo->format != XFS_HEALTH_MONITOR_FMT_V0)
+               return false;
+       if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad)))
+               return false;
+       return true;
+ }
+ 
+ /* Emit some data about the health monitoring fd. */
+ static void
+ xfs_healthmon_show_fdinfo(
+       struct seq_file         *m,
+       struct file             *file)
+ {
+       struct xfs_healthmon    *hm = file->private_data;
+ 
+       mutex_lock(&hm->lock);
+       seq_printf(m, "state:\t%s\ndev:\t%d:%d\nformat:\tv0\nevents:\t%llu\nlost:\t%llu\n",
+                       hm->mount_cookie == DETACHED_MOUNT_COOKIE ?
+                               "dead" : "alive",
+                       MAJOR(hm->dev), MINOR(hm->dev),
+                       hm->total_events,
+                       hm->total_lost);
+       mutex_unlock(&hm->lock);
+ }
+ 
+ /* Reconfigure the health monitor. */
+ STATIC long
+ xfs_healthmon_reconfigure(
+       struct file                     *file,
+       unsigned int                    cmd,
+       void __user                     *arg)
+ {
+       struct xfs_health_monitor       hmo;
+       struct xfs_healthmon            *hm = file->private_data;
+ 
+       if (copy_from_user(&hmo, arg, sizeof(hmo)))
+               return -EFAULT;
+ 
+       if (!xfs_healthmon_validate(&hmo))
+               return -EINVAL;
+ 
+       mutex_lock(&hm->lock);
+       hm->verbose = !!(hmo.flags & XFS_HEALTH_MONITOR_VERBOSE);
+       mutex_unlock(&hm->lock);
+ 
+       return 0;
+ }
+ 
+ /* Does the fd point to the same filesystem as the one we're monitoring? */
+ STATIC long
+ xfs_healthmon_file_on_monitored_fs(
+       struct file                     *file,
+       unsigned int                    cmd,
+       void __user                     *arg)
+ {
+       struct xfs_health_file_on_monitored_fs hms;
+       struct xfs_healthmon            *hm = file->private_data;
+       struct inode                    *hms_inode;
+ 
+       if (copy_from_user(&hms, arg, sizeof(hms)))
+               return -EFAULT;
+ 
+       if (hms.flags)
+               return -EINVAL;
+ 
+       CLASS(fd, hms_fd)(hms.fd);
+       if (fd_empty(hms_fd))
+               return -EBADF;
+ 
+       hms_inode = file_inode(fd_file(hms_fd));
+       mutex_lock(&hm->lock);
+       if (hm->mount_cookie != (uintptr_t)hms_inode->i_sb) {
+               mutex_unlock(&hm->lock);
+               return -ESTALE;
+       }
+ 
+       mutex_unlock(&hm->lock);
+       return 0;
+ }
+ 
+ /* Handle ioctls for the health monitoring thread. */
+ STATIC long
+ xfs_healthmon_ioctl(
+       struct file                     *file,
+       unsigned int                    cmd,
+       unsigned long                   p)
+ {
+       void __user                     *arg = (void __user *)p;
+ 
+       switch (cmd) {
+       case XFS_IOC_HEALTH_MONITOR:
+               return xfs_healthmon_reconfigure(file, cmd, arg);
+       case XFS_IOC_HEALTH_FD_ON_MONITORED_FS:
+               return xfs_healthmon_file_on_monitored_fs(file, cmd, arg);
+       default:
+               break;
+       }
+ 
+       return -ENOTTY;
+ }
+ 
+ static const struct file_operations xfs_healthmon_fops = {
+       .owner          = THIS_MODULE,
+       .show_fdinfo    = xfs_healthmon_show_fdinfo,
+       .read_iter      = xfs_healthmon_read_iter,
+       .poll           = xfs_healthmon_poll,
+       .release        = xfs_healthmon_release,
+       .unlocked_ioctl = xfs_healthmon_ioctl,
+ };
+ 
+ /*
+  * Create a health monitoring file.  Returns an index to the fd table or a
+  * negative errno.
+  */
+ long
+ xfs_ioc_health_monitor(
+       struct file                     *file,
+       struct xfs_health_monitor __user *arg)
+ {
+       struct xfs_health_monitor       hmo;
+       struct xfs_healthmon_event      *running_event;
+       struct xfs_healthmon            *hm;
+       struct xfs_inode                *ip = XFS_I(file_inode(file));
+       struct xfs_mount                *mp = ip->i_mount;
+       int                             ret;
+ 
+       /*
+        * The only intended user of the health monitoring system should be the
+        * xfs_healer daemon running on behalf of the whole filesystem in the
+        * initial user namespace.  IOWs, we don't allow unprivileged userspace
+        * (they can use fsnotify) nor do we allow containers.
+        */
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       if (ip->i_ino != mp->m_sb.sb_rootino)
+               return -EPERM;
+       if (current_user_ns() != &init_user_ns)
+               return -EPERM;
+ 
+       if (copy_from_user(&hmo, arg, sizeof(hmo)))
+               return -EFAULT;
+ 
+       if (!xfs_healthmon_validate(&hmo))
+               return -EINVAL;
+ 
+       hm = kzalloc(sizeof(*hm), GFP_KERNEL);
+       if (!hm)
+               return -ENOMEM;
+       hm->dev = mp->m_super->s_dev;
+       refcount_set(&hm->ref, 1);
+ 
+       mutex_init(&hm->lock);
+       init_waitqueue_head(&hm->wait);
+ 
+       if (hmo.flags & XFS_HEALTH_MONITOR_VERBOSE)
+               hm->verbose = true;
+ 
+       /* Queue up the first event that lets the client know we're running. */
+       running_event = kzalloc(sizeof(struct xfs_healthmon_event), GFP_NOFS);
+       if (!running_event) {
+               ret = -ENOMEM;
+               goto out_hm;
+       }
+       running_event->type = XFS_HEALTHMON_RUNNING;
+       running_event->domain = XFS_HEALTHMON_MOUNT;
+       __xfs_healthmon_insert(hm, running_event);
+ 
+       /*
+        * Preallocate the unmount event so that we can't fail to notify the
+        * filesystem later.  This is key for triggering fast exit of the
+        * xfs_healer daemon.
+        */
+       hm->unmount_event = kzalloc(sizeof(struct xfs_healthmon_event),
+                       GFP_NOFS);
+       if (!hm->unmount_event) {
+               ret = -ENOMEM;
+               goto out_hm;
+       }
+       hm->unmount_event->type = XFS_HEALTHMON_UNMOUNT;
+       hm->unmount_event->domain = XFS_HEALTHMON_MOUNT;
+ 
+       /*
+        * Try to attach this health monitor to the xfs_mount.  The monitor is
+        * considered live and will receive events if this succeeds.
+        */
+       ret = xfs_healthmon_attach(mp, hm);
+       if (ret)
+               goto out_hm;
+ 
+       /*
+        * Create the anonymous file and install a fd for it.  If it succeeds,
+        * the file owns hm and can go away at any time, so we must not access
+        * it again.  This must go last because we can't undo a fd table
+        * installation.
+        */
+       ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm,
+                       O_CLOEXEC | O_RDONLY);
+       if (ret < 0)
+               goto out_mp;
+ 
+       trace_xfs_healthmon_create(mp->m_super->s_dev, hmo.flags, hmo.format);
+ 
+       return ret;
+ 
+ out_mp:
+       xfs_healthmon_detach(hm);
+ out_hm:
+       ASSERT(refcount_read(&hm->ref) == 1);
+       xfs_healthmon_put(hm);
+       return ret;
+ }
diff --cc fs/xfs/xfs_ioctl.c
Simple merge
diff --cc fs/xfs/xfs_mount.c
Simple merge
diff --cc fs/xfs/xfs_notify_failure.c
Simple merge
diff --cc fs/xfs/xfs_platform.h
Simple merge
diff --cc fs/xfs/xfs_super.c
Simple merge
diff --cc fs/xfs/xfs_trace.c
Simple merge
diff --cc fs/xfs/xfs_verify_media.c

index 0000000000000000000000000000000000000000,f4f620c98d92ca965b5aceb1d26132f1634daa45..069cd371619dc2ad8bf7048a52368f7af2f70a95

mode 000000,100644..100644
--- /dev/null
--- 2/fs/xfs/xfs_verify_media.c
+++ b/fs/xfs/xfs_verify_media.c
@@@ -1,0 -1,445 +1,445 @@@
- -#include "xfs.h"
+ // SPDX-License-Identifier: GPL-2.0-or-later
+ /*
+  * Copyright (c) 2026 Oracle.  All Rights Reserved.
+  * Author: Darrick J. Wong <djwong@kernel.org>
+  */
++#include "xfs_platform.h"
+ #include "xfs_shared.h"
+ #include "xfs_format.h"
+ #include "xfs_log_format.h"
+ #include "xfs_trans_resv.h"
+ #include "xfs_mount.h"
+ #include "xfs_bit.h"
+ #include "xfs_btree.h"
+ #include "xfs_inode.h"
+ #include "xfs_icache.h"
+ #include "xfs_trans.h"
+ #include "xfs_alloc.h"
+ #include "xfs_ag.h"
+ #include "xfs_rmap.h"
+ #include "xfs_rmap_btree.h"
+ #include "xfs_rtgroup.h"
+ #include "xfs_rtrmap_btree.h"
+ #include "xfs_health.h"
+ #include "xfs_healthmon.h"
+ #include "xfs_trace.h"
+ #include "xfs_verify_media.h"
+ 
+ #include <linux/fserror.h>
+ 
+ struct xfs_group_data_lost {
+       xfs_agblock_t           startblock;
+       xfs_extlen_t            blockcount;
+ };
+ 
+ /* Report lost file data from rmap records */
+ static int
+ xfs_verify_report_data_lost(
+       struct xfs_btree_cur            *cur,
+       const struct xfs_rmap_irec      *rec,
+       void                            *data)
+ {
+       struct xfs_mount                *mp = cur->bc_mp;
+       struct xfs_inode                *ip;
+       struct xfs_group_data_lost      *lost = data;
+       xfs_fileoff_t                   fileoff = rec->rm_offset;
+       xfs_extlen_t                    blocks = rec->rm_blockcount;
+       const bool                      is_attr =
+                       (rec->rm_flags & XFS_RMAP_ATTR_FORK);
+       const xfs_agblock_t             lost_end =
+                       lost->startblock + lost->blockcount;
+       const xfs_agblock_t             rmap_end =
+                       rec->rm_startblock + rec->rm_blockcount;
+       int                             error = 0;
+ 
+       if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
+              return 0;
+ 
+       error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip);
+       if (error)
+               return 0;
+ 
+       if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
+               xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK);
+               goto out_rele;
+       }
+ 
+       if (is_attr) {
+               xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR);
+               goto out_rele;
+       }
+ 
+       if (lost->startblock > rec->rm_startblock) {
+               fileoff += lost->startblock - rec->rm_startblock;
+               blocks -= lost->startblock - rec->rm_startblock;
+       }
+       if (rmap_end > lost_end)
+               blocks -= rmap_end - lost_end;
+ 
+       fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff),
+                       XFS_FSB_TO_B(mp, blocks), GFP_NOFS);
+ 
+ out_rele:
+       xfs_irele(ip);
+       return 0;
+ }
+ 
+ /* Walk reverse mappings to look for all file data loss */
+ static int
+ xfs_verify_report_losses(
+       struct xfs_mount        *mp,
+       enum xfs_group_type     type,
+       xfs_daddr_t             daddr,
+       u64                     bblen)
+ {
+       struct xfs_group        *xg = NULL;
+       struct xfs_trans        *tp;
+       xfs_fsblock_t           start_bno, end_bno;
+       uint32_t                start_gno, end_gno;
+       int                     error;
+ 
+       if (type == XG_TYPE_RTG) {
+               start_bno = xfs_daddr_to_rtb(mp, daddr);
+               end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
+       } else {
+               start_bno = XFS_DADDR_TO_FSB(mp, daddr);
+               end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
+       }
+ 
+       tp = xfs_trans_alloc_empty(mp);
+       start_gno = xfs_fsb_to_gno(mp, start_bno, type);
+       end_gno = xfs_fsb_to_gno(mp, end_bno, type);
+       while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
+               struct xfs_buf          *agf_bp = NULL;
+               struct xfs_rtgroup      *rtg = NULL;
+               struct xfs_btree_cur    *cur;
+               struct xfs_rmap_irec    ri_low = { };
+               struct xfs_rmap_irec    ri_high;
+               struct xfs_group_data_lost lost;
+ 
+               if (type == XG_TYPE_AG) {
+                       struct xfs_perag        *pag = to_perag(xg);
+ 
+                       error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
+                       if (error) {
+                               xfs_perag_put(pag);
+                               break;
+                       }
+ 
+                       cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
+               } else {
+                       rtg = to_rtg(xg);
+                       xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+                       cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+               }
+ 
+               /*
+                * Set the rmap range from ri_low to ri_high, which represents
+                * a [start, end] where we looking for the files or metadata.
+                */
+               memset(&ri_high, 0xFF, sizeof(ri_high));
+               if (xg->xg_gno == start_gno)
+                       ri_low.rm_startblock =
+                               xfs_fsb_to_gbno(mp, start_bno, type);
+               if (xg->xg_gno == end_gno)
+                       ri_high.rm_startblock =
+                               xfs_fsb_to_gbno(mp, end_bno, type);
+ 
+               lost.startblock = ri_low.rm_startblock;
+               lost.blockcount = min(xg->xg_block_count,
+                                     ri_high.rm_startblock + 1) -
+                                                       ri_low.rm_startblock;
+ 
+               error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+                               xfs_verify_report_data_lost, &lost);
+               xfs_btree_del_cursor(cur, error);
+               if (agf_bp)
+                       xfs_trans_brelse(tp, agf_bp);
+               if (rtg)
+                       xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+               if (error) {
+                       xfs_group_put(xg);
+                       break;
+               }
+       }
+ 
+       xfs_trans_cancel(tp);
+       return 0;
+ }
+ 
+ /*
+  * Compute the desired verify IO size.
+  *
+  * To minimize command overhead, we'd like to create bios that are 1MB, though
+  * we allow the user to ask for a smaller size.
+  */
+ static unsigned int
+ xfs_verify_iosize(
+       const struct xfs_verify_media   *me,
+       struct xfs_buftarg              *btp,
+       uint64_t                        bbcount)
+ {
+       unsigned int                    iosize =
+                       min_not_zero(SZ_1M, me->me_max_io_size);
+ 
+       BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT);
+       ASSERT(BBTOB(bbcount) >= bdev_logical_block_size(btp->bt_bdev));
+ 
+       return clamp(iosize, bdev_logical_block_size(btp->bt_bdev),
+                       BBTOB(bbcount));
+ }
+ 
+ /* Allocate as much memory as we can get for verification buffer. */
+ static struct folio *
+ xfs_verify_alloc_folio(
+       const unsigned int      iosize)
+ {
+       unsigned int            order = get_order(iosize);
+ 
+       while (order > 0) {
+               struct folio    *folio =
+                       folio_alloc(GFP_KERNEL | __GFP_NORETRY, order);
+ 
+               if (folio)
+                       return folio;
+               order--;
+       }
+ 
+       return folio_alloc(GFP_KERNEL, 0);
+ }
+ 
+ /* Report any kind of problem verifying media */
+ static void
+ xfs_verify_media_error(
+       struct xfs_mount        *mp,
+       struct xfs_verify_media *me,
+       struct xfs_buftarg      *btp,
+       xfs_daddr_t             daddr,
+       unsigned int            bio_bbcount,
+       blk_status_t            bio_status)
+ {
+       trace_xfs_verify_media_error(mp, me, btp->bt_bdev->bd_dev, daddr,
+                       bio_bbcount, bio_status);
+ 
+       /*
+        * Pass any error, I/O or otherwise, up to the caller if we didn't
+        * successfully verify any bytes at all.
+        */
+       if (me->me_start_daddr == daddr)
+               me->me_ioerror = -blk_status_to_errno(bio_status);
+ 
+       /*
+        * PI validation failures, medium errors, or general IO errors are
+        * treated as indicators of data loss.  Everything else are (hopefully)
+        * transient errors and are not reported to healthmon or fsnotify.
+        */
+       switch (bio_status) {
+       case BLK_STS_PROTECTION:
+       case BLK_STS_IOERR:
+       case BLK_STS_MEDIUM:
+               break;
+       default:
+               return;
+       }
+ 
+       if (!(me->me_flags & XFS_VERIFY_MEDIA_REPORT))
+               return;
+ 
+       xfs_healthmon_report_media(mp, me->me_dev, daddr, bio_bbcount);
+ 
+       if (!xfs_has_rmapbt(mp))
+               return;
+ 
+       switch (me->me_dev) {
+       case XFS_DEV_DATA:
+               xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount);
+               break;
+       case XFS_DEV_RT:
+               xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount);
+               break;
+       }
+ }
+ 
+ /* Verify the media of an xfs device by submitting read requests to the disk. */
+ static int
+ xfs_verify_media(
+       struct xfs_mount        *mp,
+       struct xfs_verify_media *me)
+ {
+       struct xfs_buftarg      *btp = NULL;
+       struct bio              *bio;
+       struct folio            *folio;
+       xfs_daddr_t             daddr;
+       uint64_t                bbcount;
+       int                     error = 0;
+ 
+       me->me_ioerror = 0;
+ 
+       switch (me->me_dev) {
+       case XFS_DEV_DATA:
+               btp = mp->m_ddev_targp;
+               break;
+       case XFS_DEV_LOG:
+               if (mp->m_logdev_targp->bt_bdev != mp->m_ddev_targp->bt_bdev)
+                       btp = mp->m_logdev_targp;
+               break;
+       case XFS_DEV_RT:
+               btp = mp->m_rtdev_targp;
+               break;
+       }
+       if (!btp)
+               return -ENODEV;
+ 
+       /*
+        * If the caller told us to verify beyond the end of the disk, tell the
+        * user exactly where that was.
+        */
+       if (me->me_end_daddr > btp->bt_nr_sectors)
+               me->me_end_daddr = btp->bt_nr_sectors;
+ 
+       /* start and end have to be aligned to the lba size */
+       if (!IS_ALIGNED(BBTOB(me->me_start_daddr | me->me_end_daddr),
+                       bdev_logical_block_size(btp->bt_bdev)))
+               return -EINVAL;
+ 
+       /*
+        * end_daddr is the exclusive end of the range, so if start_daddr
+        * reaches there (or beyond), there's no work to be done.
+        */
+       if (me->me_start_daddr >= me->me_end_daddr)
+               return 0;
+ 
+       /*
+        * There are three ranges involved here:
+        *
+        *  - [me->me_start_daddr, me->me_end_daddr) is the range that the
+        *    user wants to verify.  end_daddr can be beyond the end of the
+        *    disk; we'll constrain it to the end if necessary.
+        *
+        *  - [daddr, me->me_end_daddr) is the range that we have not yet
+        *    verified.  We update daddr after each successful read.
+        *    me->me_start_daddr is set to daddr before returning.
+        *
+        *  - [daddr, daddr + bio_bbcount) is the range that we're currently
+        *    verifying.
+        */
+       daddr = me->me_start_daddr;
+       bbcount = min_t(sector_t, me->me_end_daddr, btp->bt_nr_sectors) -
+                         me->me_start_daddr;
+ 
+       folio = xfs_verify_alloc_folio(xfs_verify_iosize(me, btp, bbcount));
+       if (!folio)
+               return -ENOMEM;
+ 
+       trace_xfs_verify_media(mp, me, btp->bt_bdev->bd_dev, daddr, bbcount,
+                       folio);
+ 
+       bio = bio_alloc(btp->bt_bdev, 1, REQ_OP_READ, GFP_KERNEL);
+       if (!bio) {
+               error = -ENOMEM;
+               goto out_folio;
+       }
+ 
+       while (bbcount > 0) {
+               unsigned int    bio_bbcount;
+               blk_status_t    bio_status;
+ 
+               bio_reset(bio, btp->bt_bdev, REQ_OP_READ);
+               bio->bi_iter.bi_sector = daddr;
+               bio_add_folio_nofail(bio, folio,
+                               min(bbcount << SECTOR_SHIFT, folio_size(folio)),
+                               0);
+ 
+               /*
+                * Save the length of the bio before we submit it, because we
+                * need the original daddr and length for reporting IO errors
+                * if the bio fails.
+                */
+               bio_bbcount = bio->bi_iter.bi_size >> SECTOR_SHIFT;
+               submit_bio_wait(bio);
+               bio_status = bio->bi_status;
+               if (bio_status != BLK_STS_OK) {
+                       xfs_verify_media_error(mp, me, btp, daddr, bio_bbcount,
+                                       bio_status);
+                       error = 0;
+                       break;
+               }
+ 
+               daddr += bio_bbcount;
+               bbcount -= bio_bbcount;
+ 
+               if (bbcount == 0)
+                       break;
+ 
+               if (me->me_rest_us) {
+                       ktime_t expires;
+ 
+                       expires = ktime_add_ns(ktime_get(),
+                                       me->me_rest_us * 1000);
+                       set_current_state(TASK_KILLABLE);
+                       schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+               }
+ 
+               if (fatal_signal_pending(current)) {
+                       error = -EINTR;
+                       break;
+               }
+ 
+               cond_resched();
+       }
+ 
+       bio_put(bio);
+ out_folio:
+       folio_put(folio);
+ 
+       if (error)
+               return error;
+ 
+       /*
+        * Advance start_daddr to the end of what we verified if there wasn't
+        * an operational error.
+        */
+       me->me_start_daddr = daddr;
+       trace_xfs_verify_media_end(mp, me, btp->bt_bdev->bd_dev);
+       return 0;
+ }
+ 
+ int
+ xfs_ioc_verify_media(
+       struct file                     *file,
+       struct xfs_verify_media __user  *arg)
+ {
+       struct xfs_verify_media         me;
+       struct xfs_inode                *ip = XFS_I(file_inode(file));
+       struct xfs_mount                *mp = ip->i_mount;
+       int                             error;
+ 
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+ 
+       if (copy_from_user(&me, arg, sizeof(me)))
+               return -EFAULT;
+ 
+       if (me.me_pad)
+               return -EINVAL;
+       if (me.me_flags & ~XFS_VERIFY_MEDIA_FLAGS)
+               return -EINVAL;
+ 
+       switch (me.me_dev) {
+       case XFS_DEV_DATA:
+       case XFS_DEV_LOG:
+       case XFS_DEV_RT:
+               break;
+       default:
+               return -EINVAL;
+       }
+ 
+       error = xfs_verify_media(mp, &me);
+       if (error)
+               return error;
+ 
+       if (copy_to_user(arg, &me, sizeof(me)))
+               return -EFAULT;
+ 
+       return 0;
+ }
author	Carlos Maiolino <cem@kernel.org>
	Wed, 28 Jan 2026 08:47:42 +0000 (09:47 +0100)
committer	Carlos Maiolino <cem@kernel.org>
	Wed, 28 Jan 2026 09:02:20 +0000 (10:02 +0100)
		1	2
fs/iomap/buffered-io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_fsops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_health.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_healthmon.c	patch \|	\|	diff2 \|	blob \| history
fs/xfs/xfs_ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_mount.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_notify_failure.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_platform.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_trace.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_verify_media.c	patch \|	\|	diff2 \|	blob \| history