__entry->error)
);
+TRACE_EVENT(xfs_verify_media,
+ TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me,
+ dev_t fdev, xfs_daddr_t daddr, uint64_t bbcount,
+ const struct folio *folio),
+ TP_ARGS(mp, me, fdev, daddr, bbcount, folio),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, fdev)
+ __field(xfs_daddr_t, start_daddr)
+ __field(xfs_daddr_t, end_daddr)
+ __field(unsigned int, flags)
+ __field(xfs_daddr_t, daddr)
+ __field(uint64_t, bbcount)
+ __field(unsigned int, bufsize)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_ddev_targp->bt_dev;
+ __entry->fdev = fdev;
+ __entry->start_daddr = me->me_start_daddr;
+ __entry->end_daddr = me->me_end_daddr;
+ __entry->flags = me->me_flags;
+ __entry->daddr = daddr;
+ __entry->bbcount = bbcount;
+ __entry->bufsize = folio_size(folio);
+ ),
+ TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx flags 0x%x daddr 0x%llx bbcount 0x%llx bufsize 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->fdev), MINOR(__entry->fdev),
+ __entry->start_daddr,
+ __entry->end_daddr,
+ __entry->flags,
+ __entry->daddr,
+ __entry->bbcount,
+ __entry->bufsize)
+);
+
+TRACE_EVENT(xfs_verify_media_end,
+ TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me,
+ dev_t fdev),
+ TP_ARGS(mp, me, fdev),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, fdev)
+ __field(xfs_daddr_t, start_daddr)
+ __field(xfs_daddr_t, end_daddr)
+ __field(int, ioerror)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_ddev_targp->bt_dev;
+ __entry->fdev = fdev;
+ __entry->start_daddr = me->me_start_daddr;
+ __entry->end_daddr = me->me_end_daddr;
+ __entry->ioerror = me->me_ioerror;
+ ),
+ TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx ioerror %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->fdev), MINOR(__entry->fdev),
+ __entry->start_daddr,
+ __entry->end_daddr,
+ __entry->ioerror)
+);
+
+TRACE_EVENT(xfs_verify_media_error,
+ TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me,
+ dev_t fdev, xfs_daddr_t daddr, uint64_t bbcount,
+ blk_status_t status),
+ TP_ARGS(mp, me, fdev, daddr, bbcount, status),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, fdev)
+ __field(xfs_daddr_t, start_daddr)
+ __field(xfs_daddr_t, end_daddr)
+ __field(unsigned int, flags)
+ __field(xfs_daddr_t, daddr)
+ __field(uint64_t, bbcount)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_ddev_targp->bt_dev;
+ __entry->fdev = fdev;
+ __entry->start_daddr = me->me_start_daddr;
+ __entry->end_daddr = me->me_end_daddr;
+ __entry->flags = me->me_flags;
+ __entry->daddr = daddr;
+ __entry->bbcount = bbcount;
+ __entry->error = blk_status_to_errno(status);
+ ),
+ TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx flags 0x%x daddr 0x%llx bbcount 0x%llx error %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->fdev), MINOR(__entry->fdev),
+ __entry->start_daddr,
+ __entry->end_daddr,
+ __entry->flags,
+ __entry->daddr,
+ __entry->bbcount,
+ __entry->error)
+);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2026 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_bit.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_ag.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_health.h"
+#include "xfs_healthmon.h"
+#include "xfs_trace.h"
+#include "xfs_verify_media.h"
+
+#include <linux/fserror.h>
+
+struct xfs_group_data_lost {
+ xfs_agblock_t startblock;
+ xfs_extlen_t blockcount;
+};
+
+/* Report lost file data from rmap records */
+static int
+xfs_verify_report_data_lost(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *data)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_inode *ip;
+ struct xfs_group_data_lost *lost = data;
+ xfs_fileoff_t fileoff = rec->rm_offset;
+ xfs_extlen_t blocks = rec->rm_blockcount;
+ const bool is_attr =
+ (rec->rm_flags & XFS_RMAP_ATTR_FORK);
+ const xfs_agblock_t lost_end =
+ lost->startblock + lost->blockcount;
+ const xfs_agblock_t rmap_end =
+ rec->rm_startblock + rec->rm_blockcount;
+ int error = 0;
+
+ if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
+ return 0;
+
+ error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip);
+ if (error)
+ return 0;
+
+ if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
+ xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK);
+ goto out_rele;
+ }
+
+ if (is_attr) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR);
+ goto out_rele;
+ }
+
+ if (lost->startblock > rec->rm_startblock) {
+ fileoff += lost->startblock - rec->rm_startblock;
+ blocks -= lost->startblock - rec->rm_startblock;
+ }
+ if (rmap_end > lost_end)
+ blocks -= rmap_end - lost_end;
+
+ fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff),
+ XFS_FSB_TO_B(mp, blocks), GFP_NOFS);
+
+out_rele:
+ xfs_irele(ip);
+ return 0;
+}
+
+/* Walk reverse mappings to look for all file data loss */
+static int
+xfs_verify_report_losses(
+ struct xfs_mount *mp,
+ enum xfs_group_type type,
+ xfs_daddr_t daddr,
+ u64 bblen)
+{
+ struct xfs_group *xg = NULL;
+ struct xfs_trans *tp;
+ xfs_fsblock_t start_bno, end_bno;
+ uint32_t start_gno, end_gno;
+ int error;
+
+ if (type == XG_TYPE_RTG) {
+ start_bno = xfs_daddr_to_rtb(mp, daddr);
+ end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
+ } else {
+ start_bno = XFS_DADDR_TO_FSB(mp, daddr);
+ end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
+ }
+
+ tp = xfs_trans_alloc_empty(mp);
+ start_gno = xfs_fsb_to_gno(mp, start_bno, type);
+ end_gno = xfs_fsb_to_gno(mp, end_bno, type);
+ while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
+ struct xfs_buf *agf_bp = NULL;
+ struct xfs_rtgroup *rtg = NULL;
+ struct xfs_btree_cur *cur;
+ struct xfs_rmap_irec ri_low = { };
+ struct xfs_rmap_irec ri_high;
+ struct xfs_group_data_lost lost;
+
+ if (type == XG_TYPE_AG) {
+ struct xfs_perag *pag = to_perag(xg);
+
+ error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
+ if (error) {
+ xfs_perag_put(pag);
+ break;
+ }
+
+ cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
+ } else {
+ rtg = to_rtg(xg);
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+ }
+
+ /*
+ * Set the rmap range from ri_low to ri_high, which represents
+ * a [start, end] where we looking for the files or metadata.
+ */
+ memset(&ri_high, 0xFF, sizeof(ri_high));
+ if (xg->xg_gno == start_gno)
+ ri_low.rm_startblock =
+ xfs_fsb_to_gbno(mp, start_bno, type);
+ if (xg->xg_gno == end_gno)
+ ri_high.rm_startblock =
+ xfs_fsb_to_gbno(mp, end_bno, type);
+
+ lost.startblock = ri_low.rm_startblock;
+ lost.blockcount = min(xg->xg_block_count,
+ ri_high.rm_startblock + 1) -
+ ri_low.rm_startblock;
+
+ error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+ xfs_verify_report_data_lost, &lost);
+ xfs_btree_del_cursor(cur, error);
+ if (agf_bp)
+ xfs_trans_brelse(tp, agf_bp);
+ if (rtg)
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+ if (error) {
+ xfs_group_put(xg);
+ break;
+ }
+ }
+
+ xfs_trans_cancel(tp);
+ return 0;
+}
+
+/*
+ * Compute the desired verify IO size.
+ *
+ * To minimize command overhead, we'd like to create bios that are 1MB, though
+ * we allow the user to ask for a smaller size.
+ */
+static unsigned int
+xfs_verify_iosize(
+ const struct xfs_verify_media *me,
+ struct xfs_buftarg *btp,
+ uint64_t bbcount)
+{
+ unsigned int iosize =
+ min_not_zero(SZ_1M, me->me_max_io_size);
+
+ BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT);
+ ASSERT(BBTOB(bbcount) >= bdev_logical_block_size(btp->bt_bdev));
+
+ return clamp(iosize, bdev_logical_block_size(btp->bt_bdev),
+ BBTOB(bbcount));
+}
+
+/* Allocate as much memory as we can get for verification buffer. */
+static struct folio *
+xfs_verify_alloc_folio(
+ const unsigned int iosize)
+{
+ unsigned int order = get_order(iosize);
+
+ while (order > 0) {
+ struct folio *folio =
+ folio_alloc(GFP_KERNEL | __GFP_NORETRY, order);
+
+ if (folio)
+ return folio;
+ order--;
+ }
+
+ return folio_alloc(GFP_KERNEL, 0);
+}
+
+/* Report any kind of problem verifying media */
+static void
+xfs_verify_media_error(
+ struct xfs_mount *mp,
+ struct xfs_verify_media *me,
+ struct xfs_buftarg *btp,
+ xfs_daddr_t daddr,
+ unsigned int bio_bbcount,
+ blk_status_t bio_status)
+{
+ trace_xfs_verify_media_error(mp, me, btp->bt_bdev->bd_dev, daddr,
+ bio_bbcount, bio_status);
+
+ /*
+ * Pass any error, I/O or otherwise, up to the caller if we didn't
+ * successfully verify any bytes at all.
+ */
+ if (me->me_start_daddr == daddr)
+ me->me_ioerror = -blk_status_to_errno(bio_status);
+
+ /*
+ * PI validation failures, medium errors, or general IO errors are
+ * treated as indicators of data loss. Everything else are (hopefully)
+ * transient errors and are not reported to healthmon or fsnotify.
+ */
+ switch (bio_status) {
+ case BLK_STS_PROTECTION:
+ case BLK_STS_IOERR:
+ case BLK_STS_MEDIUM:
+ break;
+ default:
+ return;
+ }
+
+ if (!(me->me_flags & XFS_VERIFY_MEDIA_REPORT))
+ return;
+
+ xfs_healthmon_report_media(mp, me->me_dev, daddr, bio_bbcount);
+
+ if (!xfs_has_rmapbt(mp))
+ return;
+
+ switch (me->me_dev) {
+ case XFS_DEV_DATA:
+ xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount);
+ break;
+ case XFS_DEV_RT:
+ xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount);
+ break;
+ }
+}
+
+/* Verify the media of an xfs device by submitting read requests to the disk. */
+static int
+xfs_verify_media(
+ struct xfs_mount *mp,
+ struct xfs_verify_media *me)
+{
+ struct xfs_buftarg *btp = NULL;
+ struct bio *bio;
+ struct folio *folio;
+ xfs_daddr_t daddr;
+ uint64_t bbcount;
+ int error = 0;
+
+ me->me_ioerror = 0;
+
+ switch (me->me_dev) {
+ case XFS_DEV_DATA:
+ btp = mp->m_ddev_targp;
+ break;
+ case XFS_DEV_LOG:
+ if (mp->m_logdev_targp->bt_bdev != mp->m_ddev_targp->bt_bdev)
+ btp = mp->m_logdev_targp;
+ break;
+ case XFS_DEV_RT:
+ btp = mp->m_rtdev_targp;
+ break;
+ }
+ if (!btp)
+ return -ENODEV;
+
+ /*
+ * If the caller told us to verify beyond the end of the disk, tell the
+ * user exactly where that was.
+ */
+ if (me->me_end_daddr > btp->bt_nr_sectors)
+ me->me_end_daddr = btp->bt_nr_sectors;
+
+ /* start and end have to be aligned to the lba size */
+ if (!IS_ALIGNED(BBTOB(me->me_start_daddr | me->me_end_daddr),
+ bdev_logical_block_size(btp->bt_bdev)))
+ return -EINVAL;
+
+ /*
+ * end_daddr is the exclusive end of the range, so if start_daddr
+ * reaches there (or beyond), there's no work to be done.
+ */
+ if (me->me_start_daddr >= me->me_end_daddr)
+ return 0;
+
+ /*
+ * There are three ranges involved here:
+ *
+ * - [me->me_start_daddr, me->me_end_daddr) is the range that the
+ * user wants to verify. end_daddr can be beyond the end of the
+ * disk; we'll constrain it to the end if necessary.
+ *
+ * - [daddr, me->me_end_daddr) is the range that we have not yet
+ * verified. We update daddr after each successful read.
+ * me->me_start_daddr is set to daddr before returning.
+ *
+ * - [daddr, daddr + bio_bbcount) is the range that we're currently
+ * verifying.
+ */
+ daddr = me->me_start_daddr;
+ bbcount = min_t(sector_t, me->me_end_daddr, btp->bt_nr_sectors) -
+ me->me_start_daddr;
+
+ folio = xfs_verify_alloc_folio(xfs_verify_iosize(me, btp, bbcount));
+ if (!folio)
+ return -ENOMEM;
+
+ trace_xfs_verify_media(mp, me, btp->bt_bdev->bd_dev, daddr, bbcount,
+ folio);
+
+ bio = bio_alloc(btp->bt_bdev, 1, REQ_OP_READ, GFP_KERNEL);
+ if (!bio) {
+ error = -ENOMEM;
+ goto out_folio;
+ }
+
+ while (bbcount > 0) {
+ unsigned int bio_bbcount;
+ blk_status_t bio_status;
+
+ bio_reset(bio, btp->bt_bdev, REQ_OP_READ);
+ bio->bi_iter.bi_sector = daddr;
+ bio_add_folio_nofail(bio, folio,
+ min(bbcount << SECTOR_SHIFT, folio_size(folio)),
+ 0);
+
+ /*
+ * Save the length of the bio before we submit it, because we
+ * need the original daddr and length for reporting IO errors
+ * if the bio fails.
+ */
+ bio_bbcount = bio->bi_iter.bi_size >> SECTOR_SHIFT;
+ submit_bio_wait(bio);
+ bio_status = bio->bi_status;
+ if (bio_status != BLK_STS_OK) {
+ xfs_verify_media_error(mp, me, btp, daddr, bio_bbcount,
+ bio_status);
+ error = 0;
+ break;
+ }
+
+ daddr += bio_bbcount;
+ bbcount -= bio_bbcount;
+
+ if (bbcount == 0)
+ break;
+
+ if (me->me_rest_us) {
+ ktime_t expires;
+
+ expires = ktime_add_ns(ktime_get(),
+ me->me_rest_us * 1000);
+ set_current_state(TASK_KILLABLE);
+ schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+ }
+
+ if (fatal_signal_pending(current)) {
+ error = -EINTR;
+ break;
+ }
+
+ cond_resched();
+ }
+
+ bio_put(bio);
+out_folio:
+ folio_put(folio);
+
+ if (error)
+ return error;
+
+ /*
+ * Advance start_daddr to the end of what we verified if there wasn't
+ * an operational error.
+ */
+ me->me_start_daddr = daddr;
+ trace_xfs_verify_media_end(mp, me, btp->bt_bdev->bd_dev);
+ return 0;
+}
+
+int
+xfs_ioc_verify_media(
+ struct file *file,
+ struct xfs_verify_media __user *arg)
+{
+ struct xfs_verify_media me;
+ struct xfs_inode *ip = XFS_I(file_inode(file));
+ struct xfs_mount *mp = ip->i_mount;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&me, arg, sizeof(me)))
+ return -EFAULT;
+
+ if (me.me_pad)
+ return -EINVAL;
+ if (me.me_flags & ~XFS_VERIFY_MEDIA_FLAGS)
+ return -EINVAL;
+
+ switch (me.me_dev) {
+ case XFS_DEV_DATA:
+ case XFS_DEV_LOG:
+ case XFS_DEV_RT:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ error = xfs_verify_media(mp, &me);
+ if (error)
+ return error;
+
+ if (copy_to_user(arg, &me, sizeof(me)))
+ return -EFAULT;
+
+ return 0;
+}