]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
xfs: implement zoned garbage collection
authorChristoph Hellwig <hch@lst.de>
Sat, 15 Feb 2025 18:37:19 +0000 (19:37 +0100)
committerChristoph Hellwig <hch@lst.de>
Mon, 3 Mar 2025 15:17:07 +0000 (08:17 -0700)
RT groups on a zoned file system need to be completely empty before their
space can be reused.  This means that partially empty groups need to be
emptied entirely to free up space if no entirely free groups are
available.

Add a garbage collection thread that moves all data out of the least used
zone when not enough free zones are available, and which resets all zones
that have been emptied.  To find empty zone a simple set of 10 buckets
based on the amount of space used in the zone is used.  To empty zones,
the rmap is walked to find the owners and the data is read and then
written to the new place.

To automatically defragment files the rmap records are sorted by inode
and logical offset.  This means defragmentation of parallel writes into
a single zone happens automatically when performing garbage collection.
Because holding the iolock over the entire GC cycle would inject very
noticeable latency for other accesses to the inodes, the iolock is not
taken while performing I/O.  Instead the I/O completion handler checks
that the mapping hasn't changed over the one recorded at the start of
the GC cycle and doesn't update the mapping if it change.

Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
13 files changed:
fs/xfs/Makefile
fs/xfs/libxfs/xfs_group.h
fs/xfs/libxfs/xfs_rtgroup.h
fs/xfs/xfs_extent_busy.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_super.c
fs/xfs/xfs_trace.h
fs/xfs/xfs_zone_alloc.c
fs/xfs/xfs_zone_alloc.h
fs/xfs/xfs_zone_gc.c [new file with mode: 0644]
fs/xfs/xfs_zone_priv.h
fs/xfs/xfs_zone_space_resv.c

index bdedf4bdb1db6c1ee2e1415e21f94218943e54f9..e38838409271691e34e80a45d6dc202e869739fe 100644 (file)
@@ -139,6 +139,7 @@ xfs-$(CONFIG_XFS_QUOTA)             += xfs_dquot.o \
 # xfs_rtbitmap is shared with libxfs
 xfs-$(CONFIG_XFS_RT)           += xfs_rtalloc.o \
                                   xfs_zone_alloc.o \
+                                  xfs_zone_gc.o \
                                   xfs_zone_space_resv.o
 
 xfs-$(CONFIG_XFS_POSIX_ACL)    += xfs_acl.o
index a70096113384e019ae0b095d2458d3aba379c496..cff3f815947bfca1a49cb1116621f1770393018f 100644 (file)
@@ -19,10 +19,23 @@ struct xfs_group {
 #ifdef __KERNEL__
        /* -- kernel only structures below this line -- */
 
-       /*
-        * Track freed but not yet committed extents.
-        */
-       struct xfs_extent_busy_tree *xg_busy_extents;
+       union {
+               /*
+                * For perags and non-zoned RT groups:
+                * Track freed but not yet committed extents.
+                */
+               struct xfs_extent_busy_tree     *xg_busy_extents;
+
+               /*
+                * For zoned RT groups:
+                * List of groups that need a zone reset.
+                *
+                * The zonegc code forces a log flush of the rtrmap inode before
+                * resetting the write pointer, so there is no need for
+                * individual busy extent tracking.
+                */
+               struct xfs_group                *xg_next_reset;
+       };
 
        /*
         * Bitsets of per-ag metadata that have been checked and/or are sick.
index 5d8777f819f4359d9f0ddfeac161351a00f1fa0b..b325aff28264d5066517a322ec9e20f33656e3df 100644 (file)
@@ -58,6 +58,12 @@ struct xfs_rtgroup {
  */
 #define XFS_RTG_FREE                   XA_MARK_0
 
+/*
+ * For zoned RT devices this is set on groups that are fully written and that
+ * have unused blocks.  Used by the garbage collection to pick targets.
+ */
+#define XFS_RTG_RECLAIMABLE            XA_MARK_1
+
 static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
 {
        return container_of(xg, struct xfs_rtgroup, rtg_group);
index ea43c9a6e54c12bdb36a3614d8f2054aeceb7151..da31615727357010e4e68e0a735060f010626df5 100644 (file)
@@ -671,7 +671,7 @@ xfs_extent_busy_wait_all(
        while ((pag = xfs_perag_next(mp, pag)))
                xfs_extent_busy_wait_group(pag_group(pag));
 
-       if (xfs_has_rtgroups(mp))
+       if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp))
                while ((rtg = xfs_rtgroup_next(mp, rtg)))
                        xfs_extent_busy_wait_group(rtg_group(rtg));
 }
index 066805e7205498cdeb55c5c7016ca9c3154e5599..d8aba6a28ba7a954a815049c5ef1208a9e1b6b56 100644 (file)
@@ -1090,6 +1090,8 @@ xfs_mountfs(
                error = xfs_fs_reserve_ag_blocks(mp);
                if (error && error != -ENOSPC)
                        goto out_agresv;
+
+               xfs_zone_gc_start(mp);
        }
 
        return 0;
@@ -1178,6 +1180,8 @@ xfs_unmountfs(
        xfs_inodegc_flush(mp);
 
        xfs_blockgc_stop(mp);
+       if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate))
+               xfs_zone_gc_stop(mp);
        xfs_fs_unreserve_ag_blocks(mp);
        xfs_qm_unmount_quotas(mp);
        if (xfs_has_zoned(mp))
index 0772b74fc8fdbf173cf7cf8810751319acd4c17f..4b406f57548a2a4b28ab16f4092a363378160b67 100644 (file)
@@ -556,6 +556,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
 #define XFS_OPSTATE_RESUMING_QUOTAON   18
 /* Kernel has logged a warning about zoned RT device being used on this fs. */
 #define XFS_OPSTATE_WARNED_ZONED       19
+/* (Zoned) GC is in progress */
+#define XFS_OPSTATE_ZONEGC_RUNNING     20
 
 #define __XFS_IS_OPSTATE(name, NAME) \
 static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -600,6 +602,7 @@ static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp)
 #endif /* CONFIG_XFS_QUOTA */
 __XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
 __XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
+__XFS_IS_OPSTATE(zonegc_running, ZONEGC_RUNNING)
 
 static inline bool
 xfs_should_warn(struct xfs_mount *mp, long nr)
index a840e1c68ff2bab9ff28bafa1501ab1f8ad8dac6..39b2bad67fcde9337719bc87ce84b0c84ad4748f 100644 (file)
@@ -46,6 +46,7 @@
 #include "xfs_exchmaps_item.h"
 #include "xfs_parent.h"
 #include "xfs_rtalloc.h"
+#include "xfs_zone_alloc.h"
 #include "scrub/stats.h"
 #include "scrub/rcbag_btree.h"
 
@@ -822,6 +823,7 @@ xfs_fs_sync_fs(
        if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) {
                xfs_inodegc_stop(mp);
                xfs_blockgc_stop(mp);
+               xfs_zone_gc_stop(mp);
        }
 
        return 0;
@@ -994,6 +996,7 @@ xfs_fs_freeze(
        if (ret && !xfs_is_readonly(mp)) {
                xfs_blockgc_start(mp);
                xfs_inodegc_start(mp);
+               xfs_zone_gc_start(mp);
        }
 
        return ret;
@@ -1015,6 +1018,7 @@ xfs_fs_unfreeze(
         * filesystem.
         */
        if (!xfs_is_readonly(mp)) {
+               xfs_zone_gc_start(mp);
                xfs_blockgc_start(mp);
                xfs_inodegc_start(mp);
        }
@@ -1948,6 +1952,9 @@ xfs_remount_rw(
        /* Re-enable the background inode inactivation worker. */
        xfs_inodegc_start(mp);
 
+       /* Restart zone reclaim */
+       xfs_zone_gc_start(mp);
+
        return 0;
 }
 
@@ -1992,6 +1999,9 @@ xfs_remount_ro(
         */
        xfs_inodegc_stop(mp);
 
+       /* Stop zone reclaim */
+       xfs_zone_gc_stop(mp);
+
        /* Free the per-AG metadata reservation pool. */
        xfs_fs_unreserve_ag_blocks(mp);
 
index 6d6099ef50af12a20465f008654bbff527c65c60..40f5aa1edf6b35e419c7c0a6b5f65e66befc7cc7 100644 (file)
@@ -295,8 +295,11 @@ DECLARE_EVENT_CLASS(xfs_zone_class,
 DEFINE_EVENT(xfs_zone_class, name,                     \
        TP_PROTO(struct xfs_rtgroup *rtg),              \
        TP_ARGS(rtg))
+DEFINE_ZONE_EVENT(xfs_zone_emptied);
 DEFINE_ZONE_EVENT(xfs_zone_full);
 DEFINE_ZONE_EVENT(xfs_zone_opened);
+DEFINE_ZONE_EVENT(xfs_zone_reset);
+DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened);
 
 TRACE_EVENT(xfs_zone_free_blocks,
        TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
@@ -364,6 +367,28 @@ DEFINE_EVENT(xfs_zone_alloc_class, name,                   \
 DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
 DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
 
+TRACE_EVENT(xfs_zone_gc_select_victim,
+       TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket),
+       TP_ARGS(rtg, bucket),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_rgnumber_t, rgno)
+               __field(xfs_rgblock_t, used)
+               __field(unsigned int, bucket)
+       ),
+       TP_fast_assign(
+               __entry->dev = rtg_mount(rtg)->m_super->s_dev;
+               __entry->rgno = rtg_rgno(rtg);
+               __entry->used = rtg_rmap(rtg)->i_used_blocks;
+               __entry->bucket = bucket;
+       ),
+       TP_printk("dev %d:%d rgno 0x%x used 0x%x bucket %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rgno,
+                 __entry->used,
+                 __entry->bucket)
+);
+
 TRACE_EVENT(xfs_zones_mount,
        TP_PROTO(struct xfs_mount *mp),
        TP_ARGS(mp),
index 3d3f7589bf637d510fb47547ca20d90281e7b7dc..b7b2820ec0efa4878e06e99dec5db234adf99158 100644 (file)
@@ -35,6 +35,104 @@ xfs_open_zone_put(
        }
 }
 
+static inline uint32_t
+xfs_zone_bucket(
+       struct xfs_mount        *mp,
+       uint32_t                used_blocks)
+{
+       return XFS_ZONE_USED_BUCKETS * used_blocks /
+                       mp->m_groups[XG_TYPE_RTG].blocks;
+}
+
+static inline void
+xfs_zone_add_to_bucket(
+       struct xfs_zone_info    *zi,
+       xfs_rgnumber_t          rgno,
+       uint32_t                to_bucket)
+{
+       __set_bit(rgno, zi->zi_used_bucket_bitmap[to_bucket]);
+       zi->zi_used_bucket_entries[to_bucket]++;
+}
+
+static inline void
+xfs_zone_remove_from_bucket(
+       struct xfs_zone_info    *zi,
+       xfs_rgnumber_t          rgno,
+       uint32_t                from_bucket)
+{
+       __clear_bit(rgno, zi->zi_used_bucket_bitmap[from_bucket]);
+       zi->zi_used_bucket_entries[from_bucket]--;
+}
+
+static void
+xfs_zone_account_reclaimable(
+       struct xfs_rtgroup      *rtg,
+       uint32_t                freed)
+{
+       struct xfs_group        *xg = &rtg->rtg_group;
+       struct xfs_mount        *mp = rtg_mount(rtg);
+       struct xfs_zone_info    *zi = mp->m_zone_info;
+       uint32_t                used = rtg_rmap(rtg)->i_used_blocks;
+       xfs_rgnumber_t          rgno = rtg_rgno(rtg);
+       uint32_t                from_bucket = xfs_zone_bucket(mp, used + freed);
+       uint32_t                to_bucket = xfs_zone_bucket(mp, used);
+       bool                    was_full = (used + freed == rtg_blocks(rtg));
+
+       /*
+        * This can be called from log recovery, where the zone_info structure
+        * hasn't been allocated yet.  Skip all work as xfs_mount_zones will
+        * add the zones to the right buckets before the file systems becomes
+        * active.
+        */
+       if (!zi)
+               return;
+
+       if (!used) {
+               /*
+                * The zone is now empty, remove it from the bottom bucket and
+                * trigger a reset.
+                */
+               trace_xfs_zone_emptied(rtg);
+
+               if (!was_full)
+                       xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE);
+
+               spin_lock(&zi->zi_used_buckets_lock);
+               if (!was_full)
+                       xfs_zone_remove_from_bucket(zi, rgno, from_bucket);
+               spin_unlock(&zi->zi_used_buckets_lock);
+
+               spin_lock(&zi->zi_reset_list_lock);
+               xg->xg_next_reset = zi->zi_reset_list;
+               zi->zi_reset_list = xg;
+               spin_unlock(&zi->zi_reset_list_lock);
+
+               if (zi->zi_gc_thread)
+                       wake_up_process(zi->zi_gc_thread);
+       } else if (was_full) {
+               /*
+                * The zone transitioned from full, mark it up as reclaimable
+                * and wake up GC which might be waiting for zones to reclaim.
+                */
+               spin_lock(&zi->zi_used_buckets_lock);
+               xfs_zone_add_to_bucket(zi, rgno, to_bucket);
+               spin_unlock(&zi->zi_used_buckets_lock);
+
+               xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE);
+               if (zi->zi_gc_thread && xfs_zoned_need_gc(mp))
+                       wake_up_process(zi->zi_gc_thread);
+       } else if (to_bucket != from_bucket) {
+               /*
+                * Move the zone to a new bucket if it dropped below the
+                * threshold.
+                */
+               spin_lock(&zi->zi_used_buckets_lock);
+               xfs_zone_add_to_bucket(zi, rgno, to_bucket);
+               xfs_zone_remove_from_bucket(zi, rgno, from_bucket);
+               spin_unlock(&zi->zi_used_buckets_lock);
+       }
+}
+
 static void
 xfs_open_zone_mark_full(
        struct xfs_open_zone    *oz)
@@ -42,6 +140,7 @@ xfs_open_zone_mark_full(
        struct xfs_rtgroup      *rtg = oz->oz_rtg;
        struct xfs_mount        *mp = rtg_mount(rtg);
        struct xfs_zone_info    *zi = mp->m_zone_info;
+       uint32_t                used = rtg_rmap(rtg)->i_used_blocks;
 
        trace_xfs_zone_full(rtg);
 
@@ -59,6 +158,8 @@ xfs_open_zone_mark_full(
        xfs_open_zone_put(oz);
 
        wake_up_all(&zi->zi_zone_wait);
+       if (used < rtg_blocks(rtg))
+               xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used);
 }
 
 static void
@@ -244,6 +345,13 @@ xfs_zone_free_blocks(
        trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len);
 
        rmapip->i_used_blocks -= len;
+       /*
+        * Don't add open zones to the reclaimable buckets.  The I/O completion
+        * for writing the last block will take care of accounting for already
+        * unused blocks instead.
+        */
+       if (!READ_ONCE(rtg->rtg_open_zone))
+               xfs_zone_account_reclaimable(rtg, len);
        xfs_add_frextents(mp, len);
        xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
        return 0;
@@ -395,6 +503,9 @@ xfs_try_open_zone(
         */
        wake_up_all(&zi->zi_zone_wait);
 
+       if (xfs_zoned_need_gc(mp))
+               wake_up_process(zi->zi_gc_thread);
+
        trace_xfs_zone_opened(oz->oz_rtg);
        return oz;
 }
@@ -702,6 +813,7 @@ xfs_init_zone(
        struct xfs_zone_info    *zi = mp->m_zone_info;
        uint64_t                used = rtg_rmap(rtg)->i_used_blocks;
        xfs_rgblock_t           write_pointer, highest_rgbno;
+       int                     error;
 
        if (zone && !xfs_zone_validate(zone, rtg, &write_pointer))
                return -EFSCORRUPTED;
@@ -728,6 +840,18 @@ xfs_init_zone(
                xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
        }
 
+       /*
+        * If there are no used blocks, but the zone is not in empty state yet
+        * we lost power before the zoned reset.  In that case finish the work
+        * here.
+        */
+       if (write_pointer == rtg_blocks(rtg) && used == 0) {
+               error = xfs_zone_gc_reset_sync(rtg);
+               if (error)
+                       return error;
+               write_pointer = 0;
+       }
+
        if (write_pointer == 0) {
                /* zone is empty */
                atomic_inc(&zi->zi_nr_free_zones);
@@ -746,6 +870,7 @@ xfs_init_zone(
                iz->reclaimable += write_pointer - used;
        } else if (used < rtg_blocks(rtg)) {
                /* zone fully written, but has freed blocks */
+               xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used);
                iz->reclaimable += (rtg_blocks(rtg) - used);
        }
 
@@ -856,11 +981,20 @@ xfs_calc_open_zones(
        return 0;
 }
 
+static unsigned long *
+xfs_alloc_bucket_bitmap(
+       struct xfs_mount        *mp)
+{
+       return kvmalloc_array(BITS_TO_LONGS(mp->m_sb.sb_rgcount),
+                       sizeof(unsigned long), GFP_KERNEL | __GFP_ZERO);
+}
+
 static struct xfs_zone_info *
 xfs_alloc_zone_info(
        struct xfs_mount        *mp)
 {
        struct xfs_zone_info    *zi;
+       int                     i;
 
        zi = kzalloc(sizeof(*zi), GFP_KERNEL);
        if (!zi)
@@ -871,14 +1005,30 @@ xfs_alloc_zone_info(
        spin_lock_init(&zi->zi_open_zones_lock);
        spin_lock_init(&zi->zi_reservation_lock);
        init_waitqueue_head(&zi->zi_zone_wait);
+       spin_lock_init(&zi->zi_used_buckets_lock);
+       for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) {
+               zi->zi_used_bucket_bitmap[i] = xfs_alloc_bucket_bitmap(mp);
+               if (!zi->zi_used_bucket_bitmap[i])
+                       goto out_free_bitmaps;
+       }
        return zi;
+
+out_free_bitmaps:
+       while (--i > 0)
+               kvfree(zi->zi_used_bucket_bitmap[i]);
+       kfree(zi);
+       return NULL;
 }
 
 static void
 xfs_free_zone_info(
        struct xfs_zone_info    *zi)
 {
+       int                     i;
+
        xfs_free_open_zones(zi);
+       for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++)
+               kvfree(zi->zi_used_bucket_bitmap[i]);
        kfree(zi);
 }
 
@@ -943,6 +1093,10 @@ xfs_mount_zones(
        xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available);
        xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
                        iz.available + iz.reclaimable);
+
+       error = xfs_zone_gc_mount(mp);
+       if (error)
+               goto out_free_zone_info;
        return 0;
 
 out_free_zone_info:
@@ -954,5 +1108,6 @@ void
 xfs_unmount_zones(
        struct xfs_mount        *mp)
 {
+       xfs_zone_gc_unmount(mp);
        xfs_free_zone_info(mp->m_zone_info);
 }
index 28c9cffb72d50245a0d61074548d30d2abd1ce4a..1269390bfcdaa16745afb14624eac5c14ebbd288 100644 (file)
@@ -48,6 +48,8 @@ uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp,
 #ifdef CONFIG_XFS_RT
 int xfs_mount_zones(struct xfs_mount *mp);
 void xfs_unmount_zones(struct xfs_mount *mp);
+void xfs_zone_gc_start(struct xfs_mount *mp);
+void xfs_zone_gc_stop(struct xfs_mount *mp);
 #else
 static inline int xfs_mount_zones(struct xfs_mount *mp)
 {
@@ -56,6 +58,12 @@ static inline int xfs_mount_zones(struct xfs_mount *mp)
 static inline void xfs_unmount_zones(struct xfs_mount *mp)
 {
 }
+static inline void xfs_zone_gc_start(struct xfs_mount *mp)
+{
+}
+static inline void xfs_zone_gc_stop(struct xfs_mount *mp)
+{
+}
 #endif /* CONFIG_XFS_RT */
 
 #endif /* _XFS_ZONE_ALLOC_H */
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
new file mode 100644 (file)
index 0000000..0e1c39f
--- /dev/null
@@ -0,0 +1,1165 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+#include "xfs_trace.h"
+
+/*
+ * Implement Garbage Collection (GC) of partially used zoned.
+ *
+ * To support the purely sequential writes in each zone, zoned XFS needs to be
+ * able to move data remaining in a zone out of it to reset the zone to prepare
+ * for writing to it again.
+ *
+ * This is done by the GC thread implemented in this file.  To support that a
+ * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
+ * write the garbage collected data into.
+ *
+ * Whenever the available space is below the chosen threshold, the GC thread
+ * looks for potential non-empty but not fully used zones that are worth
+ * reclaiming.  Once found the rmap for the victim zone is queried, and after
+ * a bit of sorting to reduce fragmentation, the still live extents are read
+ * into memory and written to the GC target zone, and the bmap btree of the
+ * files is updated to point to the new location.  To avoid taking the IOLOCK
+ * and MMAPLOCK for the entire GC process and thus affecting the latency of
+ * user reads and writes to the files, the GC writes are speculative and the
+ * I/O completion checks that no other writes happened for the affected regions
+ * before remapping.
+ *
+ * Once a zone does not contain any valid data, be that through GC or user
+ * block removal, it is queued for for a zone reset.  The reset operation
+ * carefully ensures that the RT device cache is flushed and all transactions
+ * referencing the rmap have been committed to disk.
+ */
+
+/*
+ * Size of each GC scratch pad.  This is also the upper bound for each
+ * GC I/O, which helps to keep latency down.
+ */
+#define XFS_GC_CHUNK_SIZE      SZ_1M
+
+/*
+ * Scratchpad data to read GCed data into.
+ *
+ * The offset member tracks where the next allocation starts, and freed tracks
+ * the amount of space that is not used anymore.
+ */
+#define XFS_ZONE_GC_NR_SCRATCH 2
+struct xfs_zone_scratch {
+       struct folio                    *folio;
+       unsigned int                    offset;
+       unsigned int                    freed;
+};
+
+/*
+ * Chunk that is read and written for each GC operation.
+ *
+ * Note that for writes to actual zoned devices, the chunk can be split when
+ * reaching the hardware limit.
+ */
+struct xfs_gc_bio {
+       struct xfs_zone_gc_data         *data;
+
+       /*
+        * Entry into the reading/writing/resetting list.  Only accessed from
+        * the GC thread, so no locking needed.
+        */
+       struct list_head                entry;
+
+       /*
+        * State of this gc_bio.  Done means the current I/O completed.
+        * Set from the bio end I/O handler, read from the GC thread.
+        */
+       enum {
+               XFS_GC_BIO_NEW,
+               XFS_GC_BIO_DONE,
+       } state;
+
+       /*
+        * Pointer to the inode and byte range in the inode that this
+        * GC chunk is operating on.
+        */
+       struct xfs_inode                *ip;
+       loff_t                          offset;
+       unsigned int                    len;
+
+       /*
+        * Existing startblock (in the zone to be freed) and newly assigned
+        * daddr in the zone GCed into.
+        */
+       xfs_fsblock_t                   old_startblock;
+       xfs_daddr_t                     new_daddr;
+       struct xfs_zone_scratch         *scratch;
+
+       /* Are we writing to a sequential write required zone? */
+       bool                            is_seq;
+
+       /* Open Zone being written to */
+       struct xfs_open_zone            *oz;
+
+       /* Bio used for reads and writes, including the bvec used by it */
+       struct bio_vec                  bv;
+       struct bio                      bio;    /* must be last */
+};
+
+#define XFS_ZONE_GC_RECS               1024
+
+/* iterator, needs to be reinitialized for each victim zone */
+struct xfs_zone_gc_iter {
+       struct xfs_rtgroup              *victim_rtg;
+       unsigned int                    rec_count;
+       unsigned int                    rec_idx;
+       xfs_agblock_t                   next_startblock;
+       struct xfs_rmap_irec            *recs;
+};
+
+/*
+ * Per-mount GC state.
+ */
+struct xfs_zone_gc_data {
+       struct xfs_mount                *mp;
+
+       /* bioset used to allocate the gc_bios */
+       struct bio_set                  bio_set;
+
+       /*
+        * Scratchpad used, and index to indicated which one is used.
+        */
+       struct xfs_zone_scratch         scratch[XFS_ZONE_GC_NR_SCRATCH];
+       unsigned int                    scratch_idx;
+
+       /*
+        * List of bios currently being read, written and reset.
+        * These lists are only accessed by the GC thread itself, and must only
+        * be processed in order.
+        */
+       struct list_head                reading;
+       struct list_head                writing;
+       struct list_head                resetting;
+
+       /*
+        * Iterator for the victim zone.
+        */
+       struct xfs_zone_gc_iter         iter;
+};
+
+/*
+ * We aim to keep enough zones free in stock to fully use the open zone limit
+ * for data placement purposes.
+ */
+bool
+xfs_zoned_need_gc(
+       struct xfs_mount        *mp)
+{
+       if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
+               return false;
+       if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) <
+           mp->m_groups[XG_TYPE_RTG].blocks *
+           (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
+               return true;
+       return false;
+}
+
+static struct xfs_zone_gc_data *
+xfs_zone_gc_data_alloc(
+       struct xfs_mount        *mp)
+{
+       struct xfs_zone_gc_data *data;
+       int                     i;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return NULL;
+       data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
+                       GFP_KERNEL);
+       if (!data->iter.recs)
+               goto out_free_data;
+
+       /*
+        * We actually only need a single bio_vec.  It would be nice to have
+        * a flag that only allocates the inline bvecs and not the separate
+        * bvec pool.
+        */
+       if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
+                       BIOSET_NEED_BVECS))
+               goto out_free_recs;
+       for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
+               data->scratch[i].folio =
+                       folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
+               if (!data->scratch[i].folio)
+                       goto out_free_scratch;
+       }
+       INIT_LIST_HEAD(&data->reading);
+       INIT_LIST_HEAD(&data->writing);
+       INIT_LIST_HEAD(&data->resetting);
+       data->mp = mp;
+       return data;
+
+out_free_scratch:
+       while (--i >= 0)
+               folio_put(data->scratch[i].folio);
+       bioset_exit(&data->bio_set);
+out_free_recs:
+       kfree(data->iter.recs);
+out_free_data:
+       kfree(data);
+       return NULL;
+}
+
+static void
+xfs_zone_gc_data_free(
+       struct xfs_zone_gc_data *data)
+{
+       int                     i;
+
+       for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
+               folio_put(data->scratch[i].folio);
+       bioset_exit(&data->bio_set);
+       kfree(data->iter.recs);
+       kfree(data);
+}
+
+static void
+xfs_zone_gc_iter_init(
+       struct xfs_zone_gc_iter *iter,
+       struct xfs_rtgroup      *victim_rtg)
+
+{
+       iter->next_startblock = 0;
+       iter->rec_count = 0;
+       iter->rec_idx = 0;
+       iter->victim_rtg = victim_rtg;
+}
+
+/*
+ * Query the rmap of the victim zone to gather the records to evacuate.
+ */
+static int
+xfs_zone_gc_query_cb(
+       struct xfs_btree_cur    *cur,
+       const struct xfs_rmap_irec *irec,
+       void                    *private)
+{
+       struct xfs_zone_gc_iter *iter = private;
+
+       ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
+       ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
+       ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
+
+       iter->recs[iter->rec_count] = *irec;
+       if (++iter->rec_count == XFS_ZONE_GC_RECS) {
+               iter->next_startblock =
+                       irec->rm_startblock + irec->rm_blockcount;
+               return 1;
+       }
+       return 0;
+}
+
+#define cmp_int(l, r)          ((l > r) - (l < r))
+
+static int
+xfs_zone_gc_rmap_rec_cmp(
+       const void                      *a,
+       const void                      *b)
+{
+       const struct xfs_rmap_irec      *reca = a;
+       const struct xfs_rmap_irec      *recb = b;
+       int                             diff;
+
+       diff = cmp_int(reca->rm_owner, recb->rm_owner);
+       if (diff)
+               return diff;
+       return cmp_int(reca->rm_offset, recb->rm_offset);
+}
+
+static int
+xfs_zone_gc_query(
+       struct xfs_mount        *mp,
+       struct xfs_zone_gc_iter *iter)
+{
+       struct xfs_rtgroup      *rtg = iter->victim_rtg;
+       struct xfs_rmap_irec    ri_low = { };
+       struct xfs_rmap_irec    ri_high;
+       struct xfs_btree_cur    *cur;
+       struct xfs_trans        *tp;
+       int                     error;
+
+       ASSERT(iter->next_startblock <= rtg_blocks(rtg));
+       if (iter->next_startblock == rtg_blocks(rtg))
+               goto done;
+
+       ASSERT(iter->next_startblock < rtg_blocks(rtg));
+       ri_low.rm_startblock = iter->next_startblock;
+       memset(&ri_high, 0xFF, sizeof(ri_high));
+
+       iter->rec_idx = 0;
+       iter->rec_count = 0;
+
+       error = xfs_trans_alloc_empty(mp, &tp);
+       if (error)
+               return error;
+
+       xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+       cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+       error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+                       xfs_zone_gc_query_cb, iter);
+       xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+       xfs_btree_del_cursor(cur, error < 0 ? error : 0);
+       xfs_trans_cancel(tp);
+
+       if (error < 0)
+               return error;
+
+       /*
+        * Sort the rmap records by inode number and increasing offset to
+        * defragment the mappings.
+        *
+        * This could be further enhanced by an even bigger look ahead window,
+        * but that's better left until we have better detection of changes to
+        * inode mapping to avoid the potential of GCing already dead data.
+        */
+       sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
+                       xfs_zone_gc_rmap_rec_cmp, NULL);
+
+       if (error == 0) {
+               /*
+                * We finished iterating through the zone.
+                */
+               iter->next_startblock = rtg_blocks(rtg);
+               if (iter->rec_count == 0)
+                       goto done;
+       }
+
+       return 0;
+done:
+       xfs_rtgroup_rele(iter->victim_rtg);
+       iter->victim_rtg = NULL;
+       return 0;
+}
+
+static bool
+xfs_zone_gc_iter_next(
+       struct xfs_mount        *mp,
+       struct xfs_zone_gc_iter *iter,
+       struct xfs_rmap_irec    *chunk_rec,
+       struct xfs_inode        **ipp)
+{
+       struct xfs_rmap_irec    *irec;
+       int                     error;
+
+       if (!iter->victim_rtg)
+               return false;
+
+retry:
+       if (iter->rec_idx == iter->rec_count) {
+               error = xfs_zone_gc_query(mp, iter);
+               if (error)
+                       goto fail;
+               if (!iter->victim_rtg)
+                       return false;
+       }
+
+       irec = &iter->recs[iter->rec_idx];
+       error = xfs_iget(mp, NULL, irec->rm_owner,
+                       XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
+       if (error) {
+               /*
+                * If the inode was already deleted, skip over it.
+                */
+               if (error == -ENOENT) {
+                       iter->rec_idx++;
+                       goto retry;
+               }
+               goto fail;
+       }
+
+       if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
+               iter->rec_idx++;
+               xfs_irele(*ipp);
+               goto retry;
+       }
+
+       *chunk_rec = *irec;
+       return true;
+
+fail:
+       xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+       return false;
+}
+
+static void
+xfs_zone_gc_iter_advance(
+       struct xfs_zone_gc_iter *iter,
+       xfs_extlen_t            count_fsb)
+{
+       struct xfs_rmap_irec    *irec = &iter->recs[iter->rec_idx];
+
+       irec->rm_offset += count_fsb;
+       irec->rm_startblock += count_fsb;
+       irec->rm_blockcount -= count_fsb;
+       if (!irec->rm_blockcount)
+               iter->rec_idx++;
+}
+
+static struct xfs_rtgroup *
+xfs_zone_gc_pick_victim_from(
+       struct xfs_mount        *mp,
+       uint32_t                bucket)
+{
+       struct xfs_zone_info    *zi = mp->m_zone_info;
+       uint32_t                victim_used = U32_MAX;
+       struct xfs_rtgroup      *victim_rtg = NULL;
+       uint32_t                bit;
+
+       if (!zi->zi_used_bucket_entries[bucket])
+               return NULL;
+
+       for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
+                       mp->m_sb.sb_rgcount) {
+               struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
+
+               if (!rtg)
+                       continue;
+
+               /* skip zones that are just waiting for a reset */
+               if (rtg_rmap(rtg)->i_used_blocks == 0 ||
+                   rtg_rmap(rtg)->i_used_blocks >= victim_used) {
+                       xfs_rtgroup_rele(rtg);
+                       continue;
+               }
+
+               if (victim_rtg)
+                       xfs_rtgroup_rele(victim_rtg);
+               victim_rtg = rtg;
+               victim_used = rtg_rmap(rtg)->i_used_blocks;
+
+               /*
+                * Any zone that is less than 1 percent used is fair game for
+                * instant reclaim. All of these zones are in the last
+                * bucket, so avoid the expensive division for the zones
+                * in the other buckets.
+                */
+               if (bucket == 0 &&
+                   rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
+                       break;
+       }
+
+       return victim_rtg;
+}
+
+/*
+ * Iterate through all zones marked as reclaimable and find a candidate to
+ * reclaim.
+ */
+static bool
+xfs_zone_gc_select_victim(
+       struct xfs_zone_gc_data *data)
+{
+       struct xfs_zone_gc_iter *iter = &data->iter;
+       struct xfs_mount        *mp = data->mp;
+       struct xfs_zone_info    *zi = mp->m_zone_info;
+       struct xfs_rtgroup      *victim_rtg = NULL;
+       unsigned int            bucket;
+
+       if (xfs_is_shutdown(mp))
+               return false;
+
+       if (iter->victim_rtg)
+               return true;
+
+       /*
+        * Don't start new work if we are asked to stop or park.
+        */
+       if (kthread_should_stop() || kthread_should_park())
+               return false;
+
+       if (!xfs_zoned_need_gc(mp))
+               return false;
+
+       spin_lock(&zi->zi_used_buckets_lock);
+       for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
+               victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
+               if (victim_rtg)
+                       break;
+       }
+       spin_unlock(&zi->zi_used_buckets_lock);
+
+       if (!victim_rtg)
+               return false;
+
+       trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
+       xfs_zone_gc_iter_init(iter, victim_rtg);
+       return true;
+}
+
+static struct xfs_open_zone *
+xfs_zone_gc_steal_open(
+       struct xfs_zone_info    *zi)
+{
+       struct xfs_open_zone    *oz, *found = NULL;
+
+       spin_lock(&zi->zi_open_zones_lock);
+       list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
+               if (!found ||
+                   oz->oz_write_pointer < found->oz_write_pointer)
+                       found = oz;
+       }
+
+       if (found) {
+               found->oz_is_gc = true;
+               list_del_init(&found->oz_entry);
+               zi->zi_nr_open_zones--;
+       }
+
+       spin_unlock(&zi->zi_open_zones_lock);
+       return found;
+}
+
+static struct xfs_open_zone *
+xfs_zone_gc_select_target(
+       struct xfs_mount        *mp)
+{
+       struct xfs_zone_info    *zi = mp->m_zone_info;
+       struct xfs_open_zone    *oz = zi->zi_open_gc_zone;
+
+       /*
+        * We need to wait for pending writes to finish.
+        */
+       if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
+               return NULL;
+
+       ASSERT(zi->zi_nr_open_zones <=
+               mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
+       oz = xfs_open_zone(mp, true);
+       if (oz)
+               trace_xfs_zone_gc_target_opened(oz->oz_rtg);
+       spin_lock(&zi->zi_open_zones_lock);
+       zi->zi_open_gc_zone = oz;
+       spin_unlock(&zi->zi_open_zones_lock);
+       return oz;
+}
+
+/*
+ * Ensure we have a valid open zone to write the GC data to.
+ *
+ * If the current target zone has space keep writing to it, else first wait for
+ * all pending writes and then pick a new one.
+ */
+static struct xfs_open_zone *
+xfs_zone_gc_ensure_target(
+       struct xfs_mount        *mp)
+{
+       struct xfs_open_zone    *oz = mp->m_zone_info->zi_open_gc_zone;
+
+       if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
+               return xfs_zone_gc_select_target(mp);
+       return oz;
+}
+
+static unsigned int
+xfs_zone_gc_scratch_available(
+       struct xfs_zone_gc_data *data)
+{
+       return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
+}
+
+static bool
+xfs_zone_gc_space_available(
+       struct xfs_zone_gc_data *data)
+{
+       struct xfs_open_zone    *oz;
+
+       oz = xfs_zone_gc_ensure_target(data->mp);
+       if (!oz)
+               return false;
+       return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) &&
+               xfs_zone_gc_scratch_available(data);
+}
+
+static void
+xfs_zone_gc_end_io(
+       struct bio              *bio)
+{
+       struct xfs_gc_bio       *chunk =
+               container_of(bio, struct xfs_gc_bio, bio);
+       struct xfs_zone_gc_data *data = chunk->data;
+
+       WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
+       wake_up_process(data->mp->m_zone_info->zi_gc_thread);
+}
+
+static struct xfs_open_zone *
+xfs_zone_gc_alloc_blocks(
+       struct xfs_zone_gc_data *data,
+       xfs_extlen_t            *count_fsb,
+       xfs_daddr_t             *daddr,
+       bool                    *is_seq)
+{
+       struct xfs_mount        *mp = data->mp;
+       struct xfs_open_zone    *oz;
+
+       oz = xfs_zone_gc_ensure_target(mp);
+       if (!oz)
+               return NULL;
+
+       *count_fsb = min(*count_fsb,
+               XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
+
+       /*
+        * Directly allocate GC blocks from the reserved pool.
+        *
+        * If we'd take them from the normal pool we could be stealing blocks
+        * from a regular writer, which would then have to wait for GC and
+        * deadlock.
+        */
+       spin_lock(&mp->m_sb_lock);
+       *count_fsb = min(*count_fsb,
+                       rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer);
+       *count_fsb = min3(*count_fsb,
+                       mp->m_free[XC_FREE_RTEXTENTS].res_avail,
+                       mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
+       mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
+       mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
+       spin_unlock(&mp->m_sb_lock);
+
+       if (!*count_fsb)
+               return NULL;
+
+       *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
+       *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
+       if (!*is_seq)
+               *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer);
+       oz->oz_write_pointer += *count_fsb;
+       atomic_inc(&oz->oz_ref);
+       return oz;
+}
+
+static bool
+xfs_zone_gc_start_chunk(
+       struct xfs_zone_gc_data *data)
+{
+       struct xfs_zone_gc_iter *iter = &data->iter;
+       struct xfs_mount        *mp = data->mp;
+       struct block_device     *bdev = mp->m_rtdev_targp->bt_bdev;
+       struct xfs_open_zone    *oz;
+       struct xfs_rmap_irec    irec;
+       struct xfs_gc_bio       *chunk;
+       struct xfs_inode        *ip;
+       struct bio              *bio;
+       xfs_daddr_t             daddr;
+       bool                    is_seq;
+
+       if (xfs_is_shutdown(mp))
+               return false;
+
+       if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
+               return false;
+       oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
+                       &is_seq);
+       if (!oz) {
+               xfs_irele(ip);
+               return false;
+       }
+
+       bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
+
+       chunk = container_of(bio, struct xfs_gc_bio, bio);
+       chunk->ip = ip;
+       chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
+       chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
+       chunk->old_startblock =
+               xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
+       chunk->new_daddr = daddr;
+       chunk->is_seq = is_seq;
+       chunk->scratch = &data->scratch[data->scratch_idx];
+       chunk->data = data;
+       chunk->oz = oz;
+
+       bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
+       bio->bi_end_io = xfs_zone_gc_end_io;
+       bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
+                       chunk->scratch->offset);
+       chunk->scratch->offset += chunk->len;
+       if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
+               data->scratch_idx =
+                       (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
+       }
+       WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+       list_add_tail(&chunk->entry, &data->reading);
+       xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
+
+       submit_bio(bio);
+       return true;
+}
+
+static void
+xfs_zone_gc_free_chunk(
+       struct xfs_gc_bio       *chunk)
+{
+       list_del(&chunk->entry);
+       xfs_open_zone_put(chunk->oz);
+       xfs_irele(chunk->ip);
+       bio_put(&chunk->bio);
+}
+
+static void
+xfs_zone_gc_submit_write(
+       struct xfs_zone_gc_data *data,
+       struct xfs_gc_bio       *chunk)
+{
+       if (chunk->is_seq) {
+               chunk->bio.bi_opf &= ~REQ_OP_WRITE;
+               chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
+       }
+       chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
+       chunk->bio.bi_end_io = xfs_zone_gc_end_io;
+       submit_bio(&chunk->bio);
+}
+
+static struct xfs_gc_bio *
+xfs_zone_gc_split_write(
+       struct xfs_zone_gc_data *data,
+       struct xfs_gc_bio       *chunk)
+{
+       struct queue_limits     *lim =
+               &bdev_get_queue(chunk->bio.bi_bdev)->limits;
+       struct xfs_gc_bio       *split_chunk;
+       int                     split_sectors;
+       unsigned int            split_len;
+       struct bio              *split;
+       unsigned int            nsegs;
+
+       if (!chunk->is_seq)
+               return NULL;
+
+       split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
+                       lim->max_zone_append_sectors << SECTOR_SHIFT);
+       if (!split_sectors)
+               return NULL;
+
+       /* ensure the split chunk is still block size aligned */
+       split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
+                       data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
+       split_len = split_sectors << SECTOR_SHIFT;
+
+       split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
+       split_chunk = container_of(split, struct xfs_gc_bio, bio);
+       split_chunk->data = data;
+       ihold(VFS_I(chunk->ip));
+       split_chunk->ip = chunk->ip;
+       split_chunk->is_seq = chunk->is_seq;
+       split_chunk->scratch = chunk->scratch;
+       split_chunk->offset = chunk->offset;
+       split_chunk->len = split_len;
+       split_chunk->old_startblock = chunk->old_startblock;
+       split_chunk->new_daddr = chunk->new_daddr;
+       split_chunk->oz = chunk->oz;
+       atomic_inc(&chunk->oz->oz_ref);
+
+       chunk->offset += split_len;
+       chunk->len -= split_len;
+       chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
+
+       /* add right before the original chunk */
+       WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
+       list_add_tail(&split_chunk->entry, &chunk->entry);
+       return split_chunk;
+}
+
+static void
+xfs_zone_gc_write_chunk(
+       struct xfs_gc_bio       *chunk)
+{
+       struct xfs_zone_gc_data *data = chunk->data;
+       struct xfs_mount        *mp = chunk->ip->i_mount;
+       unsigned int            folio_offset = chunk->bio.bi_io_vec->bv_offset;
+       struct xfs_gc_bio       *split_chunk;
+
+       if (chunk->bio.bi_status)
+               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+       if (xfs_is_shutdown(mp)) {
+               xfs_zone_gc_free_chunk(chunk);
+               return;
+       }
+
+       WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+       list_move_tail(&chunk->entry, &data->writing);
+
+       bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
+       bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
+                       folio_offset);
+
+       while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
+               xfs_zone_gc_submit_write(data, split_chunk);
+       xfs_zone_gc_submit_write(data, chunk);
+}
+
+static void
+xfs_zone_gc_finish_chunk(
+       struct xfs_gc_bio       *chunk)
+{
+       uint                    iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+       struct xfs_inode        *ip = chunk->ip;
+       struct xfs_mount        *mp = ip->i_mount;
+       int                     error;
+
+       if (chunk->bio.bi_status)
+               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+       if (xfs_is_shutdown(mp)) {
+               xfs_zone_gc_free_chunk(chunk);
+               return;
+       }
+
+       chunk->scratch->freed += chunk->len;
+       if (chunk->scratch->freed == chunk->scratch->offset) {
+               chunk->scratch->offset = 0;
+               chunk->scratch->freed = 0;
+       }
+
+       /*
+        * Cycle through the iolock and wait for direct I/O and layouts to
+        * ensure no one is reading from the old mapping before it goes away.
+        *
+        * Note that xfs_zoned_end_io() below checks that no other writer raced
+        * with us to update the mapping by checking that the old startblock
+        * didn't change.
+        */
+       xfs_ilock(ip, iolock);
+       error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
+       if (!error)
+               inode_dio_wait(VFS_I(ip));
+       xfs_iunlock(ip, iolock);
+       if (error)
+               goto free;
+
+       if (chunk->is_seq)
+               chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
+       error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
+                       chunk->new_daddr, chunk->oz, chunk->old_startblock);
+free:
+       if (error)
+               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+       xfs_zone_gc_free_chunk(chunk);
+}
+
+static void
+xfs_zone_gc_finish_reset(
+       struct xfs_gc_bio       *chunk)
+{
+       struct xfs_rtgroup      *rtg = chunk->bio.bi_private;
+       struct xfs_mount        *mp = rtg_mount(rtg);
+       struct xfs_zone_info    *zi = mp->m_zone_info;
+
+       if (chunk->bio.bi_status) {
+               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+               goto out;
+       }
+
+       xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
+       atomic_inc(&zi->zi_nr_free_zones);
+
+       xfs_zoned_add_available(mp, rtg_blocks(rtg));
+
+       wake_up_all(&zi->zi_zone_wait);
+out:
+       list_del(&chunk->entry);
+       bio_put(&chunk->bio);
+}
+
+static bool
+xfs_zone_gc_prepare_reset(
+       struct bio              *bio,
+       struct xfs_rtgroup      *rtg)
+{
+       trace_xfs_zone_reset(rtg);
+
+       ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
+       bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
+       if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
+               if (!bdev_max_discard_sectors(bio->bi_bdev))
+                       return false;
+               bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
+               bio->bi_iter.bi_size =
+                       XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
+       }
+
+       return true;
+}
+
+int
+xfs_zone_gc_reset_sync(
+       struct xfs_rtgroup      *rtg)
+{
+       int                     error = 0;
+       struct bio              bio;
+
+       bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
+                       REQ_OP_ZONE_RESET);
+       if (xfs_zone_gc_prepare_reset(&bio, rtg))
+               error = submit_bio_wait(&bio);
+       bio_uninit(&bio);
+
+       return error;
+}
+
+static void
+xfs_zone_gc_reset_zones(
+       struct xfs_zone_gc_data *data,
+       struct xfs_group        *reset_list)
+{
+       struct xfs_group        *next = reset_list;
+
+       if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
+               xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
+               return;
+       }
+
+       do {
+               struct xfs_rtgroup      *rtg = to_rtg(next);
+               struct xfs_gc_bio       *chunk;
+               struct bio              *bio;
+
+               xfs_log_force_inode(rtg_rmap(rtg));
+
+               next = rtg_group(rtg)->xg_next_reset;
+               rtg_group(rtg)->xg_next_reset = NULL;
+
+               bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
+                               0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
+               bio->bi_private = rtg;
+               bio->bi_end_io = xfs_zone_gc_end_io;
+
+               chunk = container_of(bio, struct xfs_gc_bio, bio);
+               chunk->data = data;
+               WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+               list_add_tail(&chunk->entry, &data->resetting);
+
+               /*
+                * Also use the bio to drive the state machine when neither
+                * zone reset nor discard is supported to keep things simple.
+                */
+               if (xfs_zone_gc_prepare_reset(bio, rtg))
+                       submit_bio(bio);
+               else
+                       bio_endio(bio);
+       } while (next);
+}
+
+/*
+ * Handle the work to read and write data for GC and to reset the zones,
+ * including handling all completions.
+ *
+ * Note that the order of the chunks is preserved so that we don't undo the
+ * optimal order established by xfs_zone_gc_query().
+ */
+static bool
+xfs_zone_gc_handle_work(
+       struct xfs_zone_gc_data *data)
+{
+       struct xfs_zone_info    *zi = data->mp->m_zone_info;
+       struct xfs_gc_bio       *chunk, *next;
+       struct xfs_group        *reset_list;
+       struct blk_plug         plug;
+
+       spin_lock(&zi->zi_reset_list_lock);
+       reset_list = zi->zi_reset_list;
+       zi->zi_reset_list = NULL;
+       spin_unlock(&zi->zi_reset_list_lock);
+
+       if (!xfs_zone_gc_select_victim(data) ||
+           !xfs_zone_gc_space_available(data)) {
+               if (list_empty(&data->reading) &&
+                   list_empty(&data->writing) &&
+                   list_empty(&data->resetting) &&
+                   !reset_list)
+                       return false;
+       }
+
+       __set_current_state(TASK_RUNNING);
+       try_to_freeze();
+
+       if (reset_list)
+               xfs_zone_gc_reset_zones(data, reset_list);
+
+       list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
+               if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+                       break;
+               xfs_zone_gc_finish_reset(chunk);
+       }
+
+       list_for_each_entry_safe(chunk, next, &data->writing, entry) {
+               if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+                       break;
+               xfs_zone_gc_finish_chunk(chunk);
+       }
+
+       blk_start_plug(&plug);
+       list_for_each_entry_safe(chunk, next, &data->reading, entry) {
+               if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+                       break;
+               xfs_zone_gc_write_chunk(chunk);
+       }
+       blk_finish_plug(&plug);
+
+       blk_start_plug(&plug);
+       while (xfs_zone_gc_start_chunk(data))
+               ;
+       blk_finish_plug(&plug);
+       return true;
+}
+
+/*
+ * Note that the current GC algorithm would break reflinks and thus duplicate
+ * data that was shared by multiple owners before.  Because of that reflinks
+ * are currently not supported on zoned file systems and can't be created or
+ * mounted.
+ */
+static int
+xfs_zoned_gcd(
+       void                    *private)
+{
+       struct xfs_zone_gc_data *data = private;
+       struct xfs_mount        *mp = data->mp;
+       struct xfs_zone_info    *zi = mp->m_zone_info;
+       unsigned int            nofs_flag;
+
+       nofs_flag = memalloc_nofs_save();
+       set_freezable();
+
+       for (;;) {
+               set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
+               xfs_set_zonegc_running(mp);
+               if (xfs_zone_gc_handle_work(data))
+                       continue;
+
+               if (list_empty(&data->reading) &&
+                   list_empty(&data->writing) &&
+                   list_empty(&data->resetting) &&
+                   !zi->zi_reset_list) {
+                       xfs_clear_zonegc_running(mp);
+                       xfs_zoned_resv_wake_all(mp);
+
+                       if (kthread_should_stop()) {
+                               __set_current_state(TASK_RUNNING);
+                               break;
+                       }
+
+                       if (kthread_should_park()) {
+                               __set_current_state(TASK_RUNNING);
+                               kthread_parkme();
+                               continue;
+                       }
+               }
+
+               schedule();
+       }
+       xfs_clear_zonegc_running(mp);
+
+       if (data->iter.victim_rtg)
+               xfs_rtgroup_rele(data->iter.victim_rtg);
+
+       memalloc_nofs_restore(nofs_flag);
+       xfs_zone_gc_data_free(data);
+       return 0;
+}
+
+void
+xfs_zone_gc_start(
+       struct xfs_mount        *mp)
+{
+       if (xfs_has_zoned(mp))
+               kthread_unpark(mp->m_zone_info->zi_gc_thread);
+}
+
+void
+xfs_zone_gc_stop(
+       struct xfs_mount        *mp)
+{
+       if (xfs_has_zoned(mp))
+               kthread_park(mp->m_zone_info->zi_gc_thread);
+}
+
+int
+xfs_zone_gc_mount(
+       struct xfs_mount        *mp)
+{
+       struct xfs_zone_info    *zi = mp->m_zone_info;
+       struct xfs_zone_gc_data *data;
+       struct xfs_open_zone    *oz;
+       int                     error;
+
+       /*
+        * If there are no free zones available for GC, pick the open zone with
+        * the least used space to GC into.  This should only happen after an
+        * unclean shutdown near ENOSPC while GC was ongoing.
+        *
+        * We also need to do this for the first gc zone allocation if we
+        * unmounted while at the open limit.
+        */
+       if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
+           zi->zi_nr_open_zones == mp->m_max_open_zones)
+               oz = xfs_zone_gc_steal_open(zi);
+       else
+               oz = xfs_open_zone(mp, true);
+       if (!oz) {
+               xfs_warn(mp, "unable to allocate a zone for gc");
+               error = -EIO;
+               goto out;
+       }
+
+       trace_xfs_zone_gc_target_opened(oz->oz_rtg);
+       zi->zi_open_gc_zone = oz;
+
+       data = xfs_zone_gc_data_alloc(mp);
+       if (!data) {
+               error = -ENOMEM;
+               goto out_put_gc_zone;
+       }
+
+       mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
+                       "xfs-zone-gc/%s", mp->m_super->s_id);
+       if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
+               xfs_warn(mp, "unable to create zone gc thread");
+               error = PTR_ERR(mp->m_zone_info->zi_gc_thread);
+               goto out_free_gc_data;
+       }
+
+       /* xfs_zone_gc_start will unpark for rw mounts */
+       kthread_park(mp->m_zone_info->zi_gc_thread);
+       return 0;
+
+out_free_gc_data:
+       kfree(data);
+out_put_gc_zone:
+       xfs_open_zone_put(zi->zi_open_gc_zone);
+out:
+       return error;
+}
+
+void
+xfs_zone_gc_unmount(
+       struct xfs_mount        *mp)
+{
+       struct xfs_zone_info    *zi = mp->m_zone_info;
+
+       kthread_stop(zi->zi_gc_thread);
+       if (zi->zi_open_gc_zone)
+               xfs_open_zone_put(zi->zi_open_gc_zone);
+}
index 5283d77482d46f1161e55a55870186a0b58fcbb7..f6c76d751a494e69f04c20eee47f6ef7235c9397 100644 (file)
@@ -40,6 +40,13 @@ struct xfs_open_zone {
        struct xfs_rtgroup      *oz_rtg;
 };
 
+/*
+ * Number of bitmap buckets to track reclaimable zones.  There are 10 buckets
+ * so that each 10% of the usable capacity get their own bucket and GC can
+ * only has to walk the bitmaps of the lesser used zones if there are any.
+ */
+#define XFS_ZONE_USED_BUCKETS          10u
+
 struct xfs_zone_info {
        /*
         * List of pending space reservations:
@@ -82,10 +89,24 @@ struct xfs_zone_info {
         */
        spinlock_t              zi_reset_list_lock;
        struct xfs_group        *zi_reset_list;
+
+       /*
+        * A set of bitmaps to bucket-sort reclaimable zones by used blocks to help
+        * garbage collection to quickly find the best candidate for reclaim.
+        */
+       spinlock_t              zi_used_buckets_lock;
+       unsigned int            zi_used_bucket_entries[XFS_ZONE_USED_BUCKETS];
+       unsigned long           *zi_used_bucket_bitmap[XFS_ZONE_USED_BUCKETS];
+
 };
 
 struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
 
+int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg);
+bool xfs_zoned_need_gc(struct xfs_mount *mp);
+int xfs_zone_gc_mount(struct xfs_mount *mp);
+void xfs_zone_gc_unmount(struct xfs_mount *mp);
+
 void xfs_zoned_resv_wake_all(struct xfs_mount *mp);
 
 #endif /* _XFS_ZONE_PRIV_H */
index eff9be0264256950b575f480df2f2919142fb011..4bf1b18aa7a7924926a5429030ab0a1223681b09 100644 (file)
@@ -159,6 +159,15 @@ xfs_zoned_reserve_available(
                if (error != -ENOSPC)
                        break;
 
+               /*
+                * If there is no reclaimable group left and we aren't still
+                * processing a pending GC request give up as we're fully out
+                * of space.
+                */
+               if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) &&
+                   !xfs_is_zonegc_running(mp))
+                       break;
+
                spin_unlock(&zi->zi_reservation_lock);
                schedule();
                spin_lock(&zi->zi_reservation_lock);