xfs: rework datasync tracking and execution

author Dave Chinner <dchinner@redhat.com>

Wed, 17 Sep 2025 22:12:54 +0000 (08:12 +1000)

committer Carlos Maiolino <cem@kernel.org>

Tue, 23 Sep 2025 13:12:43 +0000 (15:12 +0200)
author Dave Chinner <dchinner@redhat.com>
Wed, 17 Sep 2025 22:12:54 +0000 (08:12 +1000)
committer Carlos Maiolino <cem@kernel.org>
Tue, 23 Sep 2025 13:12:43 +0000 (15:12 +0200)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index f96fbf5c54c99986708b928fec69dc2f37f6d604..2702fef2c90cd28e2af2a69cf5485ca773195196 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -75,52 +75,47 @@ xfs_dir_fsync(
         return xfs_log_force_inode(ip);
  }
  
-static xfs_csn_t
-xfs_fsync_seq(
-       struct xfs_inode        *ip,
-       bool                    datasync)
-{
-       if (!xfs_ipincount(ip))
-               return 0;
-       if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
-               return 0;
-       return ip->i_itemp->ili_commit_seq;
-}
-
  /*
- * All metadata updates are logged, which means that we just have to flush the
- * log up to the latest LSN that touched the inode.
+ * All metadata updates are logged, which means that we just have to push the
+ * journal to the required sequence number than holds the updates. We track
+ * datasync commits separately to full sync commits, and hence only need to
+ * select the correct sequence number for the log force here.
   *
- * If we have concurrent fsync/fdatasync() calls, we need them to all block on
- * the log force before we clear the ili_fsync_fields field. This ensures that
- * we don't get a racing sync operation that does not wait for the metadata to
- * hit the journal before returning.  If we race with clearing ili_fsync_fields,
- * then all that will happen is the log force will do nothing as the lsn will
- * already be on disk.  We can't race with setting ili_fsync_fields because that
- * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
- * shared until after the ili_fsync_fields is cleared.
+ * We don't have to serialise against concurrent modifications, as we do not
+ * have to wait for modifications that have not yet completed. We define a
+ * transaction commit as completing when the commit sequence number is updated,
+ * hence if the sequence number has not updated, the sync operation has been
+ * run before the commit completed and we don't have to wait for it.
+ *
+ * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain
+ * set on the log item until - at least - the journal flush completes. In
+ * reality, they are only cleared when the inode is fully unpinned (i.e.
+ * persistent in the journal and not dirty in the CIL), and so we rely on
+ * xfs_log_force_seq() either skipping sequences that have been persisted or
+ * waiting on sequences that are still in flight to correctly order concurrent
+ * sync operations.
   */
-static  int
+static int
  xfs_fsync_flush_log(
         struct xfs_inode        *ip,
         bool                    datasync,
         int                     *log_flushed)
  {
-       int                     error = 0;
-       xfs_csn_t               seq;
+       struct xfs_inode_log_item *iip = ip->i_itemp;
+       xfs_csn_t               seq = 0;
  
-       xfs_ilock(ip, XFS_ILOCK_SHARED);
-       seq = xfs_fsync_seq(ip, datasync);
-       if (seq) {
-               error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
-                                         log_flushed);
+       spin_lock(&iip->ili_lock);
+       if (datasync)
+               seq = iip->ili_datasync_seq;
+       else
+               seq = iip->ili_commit_seq;
+       spin_unlock(&iip->ili_lock);
  
-               spin_lock(&ip->i_itemp->ili_lock);
-               ip->i_itemp->ili_fsync_fields = 0;
-               spin_unlock(&ip->i_itemp->ili_lock);
-       }
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
-       return error;
+       if (!seq)
+               return 0;
+
+       return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
+                                         log_flushed);
  }
  
  STATIC int
@@ -158,12 +153,10 @@ xfs_file_fsync(
                 error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
  
         /*
-        * Any inode that has dirty modifications in the log is pinned.  The
-        * racy check here for a pinned inode will not catch modifications
-        * that happen concurrently to the fsync call, but fsync semantics
-        * only require to sync previously completed I/O.
+        * If the inode has a inode log item attached, it may need the journal
+        * flushed to persist any changes the log item might be tracking.
          */
-       if (xfs_ipincount(ip)) {
+       if (ip->i_itemp) {
                 err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
                 if (err2 && !error)
                         error = err2;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c

index 2bfe87b09c2ada5b25c5642ab2d6f8191db7e1a3..047a5260cf7084bb5d73aa19a8c636ab507c597e 100644 (file)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1667,7 +1667,6 @@ retry:
         spin_lock(&iip->ili_lock);
         iip->ili_last_fields = iip->ili_fields;
         iip->ili_fields = 0;
-       iip->ili_fsync_fields = 0;
         spin_unlock(&iip->ili_lock);
         ASSERT(iip->ili_last_fields);
  
@@ -1832,12 +1831,20 @@ static void
  xfs_iunpin(
         struct xfs_inode        *ip)
  {
-       xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
+       struct xfs_inode_log_item *iip = ip->i_itemp;
+       xfs_csn_t               seq = 0;
  
         trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
+       xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
+
+       spin_lock(&iip->ili_lock);
+       seq = iip->ili_commit_seq;
+       spin_unlock(&iip->ili_lock);
+       if (!seq)
+               return;
  
         /* Give the log a push to start the unpinning I/O */
-       xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
+       xfs_log_force_seq(ip->i_mount, seq, 0, NULL);
  
  }
  
@@ -2504,7 +2511,6 @@ flush_out:
         spin_lock(&iip->ili_lock);
         iip->ili_last_fields = iip->ili_fields;
         iip->ili_fields = 0;
-       iip->ili_fsync_fields = 0;
         set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags);
         spin_unlock(&iip->ili_lock);
  
@@ -2663,12 +2669,15 @@ int
  xfs_log_force_inode(
         struct xfs_inode        *ip)
  {
+       struct xfs_inode_log_item *iip = ip->i_itemp;
         xfs_csn_t               seq = 0;
  
-       xfs_ilock(ip, XFS_ILOCK_SHARED);
-       if (xfs_ipincount(ip))
-               seq = ip->i_itemp->ili_commit_seq;
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+       if (!iip)
+               return 0;
+
+       spin_lock(&iip->ili_lock);
+       seq = iip->ili_commit_seq;
+       spin_unlock(&iip->ili_lock);
  
         if (!seq)
                 return 0;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c

index 678ca95793e0af20f103ff650dc21c98a6ed7cce..1bd411a1114c7ec918ec0c3e80b7ae613890bad3 100644 (file)
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -187,13 +187,16 @@ xfs_inode_item_precommit(
         }
  
         /*
-        * Record the specific change for fdatasync optimisation. This allows
-        * fdatasync to skip log forces for inodes that are only timestamp
-        * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it
-        * to XFS_ILOG_CORE so that the actual on-disk dirty tracking
-        * (ili_fields) correctly tracks that the version has changed.
+        * Store the dirty flags back into the inode item as this state is used
+        * later on in xfs_inode_item_committing() to determine whether the
+        * transaction is relevant to fsync state or not.
+        */
+       iip->ili_dirty_flags = flags;
+
+       /*
+        * Convert the flags on-disk fields that have been modified in the
+        * transaction so that ili_fields tracks the changes correctly.
          */
-       iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION);
         if (flags & XFS_ILOG_IVERSION)
                 flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
  
@@ -207,12 +210,6 @@ xfs_inode_item_precommit(
         spin_unlock(&iip->ili_lock);
  
         xfs_inode_item_precommit_check(ip);
-
-       /*
-        * We are done with the log item transaction dirty state, so clear it so
-        * that it doesn't pollute future transactions.
-        */
-       iip->ili_dirty_flags = 0;
         return 0;
  }
  
@@ -722,13 +719,24 @@ xfs_inode_item_unpin(
         struct xfs_log_item     *lip,
         int                     remove)
  {
-       struct xfs_inode        *ip = INODE_ITEM(lip)->ili_inode;
+       struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+       struct xfs_inode        *ip = iip->ili_inode;
  
         trace_xfs_inode_unpin(ip, _RET_IP_);
         ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE));
         ASSERT(atomic_read(&ip->i_pincount) > 0);
-       if (atomic_dec_and_test(&ip->i_pincount))
+
+       /*
+        * If this is the last unpin, then the inode no longer needs a journal
+        * flush to persist it. Hence we can clear the commit sequence numbers
+        * as a fsync/fdatasync operation on the inode at this point is a no-op.
+        */
+       if (atomic_dec_and_lock(&ip->i_pincount, &iip->ili_lock)) {
+               iip->ili_commit_seq = 0;
+               iip->ili_datasync_seq = 0;
+               spin_unlock(&iip->ili_lock);
                 wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
+       }
  }
  
  STATIC uint
@@ -851,12 +859,45 @@ xfs_inode_item_committed(
         return lsn;
  }
  
+/*
+ * The modification is now complete, so before we unlock the inode we need to
+ * update the commit sequence numbers for data integrity journal flushes. We
+ * always record the commit sequence number (ili_commit_seq) so that anything
+ * that needs a full journal sync will capture all of this modification.
+ *
+ * We then
+ * check if the changes will impact a datasync (O_DSYNC) journal flush. If the
+ * changes will require a datasync flush, then we also record the sequence in
+ * ili_datasync_seq.
+ *
+ * These commit sequence numbers will get cleared atomically with the inode being
+ * unpinned (i.e. pin count goes to zero), and so it will only be set when the
+ * inode is dirty in the journal. This removes the need for checking if the
+ * inode is pinned to determine if a journal flush is necessary, and hence
+ * removes the need for holding the ILOCK_SHARED in xfs_file_fsync() to
+ * serialise pin counts against commit sequence number updates.
+ *
+ */
  STATIC void
  xfs_inode_item_committing(
         struct xfs_log_item     *lip,
         xfs_csn_t               seq)
  {
-       INODE_ITEM(lip)->ili_commit_seq = seq;
+       struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+
+       spin_lock(&iip->ili_lock);
+       iip->ili_commit_seq = seq;
+       if (iip->ili_dirty_flags & ~(XFS_ILOG_IVERSION | XFS_ILOG_TIMESTAMP))
+               iip->ili_datasync_seq = seq;
+       spin_unlock(&iip->ili_lock);
+
+       /*
+        * Clear the per-transaction dirty flags now that we have finished
+        * recording the transaction's inode modifications in the CIL and are
+        * about to release and (maybe) unlock the inode.
+        */
+       iip->ili_dirty_flags = 0;
+
         return xfs_inode_item_release(lip);
  }
  
@@ -1048,7 +1089,6 @@ xfs_iflush_abort_clean(
  {
         iip->ili_last_fields = 0;
         iip->ili_fields = 0;
-       iip->ili_fsync_fields = 0;
         iip->ili_flush_lsn = 0;
         iip->ili_item.li_buf = NULL;
         list_del_init(&iip->ili_item.li_bio_list);
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h

index ba92ce11a011112171e7b72f29c149ab9191688e..2ddcca41714f7a9ec0da7b1d6a78a105105c38d5 100644 (file)
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -32,9 +32,17 @@ struct xfs_inode_log_item {
         spinlock_t              ili_lock;          /* flush state lock */
         unsigned int            ili_last_fields;   /* fields when flushed */
         unsigned int            ili_fields;        /* fields to be logged */
-       unsigned int            ili_fsync_fields;  /* logged since last fsync */
         xfs_lsn_t               ili_flush_lsn;     /* lsn at last flush */
+
+       /*
+        * We record the sequence number for every inode modification, as
+        * well as those that only require fdatasync operations for data
+        * integrity. This allows optimisation of the O_DSYNC/fdatasync path
+        * without needing to track what modifications the journal is currently
+        * carrying for the inode. These are protected by the above ili_lock.
+        */
         xfs_csn_t               ili_commit_seq;    /* last transaction commit */
+       xfs_csn_t               ili_datasync_seq;  /* for datasync optimisation */
  };
  
  static inline int xfs_inode_clean(struct xfs_inode *ip)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c

index 2570d0a66047380c292bd7ad245ce387f991dee9..d3f6e3e42a11913f00b79a439d74dcc2bbda6389 100644 (file)
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -149,9 +149,18 @@ xfs_bmbt_to_iomap(
                 iomap->bdev = target->bt_bdev;
         iomap->flags = iomap_flags;
  
-       if (xfs_ipincount(ip) &&
-           (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
-               iomap->flags |= IOMAP_F_DIRTY;
+       /*
+        * If the inode is dirty for datasync purposes, let iomap know so it
+        * doesn't elide the IO completion journal flushes on O_DSYNC IO.
+        */
+       if (ip->i_itemp) {
+               struct xfs_inode_log_item *iip = ip->i_itemp;
+
+               spin_lock(&iip->ili_lock);
+               if (iip->ili_datasync_seq)
+                       iomap->flags |= IOMAP_F_DIRTY;
+               spin_unlock(&iip->ili_lock);
+       }
  
         iomap->validity_cookie = sequence_cookie;
         return 0;
author	Dave Chinner <dchinner@redhat.com>
	Wed, 17 Sep 2025 22:12:54 +0000 (08:12 +1000)
committer	Carlos Maiolino <cem@kernel.org>
	Tue, 23 Sep 2025 13:12:43 +0000 (15:12 +0200)
fs/xfs/xfs_file.c		patch \| blob \| blame \| history
fs/xfs/xfs_inode.c		patch \| blob \| blame \| history
fs/xfs/xfs_inode_item.c		patch \| blob \| blame \| history
fs/xfs/xfs_inode_item.h		patch \| blob \| blame \| history
fs/xfs/xfs_iomap.c		patch \| blob \| blame \| history