return xfs_log_force_inode(ip);
}
-static xfs_csn_t
-xfs_fsync_seq(
- struct xfs_inode *ip,
- bool datasync)
-{
- if (!xfs_ipincount(ip))
- return 0;
- if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
- return 0;
- return ip->i_itemp->ili_commit_seq;
-}
-
/*
- * All metadata updates are logged, which means that we just have to flush the
- * log up to the latest LSN that touched the inode.
+ * All metadata updates are logged, which means that we just have to push the
+ * journal to the required sequence number than holds the updates. We track
+ * datasync commits separately to full sync commits, and hence only need to
+ * select the correct sequence number for the log force here.
*
- * If we have concurrent fsync/fdatasync() calls, we need them to all block on
- * the log force before we clear the ili_fsync_fields field. This ensures that
- * we don't get a racing sync operation that does not wait for the metadata to
- * hit the journal before returning. If we race with clearing ili_fsync_fields,
- * then all that will happen is the log force will do nothing as the lsn will
- * already be on disk. We can't race with setting ili_fsync_fields because that
- * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
- * shared until after the ili_fsync_fields is cleared.
+ * We don't have to serialise against concurrent modifications, as we do not
+ * have to wait for modifications that have not yet completed. We define a
+ * transaction commit as completing when the commit sequence number is updated,
+ * hence if the sequence number has not updated, the sync operation has been
+ * run before the commit completed and we don't have to wait for it.
+ *
+ * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain
+ * set on the log item until - at least - the journal flush completes. In
+ * reality, they are only cleared when the inode is fully unpinned (i.e.
+ * persistent in the journal and not dirty in the CIL), and so we rely on
+ * xfs_log_force_seq() either skipping sequences that have been persisted or
+ * waiting on sequences that are still in flight to correctly order concurrent
+ * sync operations.
*/
-static int
+static int
xfs_fsync_flush_log(
struct xfs_inode *ip,
bool datasync,
int *log_flushed)
{
- int error = 0;
- xfs_csn_t seq;
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ xfs_csn_t seq = 0;
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- seq = xfs_fsync_seq(ip, datasync);
- if (seq) {
- error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
- log_flushed);
+ spin_lock(&iip->ili_lock);
+ if (datasync)
+ seq = iip->ili_datasync_seq;
+ else
+ seq = iip->ili_commit_seq;
+ spin_unlock(&iip->ili_lock);
- spin_lock(&ip->i_itemp->ili_lock);
- ip->i_itemp->ili_fsync_fields = 0;
- spin_unlock(&ip->i_itemp->ili_lock);
- }
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return error;
+ if (!seq)
+ return 0;
+
+ return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
+ log_flushed);
}
STATIC int
error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
/*
- * Any inode that has dirty modifications in the log is pinned. The
- * racy check here for a pinned inode will not catch modifications
- * that happen concurrently to the fsync call, but fsync semantics
- * only require to sync previously completed I/O.
+ * If the inode has a inode log item attached, it may need the journal
+ * flushed to persist any changes the log item might be tracking.
*/
- if (xfs_ipincount(ip)) {
+ if (ip->i_itemp) {
err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
if (err2 && !error)
error = err2;
spin_lock(&iip->ili_lock);
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
- iip->ili_fsync_fields = 0;
spin_unlock(&iip->ili_lock);
ASSERT(iip->ili_last_fields);
xfs_iunpin(
struct xfs_inode *ip)
{
- xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ xfs_csn_t seq = 0;
trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
+
+ spin_lock(&iip->ili_lock);
+ seq = iip->ili_commit_seq;
+ spin_unlock(&iip->ili_lock);
+ if (!seq)
+ return;
/* Give the log a push to start the unpinning I/O */
- xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
+ xfs_log_force_seq(ip->i_mount, seq, 0, NULL);
}
spin_lock(&iip->ili_lock);
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
- iip->ili_fsync_fields = 0;
set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags);
spin_unlock(&iip->ili_lock);
xfs_log_force_inode(
struct xfs_inode *ip)
{
+ struct xfs_inode_log_item *iip = ip->i_itemp;
xfs_csn_t seq = 0;
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip))
- seq = ip->i_itemp->ili_commit_seq;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ if (!iip)
+ return 0;
+
+ spin_lock(&iip->ili_lock);
+ seq = iip->ili_commit_seq;
+ spin_unlock(&iip->ili_lock);
if (!seq)
return 0;
}
/*
- * Record the specific change for fdatasync optimisation. This allows
- * fdatasync to skip log forces for inodes that are only timestamp
- * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it
- * to XFS_ILOG_CORE so that the actual on-disk dirty tracking
- * (ili_fields) correctly tracks that the version has changed.
+ * Store the dirty flags back into the inode item as this state is used
+ * later on in xfs_inode_item_committing() to determine whether the
+ * transaction is relevant to fsync state or not.
+ */
+ iip->ili_dirty_flags = flags;
+
+ /*
+ * Convert the flags on-disk fields that have been modified in the
+ * transaction so that ili_fields tracks the changes correctly.
*/
- iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION);
if (flags & XFS_ILOG_IVERSION)
flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
spin_unlock(&iip->ili_lock);
xfs_inode_item_precommit_check(ip);
-
- /*
- * We are done with the log item transaction dirty state, so clear it so
- * that it doesn't pollute future transactions.
- */
- iip->ili_dirty_flags = 0;
return 0;
}
struct xfs_log_item *lip,
int remove)
{
- struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
trace_xfs_inode_unpin(ip, _RET_IP_);
ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE));
ASSERT(atomic_read(&ip->i_pincount) > 0);
- if (atomic_dec_and_test(&ip->i_pincount))
+
+ /*
+ * If this is the last unpin, then the inode no longer needs a journal
+ * flush to persist it. Hence we can clear the commit sequence numbers
+ * as a fsync/fdatasync operation on the inode at this point is a no-op.
+ */
+ if (atomic_dec_and_lock(&ip->i_pincount, &iip->ili_lock)) {
+ iip->ili_commit_seq = 0;
+ iip->ili_datasync_seq = 0;
+ spin_unlock(&iip->ili_lock);
wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
+ }
}
STATIC uint
return lsn;
}
+/*
+ * The modification is now complete, so before we unlock the inode we need to
+ * update the commit sequence numbers for data integrity journal flushes. We
+ * always record the commit sequence number (ili_commit_seq) so that anything
+ * that needs a full journal sync will capture all of this modification.
+ *
+ * We then
+ * check if the changes will impact a datasync (O_DSYNC) journal flush. If the
+ * changes will require a datasync flush, then we also record the sequence in
+ * ili_datasync_seq.
+ *
+ * These commit sequence numbers will get cleared atomically with the inode being
+ * unpinned (i.e. pin count goes to zero), and so it will only be set when the
+ * inode is dirty in the journal. This removes the need for checking if the
+ * inode is pinned to determine if a journal flush is necessary, and hence
+ * removes the need for holding the ILOCK_SHARED in xfs_file_fsync() to
+ * serialise pin counts against commit sequence number updates.
+ *
+ */
STATIC void
xfs_inode_item_committing(
struct xfs_log_item *lip,
xfs_csn_t seq)
{
- INODE_ITEM(lip)->ili_commit_seq = seq;
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+
+ spin_lock(&iip->ili_lock);
+ iip->ili_commit_seq = seq;
+ if (iip->ili_dirty_flags & ~(XFS_ILOG_IVERSION | XFS_ILOG_TIMESTAMP))
+ iip->ili_datasync_seq = seq;
+ spin_unlock(&iip->ili_lock);
+
+ /*
+ * Clear the per-transaction dirty flags now that we have finished
+ * recording the transaction's inode modifications in the CIL and are
+ * about to release and (maybe) unlock the inode.
+ */
+ iip->ili_dirty_flags = 0;
+
return xfs_inode_item_release(lip);
}
{
iip->ili_last_fields = 0;
iip->ili_fields = 0;
- iip->ili_fsync_fields = 0;
iip->ili_flush_lsn = 0;
iip->ili_item.li_buf = NULL;
list_del_init(&iip->ili_item.li_bio_list);
spinlock_t ili_lock; /* flush state lock */
unsigned int ili_last_fields; /* fields when flushed */
unsigned int ili_fields; /* fields to be logged */
- unsigned int ili_fsync_fields; /* logged since last fsync */
xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
+
+ /*
+ * We record the sequence number for every inode modification, as
+ * well as those that only require fdatasync operations for data
+ * integrity. This allows optimisation of the O_DSYNC/fdatasync path
+ * without needing to track what modifications the journal is currently
+ * carrying for the inode. These are protected by the above ili_lock.
+ */
xfs_csn_t ili_commit_seq; /* last transaction commit */
+ xfs_csn_t ili_datasync_seq; /* for datasync optimisation */
};
static inline int xfs_inode_clean(struct xfs_inode *ip)
iomap->bdev = target->bt_bdev;
iomap->flags = iomap_flags;
- if (xfs_ipincount(ip) &&
- (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
- iomap->flags |= IOMAP_F_DIRTY;
+ /*
+ * If the inode is dirty for datasync purposes, let iomap know so it
+ * doesn't elide the IO completion journal flushes on O_DSYNC IO.
+ */
+ if (ip->i_itemp) {
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+
+ spin_lock(&iip->ili_lock);
+ if (iip->ili_datasync_seq)
+ iomap->flags |= IOMAP_F_DIRTY;
+ spin_unlock(&iip->ili_lock);
+ }
iomap->validity_cookie = sequence_cookie;
return 0;