It does not lock ``i_rwsem`` or ``invalidate_lock``.
The dirty bit will be cleared for all folios run through the
-``->map_blocks`` machinery described below even if the writeback fails.
+``->writeback_range`` machinery described below even if the writeback fails.
This is to prevent dirty folio clots when storage devices fail; an
``-EIO`` is recorded for userspace to collect via ``fsync``.
.. code-block:: c
struct iomap_writeback_ops {
- int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
- loff_t offset, unsigned len);
- int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
- void (*discard_folio)(struct folio *folio, loff_t pos);
+ int (*writeback_range)(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 pos, unsigned int len, u64 end_pos);
+ int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
};
The fields are as follows:
- - ``map_blocks``: Sets ``wpc->iomap`` to the space mapping of the file
+ - ``writeback_range``: Sets ``wpc->iomap`` to the space mapping of the file
range (in bytes) given by ``offset`` and ``len``.
iomap calls this function for each dirty fs block in each dirty folio,
though it will `reuse mappings
This revalidation must be open-coded by the filesystem; it is
unclear if ``iomap::validity_cookie`` can be reused for this
purpose.
+
+ If this methods fails to schedule I/O for any part of a dirty folio, it
+ should throw away any reservations that may have been made for the write.
+ The folio will be marked clean and an ``-EIO`` recorded in the
+ pagecache.
+ Filesystems can use this callback to `remove
+ <https://lore.kernel.org/all/20201029163313.1766967-1-bfoster@redhat.com/>`_
+ delalloc reservations to avoid having delalloc reservations for
+ clean pagecache.
This function must be supplied by the filesystem.
- ``submit_ioend``: Allows the file systems to hook into writeback bio
transactions from process context before submitting the bio.
This function is optional.
- - ``discard_folio``: iomap calls this function after ``->map_blocks``
- fails to schedule I/O for any part of a dirty folio.
- The function should throw away any reservations that may have been
- made for the write.
- The folio will be marked clean and an ``-EIO`` recorded in the
- pagecache.
- Filesystems can use this callback to `remove
- <https://lore.kernel.org/all/20201029163313.1766967-1-bfoster@redhat.com/>`_
- delalloc reservations to avoid having delalloc reservations for
- clean pagecache.
- This function is optional.
-
Pagecache Writeback Completion
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
iomap_readahead(rac, &blkdev_iomap_ops);
}
-static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
- struct inode *inode, loff_t offset, unsigned int len)
+static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 offset, unsigned int len, u64 end_pos)
{
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read(wpc->inode);
if (WARN_ON_ONCE(offset >= isize))
return -EIO;
- if (offset >= wpc->iomap.offset &&
- offset < wpc->iomap.offset + wpc->iomap.length)
- return 0;
- return blkdev_iomap_begin(inode, offset, isize - offset,
- IOMAP_WRITE, &wpc->iomap, NULL);
+
+ if (offset < wpc->iomap.offset ||
+ offset >= wpc->iomap.offset + wpc->iomap.length) {
+ int error;
+
+ error = blkdev_iomap_begin(wpc->inode, offset, isize - offset,
+ IOMAP_WRITE, &wpc->iomap, NULL);
+ if (error)
+ return error;
+ }
+
+ return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
}
static const struct iomap_writeback_ops blkdev_writeback_ops = {
- .map_blocks = blkdev_map_blocks,
+ .writeback_range = blkdev_writeback_range,
};
static int blkdev_writepages(struct address_space *mapping,
return error;
}
-static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
- loff_t offset, unsigned int len)
+static ssize_t gfs2_writeback_range(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 offset, unsigned int len, u64 end_pos)
{
- int ret;
-
- if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode))))
+ if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(wpc->inode))))
return -EIO;
- if (offset >= wpc->iomap.offset &&
- offset < wpc->iomap.offset + wpc->iomap.length)
- return 0;
+ if (offset < wpc->iomap.offset ||
+ offset >= wpc->iomap.offset + wpc->iomap.length) {
+ int ret;
- memset(&wpc->iomap, 0, sizeof(wpc->iomap));
- ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap);
- return ret;
+ memset(&wpc->iomap, 0, sizeof(wpc->iomap));
+ ret = gfs2_iomap_get(wpc->inode, offset, INT_MAX, &wpc->iomap);
+ if (ret)
+ return ret;
+ }
+
+ return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
}
const struct iomap_writeback_ops gfs2_writeback_ops = {
- .map_blocks = gfs2_map_blocks,
+ .writeback_range = gfs2_writeback_range,
};
* At the end of a writeback pass, there will be a cached ioend remaining on the
* writepage context that the caller will need to submit.
*/
-static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
- struct folio *folio, loff_t pos, loff_t end_pos, unsigned len)
+ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
+ loff_t pos, loff_t end_pos, unsigned int dirty_len)
{
struct iomap_folio_state *ifs = folio->private;
size_t poff = offset_in_folio(folio, pos);
unsigned int ioend_flags = 0;
+ unsigned int map_len = min_t(u64, dirty_len,
+ wpc->iomap.offset + wpc->iomap.length - pos);
int error;
+ trace_iomap_add_to_ioend(wpc->inode, pos, dirty_len, &wpc->iomap);
+
+ WARN_ON_ONCE(!folio->private && map_len < dirty_len);
+
+ switch (wpc->iomap.type) {
+ case IOMAP_INLINE:
+ WARN_ON_ONCE(1);
+ return -EIO;
+ case IOMAP_HOLE:
+ return map_len;
+ default:
+ break;
+ }
+
if (wpc->iomap.type == IOMAP_UNWRITTEN)
ioend_flags |= IOMAP_IOEND_UNWRITTEN;
if (wpc->iomap.flags & IOMAP_F_SHARED)
wpc->ioend = iomap_alloc_ioend(wpc, pos, ioend_flags);
}
- if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
+ if (!bio_add_folio(&wpc->ioend->io_bio, folio, map_len, poff))
goto new_ioend;
if (ifs)
- atomic_add(len, &ifs->write_bytes_pending);
+ atomic_add(map_len, &ifs->write_bytes_pending);
/*
* Clamp io_offset and io_size to the incore EOF so that ondisk
* Note that this defeats the ability to chain the ioends of
* appending writes.
*/
- wpc->ioend->io_size += len;
+ wpc->ioend->io_size += map_len;
if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
- wbc_account_cgroup_owner(wpc->wbc, folio, len);
- return 0;
+ wbc_account_cgroup_owner(wpc->wbc, folio, map_len);
+ return map_len;
}
+EXPORT_SYMBOL_GPL(iomap_add_to_ioend);
-static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
- struct folio *folio, u64 pos, u64 end_pos, unsigned dirty_len,
+static int iomap_writeback_range(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 pos, u32 rlen, u64 end_pos,
bool *wb_pending)
{
- int error;
-
do {
- unsigned map_len;
-
- error = wpc->ops->map_blocks(wpc, wpc->inode, pos, dirty_len);
- if (error)
- break;
- trace_iomap_writepage_map(wpc->inode, pos, dirty_len,
- &wpc->iomap);
+ ssize_t ret;
- map_len = min_t(u64, dirty_len,
- wpc->iomap.offset + wpc->iomap.length - pos);
- WARN_ON_ONCE(!folio->private && map_len < dirty_len);
+ ret = wpc->ops->writeback_range(wpc, folio, pos, rlen, end_pos);
+ if (WARN_ON_ONCE(ret == 0 || ret > rlen))
+ return -EIO;
+ if (ret < 0)
+ return ret;
+ rlen -= ret;
+ pos += ret;
- switch (wpc->iomap.type) {
- case IOMAP_INLINE:
- WARN_ON_ONCE(1);
- error = -EIO;
- break;
- case IOMAP_HOLE:
- break;
- default:
- error = iomap_add_to_ioend(wpc, folio, pos, end_pos,
- map_len);
- if (!error)
- *wb_pending = true;
- break;
- }
- dirty_len -= map_len;
- pos += map_len;
- } while (dirty_len && !error);
+ /*
+ * Holes are not be written back by ->writeback_range, so track
+ * if we did handle anything that is not a hole here.
+ */
+ if (wpc->iomap.type != IOMAP_HOLE)
+ *wb_pending = true;
+ } while (rlen);
- /*
- * We cannot cancel the ioend directly here on error. We may have
- * already set other pages under writeback and hence we have to run I/O
- * completion to mark the error state of the pages under writeback
- * appropriately.
- *
- * Just let the file system know what portion of the folio failed to
- * map.
- */
- if (error && wpc->ops->discard_folio)
- wpc->ops->discard_folio(folio, pos);
- return error;
+ return 0;
}
/*
*/
end_aligned = round_up(end_pos, i_blocksize(inode));
while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
- error = iomap_writepage_map_blocks(wpc, folio, pos, end_pos,
- rlen, &wb_pending);
+ error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos,
+ &wb_pending);
if (error)
break;
pos += rlen;
DEFINE_IOMAP_EVENT(iomap_iter_dstmap);
DEFINE_IOMAP_EVENT(iomap_iter_srcmap);
-TRACE_EVENT(iomap_writepage_map,
+TRACE_EVENT(iomap_add_to_ioend,
TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len,
struct iomap *iomap),
TP_ARGS(inode, pos, dirty_len, iomap),
spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
}
+/*
+ * We cannot cancel the ioend directly on error. We may have already set other
+ * pages under writeback and hence we have to run I/O completion to mark the
+ * error state of the pages under writeback appropriately.
+ *
+ * If the folio has delalloc blocks on it, the caller is asking us to punch them
+ * out. If we don't, we can leave a stale delalloc mapping covered by a clean
+ * page that needs to be dirtied again before the delalloc mapping can be
+ * converted. This stale delalloc mapping can trip up a later direct I/O read
+ * operation on the same region.
+ *
+ * We prevent this by truncating away the delalloc regions on the folio. Because
+ * they are delalloc, we can do this without needing a transaction. Indeed - if
+ * we get ENOSPC errors, we have to be able to do this truncation without a
+ * transaction as there is no space left for block reservation (typically why
+ * we see a ENOSPC in writeback).
+ */
+static void
+xfs_discard_folio(
+ struct folio *folio,
+ loff_t pos)
+{
+ struct xfs_inode *ip = XFS_I(folio->mapping->host);
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (xfs_is_shutdown(mp))
+ return;
+
+ xfs_alert_ratelimited(mp,
+ "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
+ folio, ip->i_ino, pos);
+
+ /*
+ * The end of the punch range is always the offset of the first
+ * byte of the next folio. Hence the end offset is only dependent on the
+ * folio itself and not the start offset that is passed in.
+ */
+ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
+ folio_pos(folio) + folio_size(folio), NULL);
+}
+
/*
* Fast revalidation of the cached writeback mapping. Return true if the current
* mapping is valid, false otherwise.
static int
xfs_map_blocks(
struct iomap_writepage_ctx *wpc,
- struct inode *inode,
loff_t offset,
unsigned int len)
{
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(wpc->inode);
struct xfs_mount *mp = ip->i_mount;
- ssize_t count = i_blocksize(inode);
+ ssize_t count = i_blocksize(wpc->inode);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_fileoff_t cow_fsb;
return 0;
}
+static ssize_t
+xfs_writeback_range(
+ struct iomap_writepage_ctx *wpc,
+ struct folio *folio,
+ u64 offset,
+ unsigned int len,
+ u64 end_pos)
+{
+ ssize_t ret;
+
+ ret = xfs_map_blocks(wpc, offset, len);
+ if (!ret)
+ ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
+ if (ret < 0)
+ xfs_discard_folio(folio, offset);
+ return ret;
+}
+
static bool
xfs_ioend_needs_wq_completion(
struct iomap_ioend *ioend)
return 0;
}
-/*
- * If the folio has delalloc blocks on it, the caller is asking us to punch them
- * out. If we don't, we can leave a stale delalloc mapping covered by a clean
- * page that needs to be dirtied again before the delalloc mapping can be
- * converted. This stale delalloc mapping can trip up a later direct I/O read
- * operation on the same region.
- *
- * We prevent this by truncating away the delalloc regions on the folio. Because
- * they are delalloc, we can do this without needing a transaction. Indeed - if
- * we get ENOSPC errors, we have to be able to do this truncation without a
- * transaction as there is no space left for block reservation (typically why
- * we see a ENOSPC in writeback).
- */
-static void
-xfs_discard_folio(
- struct folio *folio,
- loff_t pos)
-{
- struct xfs_inode *ip = XFS_I(folio->mapping->host);
- struct xfs_mount *mp = ip->i_mount;
-
- if (xfs_is_shutdown(mp))
- return;
-
- xfs_alert_ratelimited(mp,
- "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
- folio, ip->i_ino, pos);
-
- /*
- * The end of the punch range is always the offset of the first
- * byte of the next folio. Hence the end offset is only dependent on the
- * folio itself and not the start offset that is passed in.
- */
- xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
- folio_pos(folio) + folio_size(folio), NULL);
-}
-
static const struct iomap_writeback_ops xfs_writeback_ops = {
- .map_blocks = xfs_map_blocks,
+ .writeback_range = xfs_writeback_range,
.submit_ioend = xfs_submit_ioend,
- .discard_folio = xfs_discard_folio,
};
struct xfs_zoned_writepage_ctx {
static int
xfs_zoned_map_blocks(
struct iomap_writepage_ctx *wpc,
- struct inode *inode,
loff_t offset,
unsigned int len)
{
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(wpc->inode);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
return 0;
}
+static ssize_t
+xfs_zoned_writeback_range(
+ struct iomap_writepage_ctx *wpc,
+ struct folio *folio,
+ u64 offset,
+ unsigned int len,
+ u64 end_pos)
+{
+ ssize_t ret;
+
+ ret = xfs_zoned_map_blocks(wpc, offset, len);
+ if (!ret)
+ ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
+ if (ret < 0)
+ xfs_discard_folio(folio, offset);
+ return ret;
+}
+
static int
xfs_zoned_submit_ioend(
struct iomap_writepage_ctx *wpc,
}
static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
- .map_blocks = xfs_zoned_map_blocks,
+ .writeback_range = xfs_zoned_writeback_range,
.submit_ioend = xfs_zoned_submit_ioend,
- .discard_folio = xfs_discard_folio,
};
STATIC int
* Map blocks for page writeback. This is used only on conventional zone files,
* which implies that the page range can only be within the fixed inode size.
*/
-static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
- struct inode *inode, loff_t offset,
- unsigned int len)
+static ssize_t zonefs_writeback_range(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 offset, unsigned len, u64 end_pos)
{
- struct zonefs_zone *z = zonefs_inode_zone(inode);
+ struct zonefs_zone *z = zonefs_inode_zone(wpc->inode);
if (WARN_ON_ONCE(zonefs_zone_is_seq(z)))
return -EIO;
- if (WARN_ON_ONCE(offset >= i_size_read(inode)))
+ if (WARN_ON_ONCE(offset >= i_size_read(wpc->inode)))
return -EIO;
/* If the mapping is already OK, nothing needs to be done */
- if (offset >= wpc->iomap.offset &&
- offset < wpc->iomap.offset + wpc->iomap.length)
- return 0;
+ if (offset < wpc->iomap.offset ||
+ offset >= wpc->iomap.offset + wpc->iomap.length) {
+ int error;
+
+ error = zonefs_write_iomap_begin(wpc->inode, offset,
+ z->z_capacity - offset, IOMAP_WRITE,
+ &wpc->iomap, NULL);
+ if (error)
+ return error;
+ }
- return zonefs_write_iomap_begin(inode, offset,
- z->z_capacity - offset,
- IOMAP_WRITE, &wpc->iomap, NULL);
+ return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
}
static const struct iomap_writeback_ops zonefs_writeback_ops = {
- .map_blocks = zonefs_write_map_blocks,
+ .writeback_range = zonefs_writeback_range,
};
static int zonefs_writepages(struct address_space *mapping,
struct iomap_writeback_ops {
/*
- * Required, maps the blocks so that writeback can be performed on
- * the range starting at offset.
+ * Required, performs writeback on the passed in range
*
- * Can return arbitrarily large regions, but we need to call into it at
+ * Can map arbitrarily large regions, but we need to call into it at
* least once per folio to allow the file systems to synchronize with
* the write path that could be invalidating mappings.
*
* An existing mapping from a previous call to this method can be reused
* by the file system if it is still valid.
+ *
+ * Returns the number of bytes processed or a negative errno.
*/
- int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
- loff_t offset, unsigned len);
+ ssize_t (*writeback_range)(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 pos, unsigned int len,
+ u64 end_pos);
/*
* Optional, allows the file systems to hook into bio submission,
* the bio could not be submitted.
*/
int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
-
- /*
- * Optional, allows the file system to discard state on a page where
- * we failed to submit any I/O.
- */
- void (*discard_folio)(struct folio *folio, loff_t pos);
};
struct iomap_writepage_ctx {
void iomap_ioend_try_merge(struct iomap_ioend *ioend,
struct list_head *more_ioends);
void iomap_sort_ioends(struct list_head *ioend_list);
+ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
+ loff_t pos, loff_t end_pos, unsigned int dirty_len);
+
int iomap_writepages(struct iomap_writepage_ctx *wpc);
/*