]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
ext4: process folios writeback in bytes
authorZhang Yi <yi.zhang@huawei.com>
Mon, 7 Jul 2025 14:08:04 +0000 (22:08 +0800)
committerTheodore Ts'o <tytso@mit.edu>
Mon, 14 Jul 2025 03:41:51 +0000 (23:41 -0400)
Since ext4 supports large folios, processing writebacks in pages is no
longer appropriate, it can be modified to process writebacks in bytes.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/20250707140814.542883-2-yi.zhang@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
fs/ext4/inode.c
include/trace/events/ext4.h

index 35e7f34ee18890fad6f93c7551a57fa7a611b7b9..4cb499f28c12c427de663a2828a725a25cd6d80b 100644 (file)
@@ -1665,11 +1665,12 @@ struct mpage_da_data {
        unsigned int can_map:1; /* Can writepages call map blocks? */
 
        /* These are internal state of ext4_do_writepages() */
-       pgoff_t first_page;     /* The first page to write */
-       pgoff_t next_page;      /* Current page to examine */
-       pgoff_t last_page;      /* Last page to examine */
+       loff_t start_pos;       /* The start pos to write */
+       loff_t next_pos;        /* Current pos to examine */
+       loff_t end_pos;         /* Last pos to examine */
+
        /*
-        * Extent to map - this can be after first_page because that can be
+        * Extent to map - this can be after start_pos because that can be
         * fully mapped. We somewhat abuse m_flags to store whether the extent
         * is delalloc or unwritten.
         */
@@ -1689,38 +1690,38 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
 
-       /* This is necessary when next_page == 0. */
-       if (mpd->first_page >= mpd->next_page)
+       /* This is necessary when next_pos == 0. */
+       if (mpd->start_pos >= mpd->next_pos)
                return;
 
        mpd->scanned_until_end = 0;
-       index = mpd->first_page;
-       end   = mpd->next_page - 1;
        if (invalidate) {
                ext4_lblk_t start, last;
-               start = index << (PAGE_SHIFT - inode->i_blkbits);
-               last = end << (PAGE_SHIFT - inode->i_blkbits);
+               start = EXT4_B_TO_LBLK(inode, mpd->start_pos);
+               last = mpd->next_pos >> inode->i_blkbits;
 
                /*
                 * avoid racing with extent status tree scans made by
                 * ext4_insert_delayed_block()
                 */
                down_write(&EXT4_I(inode)->i_data_sem);
-               ext4_es_remove_extent(inode, start, last - start + 1);
+               ext4_es_remove_extent(inode, start, last - start);
                up_write(&EXT4_I(inode)->i_data_sem);
        }
 
        folio_batch_init(&fbatch);
-       while (index <= end) {
-               nr = filemap_get_folios(mapping, &index, end, &fbatch);
+       index = mpd->start_pos >> PAGE_SHIFT;
+       end = mpd->next_pos >> PAGE_SHIFT;
+       while (index < end) {
+               nr = filemap_get_folios(mapping, &index, end - 1, &fbatch);
                if (nr == 0)
                        break;
                for (i = 0; i < nr; i++) {
                        struct folio *folio = fbatch.folios[i];
 
-                       if (folio->index < mpd->first_page)
+                       if (folio_pos(folio) < mpd->start_pos)
                                continue;
-                       if (folio_next_index(folio) - 1 > end)
+                       if (folio_next_index(folio) > end)
                                continue;
                        BUG_ON(!folio_test_locked(folio));
                        BUG_ON(folio_test_writeback(folio));
@@ -2022,7 +2023,7 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 
 static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio)
 {
-       mpd->first_page += folio_nr_pages(folio);
+       mpd->start_pos += folio_size(folio);
        folio_unlock(folio);
 }
 
@@ -2032,7 +2033,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
        loff_t size;
        int err;
 
-       BUG_ON(folio->index != mpd->first_page);
+       WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos);
        folio_clear_dirty_for_io(folio);
        /*
         * We have to be very careful here!  Nothing protects writeback path
@@ -2444,7 +2445,7 @@ update_disksize:
         * Update on-disk size after IO is submitted.  Races with
         * truncate are avoided by checking i_size under i_data_sem.
         */
-       disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
+       disksize = mpd->start_pos;
        if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
                int err2;
                loff_t i_size;
@@ -2547,8 +2548,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
        struct address_space *mapping = mpd->inode->i_mapping;
        struct folio_batch fbatch;
        unsigned int nr_folios;
-       pgoff_t index = mpd->first_page;
-       pgoff_t end = mpd->last_page;
+       pgoff_t index = mpd->start_pos >> PAGE_SHIFT;
+       pgoff_t end = mpd->end_pos >> PAGE_SHIFT;
        xa_mark_t tag;
        int i, err = 0;
        int blkbits = mpd->inode->i_blkbits;
@@ -2563,7 +2564,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
                tag = PAGECACHE_TAG_DIRTY;
 
        mpd->map.m_len = 0;
-       mpd->next_page = index;
+       mpd->next_pos = mpd->start_pos;
        if (ext4_should_journal_data(mpd->inode)) {
                handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE,
                                            bpp);
@@ -2594,7 +2595,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
                                goto out;
 
                        /* If we can't merge this page, we are done. */
-                       if (mpd->map.m_len > 0 && mpd->next_page != folio->index)
+                       if (mpd->map.m_len > 0 &&
+                           mpd->next_pos != folio_pos(folio))
                                goto out;
 
                        if (handle) {
@@ -2640,8 +2642,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
                        }
 
                        if (mpd->map.m_len == 0)
-                               mpd->first_page = folio->index;
-                       mpd->next_page = folio_next_index(folio);
+                               mpd->start_pos = folio_pos(folio);
+                       mpd->next_pos = folio_pos(folio) + folio_size(folio);
                        /*
                         * Writeout when we cannot modify metadata is simple.
                         * Just submit the page. For data=journal mode we
@@ -2784,18 +2786,18 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
                writeback_index = mapping->writeback_index;
                if (writeback_index)
                        cycled = 0;
-               mpd->first_page = writeback_index;
-               mpd->last_page = -1;
+               mpd->start_pos = writeback_index << PAGE_SHIFT;
+               mpd->end_pos = LLONG_MAX;
        } else {
-               mpd->first_page = wbc->range_start >> PAGE_SHIFT;
-               mpd->last_page = wbc->range_end >> PAGE_SHIFT;
+               mpd->start_pos = wbc->range_start;
+               mpd->end_pos = wbc->range_end;
        }
 
        ext4_io_submit_init(&mpd->io_submit, wbc);
 retry:
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
-               tag_pages_for_writeback(mapping, mpd->first_page,
-                                       mpd->last_page);
+               tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT,
+                                       mpd->end_pos >> PAGE_SHIFT);
        blk_start_plug(&plug);
 
        /*
@@ -2855,7 +2857,7 @@ retry:
                }
                mpd->do_map = 1;
 
-               trace_ext4_da_write_pages(inode, mpd->first_page, wbc);
+               trace_ext4_da_write_pages(inode, mpd->start_pos, wbc);
                ret = mpage_prepare_extent_to_map(mpd);
                if (!ret && mpd->map.m_len)
                        ret = mpage_map_and_submit_extent(handle, mpd,
@@ -2912,8 +2914,8 @@ unplug:
        blk_finish_plug(&plug);
        if (!ret && !cycled && wbc->nr_to_write > 0) {
                cycled = 1;
-               mpd->last_page = writeback_index - 1;
-               mpd->first_page = 0;
+               mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1;
+               mpd->start_pos = 0;
                goto retry;
        }
 
@@ -2923,7 +2925,7 @@ unplug:
                 * Set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
-               mapping->writeback_index = mpd->first_page;
+               mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT;
 
 out_writepages:
        trace_ext4_writepages_result(inode, wbc, ret,
index 156908641e68f18fbbc745b283f02b8fe5bcc44c..62d52997b5c64c9de365de88de64699893270afc 100644 (file)
@@ -483,15 +483,15 @@ TRACE_EVENT(ext4_writepages,
 );
 
 TRACE_EVENT(ext4_da_write_pages,
-       TP_PROTO(struct inode *inode, pgoff_t first_page,
+       TP_PROTO(struct inode *inode, loff_t start_pos,
                 struct writeback_control *wbc),
 
-       TP_ARGS(inode, first_page, wbc),
+       TP_ARGS(inode, start_pos, wbc),
 
        TP_STRUCT__entry(
                __field(        dev_t,  dev                     )
                __field(        ino_t,  ino                     )
-               __field(      pgoff_t,  first_page              )
+               __field(       loff_t,  start_pos               )
                __field(         long,  nr_to_write             )
                __field(          int,  sync_mode               )
        ),
@@ -499,15 +499,14 @@ TRACE_EVENT(ext4_da_write_pages,
        TP_fast_assign(
                __entry->dev            = inode->i_sb->s_dev;
                __entry->ino            = inode->i_ino;
-               __entry->first_page     = first_page;
+               __entry->start_pos      = start_pos;
                __entry->nr_to_write    = wbc->nr_to_write;
                __entry->sync_mode      = wbc->sync_mode;
        ),
 
-       TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld "
-                 "sync_mode %d",
+       TP_printk("dev %d,%d ino %lu start_pos 0x%llx nr_to_write %ld sync_mode %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                 (unsigned long) __entry->ino, __entry->first_page,
+                 (unsigned long) __entry->ino, __entry->start_pos,
                  __entry->nr_to_write, __entry->sync_mode)
 );