]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking
authorJeff Layton <jlayton@kernel.org>
Mon, 11 May 2026 11:58:29 +0000 (07:58 -0400)
committerChristian Brauner <brauner@kernel.org>
Thu, 4 Jun 2026 08:16:51 +0000 (10:16 +0200)
The IOCB_DONTCACHE writeback path in generic_write_sync() calls
filemap_flush_range() on every write, submitting writeback inline in
the writer's context.  Perf lock contention profiling shows the
performance problem is not lock contention but the writeback submission
work itself — walking the page tree and submitting I/O blocks the writer
for milliseconds, inflating p99.9 latency from 23ms (buffered) to 93ms
(dontcache).

Replace the inline filemap_flush_range() call with a flusher kick that
drains dirty pages in the background.  This moves writeback submission
completely off the writer's hot path.

To avoid flushing unrelated buffered dirty data, add a dedicated
WB_start_dontcache bit and wb_check_start_dontcache() handler that uses
the per-wb WB_DONTCACHE_DIRTY counter to determine how many pages to
write back.  The flusher writes back that many pages from the oldest dirty
inodes (not restricted to dontcache-specific inodes). This helps
preserve I/O batching while limiting the scope of expedited writeback.

Like WB_start_all, the WB_start_dontcache bit coalesces multiple
DONTCACHE writes into a single flusher wakeup without per-write
allocations.  Use test_and_clear_bit to atomically consume the kick
request before reading the dirty counter and starting writeback, so that
concurrent DONTCACHE writes during writeback can re-set the bit and
schedule a follow-up flusher run.

Read the dirty counter with wb_stat_sum() (aggregating per-CPU batches)
rather than wb_stat() (which reads only the global counter) to ensure
small writes below the percpu batch threshold are visible to the flusher.

In filemap_dontcache_kick_writeback(), set the WB_start_dontcache bit
inside the unlocked_inode_to_wb_begin/end section for correct cgroup
writeback domain targeting, but defer the wb_wakeup() call until after
the section ends, since wb_wakeup() uses spin_unlock_irq() which would
unconditionally re-enable interrupts while the i_pages xa_lock may still
be held under irqsave during a cgroup writeback switch. Pin the wb with
wb_get() inside the RCU critical section before calling wb_wakeup()
outside it, since cgroup bdi_writeback structures are RCU-freed and the
wb pointer could become invalid after unlocked_inode_to_wb_end() drops
the RCU read lock.

Also add WB_REASON_DONTCACHE as a new writeback reason for tracing
visibility.

dontcache-bench results (same host, T6F_SKL_1920GBF, 251 GiB RAM,
xfs on NVMe, fio io_uring):

Buffered and direct I/O paths are unaffected by this patchset. All
improvements are confined to the dontcache path:

Single-stream throughput (MB/s):
                        Before    After    Change
  seq-write/dontcache      298      897    +201%
  rand-write/dontcache     131      236     +80%

Tail latency improvements (seq-write/dontcache):
  p99:    135,266 us  ->  23,986 us   (-82%)
  p99.9: 8,925,479 us ->  28,443 us   (-99.7%)

Multi-writer (4 jobs, sequential write):
                                Before    After    Change
  dontcache aggregate (MB/s)     2,529    4,532     +79%
  dontcache p99 (us)             8,553    1,002     -88%
  dontcache p99.9 (us)         109,314    1,057     -99%

  Dontcache multi-writer throughput now matches buffered (4,532 vs
  4,616 MB/s).

32-file write (Axboe test):
                                Before    After    Change
  dontcache aggregate (MB/s)     1,548    3,499    +126%
  dontcache p99 (us)            10,170      602     -94%
  Peak dirty pages (MB)          1,837      213     -88%

  Dontcache now reaches 81% of buffered throughput (was 35%).

Competing writers (dontcache vs buffered, separate files):
                                Before    After
  buffered writer                  868      433 MB/s
  dontcache writer                 415      433 MB/s
  Aggregate                      1,284      866 MB/s

  Previously the buffered writer starved the dontcache writer 2:1.
  With per-bdi_writeback tracking, both writers now receive equal
  bandwidth. The aggregate matches the buffered-vs-buffered baseline
  (863 MB/s), indicating fair sharing regardless of I/O mode.

  The dontcache writer's p99.9 latency collapsed from 119 ms to
  33 ms (-73%), eliminating the severe periodic stalls seen in the
  baseline. Both writers now share identical latency profiles,
  matching the buffered-vs-buffered pattern.

The per-bdi_writeback dirty tracking dramatically reduces peak dirty
pages in dontcache workloads, with the 32-file test dropping from
1.8 GB to 213 MB. Dontcache sequential write throughput triples and
multi-writer throughput reaches parity with buffered I/O, with tail
latencies collapsing by 1-2 orders of magnitude.

Assisted-by: Claude:claude-opus-4-6
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20260511-dontcache-v7-3-2848ddce8090@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
fs/fs-writeback.c
include/linux/backing-dev-defs.h
include/linux/fs.h
include/trace/events/writeback.h

index b303516f27535bcda5b12f9253654b5df1633279..fdb8766d275a17009f3c4864fc8de966e7e8e7df 100644 (file)
@@ -2396,6 +2396,27 @@ static long wb_check_start_all(struct bdi_writeback *wb)
        return nr_pages;
 }
 
+static long wb_check_start_dontcache(struct bdi_writeback *wb)
+{
+       long nr_pages;
+
+       if (!test_and_clear_bit(WB_start_dontcache, &wb->state))
+               return 0;
+
+       nr_pages = wb_stat_sum(wb, WB_DONTCACHE_DIRTY);
+       if (nr_pages) {
+               struct wb_writeback_work work = {
+                       .nr_pages       = nr_pages,
+                       .sync_mode      = WB_SYNC_NONE,
+                       .range_cyclic   = 1,
+                       .reason         = WB_REASON_DONTCACHE,
+               };
+
+               nr_pages = wb_writeback(wb, &work);
+       }
+
+       return nr_pages;
+}
 
 /*
  * Retrieve work items and do the writeback they describe
@@ -2417,6 +2438,11 @@ static long wb_do_writeback(struct bdi_writeback *wb)
         */
        wrote += wb_check_start_all(wb);
 
+       /*
+        * Check for dontcache writeback request
+        */
+       wrote += wb_check_start_dontcache(wb);
+
        /*
         * Check for periodic writeback, kupdated() style
         */
@@ -2491,6 +2517,43 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
        rcu_read_unlock();
 }
 
+/**
+ * filemap_dontcache_kick_writeback - kick flusher for IOCB_DONTCACHE writes
+ * @mapping:   address_space that was just written to
+ *
+ * Kick the writeback flusher thread to expedite writeback of dontcache dirty
+ * pages. Queue writeback for the inode's wb for as many pages as there are
+ * dontcache pages, but don't restrict writeback to dontcache pages only.
+ *
+ * This significantly improves performance over either writing all wb's pages
+ * or writing only dontcache pages.  Although it doesn't guarantee quick
+ * writeback and reclaim of dontcache pages, it keeps the amount of dirty pages
+ * in check. Over longer term dontcache pages get written and reclaimed by
+ * background writeback even with this rough heuristic.
+ */
+void filemap_dontcache_kick_writeback(struct address_space *mapping)
+{
+       struct inode *inode = mapping->host;
+       struct bdi_writeback *wb;
+       struct wb_lock_cookie cookie = {};
+       bool need_wakeup = false;
+
+       wb = unlocked_inode_to_wb_begin(inode, &cookie);
+       if (wb_has_dirty_io(wb) &&
+           !test_bit(WB_start_dontcache, &wb->state) &&
+           !test_and_set_bit(WB_start_dontcache, &wb->state)) {
+               wb_get(wb);
+               need_wakeup = true;
+       }
+       unlocked_inode_to_wb_end(inode, &cookie);
+
+       if (need_wakeup) {
+               wb_wakeup(wb);
+               wb_put(wb);
+       }
+}
+EXPORT_SYMBOL_GPL(filemap_dontcache_kick_writeback);
+
 /*
  * Wakeup the flusher threads to start writeback of all currently dirty pages
  */
index cb660dd372866b9dffb8cfa66ee03d890076bb6a..4f10849373150dfbc735565ac0f06f4d20194235 100644 (file)
@@ -26,6 +26,7 @@ enum wb_state {
        WB_writeback_running,   /* Writeback is in progress */
        WB_has_dirty_io,        /* Dirty inodes on ->b_{dirty|io|more_io} */
        WB_start_all,           /* nr_pages == 0 (all) work pending */
+       WB_start_dontcache,     /* dontcache writeback pending */
 };
 
 enum wb_stat_item {
@@ -56,6 +57,7 @@ enum wb_reason {
         */
        WB_REASON_FORKER_THREAD,
        WB_REASON_FOREIGN_FLUSH,
+       WB_REASON_DONTCACHE,
 
        WB_REASON_MAX,
 };
index 11559c513dfbb2f9bf15a8c7516f031aab3e094f..df72b42a9e9b6fed10bdfe5ec3b6084179b2036e 100644 (file)
@@ -2624,6 +2624,7 @@ extern int __must_check file_write_and_wait_range(struct file *file,
                                                loff_t start, loff_t end);
 int filemap_flush_range(struct address_space *mapping, loff_t start,
                loff_t end);
+void filemap_dontcache_kick_writeback(struct address_space *mapping);
 
 static inline int file_write_and_wait(struct file *file)
 {
@@ -2657,10 +2658,7 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
                if (ret)
                        return ret;
        } else if (iocb->ki_flags & IOCB_DONTCACHE) {
-               struct address_space *mapping = iocb->ki_filp->f_mapping;
-
-               filemap_flush_range(mapping, iocb->ki_pos - count,
-                               iocb->ki_pos - 1);
+               filemap_dontcache_kick_writeback(iocb->ki_filp->f_mapping);
        }
 
        return count;
index bdac0d685a984400c3d9bea56105d8cede28fc41..13ee076ccd166ee7fbae5486edd4abb9b648ab1f 100644 (file)
@@ -44,7 +44,8 @@
        EM( WB_REASON_PERIODIC,                 "periodic")             \
        EM( WB_REASON_FS_FREE_SPACE,            "fs_free_space")        \
        EM( WB_REASON_FORKER_THREAD,            "forker_thread")        \
-       EMe(WB_REASON_FOREIGN_FLUSH,            "foreign_flush")
+       EM( WB_REASON_FOREIGN_FLUSH,            "foreign_flush")        \
+       EMe(WB_REASON_DONTCACHE,                "dontcache")
 
 WB_WORK_REASON