]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
read_stream: Prevent distance from decaying too quickly
authorAndres Freund <andres@anarazel.de>
Wed, 1 Apr 2026 23:50:03 +0000 (19:50 -0400)
committerAndres Freund <andres@anarazel.de>
Wed, 1 Apr 2026 23:50:03 +0000 (19:50 -0400)
Until now we reduced the look-ahead distance by 1 on every hit, and doubled it
on every miss. That is problematic because there are very common IO patterns
where this prevents us from ever reaching a sufficiently high distance (e.g. a
miss followed by a hit will never have the distance grow beyond 2). In many
such cases, if we had ever reached a sufficient look-ahead distance, things
would have been fine, because we grow the distance faster than we decrease it.

One might think that the most obvious answer to this problem would be to never
reduce the distance. However, that would not work well, as (particularly with
upcoming users of read streams), it is reasonably common to at first have a
lot of misses and then to transition to a fully cached workload, e.g. because
the same blocks are needed repeatedly within one stream. Doing unnecessarily
deep readahead can be costly, due to having to pin a lot more buffers, which
increases CPU overhead.

Because the cost of a synchronously handled miss can be very high (multiple
milliseconds for every IO with commonly used storage) compared to the CPU
overhead of keeping the distance too high, we want to err on the side of not
reducing the distance too early.

The insight that a decrease of the distance by 1 at ever hit may be ok at
large distances, but not at low distances, shows a way out: If we only allow
decreasing the distance once there were no misses for our maximum look-ahead
distance, we will keep the distance high as long as readahead has a chance to
do IO asynchronously, but not commonly when not.

Several folks have written variants of this patch, including at least Thomas
Munro, Melanie Plageman and I.

Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/f3xxfrkafjxpyqxywcxricxgyizjirfceychyxsgn7bwjp5eda@kwbduhy7tfmu
Discussion: https://postgr.es/m/CA+hUKGL2PhFyDoqrHefqasOnaXhSg48t1phs3VM8BAdrZqKZkw@mail.gmail.com
Discussion: https://postgr.es/m/CAH2-Wz%3DkMg3PNay96cHMT0LFwtxP-cQSRZTZzh1Cixxf8G%3Dzrw%40mail.gmail.com

src/backend/storage/aio/read_stream.c

index c9595ea10c7890b98b26dfb4c441f247a252bbc6..31f9e35dee3106832bc185b027af9e743e0e85eb 100644 (file)
@@ -99,6 +99,7 @@ struct ReadStream
        int16           forwarded_buffers;
        int16           pinned_buffers;
        int16           distance;
+       uint16          distance_decay_holdoff;
        int16           initialized_buffers;
        int16           resume_distance;
        int                     read_buffers_flags;
@@ -364,9 +365,22 @@ read_stream_start_pending_read(ReadStream *stream)
        /* Remember whether we need to wait before returning this buffer. */
        if (!need_wait)
        {
-               /* Look-ahead distance decays, no I/O necessary. */
-               if (stream->distance > 1)
-                       stream->distance--;
+               /*
+                * If there currently is no IO in progress, and we have not needed to
+                * issue IO recently, decay the look-ahead distance.  We detect if we
+                * had to issue IO recently by having a decay holdoff that's set to
+                * the max look-ahead distance whenever we need to do IO.  This is
+                * important to ensure we eventually reach a high enough distance to
+                * perform IO asynchronously when starting out with a small look-ahead
+                * distance.
+                */
+               if (stream->distance > 1 && stream->ios_in_progress == 0)
+               {
+                       if (stream->distance_decay_holdoff == 0)
+                               stream->distance--;
+                       else
+                               stream->distance_decay_holdoff--;
+               }
        }
        else
        {
@@ -702,6 +716,7 @@ read_stream_begin_impl(int flags,
        stream->seq_blocknum = InvalidBlockNumber;
        stream->seq_until_processed = InvalidBlockNumber;
        stream->temporary = SmgrIsTemp(smgr);
+       stream->distance_decay_holdoff = 0;
 
        /*
         * Skip the initial ramp-up phase if the caller says we're going to be
@@ -954,6 +969,20 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
                distance = Min(distance, stream->max_pinned_buffers);
                stream->distance = distance;
 
+               /*
+                * As we needed IO, prevent distance from being reduced within our
+                * maximum look-ahead window. This avoids having distance collapse too
+                * quickly in workloads where most of the required blocks are cached,
+                * but where the remaining IOs are a sufficient enough factor to cause
+                * a substantial slowdown if executed synchronously.
+                *
+                * There are valid arguments for preventing decay for max_ios or for
+                * max_pinned_buffers.  But the argument for max_pinned_buffers seems
+                * clearer - if we can't see any misses within the maximum look-ahead
+                * distance, we can't do any useful read-ahead.
+                */
+               stream->distance_decay_holdoff = stream->max_pinned_buffers;
+
                /*
                 * If we've reached the first block of a sequential region we're
                 * issuing advice for, cancel that until the next jump.  The kernel
@@ -1128,6 +1157,7 @@ read_stream_reset(ReadStream *stream)
        /* Start off assuming data is cached. */
        stream->distance = 1;
        stream->resume_distance = stream->distance;
+       stream->distance_decay_holdoff = 0;
 }
 
 /*