]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
aio: Don't wait for already in-progress IO
authorAndres Freund <andres@anarazel.de>
Fri, 27 Mar 2026 23:51:53 +0000 (19:51 -0400)
committerAndres Freund <andres@anarazel.de>
Fri, 27 Mar 2026 23:53:32 +0000 (19:53 -0400)
When a backend attempts to start a read IO and finds the first buffer already
has I/O in progress, previously it waited for that I/O to complete before
initiating reads for any of the subsequent buffers.

Although it must wait for the I/O to finish when acquiring the buffer, there's
no reason for it to wait when setting up the read operation. Waiting at this
point prevents starting I/O on subsequent buffers and can significantly reduce
concurrency.

This matters in two workloads:
1) When multiple backends scan the same relation concurrently.
2) When a single backend requests the same block multiple times within the
   readahead distance.

Waiting each time an in-progress read is encountered effectively degenerates
the access pattern into synchronous I/O.

To fix this, when encountering an already in-progress IO for the head buffer,
the wait reference is now recorded and waiting is deferred until
WaitReadBuffers(), when the buffer actually needs to be acquired.

In rare cases, a backend may still need to wait synchronously at IO
start time: If another backend has set BM_IO_IN_PROGRESS on the buffer
but has not yet set the wait reference. Such windows should be brief and
uncommon.

Author: Melanie Plageman <melanieplageman@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/flat/zljergweqti7x67lg5ije2rzjusie37nslsnkjkkby4laqqbfw%403p3zu522yykv

src/backend/storage/buffer/bufmgr.c
src/include/storage/bufmgr.h
src/test/modules/test_aio/t/001_aio.pl
src/test/modules/test_aio/test_aio--1.0.sql
src/test/modules/test_aio/test_aio.c

index 4d8c3b5d37b98e24184a9c470f09e339706bb077..cd21ae3fc36d64ce997672e6e00a4db89da05487 100644 (file)
@@ -1800,8 +1800,11 @@ WaitReadBuffers(ReadBuffersOperation *operation)
                         * b) reports some time as waiting, even if we never waited
                         *
                         * we first check if we already know the IO is complete.
+                        *
+                        * Note that operation->io_return is uninitialized for foreign IO,
+                        * so we cannot use the cheaper PGAIO_RS_UNKNOWN pre-check.
                         */
-                       if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
+                       if ((operation->foreign_io || aio_ret->result.status == PGAIO_RS_UNKNOWN) &&
                                !pgaio_wref_check_done(&operation->io_wref))
                        {
                                instr_time      io_start = pgstat_prepare_io_time(track_io_timing);
@@ -1820,11 +1823,45 @@ WaitReadBuffers(ReadBuffersOperation *operation)
                                Assert(pgaio_wref_check_done(&operation->io_wref));
                        }
 
-                       /*
-                        * We now are sure the IO completed. Check the results. This
-                        * includes reporting on errors if there were any.
-                        */
-                       ProcessReadBuffersResult(operation);
+                       if (unlikely(operation->foreign_io))
+                       {
+                               Buffer          buffer = operation->buffers[operation->nblocks_done];
+                               BufferDesc *desc = BufferIsLocal(buffer) ?
+                                       GetLocalBufferDescriptor(-buffer - 1) :
+                                       GetBufferDescriptor(buffer - 1);
+                               uint64          buf_state = pg_atomic_read_u64(&desc->state);
+
+                               if (buf_state & BM_VALID)
+                               {
+                                       BlockNumber blocknum = operation->blocknum + operation->nblocks_done;
+
+                                       operation->nblocks_done += 1;
+                                       Assert(operation->nblocks_done <= operation->nblocks);
+
+                                       /*
+                                        * Track this as a 'hit' for this backend. The backend
+                                        * performing the IO will track it as a 'read'.
+                                        */
+                                       TrackBufferHit(io_object, io_context,
+                                                                  operation->rel, operation->persistence,
+                                                                  operation->smgr, operation->forknum,
+                                                                  blocknum);
+                               }
+
+                               /*
+                                * If the foreign IO failed and left the buffer invalid,
+                                * nblocks_done is not incremented. The retry loop below will
+                                * call AsyncReadBuffers() which will attempt the IO itself.
+                                */
+                       }
+                       else
+                       {
+                               /*
+                                * We now are sure the IO completed. Check the results. This
+                                * includes reporting on errors if there were any.
+                                */
+                               ProcessReadBuffersResult(operation);
+                       }
                }
 
                /*
@@ -1870,7 +1907,8 @@ WaitReadBuffers(ReadBuffersOperation *operation)
  * affected by the call. If the first buffer is valid, *nblocks_progress is
  * set to 1 and operation->nblocks_done is incremented.
  *
- * Returns true if IO was initiated, false if no IO was necessary.
+ * Returns true if IO was initiated or is already in progress (foreign IO),
+ * false if the buffer was already valid.
  */
 static bool
 AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
@@ -1943,8 +1981,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
        pgstat_prepare_report_checksum_failure(operation->smgr->smgr_rlocator.locator.dbOid);
 
        /*
-        * Get IO handle before StartBufferIO(), as pgaio_io_acquire() might
-        * block, which we don't want after setting IO_IN_PROGRESS.
+        * We must get an IO handle before StartBufferIO(), as pgaio_io_acquire()
+        * might block, which we don't want after setting IO_IN_PROGRESS. If we
+        * don't need to do the IO, we'll release the handle.
         *
         * If we need to wait for IO before we can get a handle, submit
         * already-staged IO first, so that other backends don't need to wait.
@@ -1966,14 +2005,34 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
                ioh = pgaio_io_acquire(CurrentResourceOwner, &operation->io_return);
        }
 
+       operation->foreign_io = false;
+       pgaio_wref_clear(&operation->io_wref);
+
        /*
-        * Check if we can start IO on the first to-be-read buffer.
+        * Try to start IO on the first buffer in a new run of blocks. If AIO is
+        * in progress, be it in this backend or another backend, we just
+        * associate the wait reference with the operation and wait in
+        * WaitReadBuffers(). This turns out to be important for performance in
+        * two workloads:
+        *
+        * 1) A read stream that has to read the same block multiple times within
+        * the readahead distance. This can happen e.g. for the table accesses of
+        * an index scan.
+        *
+        * 2) Concurrent scans by multiple backends on the same relation.
+        *
+        * If we were to synchronously wait for the in-progress IO, we'd not be
+        * able to keep enough I/O in flight.
+        *
+        * If we do find there is ongoing I/O for the buffer, we set up a 1-block
+        * ReadBuffersOperation that WaitReadBuffers then can wait on.
         *
-        * If an I/O is already in progress in another backend, we want to wait
-        * for the outcome: either done, or something went wrong and we will
-        * retry.
+        * It's possible that another backend has started IO on the buffer but not
+        * yet set its wait reference. In this case, we have no choice but to wait
+        * for either the wait reference to be valid or the IO to be done.
         */
-       status = StartBufferIO(buffers[nblocks_done], true, true, NULL);
+       status = StartBufferIO(buffers[nblocks_done], true, true,
+                                                  &operation->io_wref);
        if (status != BUFFER_IO_READY_FOR_IO)
        {
                pgaio_io_release(ioh);
@@ -2006,6 +2065,8 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
 
                /* The IO is already in-progress */
                Assert(status == BUFFER_IO_IN_PROGRESS);
+               Assert(pgaio_wref_valid(&operation->io_wref));
+               operation->foreign_io = true;
 
                return true;
        }
index 4017896f9518ece00f64db0853131f7da5d0b9b2..dd41b92f9444dca3345047f9669c29f13a06131d 100644 (file)
@@ -144,9 +144,11 @@ struct ReadBuffersOperation
         */
        Buffer     *buffers;
        BlockNumber blocknum;
-       int                     flags;
+       uint16          flags;
        int16           nblocks;
        int16           nblocks_done;
+       /* true if waiting on another backend's IO */
+       bool            foreign_io;
        PgAioWaitRef io_wref;
        PgAioReturn io_return;
 };
index 1f9e83690f43b131d1f00cb555adb87310e5973d..63cadd64c15b75c75be9df5a21636c2df4287dcb 100644 (file)
@@ -79,6 +79,9 @@ sub psql_like
        return $output;
 }
 
+# Issue query, wait for the specified wait event to be reached. If
+# wait_current_session is true, we will wait for the event in the current
+# session, otherwise we'll wait for any session.
 sub query_wait_block
 {
        local $Test::Builder::Level = $Test::Builder::Level + 1;
@@ -88,16 +91,29 @@ sub query_wait_block
        my $name = shift;
        my $sql = shift;
        my $waitfor = shift;
+       my $wait_current_session = shift;
 
        my $pid = $psql->query_safe('SELECT pg_backend_pid()');
 
        $psql->{stdin} .= qq($sql;\n);
        $psql->{run}->pump_nb();
+       note "issued sql: $sql;\n";
        ok(1, "$io_method: $name: issued sql");
 
-       $node->poll_query_until('postgres',
-               qq(SELECT wait_event FROM pg_stat_activity WHERE pid = $pid),
-               $waitfor);
+       my $waitquery;
+       if ($wait_current_session)
+       {
+               $waitquery =
+                 qq(SELECT wait_event FROM pg_stat_activity WHERE pid = $pid);
+       }
+       else
+       {
+               $waitquery =
+                 qq(SELECT wait_event FROM pg_stat_activity WHERE wait_event = '$waitfor');
+       }
+
+       note "polling for completion with $waitquery";
+       $node->poll_query_until('postgres', $waitquery, $waitfor);
        ok(1, "$io_method: $name: observed $waitfor wait event");
 }
 
@@ -410,7 +426,8 @@ sub test_startwait_io
                $psql_b,
                "blocking start buffer io",
                qq(SELECT buffer_call_start_io($buf_id, for_input=>true, wait=>true);),
-               "BufferIo");
+               "BufferIo",
+               1);
 
        # Terminate the IO, without marking it as success, this should trigger the
        # waiting session to be able to start the io
@@ -449,7 +466,8 @@ sub test_startwait_io
                $psql_b,
                "blocking start buffer io",
                qq(SELECT buffer_call_start_io($buf_id, for_input=>true, wait=>true);),
-               "BufferIo");
+               "BufferIo",
+               1);
 
        # Terminate the IO, marking it as success
        psql_like(
@@ -1560,6 +1578,10 @@ INSERT INTO tmp_ok SELECT generate_series(1, 5000);
 
                # Test encountering buffer IO we started in the first block of the
                # range.
+               #
+               # Depending on how quick the IO we start completes, the IO might be
+               # completed or we "join" the foreign IO. To hide that variability, the
+               # query below treats a foreign IO as not having needed to do IO.
                $psql_a->query_safe(qq|SELECT evict_rel('$table')|);
                $psql_a->query_safe(
                        qq|SELECT read_rel_block_ll('$table', 1, wait_complete=>false)|);
@@ -1567,7 +1589,7 @@ INSERT INTO tmp_ok SELECT generate_series(1, 5000);
                        $io_method,
                        $psql_a,
                        "$persistency: read buffers, in-progress 1, read 1-3",
-                       qq|SELECT blockoff, blocknum, io_reqd, nblocks FROM read_buffers('$table', 1, 3)|,
+                       qq|SELECT blockoff, blocknum, io_reqd and not foreign_io, nblocks FROM read_buffers('$table', 1, 3)|,
                        qr/^0\|1\|f\|1\n1\|2\|t\|2$/,
                        qr/^$/);
 
@@ -1579,7 +1601,7 @@ INSERT INTO tmp_ok SELECT generate_series(1, 5000);
                        $io_method,
                        $psql_a,
                        "$persistency: read buffers, in-progress 2, read 1-3",
-                       qq|SELECT blockoff, blocknum, io_reqd, nblocks FROM read_buffers('$table', 1, 3)|,
+                       qq|SELECT blockoff, blocknum, io_reqd and not foreign_io, nblocks FROM read_buffers('$table', 1, 3)|,
                        qr/^0\|1\|t\|1\n1\|2\|f\|1\n2\|3\|t\|1$/,
                        qr/^$/);
 
@@ -1591,8 +1613,7 @@ INSERT INTO tmp_ok SELECT generate_series(1, 5000);
                        $io_method,
                        $psql_a,
                        "$persistency: read buffers, in-progress 3, read 1-3",
-                       qq|SELECT blockoff, blocknum, io_reqd, nblocks FROM
-read_buffers('$table', 1, 3)|,
+                       qq|SELECT blockoff, blocknum, io_reqd and not foreign_io, nblocks FROM read_buffers('$table', 1, 3)|,
                        qr/^0\|1\|t\|2\n2\|3\|f\|1$/,
                        qr/^$/);
        }
@@ -1620,14 +1641,15 @@ read_buffers('$table', 1, 3)|,
                        $node,
                        $psql_a,
                        "$persistency: read buffers blocks waiting for concurrent IO",
-                       qq|SELECT blockoff, blocknum, io_reqd, nblocks FROM read_buffers('$table', 1, 5);\n|,
-                       "BufferIo");
+                       qq|SELECT blockoff, blocknum, io_reqd, foreign_io, nblocks FROM read_buffers('$table', 1, 5);\n|,
+                       "BufferIo",
+                       1);
                $psql_b->query_safe(
                        qq|SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false)|
                );
-               pump_until(
-                       $psql_a->{run}, $psql_a->{timeout},
-                       \$psql_a->{stdout}, qr/0\|1\|t\|2\n2\|3\|t\|3/);
+               # Because no IO wref was assigned, block 3 should not report foreign IO
+               pump_until($psql_a->{run}, $psql_a->{timeout}, \$psql_a->{stdout},
+                       qr/0\|1\|t\|f\|2\n2\|3\|t\|f\|3/);
                ok(1,
                        "$io_method: $persistency: IO was split due to concurrent failed IO"
                );
@@ -1645,13 +1667,15 @@ read_buffers('$table', 1, 3)|,
                        $node,
                        $psql_a,
                        "$persistency: read buffers blocks waiting for concurrent IO",
-                       qq|SELECT blockoff, blocknum, io_reqd, nblocks FROM read_buffers('$table', 1, 5);\n|,
-                       "BufferIo");
+                       qq|SELECT blockoff, blocknum, io_reqd, foreign_io, nblocks FROM read_buffers('$table', 1, 5);\n|,
+                       "BufferIo",
+                       1);
                $psql_b->query_safe(
                        qq|SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>true, io_error=>false, release_aio=>false)|
                );
+               # Because no IO wref was assigned, block 3 should not report foreign IO
                pump_until($psql_a->{run}, $psql_a->{timeout}, \$psql_a->{stdout},
-                       qr/0\|1\|t\|2\n2\|3\|f\|1\n3\|4\|t\|2/);
+                       qr/0\|1\|t\|f\|2\n2\|3\|f\|f\|1\n3\|4\|t\|f\|2/);
                ok(1,
                        "$io_method: $persistency: IO was split due to concurrent successful IO"
                );
@@ -1662,6 +1686,162 @@ read_buffers('$table', 1, 3)|,
 }
 
 
+# Tests for StartReadBuffers() that depend on injection point support
+sub test_read_buffers_inject
+{
+       my $io_method = shift;
+       my $node = shift;
+
+       my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+       my $psql_b = $node->background_psql('postgres', on_error_stop => 0);
+       my $psql_c = $node->background_psql('postgres', on_error_stop => 0);
+
+       my $expected;
+
+       # We can't easily test waiting for foreign IOs on temporary tables, as the
+       # waiting in the completion hook will just stall the backend. For worker
+       # that is because temporary table IO is executed synchronously, for
+       # io_uring the completion will be executed in the same process, but due to
+       # temporary tables not being shared, we can't do the wait in another
+       # backend.
+       my $table = 'tbl_ok';
+       my $persistency = 'normal';
+
+
+       ###
+       # Test if a read buffers encounters AIO in progress by another backend, it
+       # recognizes that other IO as a foreign IO.
+       ###
+       $psql_a->query_safe(qq|SELECT evict_rel('$table')|);
+
+       # B: Trigger wait in the next AIO read for block 1.
+       $psql_b->query_safe(
+               qq/SELECT inj_io_completion_wait(pid=>pg_backend_pid(),
+                  relfilenode=>pg_relation_filenode('$table'),
+                  blockno=>1);/);
+       ok(1,
+               "$io_method: $persistency: configure wait in completion of block 1");
+
+       # B: Read block 1 and wait for the completion hook to be reached (which could
+       # be in B itself or in an IO worker)
+       query_wait_block(
+               $io_method,
+               $node,
+               $psql_b,
+               "$persistency: wait in completion of block 1",
+               qq|SELECT read_rel_block_ll('$table', blockno=>1, nblocks=>1)|,
+               'completion_wait',
+               0);
+
+       # A: Start read, wait until we're waiting for IO completion
+       query_wait_block(
+               $io_method,
+               $node,
+               $psql_a,
+               "$persistency: read 1-4, blocked on in-progress 1",
+               qq|SELECT blockoff, blocknum, io_reqd, foreign_io, nblocks FROM read_buffers('$table', 1, 4)|,
+               "AioIoCompletion",
+               1);
+
+       # C: Release B from completion hook
+       $psql_c->query_safe(qq|SELECT inj_io_completion_continue()|);
+       ok(1, "$io_method: $persistency: continued completion of block 1");
+
+       # A: Check that we recognized the foreign IO wait, if possible
+       #
+       # Due to sync mode not actually issuing IO below StartReadBuffers(), we
+       # can't observe encountering foreign IO. It still seems worth exercising these
+       # paths however.
+       if ($io_method ne 'sync')
+       {
+               # A foreign IO covering block 1, and one IO covering blocks 2-4.
+               $expected = qr/0\|1\|t\|t\|1\n1\|2\|t\|f\|3/;
+       }
+       else
+       {
+               # One IO covering everything, as that's what StartReadBuffers() will
+               # return for something with misses in sync mode.
+               $expected = qr/0\|1\|t\|f\|4/;
+       }
+       pump_until($psql_a->{run}, $psql_a->{timeout}, \$psql_a->{stdout},
+               $expected);
+       ok(1,
+               "$io_method: $persistency: read 1-3, blocked on in-progress 1, see expected result"
+       );
+       $psql_a->{stdout} = '';
+
+
+       ###
+       # Test if a read buffers encounters AIO in progress by another backend, it
+       # recognizes that other IO as a foreign IO. This time we encounter the
+       # foreign IO multiple times.
+       ###
+       $psql_a->query_safe(qq|SELECT evict_rel('$table')|);
+
+       # B: Trigger wait in the next AIO read for block 3.
+       $psql_b->query_safe(
+               qq/SELECT inj_io_completion_wait(pid=>pg_backend_pid(),
+                  relfilenode=>pg_relation_filenode('$table'),
+                  blockno=>3);/);
+       ok(1,
+               "$io_method: $persistency: configure wait in completion of block 3");
+
+       # B: Read block 2-3 and wait for the completion hook to be reached (which
+       # could be in B itself or in an IO worker)
+       query_wait_block(
+               $io_method,
+               $node,
+               $psql_b,
+               "$persistency: wait in completion of block 2+3",
+               qq|SELECT read_rel_block_ll('$table', blockno=>2, nblocks=>2)|,
+               'completion_wait',
+               0);
+
+       # A: Start read, wait until we're waiting for IO completion
+       #
+       # Note that we need to defer waiting for IO until the end of
+       # read_buffers(), to be able to see that the IO on 3 is still in progress.
+       query_wait_block(
+               $io_method,
+               $node,
+               $psql_a,
+               "$persistency: read 0-3, blocked on in-progress 2+3",
+               qq|SELECT blockoff, blocknum, io_reqd, foreign_io, nblocks FROM
+read_buffers('$table', 0, 4)|,
+               "AioIoCompletion", 1);
+
+       # C: Release B from completion hook
+       $psql_c->query_safe(qq|SELECT inj_io_completion_continue()|);
+       ok(1, "$io_method: $persistency: continued completion of block 2+3");
+
+       # A: Check that we recognized the foreign IO wait, if possible
+       #
+       # See comment further up about sync mode.
+       if ($io_method ne 'sync')
+       {
+               # One IO covering blocks 0-1, A foreign IO covering block 2, and a
+               # foreign IO covering block 3 (same wref as for block 2).
+               $expected = qr/0\|0\|t\|f\|2\n2\|2\|t\|t\|1\n3\|3\|t\|t\|1/;
+       }
+       else
+       {
+               # One IO covering everything, as that's what StartReadBuffers() will
+               # return for something with misses in sync mode.
+               $expected = qr/0\|0\|t\|f\|4/;
+       }
+       pump_until($psql_a->{run}, $psql_a->{timeout}, \$psql_a->{stdout},
+               $expected);
+       ok(1,
+               "$io_method: $persistency: read 0-3, blocked on in-progress 2+3, see expected result"
+       );
+       $psql_a->{stdout} = '';
+
+
+       $psql_a->quit();
+       $psql_b->quit();
+       $psql_c->quit();
+}
+
 # Run all tests that for the specified node / io_method
 sub test_io_method
 {
@@ -1705,6 +1885,7 @@ CHECKPOINT;
                skip 'Injection points not supported by this build', 1
                  unless $ENV{enable_injection_points} eq 'yes';
                test_inject($io_method, $node);
+               test_read_buffers_inject($io_method, $node);
        }
 
        # worker specific injection tests
index 53c83b74e5355e9a87ace4780c5476840f9fc06c..762ac29512f7fc1dabfdc47dfe6840bf840832f3 100644 (file)
@@ -53,7 +53,7 @@ CREATE FUNCTION buffer_call_terminate_io(buffer int, for_input bool, succeed boo
 RETURNS pg_catalog.void STRICT
 AS 'MODULE_PATHNAME' LANGUAGE C;
 
-CREATE FUNCTION read_buffers(rel regclass, startblock int4, nblocks int4, OUT blockoff int4, OUT blocknum int4, OUT io_reqd bool, OUT nblocks int4, OUT buf int4[])
+CREATE FUNCTION read_buffers(rel regclass, startblock int4, nblocks int4, OUT blockoff int4, OUT blocknum int4, OUT io_reqd bool, OUT foreign_io bool, OUT nblocks int4, OUT buf int4[])
 RETURNS SETOF record STRICT
 AS 'MODULE_PATHNAME' LANGUAGE C;
 
index a0ad2313661223ab3f3c05d0a9ee09c9e3a5373b..a8267192cb77cf53cd491dbc2cbebc0c93403618 100644 (file)
@@ -778,8 +778,8 @@ read_buffers(PG_FUNCTION_ARGS)
        {
                ReadBuffersOperation *operation = &operations[nio];
                int                     nblocks_this_io = operation->nblocks;
-               Datum           values[5] = {0};
-               bool            nulls[5] = {0};
+               Datum           values[6] = {0};
+               bool            nulls[6] = {0};
                ArrayType  *buffers_arr;
 
                /* convert buffer array to datum array */
@@ -809,14 +809,18 @@ read_buffers(PG_FUNCTION_ARGS)
                values[2] = BoolGetDatum(io_reqds[nio]);
                nulls[2] = false;
 
-               /* nblocks */
-               values[3] = Int32GetDatum(nblocks_this_io);
+               /* foreign IO */
+               values[3] = BoolGetDatum(operation->foreign_io);
                nulls[3] = false;
 
-               /* array of buffers */
-               values[4] = PointerGetDatum(buffers_arr);
+               /* nblocks */
+               values[4] = Int32GetDatum(nblocks_this_io);
                nulls[4] = false;
 
+               /* array of buffers */
+               values[5] = PointerGetDatum(buffers_arr);
+               nulls[5] = false;
+
                tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
 
                nblocks_disp += nblocks_this_io;