From: Andres Freund Date: Fri, 27 Mar 2026 23:51:53 +0000 (-0400) Subject: aio: Don't wait for already in-progress IO X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=999dec9ec6a81668057427c2e9312b20635fba02;p=thirdparty%2Fpostgresql.git aio: Don't wait for already in-progress IO When a backend attempts to start a read IO and finds the first buffer already has I/O in progress, previously it waited for that I/O to complete before initiating reads for any of the subsequent buffers. Although it must wait for the I/O to finish when acquiring the buffer, there's no reason for it to wait when setting up the read operation. Waiting at this point prevents starting I/O on subsequent buffers and can significantly reduce concurrency. This matters in two workloads: 1) When multiple backends scan the same relation concurrently. 2) When a single backend requests the same block multiple times within the readahead distance. Waiting each time an in-progress read is encountered effectively degenerates the access pattern into synchronous I/O. To fix this, when encountering an already in-progress IO for the head buffer, the wait reference is now recorded and waiting is deferred until WaitReadBuffers(), when the buffer actually needs to be acquired. In rare cases, a backend may still need to wait synchronously at IO start time: If another backend has set BM_IO_IN_PROGRESS on the buffer but has not yet set the wait reference. Such windows should be brief and uncommon. Author: Melanie Plageman Author: Andres Freund Reviewed-by: Andres Freund Reviewed-by: Melanie Plageman Reviewed-by: Nazir Bilal Yavuz Discussion: https://postgr.es/m/flat/zljergweqti7x67lg5ije2rzjusie37nslsnkjkkby4laqqbfw%403p3zu522yykv --- diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 4d8c3b5d37b..cd21ae3fc36 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1800,8 +1800,11 @@ WaitReadBuffers(ReadBuffersOperation *operation) * b) reports some time as waiting, even if we never waited * * we first check if we already know the IO is complete. + * + * Note that operation->io_return is uninitialized for foreign IO, + * so we cannot use the cheaper PGAIO_RS_UNKNOWN pre-check. */ - if (aio_ret->result.status == PGAIO_RS_UNKNOWN && + if ((operation->foreign_io || aio_ret->result.status == PGAIO_RS_UNKNOWN) && !pgaio_wref_check_done(&operation->io_wref)) { instr_time io_start = pgstat_prepare_io_time(track_io_timing); @@ -1820,11 +1823,45 @@ WaitReadBuffers(ReadBuffersOperation *operation) Assert(pgaio_wref_check_done(&operation->io_wref)); } - /* - * We now are sure the IO completed. Check the results. This - * includes reporting on errors if there were any. - */ - ProcessReadBuffersResult(operation); + if (unlikely(operation->foreign_io)) + { + Buffer buffer = operation->buffers[operation->nblocks_done]; + BufferDesc *desc = BufferIsLocal(buffer) ? + GetLocalBufferDescriptor(-buffer - 1) : + GetBufferDescriptor(buffer - 1); + uint64 buf_state = pg_atomic_read_u64(&desc->state); + + if (buf_state & BM_VALID) + { + BlockNumber blocknum = operation->blocknum + operation->nblocks_done; + + operation->nblocks_done += 1; + Assert(operation->nblocks_done <= operation->nblocks); + + /* + * Track this as a 'hit' for this backend. The backend + * performing the IO will track it as a 'read'. + */ + TrackBufferHit(io_object, io_context, + operation->rel, operation->persistence, + operation->smgr, operation->forknum, + blocknum); + } + + /* + * If the foreign IO failed and left the buffer invalid, + * nblocks_done is not incremented. The retry loop below will + * call AsyncReadBuffers() which will attempt the IO itself. + */ + } + else + { + /* + * We now are sure the IO completed. Check the results. This + * includes reporting on errors if there were any. + */ + ProcessReadBuffersResult(operation); + } } /* @@ -1870,7 +1907,8 @@ WaitReadBuffers(ReadBuffersOperation *operation) * affected by the call. If the first buffer is valid, *nblocks_progress is * set to 1 and operation->nblocks_done is incremented. * - * Returns true if IO was initiated, false if no IO was necessary. + * Returns true if IO was initiated or is already in progress (foreign IO), + * false if the buffer was already valid. */ static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) @@ -1943,8 +1981,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) pgstat_prepare_report_checksum_failure(operation->smgr->smgr_rlocator.locator.dbOid); /* - * Get IO handle before StartBufferIO(), as pgaio_io_acquire() might - * block, which we don't want after setting IO_IN_PROGRESS. + * We must get an IO handle before StartBufferIO(), as pgaio_io_acquire() + * might block, which we don't want after setting IO_IN_PROGRESS. If we + * don't need to do the IO, we'll release the handle. * * If we need to wait for IO before we can get a handle, submit * already-staged IO first, so that other backends don't need to wait. @@ -1966,14 +2005,34 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) ioh = pgaio_io_acquire(CurrentResourceOwner, &operation->io_return); } + operation->foreign_io = false; + pgaio_wref_clear(&operation->io_wref); + /* - * Check if we can start IO on the first to-be-read buffer. + * Try to start IO on the first buffer in a new run of blocks. If AIO is + * in progress, be it in this backend or another backend, we just + * associate the wait reference with the operation and wait in + * WaitReadBuffers(). This turns out to be important for performance in + * two workloads: + * + * 1) A read stream that has to read the same block multiple times within + * the readahead distance. This can happen e.g. for the table accesses of + * an index scan. + * + * 2) Concurrent scans by multiple backends on the same relation. + * + * If we were to synchronously wait for the in-progress IO, we'd not be + * able to keep enough I/O in flight. + * + * If we do find there is ongoing I/O for the buffer, we set up a 1-block + * ReadBuffersOperation that WaitReadBuffers then can wait on. * - * If an I/O is already in progress in another backend, we want to wait - * for the outcome: either done, or something went wrong and we will - * retry. + * It's possible that another backend has started IO on the buffer but not + * yet set its wait reference. In this case, we have no choice but to wait + * for either the wait reference to be valid or the IO to be done. */ - status = StartBufferIO(buffers[nblocks_done], true, true, NULL); + status = StartBufferIO(buffers[nblocks_done], true, true, + &operation->io_wref); if (status != BUFFER_IO_READY_FOR_IO) { pgaio_io_release(ioh); @@ -2006,6 +2065,8 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) /* The IO is already in-progress */ Assert(status == BUFFER_IO_IN_PROGRESS); + Assert(pgaio_wref_valid(&operation->io_wref)); + operation->foreign_io = true; return true; } diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 4017896f951..dd41b92f944 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -144,9 +144,11 @@ struct ReadBuffersOperation */ Buffer *buffers; BlockNumber blocknum; - int flags; + uint16 flags; int16 nblocks; int16 nblocks_done; + /* true if waiting on another backend's IO */ + bool foreign_io; PgAioWaitRef io_wref; PgAioReturn io_return; }; diff --git a/src/test/modules/test_aio/t/001_aio.pl b/src/test/modules/test_aio/t/001_aio.pl index 1f9e83690f4..63cadd64c15 100644 --- a/src/test/modules/test_aio/t/001_aio.pl +++ b/src/test/modules/test_aio/t/001_aio.pl @@ -79,6 +79,9 @@ sub psql_like return $output; } +# Issue query, wait for the specified wait event to be reached. If +# wait_current_session is true, we will wait for the event in the current +# session, otherwise we'll wait for any session. sub query_wait_block { local $Test::Builder::Level = $Test::Builder::Level + 1; @@ -88,16 +91,29 @@ sub query_wait_block my $name = shift; my $sql = shift; my $waitfor = shift; + my $wait_current_session = shift; my $pid = $psql->query_safe('SELECT pg_backend_pid()'); $psql->{stdin} .= qq($sql;\n); $psql->{run}->pump_nb(); + note "issued sql: $sql;\n"; ok(1, "$io_method: $name: issued sql"); - $node->poll_query_until('postgres', - qq(SELECT wait_event FROM pg_stat_activity WHERE pid = $pid), - $waitfor); + my $waitquery; + if ($wait_current_session) + { + $waitquery = + qq(SELECT wait_event FROM pg_stat_activity WHERE pid = $pid); + } + else + { + $waitquery = + qq(SELECT wait_event FROM pg_stat_activity WHERE wait_event = '$waitfor'); + } + + note "polling for completion with $waitquery"; + $node->poll_query_until('postgres', $waitquery, $waitfor); ok(1, "$io_method: $name: observed $waitfor wait event"); } @@ -410,7 +426,8 @@ sub test_startwait_io $psql_b, "blocking start buffer io", qq(SELECT buffer_call_start_io($buf_id, for_input=>true, wait=>true);), - "BufferIo"); + "BufferIo", + 1); # Terminate the IO, without marking it as success, this should trigger the # waiting session to be able to start the io @@ -449,7 +466,8 @@ sub test_startwait_io $psql_b, "blocking start buffer io", qq(SELECT buffer_call_start_io($buf_id, for_input=>true, wait=>true);), - "BufferIo"); + "BufferIo", + 1); # Terminate the IO, marking it as success psql_like( @@ -1560,6 +1578,10 @@ INSERT INTO tmp_ok SELECT generate_series(1, 5000); # Test encountering buffer IO we started in the first block of the # range. + # + # Depending on how quick the IO we start completes, the IO might be + # completed or we "join" the foreign IO. To hide that variability, the + # query below treats a foreign IO as not having needed to do IO. $psql_a->query_safe(qq|SELECT evict_rel('$table')|); $psql_a->query_safe( qq|SELECT read_rel_block_ll('$table', 1, wait_complete=>false)|); @@ -1567,7 +1589,7 @@ INSERT INTO tmp_ok SELECT generate_series(1, 5000); $io_method, $psql_a, "$persistency: read buffers, in-progress 1, read 1-3", - qq|SELECT blockoff, blocknum, io_reqd, nblocks FROM read_buffers('$table', 1, 3)|, + qq|SELECT blockoff, blocknum, io_reqd and not foreign_io, nblocks FROM read_buffers('$table', 1, 3)|, qr/^0\|1\|f\|1\n1\|2\|t\|2$/, qr/^$/); @@ -1579,7 +1601,7 @@ INSERT INTO tmp_ok SELECT generate_series(1, 5000); $io_method, $psql_a, "$persistency: read buffers, in-progress 2, read 1-3", - qq|SELECT blockoff, blocknum, io_reqd, nblocks FROM read_buffers('$table', 1, 3)|, + qq|SELECT blockoff, blocknum, io_reqd and not foreign_io, nblocks FROM read_buffers('$table', 1, 3)|, qr/^0\|1\|t\|1\n1\|2\|f\|1\n2\|3\|t\|1$/, qr/^$/); @@ -1591,8 +1613,7 @@ INSERT INTO tmp_ok SELECT generate_series(1, 5000); $io_method, $psql_a, "$persistency: read buffers, in-progress 3, read 1-3", - qq|SELECT blockoff, blocknum, io_reqd, nblocks FROM -read_buffers('$table', 1, 3)|, + qq|SELECT blockoff, blocknum, io_reqd and not foreign_io, nblocks FROM read_buffers('$table', 1, 3)|, qr/^0\|1\|t\|2\n2\|3\|f\|1$/, qr/^$/); } @@ -1620,14 +1641,15 @@ read_buffers('$table', 1, 3)|, $node, $psql_a, "$persistency: read buffers blocks waiting for concurrent IO", - qq|SELECT blockoff, blocknum, io_reqd, nblocks FROM read_buffers('$table', 1, 5);\n|, - "BufferIo"); + qq|SELECT blockoff, blocknum, io_reqd, foreign_io, nblocks FROM read_buffers('$table', 1, 5);\n|, + "BufferIo", + 1); $psql_b->query_safe( qq|SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false)| ); - pump_until( - $psql_a->{run}, $psql_a->{timeout}, - \$psql_a->{stdout}, qr/0\|1\|t\|2\n2\|3\|t\|3/); + # Because no IO wref was assigned, block 3 should not report foreign IO + pump_until($psql_a->{run}, $psql_a->{timeout}, \$psql_a->{stdout}, + qr/0\|1\|t\|f\|2\n2\|3\|t\|f\|3/); ok(1, "$io_method: $persistency: IO was split due to concurrent failed IO" ); @@ -1645,13 +1667,15 @@ read_buffers('$table', 1, 3)|, $node, $psql_a, "$persistency: read buffers blocks waiting for concurrent IO", - qq|SELECT blockoff, blocknum, io_reqd, nblocks FROM read_buffers('$table', 1, 5);\n|, - "BufferIo"); + qq|SELECT blockoff, blocknum, io_reqd, foreign_io, nblocks FROM read_buffers('$table', 1, 5);\n|, + "BufferIo", + 1); $psql_b->query_safe( qq|SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>true, io_error=>false, release_aio=>false)| ); + # Because no IO wref was assigned, block 3 should not report foreign IO pump_until($psql_a->{run}, $psql_a->{timeout}, \$psql_a->{stdout}, - qr/0\|1\|t\|2\n2\|3\|f\|1\n3\|4\|t\|2/); + qr/0\|1\|t\|f\|2\n2\|3\|f\|f\|1\n3\|4\|t\|f\|2/); ok(1, "$io_method: $persistency: IO was split due to concurrent successful IO" ); @@ -1662,6 +1686,162 @@ read_buffers('$table', 1, 3)|, } +# Tests for StartReadBuffers() that depend on injection point support +sub test_read_buffers_inject +{ + my $io_method = shift; + my $node = shift; + + my $psql_a = $node->background_psql('postgres', on_error_stop => 0); + my $psql_b = $node->background_psql('postgres', on_error_stop => 0); + my $psql_c = $node->background_psql('postgres', on_error_stop => 0); + + my $expected; + + # We can't easily test waiting for foreign IOs on temporary tables, as the + # waiting in the completion hook will just stall the backend. For worker + # that is because temporary table IO is executed synchronously, for + # io_uring the completion will be executed in the same process, but due to + # temporary tables not being shared, we can't do the wait in another + # backend. + my $table = 'tbl_ok'; + my $persistency = 'normal'; + + + ### + # Test if a read buffers encounters AIO in progress by another backend, it + # recognizes that other IO as a foreign IO. + ### + $psql_a->query_safe(qq|SELECT evict_rel('$table')|); + + # B: Trigger wait in the next AIO read for block 1. + $psql_b->query_safe( + qq/SELECT inj_io_completion_wait(pid=>pg_backend_pid(), + relfilenode=>pg_relation_filenode('$table'), + blockno=>1);/); + ok(1, + "$io_method: $persistency: configure wait in completion of block 1"); + + # B: Read block 1 and wait for the completion hook to be reached (which could + # be in B itself or in an IO worker) + query_wait_block( + $io_method, + $node, + $psql_b, + "$persistency: wait in completion of block 1", + qq|SELECT read_rel_block_ll('$table', blockno=>1, nblocks=>1)|, + 'completion_wait', + 0); + + # A: Start read, wait until we're waiting for IO completion + query_wait_block( + $io_method, + $node, + $psql_a, + "$persistency: read 1-4, blocked on in-progress 1", + qq|SELECT blockoff, blocknum, io_reqd, foreign_io, nblocks FROM read_buffers('$table', 1, 4)|, + "AioIoCompletion", + 1); + + # C: Release B from completion hook + $psql_c->query_safe(qq|SELECT inj_io_completion_continue()|); + ok(1, "$io_method: $persistency: continued completion of block 1"); + + # A: Check that we recognized the foreign IO wait, if possible + # + # Due to sync mode not actually issuing IO below StartReadBuffers(), we + # can't observe encountering foreign IO. It still seems worth exercising these + # paths however. + if ($io_method ne 'sync') + { + # A foreign IO covering block 1, and one IO covering blocks 2-4. + $expected = qr/0\|1\|t\|t\|1\n1\|2\|t\|f\|3/; + } + else + { + # One IO covering everything, as that's what StartReadBuffers() will + # return for something with misses in sync mode. + $expected = qr/0\|1\|t\|f\|4/; + } + pump_until($psql_a->{run}, $psql_a->{timeout}, \$psql_a->{stdout}, + $expected); + ok(1, + "$io_method: $persistency: read 1-3, blocked on in-progress 1, see expected result" + ); + $psql_a->{stdout} = ''; + + + ### + # Test if a read buffers encounters AIO in progress by another backend, it + # recognizes that other IO as a foreign IO. This time we encounter the + # foreign IO multiple times. + ### + $psql_a->query_safe(qq|SELECT evict_rel('$table')|); + + # B: Trigger wait in the next AIO read for block 3. + $psql_b->query_safe( + qq/SELECT inj_io_completion_wait(pid=>pg_backend_pid(), + relfilenode=>pg_relation_filenode('$table'), + blockno=>3);/); + ok(1, + "$io_method: $persistency: configure wait in completion of block 3"); + + # B: Read block 2-3 and wait for the completion hook to be reached (which + # could be in B itself or in an IO worker) + query_wait_block( + $io_method, + $node, + $psql_b, + "$persistency: wait in completion of block 2+3", + qq|SELECT read_rel_block_ll('$table', blockno=>2, nblocks=>2)|, + 'completion_wait', + 0); + + # A: Start read, wait until we're waiting for IO completion + # + # Note that we need to defer waiting for IO until the end of + # read_buffers(), to be able to see that the IO on 3 is still in progress. + query_wait_block( + $io_method, + $node, + $psql_a, + "$persistency: read 0-3, blocked on in-progress 2+3", + qq|SELECT blockoff, blocknum, io_reqd, foreign_io, nblocks FROM +read_buffers('$table', 0, 4)|, + "AioIoCompletion", 1); + + # C: Release B from completion hook + $psql_c->query_safe(qq|SELECT inj_io_completion_continue()|); + ok(1, "$io_method: $persistency: continued completion of block 2+3"); + + # A: Check that we recognized the foreign IO wait, if possible + # + # See comment further up about sync mode. + if ($io_method ne 'sync') + { + # One IO covering blocks 0-1, A foreign IO covering block 2, and a + # foreign IO covering block 3 (same wref as for block 2). + $expected = qr/0\|0\|t\|f\|2\n2\|2\|t\|t\|1\n3\|3\|t\|t\|1/; + } + else + { + # One IO covering everything, as that's what StartReadBuffers() will + # return for something with misses in sync mode. + $expected = qr/0\|0\|t\|f\|4/; + } + pump_until($psql_a->{run}, $psql_a->{timeout}, \$psql_a->{stdout}, + $expected); + ok(1, + "$io_method: $persistency: read 0-3, blocked on in-progress 2+3, see expected result" + ); + $psql_a->{stdout} = ''; + + + $psql_a->quit(); + $psql_b->quit(); + $psql_c->quit(); +} + # Run all tests that for the specified node / io_method sub test_io_method { @@ -1705,6 +1885,7 @@ CHECKPOINT; skip 'Injection points not supported by this build', 1 unless $ENV{enable_injection_points} eq 'yes'; test_inject($io_method, $node); + test_read_buffers_inject($io_method, $node); } # worker specific injection tests diff --git a/src/test/modules/test_aio/test_aio--1.0.sql b/src/test/modules/test_aio/test_aio--1.0.sql index 53c83b74e53..762ac29512f 100644 --- a/src/test/modules/test_aio/test_aio--1.0.sql +++ b/src/test/modules/test_aio/test_aio--1.0.sql @@ -53,7 +53,7 @@ CREATE FUNCTION buffer_call_terminate_io(buffer int, for_input bool, succeed boo RETURNS pg_catalog.void STRICT AS 'MODULE_PATHNAME' LANGUAGE C; -CREATE FUNCTION read_buffers(rel regclass, startblock int4, nblocks int4, OUT blockoff int4, OUT blocknum int4, OUT io_reqd bool, OUT nblocks int4, OUT buf int4[]) +CREATE FUNCTION read_buffers(rel regclass, startblock int4, nblocks int4, OUT blockoff int4, OUT blocknum int4, OUT io_reqd bool, OUT foreign_io bool, OUT nblocks int4, OUT buf int4[]) RETURNS SETOF record STRICT AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_aio/test_aio.c b/src/test/modules/test_aio/test_aio.c index a0ad2313661..a8267192cb7 100644 --- a/src/test/modules/test_aio/test_aio.c +++ b/src/test/modules/test_aio/test_aio.c @@ -778,8 +778,8 @@ read_buffers(PG_FUNCTION_ARGS) { ReadBuffersOperation *operation = &operations[nio]; int nblocks_this_io = operation->nblocks; - Datum values[5] = {0}; - bool nulls[5] = {0}; + Datum values[6] = {0}; + bool nulls[6] = {0}; ArrayType *buffers_arr; /* convert buffer array to datum array */ @@ -809,14 +809,18 @@ read_buffers(PG_FUNCTION_ARGS) values[2] = BoolGetDatum(io_reqds[nio]); nulls[2] = false; - /* nblocks */ - values[3] = Int32GetDatum(nblocks_this_io); + /* foreign IO */ + values[3] = BoolGetDatum(operation->foreign_io); nulls[3] = false; - /* array of buffers */ - values[4] = PointerGetDatum(buffers_arr); + /* nblocks */ + values[4] = Int32GetDatum(nblocks_this_io); nulls[4] = false; + /* array of buffers */ + values[5] = PointerGetDatum(buffers_arr); + nulls[5] = false; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); nblocks_disp += nblocks_this_io;