* b) reports some time as waiting, even if we never waited
*
* we first check if we already know the IO is complete.
+ *
+ * Note that operation->io_return is uninitialized for foreign IO,
+ * so we cannot use the cheaper PGAIO_RS_UNKNOWN pre-check.
*/
- if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
+ if ((operation->foreign_io || aio_ret->result.status == PGAIO_RS_UNKNOWN) &&
!pgaio_wref_check_done(&operation->io_wref))
{
instr_time io_start = pgstat_prepare_io_time(track_io_timing);
Assert(pgaio_wref_check_done(&operation->io_wref));
}
- /*
- * We now are sure the IO completed. Check the results. This
- * includes reporting on errors if there were any.
- */
- ProcessReadBuffersResult(operation);
+ if (unlikely(operation->foreign_io))
+ {
+ Buffer buffer = operation->buffers[operation->nblocks_done];
+ BufferDesc *desc = BufferIsLocal(buffer) ?
+ GetLocalBufferDescriptor(-buffer - 1) :
+ GetBufferDescriptor(buffer - 1);
+ uint64 buf_state = pg_atomic_read_u64(&desc->state);
+
+ if (buf_state & BM_VALID)
+ {
+ BlockNumber blocknum = operation->blocknum + operation->nblocks_done;
+
+ operation->nblocks_done += 1;
+ Assert(operation->nblocks_done <= operation->nblocks);
+
+ /*
+ * Track this as a 'hit' for this backend. The backend
+ * performing the IO will track it as a 'read'.
+ */
+ TrackBufferHit(io_object, io_context,
+ operation->rel, operation->persistence,
+ operation->smgr, operation->forknum,
+ blocknum);
+ }
+
+ /*
+ * If the foreign IO failed and left the buffer invalid,
+ * nblocks_done is not incremented. The retry loop below will
+ * call AsyncReadBuffers() which will attempt the IO itself.
+ */
+ }
+ else
+ {
+ /*
+ * We now are sure the IO completed. Check the results. This
+ * includes reporting on errors if there were any.
+ */
+ ProcessReadBuffersResult(operation);
+ }
}
/*
* affected by the call. If the first buffer is valid, *nblocks_progress is
* set to 1 and operation->nblocks_done is incremented.
*
- * Returns true if IO was initiated, false if no IO was necessary.
+ * Returns true if IO was initiated or is already in progress (foreign IO),
+ * false if the buffer was already valid.
*/
static bool
AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
pgstat_prepare_report_checksum_failure(operation->smgr->smgr_rlocator.locator.dbOid);
/*
- * Get IO handle before StartBufferIO(), as pgaio_io_acquire() might
- * block, which we don't want after setting IO_IN_PROGRESS.
+ * We must get an IO handle before StartBufferIO(), as pgaio_io_acquire()
+ * might block, which we don't want after setting IO_IN_PROGRESS. If we
+ * don't need to do the IO, we'll release the handle.
*
* If we need to wait for IO before we can get a handle, submit
* already-staged IO first, so that other backends don't need to wait.
ioh = pgaio_io_acquire(CurrentResourceOwner, &operation->io_return);
}
+ operation->foreign_io = false;
+ pgaio_wref_clear(&operation->io_wref);
+
/*
- * Check if we can start IO on the first to-be-read buffer.
+ * Try to start IO on the first buffer in a new run of blocks. If AIO is
+ * in progress, be it in this backend or another backend, we just
+ * associate the wait reference with the operation and wait in
+ * WaitReadBuffers(). This turns out to be important for performance in
+ * two workloads:
+ *
+ * 1) A read stream that has to read the same block multiple times within
+ * the readahead distance. This can happen e.g. for the table accesses of
+ * an index scan.
+ *
+ * 2) Concurrent scans by multiple backends on the same relation.
+ *
+ * If we were to synchronously wait for the in-progress IO, we'd not be
+ * able to keep enough I/O in flight.
+ *
+ * If we do find there is ongoing I/O for the buffer, we set up a 1-block
+ * ReadBuffersOperation that WaitReadBuffers then can wait on.
*
- * If an I/O is already in progress in another backend, we want to wait
- * for the outcome: either done, or something went wrong and we will
- * retry.
+ * It's possible that another backend has started IO on the buffer but not
+ * yet set its wait reference. In this case, we have no choice but to wait
+ * for either the wait reference to be valid or the IO to be done.
*/
- status = StartBufferIO(buffers[nblocks_done], true, true, NULL);
+ status = StartBufferIO(buffers[nblocks_done], true, true,
+ &operation->io_wref);
if (status != BUFFER_IO_READY_FOR_IO)
{
pgaio_io_release(ioh);
/* The IO is already in-progress */
Assert(status == BUFFER_IO_IN_PROGRESS);
+ Assert(pgaio_wref_valid(&operation->io_wref));
+ operation->foreign_io = true;
return true;
}
return $output;
}
+# Issue query, wait for the specified wait event to be reached. If
+# wait_current_session is true, we will wait for the event in the current
+# session, otherwise we'll wait for any session.
sub query_wait_block
{
local $Test::Builder::Level = $Test::Builder::Level + 1;
my $name = shift;
my $sql = shift;
my $waitfor = shift;
+ my $wait_current_session = shift;
my $pid = $psql->query_safe('SELECT pg_backend_pid()');
$psql->{stdin} .= qq($sql;\n);
$psql->{run}->pump_nb();
+ note "issued sql: $sql;\n";
ok(1, "$io_method: $name: issued sql");
- $node->poll_query_until('postgres',
- qq(SELECT wait_event FROM pg_stat_activity WHERE pid = $pid),
- $waitfor);
+ my $waitquery;
+ if ($wait_current_session)
+ {
+ $waitquery =
+ qq(SELECT wait_event FROM pg_stat_activity WHERE pid = $pid);
+ }
+ else
+ {
+ $waitquery =
+ qq(SELECT wait_event FROM pg_stat_activity WHERE wait_event = '$waitfor');
+ }
+
+ note "polling for completion with $waitquery";
+ $node->poll_query_until('postgres', $waitquery, $waitfor);
ok(1, "$io_method: $name: observed $waitfor wait event");
}
$psql_b,
"blocking start buffer io",
qq(SELECT buffer_call_start_io($buf_id, for_input=>true, wait=>true);),
- "BufferIo");
+ "BufferIo",
+ 1);
# Terminate the IO, without marking it as success, this should trigger the
# waiting session to be able to start the io
$psql_b,
"blocking start buffer io",
qq(SELECT buffer_call_start_io($buf_id, for_input=>true, wait=>true);),
- "BufferIo");
+ "BufferIo",
+ 1);
# Terminate the IO, marking it as success
psql_like(
# Test encountering buffer IO we started in the first block of the
# range.
+ #
+ # Depending on how quick the IO we start completes, the IO might be
+ # completed or we "join" the foreign IO. To hide that variability, the
+ # query below treats a foreign IO as not having needed to do IO.
$psql_a->query_safe(qq|SELECT evict_rel('$table')|);
$psql_a->query_safe(
qq|SELECT read_rel_block_ll('$table', 1, wait_complete=>false)|);
$io_method,
$psql_a,
"$persistency: read buffers, in-progress 1, read 1-3",
- qq|SELECT blockoff, blocknum, io_reqd, nblocks FROM read_buffers('$table', 1, 3)|,
+ qq|SELECT blockoff, blocknum, io_reqd and not foreign_io, nblocks FROM read_buffers('$table', 1, 3)|,
qr/^0\|1\|f\|1\n1\|2\|t\|2$/,
qr/^$/);
$io_method,
$psql_a,
"$persistency: read buffers, in-progress 2, read 1-3",
- qq|SELECT blockoff, blocknum, io_reqd, nblocks FROM read_buffers('$table', 1, 3)|,
+ qq|SELECT blockoff, blocknum, io_reqd and not foreign_io, nblocks FROM read_buffers('$table', 1, 3)|,
qr/^0\|1\|t\|1\n1\|2\|f\|1\n2\|3\|t\|1$/,
qr/^$/);
$io_method,
$psql_a,
"$persistency: read buffers, in-progress 3, read 1-3",
- qq|SELECT blockoff, blocknum, io_reqd, nblocks FROM
-read_buffers('$table', 1, 3)|,
+ qq|SELECT blockoff, blocknum, io_reqd and not foreign_io, nblocks FROM read_buffers('$table', 1, 3)|,
qr/^0\|1\|t\|2\n2\|3\|f\|1$/,
qr/^$/);
}
$node,
$psql_a,
"$persistency: read buffers blocks waiting for concurrent IO",
- qq|SELECT blockoff, blocknum, io_reqd, nblocks FROM read_buffers('$table', 1, 5);\n|,
- "BufferIo");
+ qq|SELECT blockoff, blocknum, io_reqd, foreign_io, nblocks FROM read_buffers('$table', 1, 5);\n|,
+ "BufferIo",
+ 1);
$psql_b->query_safe(
qq|SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false)|
);
- pump_until(
- $psql_a->{run}, $psql_a->{timeout},
- \$psql_a->{stdout}, qr/0\|1\|t\|2\n2\|3\|t\|3/);
+ # Because no IO wref was assigned, block 3 should not report foreign IO
+ pump_until($psql_a->{run}, $psql_a->{timeout}, \$psql_a->{stdout},
+ qr/0\|1\|t\|f\|2\n2\|3\|t\|f\|3/);
ok(1,
"$io_method: $persistency: IO was split due to concurrent failed IO"
);
$node,
$psql_a,
"$persistency: read buffers blocks waiting for concurrent IO",
- qq|SELECT blockoff, blocknum, io_reqd, nblocks FROM read_buffers('$table', 1, 5);\n|,
- "BufferIo");
+ qq|SELECT blockoff, blocknum, io_reqd, foreign_io, nblocks FROM read_buffers('$table', 1, 5);\n|,
+ "BufferIo",
+ 1);
$psql_b->query_safe(
qq|SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>true, io_error=>false, release_aio=>false)|
);
+ # Because no IO wref was assigned, block 3 should not report foreign IO
pump_until($psql_a->{run}, $psql_a->{timeout}, \$psql_a->{stdout},
- qr/0\|1\|t\|2\n2\|3\|f\|1\n3\|4\|t\|2/);
+ qr/0\|1\|t\|f\|2\n2\|3\|f\|f\|1\n3\|4\|t\|f\|2/);
ok(1,
"$io_method: $persistency: IO was split due to concurrent successful IO"
);
}
+# Tests for StartReadBuffers() that depend on injection point support
+sub test_read_buffers_inject
+{
+ my $io_method = shift;
+ my $node = shift;
+
+ my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+ my $psql_b = $node->background_psql('postgres', on_error_stop => 0);
+ my $psql_c = $node->background_psql('postgres', on_error_stop => 0);
+
+ my $expected;
+
+ # We can't easily test waiting for foreign IOs on temporary tables, as the
+ # waiting in the completion hook will just stall the backend. For worker
+ # that is because temporary table IO is executed synchronously, for
+ # io_uring the completion will be executed in the same process, but due to
+ # temporary tables not being shared, we can't do the wait in another
+ # backend.
+ my $table = 'tbl_ok';
+ my $persistency = 'normal';
+
+
+ ###
+ # Test if a read buffers encounters AIO in progress by another backend, it
+ # recognizes that other IO as a foreign IO.
+ ###
+ $psql_a->query_safe(qq|SELECT evict_rel('$table')|);
+
+ # B: Trigger wait in the next AIO read for block 1.
+ $psql_b->query_safe(
+ qq/SELECT inj_io_completion_wait(pid=>pg_backend_pid(),
+ relfilenode=>pg_relation_filenode('$table'),
+ blockno=>1);/);
+ ok(1,
+ "$io_method: $persistency: configure wait in completion of block 1");
+
+ # B: Read block 1 and wait for the completion hook to be reached (which could
+ # be in B itself or in an IO worker)
+ query_wait_block(
+ $io_method,
+ $node,
+ $psql_b,
+ "$persistency: wait in completion of block 1",
+ qq|SELECT read_rel_block_ll('$table', blockno=>1, nblocks=>1)|,
+ 'completion_wait',
+ 0);
+
+ # A: Start read, wait until we're waiting for IO completion
+ query_wait_block(
+ $io_method,
+ $node,
+ $psql_a,
+ "$persistency: read 1-4, blocked on in-progress 1",
+ qq|SELECT blockoff, blocknum, io_reqd, foreign_io, nblocks FROM read_buffers('$table', 1, 4)|,
+ "AioIoCompletion",
+ 1);
+
+ # C: Release B from completion hook
+ $psql_c->query_safe(qq|SELECT inj_io_completion_continue()|);
+ ok(1, "$io_method: $persistency: continued completion of block 1");
+
+ # A: Check that we recognized the foreign IO wait, if possible
+ #
+ # Due to sync mode not actually issuing IO below StartReadBuffers(), we
+ # can't observe encountering foreign IO. It still seems worth exercising these
+ # paths however.
+ if ($io_method ne 'sync')
+ {
+ # A foreign IO covering block 1, and one IO covering blocks 2-4.
+ $expected = qr/0\|1\|t\|t\|1\n1\|2\|t\|f\|3/;
+ }
+ else
+ {
+ # One IO covering everything, as that's what StartReadBuffers() will
+ # return for something with misses in sync mode.
+ $expected = qr/0\|1\|t\|f\|4/;
+ }
+ pump_until($psql_a->{run}, $psql_a->{timeout}, \$psql_a->{stdout},
+ $expected);
+ ok(1,
+ "$io_method: $persistency: read 1-3, blocked on in-progress 1, see expected result"
+ );
+ $psql_a->{stdout} = '';
+
+
+ ###
+ # Test if a read buffers encounters AIO in progress by another backend, it
+ # recognizes that other IO as a foreign IO. This time we encounter the
+ # foreign IO multiple times.
+ ###
+ $psql_a->query_safe(qq|SELECT evict_rel('$table')|);
+
+ # B: Trigger wait in the next AIO read for block 3.
+ $psql_b->query_safe(
+ qq/SELECT inj_io_completion_wait(pid=>pg_backend_pid(),
+ relfilenode=>pg_relation_filenode('$table'),
+ blockno=>3);/);
+ ok(1,
+ "$io_method: $persistency: configure wait in completion of block 3");
+
+ # B: Read block 2-3 and wait for the completion hook to be reached (which
+ # could be in B itself or in an IO worker)
+ query_wait_block(
+ $io_method,
+ $node,
+ $psql_b,
+ "$persistency: wait in completion of block 2+3",
+ qq|SELECT read_rel_block_ll('$table', blockno=>2, nblocks=>2)|,
+ 'completion_wait',
+ 0);
+
+ # A: Start read, wait until we're waiting for IO completion
+ #
+ # Note that we need to defer waiting for IO until the end of
+ # read_buffers(), to be able to see that the IO on 3 is still in progress.
+ query_wait_block(
+ $io_method,
+ $node,
+ $psql_a,
+ "$persistency: read 0-3, blocked on in-progress 2+3",
+ qq|SELECT blockoff, blocknum, io_reqd, foreign_io, nblocks FROM
+read_buffers('$table', 0, 4)|,
+ "AioIoCompletion", 1);
+
+ # C: Release B from completion hook
+ $psql_c->query_safe(qq|SELECT inj_io_completion_continue()|);
+ ok(1, "$io_method: $persistency: continued completion of block 2+3");
+
+ # A: Check that we recognized the foreign IO wait, if possible
+ #
+ # See comment further up about sync mode.
+ if ($io_method ne 'sync')
+ {
+ # One IO covering blocks 0-1, A foreign IO covering block 2, and a
+ # foreign IO covering block 3 (same wref as for block 2).
+ $expected = qr/0\|0\|t\|f\|2\n2\|2\|t\|t\|1\n3\|3\|t\|t\|1/;
+ }
+ else
+ {
+ # One IO covering everything, as that's what StartReadBuffers() will
+ # return for something with misses in sync mode.
+ $expected = qr/0\|0\|t\|f\|4/;
+ }
+ pump_until($psql_a->{run}, $psql_a->{timeout}, \$psql_a->{stdout},
+ $expected);
+ ok(1,
+ "$io_method: $persistency: read 0-3, blocked on in-progress 2+3, see expected result"
+ );
+ $psql_a->{stdout} = '';
+
+
+ $psql_a->quit();
+ $psql_b->quit();
+ $psql_c->quit();
+}
+
# Run all tests that for the specified node / io_method
sub test_io_method
{
skip 'Injection points not supported by this build', 1
unless $ENV{enable_injection_points} eq 'yes';
test_inject($io_method, $node);
+ test_read_buffers_inject($io_method, $node);
}
# worker specific injection tests