]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Test stabilization for online checksums
authorDaniel Gustafsson <dgustafsson@postgresql.org>
Mon, 6 Apr 2026 00:03:10 +0000 (02:03 +0200)
committerDaniel Gustafsson <dgustafsson@postgresql.org>
Mon, 6 Apr 2026 00:03:10 +0000 (02:03 +0200)
Postcommit review and buildfarm/CI failures revealed a few issues in
the test code which this commit attempts to resolve.  These failures
are verified using synthetic means.

  * Wait for launcher exit in enable/disable checksum tests

    When enabling or disabling data checksums in a test with waiting
    for an end state (on or off), the test typically want to perform
    more test against the cluster immediately. Make sure to wait for
    the launcher to exit in these cases before returning in order to
    know it can immediately be acted on.  This is a more generic way
    of implementating 0036232ba8f.

  * Refactor injection point tests to use the injection_points test
    extension. Two injection points added for online checksums were
    better expressed using the injection_points extension with the
    test code embedded in datachecksum_state.c.

  * Make tests less timing dependent and allow transitions to "on"
    and not just "inprogress-on" in case a test manages to finish
    before it's checked for state.

  * When waiting on a blocking background psql keeping a temporary
    table open, the test first closed the background session abd
    then the server.  This could cause data checksums to manage to
    get enabled in the brief window between dropping the temporary
    table and closing the server.  Fix by closing the server first
    before the background session.

  * Remove a few superfluous duplicate checks and general cleanup
    of comments as well as making LSN logging consistent.

These issues were reported by Andres as well as spotted in the
buildfarm and on CI.

Author: Daniel Gustafsson <daniel@yesql.se>
Reported-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/92F25C14-801E-4198-994D-D83E31FEB0D8@yesql.se

src/backend/postmaster/datachecksum_state.c
src/test/modules/test_checksums/t/001_basic.pl
src/test/modules/test_checksums/t/003_standby_restarts.pl
src/test/modules/test_checksums/t/004_offline.pl
src/test/modules/test_checksums/t/005_injection.pl
src/test/modules/test_checksums/t/006_pgbench_single.pl
src/test/modules/test_checksums/t/007_pgbench_standby.pl
src/test/modules/test_checksums/t/008_pitr.pl
src/test/modules/test_checksums/t/DataChecksums/Utils.pm
src/test/modules/test_checksums/test_checksums--1.0.sql
src/test/modules/test_checksums/test_checksums.c

index 430e6e9a3c9bacf38c15d264952bdf9203e5d11b..1243949eacbfd9b47fe76ebf982a233cf5728f05 100644 (file)
@@ -1216,8 +1216,15 @@ ProcessAllDatabases(void)
 
                result = ProcessDatabase(db);
 
+#ifdef USE_INJECTION_POINTS
                /* Allow a test process to alter the result of the operation */
-               INJECTION_POINT("datachecksumsworker-modify-db-result", &result);
+               if (IS_INJECTION_POINT_ATTACHED("datachecksumsworker-fail-db-result"))
+               {
+                       result = DATACHECKSUMSWORKER_FAILED;
+                       INJECTION_POINT_CACHED("datachecksumsworker-fail-db-result",
+                                                                  db->dbname);
+               }
+#endif
 
                pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_DBS_DONE,
                                                                         ++cumulative_total);
@@ -1451,6 +1458,9 @@ DataChecksumsWorkerMain(Datum arg)
        BufferAccessStrategy strategy;
        bool            aborted = false;
        int64           rels_done;
+#ifdef USE_INJECTION_POINTS
+       bool            retried = false;
+#endif
 
        operation = ENABLE_DATACHECKSUMS;
 
@@ -1566,7 +1576,19 @@ DataChecksumsWorkerMain(Datum arg)
                }
                list_free(CurrentTempTables);
 
-               INJECTION_POINT("datachecksumsworker-fake-temptable-wait", &numleft);
+#ifdef USE_INJECTION_POINTS
+               if (IS_INJECTION_POINT_ATTACHED("datachecksumsworker-fake-temptable-wait"))
+               {
+                       /* Make sure to just cause one retry */
+                       if (!retried && numleft == 0)
+                       {
+                               numleft = 1;
+                               retried = true;
+
+                               INJECTION_POINT_CACHED("datachecksumsworker-fake-temptable-wait", NULL);
+                       }
+               }
+#endif
 
                if (numleft == 0)
                        break;
index 5933c730da1697ef8cc1d06f48a50e30340baa78..a78118320d5515c093dc0afd98388cecd34cc626 100644 (file)
@@ -35,11 +35,7 @@ my $result =
 is($result, '9999', 'ensure checksummed pages can be read back');
 
 # Enable data checksums again which should be a no-op so we explicitly don't
-# wait for any state transition as none should happen here. Make sure to let
-# any running launcher finish in case it's still wrapping up.
-$result = $node->poll_query_until('postgres',
-       "SELECT count(*) = 0 FROM pg_catalog.pg_stat_activity WHERE backend_type = 'datachecksum launcher';"
-);
+# wait for any state transition as none should happen here.
 enable_data_checksums($node);
 test_checksum_state($node, 'on');
 # ..and make sure we can still read/write data
index 6b01692565115e26d370bba5958654d3f96c5284..7ad11417ca6518611f4fe52a42fc480cdfe27344 100644 (file)
@@ -41,7 +41,7 @@ $node_standby->start;
 $node_primary->safe_psql('postgres',
        "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;");
 
-# Wait for standbys to catch up
+# Wait for standby to catch up
 $node_primary->wait_for_catchup($node_standby, 'replay',
        $node_primary->lsn('insert'));
 
@@ -54,8 +54,14 @@ test_checksum_state($node_standby, 'off');
 # standby change state.
 #
 
-# Ensure that the primary switches to "inprogress-on"
-enable_data_checksums($node_primary, wait => 'inprogress-on');
+# Initiate enabling of checksums and ensure that the primary switches to
+# either "inprogress-on" or "on"
+enable_data_checksums($node_primary);
+my $result = $node_primary->poll_query_until(
+       'postgres',
+       "SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';",
+       'f');
+is($result, 1, 'ensure primary has transitioned from off');
 # Wait for checksum enable to be replayed
 $node_primary->wait_for_catchup($node_standby, 'replay');
 
@@ -63,7 +69,7 @@ $node_primary->wait_for_catchup($node_standby, 'replay');
 # would be "inprogress-on", but it is theoretically possible for the primary to
 # complete the checksum enabling *and* have the standby replay that record
 # before we reach the check below.
-my $result = $node_standby->poll_query_until(
+$result = $node_standby->poll_query_until(
        'postgres',
        "SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';",
        'f');
index f1972bddff1bf97b9db8d8ce62fa807ea59fb6d8..73c279e75e00f59b6c93cbf654c4162f91be83ed 100644 (file)
@@ -62,11 +62,15 @@ $result = $node->safe_psql('postgres',
        "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';");
 is($result, 't', 'ensure we can see the temporary table');
 
+# Enable, but stop waiting at inprogress-on since it will sit there until the
+# above temporary table is removed.
 enable_data_checksums($node, wait => 'inprogress-on');
 
-# Turn the cluster off and enable checksums offline, then start back up
+# Turn the cluster off and enable checksums offline, then start back up.
+# Stop the cluster before exiting the background session since otherwise
+# checksums might have time to get enabled before shutting down the cluster.
+$node->stop('fast');
 $bsession->quit;
-$node->stop;
 $node->checksum_enable_offline;
 $node->start;
 
index 897f282a1f2f7f76381633378131783aa51186d1..a37a24dbbadb1b0ce8419371bf0a74544ccee96f 100644 (file)
@@ -32,6 +32,7 @@ $node->start;
 
 # Set up test environment
 $node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
+$node->safe_psql('postgres', 'CREATE EXTENSION injection_points;');
 
 # ---------------------------------------------------------------------------
 # Inducing failures and crashes in processing
@@ -39,9 +40,12 @@ $node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
 # Force enabling checksums to fail by marking one of the databases as having
 # failed in processing.
 disable_data_checksums($node, wait => 1);
-$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(true);');
+$node->safe_psql('postgres',
+       "SELECT injection_points_attach('datachecksumsworker-fail-db-result','notice');"
+);
 enable_data_checksums($node, wait => 'off');
-$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(false);');
+$node->safe_psql('postgres',
+       "SELECT injection_points_detach('datachecksumsworker-fail-db-result');");
 
 # Make sure that disabling after a failure works
 disable_data_checksums($node);
@@ -66,7 +70,9 @@ SKIP:
        # will force the processing to wait and retry in order to wait for it to
        # disappear.
        disable_data_checksums($node, wait => 1);
-       $node->safe_psql('postgres', 'SELECT dcw_fake_temptable(true);');
+       $node->safe_psql('postgres',
+               "SELECT injection_points_attach('datachecksumsworker-fake-temptable-wait', 'notice');"
+       );
        enable_data_checksums($node, wait => 'on');
 }
 
index 0ab5b04b9313b59008e9f90d3961628c78150555..f5ccc1b5b082d84eccf55e6177febeadcb3a4fda 100644 (file)
@@ -89,14 +89,21 @@ sub background_rw_pgbench
 # before and after state.
 sub flip_data_checksums
 {
+       my $temptablewait = 0;
+
        # First, make sure the cluster is in the state we expect it to be
        test_checksum_state($node, $data_checksum_state);
 
        if ($data_checksum_state eq 'off')
        {
                # Coin-toss to see if we are injecting a retry due to a temptable
-               $node->safe_psql('postgres', 'SELECT dcw_fake_temptable();')
-                 if cointoss();
+               if (cointoss())
+               {
+                       $node->safe_psql('postgres',
+                               "SELECT injection_points_attach('datachecksumsworker-fake-temptable-wait', 'notice');"
+                       );
+                       $temptablewait = 1;
+               }
 
                # log LSN right before we start changing checksums
                my $result =
@@ -117,7 +124,9 @@ sub flip_data_checksums
 
                random_sleep() if ($extended);
 
-               $node->safe_psql('postgres', 'SELECT dcw_fake_temptable(false);');
+               $node->safe_psql('postgres',
+                       "SELECT injection_points_detach('datachecksumsworker-fake-temptable-wait');"
+               ) if ($temptablewait);
                $data_checksum_state = 'on';
        }
        elsif ($data_checksum_state eq 'on')
@@ -165,6 +174,7 @@ log_statement = none
 ]);
 $node->start;
 $node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
+$node->safe_psql('postgres', 'CREATE EXTENSION injection_points;');
 # Create some content to have un-checksummed data in the cluster
 $node->safe_psql('postgres',
        "CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;");
index b0d40d24005f310f466e8252520415044f0536a3..f3611e7ce256bfbf33b16c91c2a94a5aeb67f9da 100644 (file)
@@ -99,6 +99,8 @@ sub background_pgbench
 # before and after state.
 sub flip_data_checksums
 {
+       my $temptablewait = 0;
+
        # First, make sure the cluster is in the state we expect it to be
        test_checksum_state($node_primary, $data_checksum_state);
        test_checksum_state($node_standby, $data_checksum_state);
@@ -106,8 +108,13 @@ sub flip_data_checksums
        if ($data_checksum_state eq 'off')
        {
                # Coin-toss to see if we are injecting a retry due to a temptable
-               $node_primary->safe_psql('postgres', 'SELECT dcw_fake_temptable();')
-                 if cointoss();
+               if (cointoss())
+               {
+                       $node_primary->safe_psql('postgres',
+                               "SELECT injection_points_attach('datachecksumsworker-fake-temptable-wait', 'notice');"
+                       );
+                       $temptablewait = 1;
+               }
 
                # log LSN right before we start changing checksums
                my $result =
@@ -154,7 +161,8 @@ sub flip_data_checksums
                wait_for_checksum_state($node_standby, 'on');
 
                $node_primary->safe_psql('postgres',
-                       'SELECT dcw_fake_temptable(false);');
+                       "SELECT injection_points_detach('datachecksumsworker-fake-temptable-wait');"
+               ) if ($temptablewait);
                $data_checksum_state = 'on';
        }
        elsif ($data_checksum_state eq 'on')
@@ -170,6 +178,7 @@ sub flip_data_checksums
                $node_primary->wait_for_catchup($node_standby, 'replay');
 
                # Wait for checksums disabled on the primary and standby
+               random_sleep() if ($extended);
                wait_for_checksum_state($node_primary, 'off');
                wait_for_checksum_state($node_standby, 'off');
 
@@ -178,9 +187,6 @@ sub flip_data_checksums
                  $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
                note("LSN after disabling: " . $result . "\n");
 
-               random_sleep() if ($extended);
-               wait_for_checksum_state($node_standby, 'off');
-
                $data_checksum_state = 'off';
        }
        else
@@ -207,6 +213,7 @@ log_statement = none
 ]);
 $node_primary->start;
 $node_primary->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
+$node_primary->safe_psql('postgres', 'CREATE EXTENSION injection_points;');
 # Create some content to have un-checksummed data in the cluster
 $node_primary->safe_psql('postgres',
        "CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;");
index b9b89f414ab4724d6a6e423572a7a4c2cd345f4f..e8cb2b0ed96b61136ea102c53f1dbd4d332995b6 100644 (file)
@@ -67,15 +67,15 @@ sub flip_data_checksums
                # log LSN right before we start changing checksums
                $lsn_pre =
                  $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+               note("LSN before disabling: " . $lsn_pre . "\n");
 
-               disable_data_checksums($node_primary);
-
-               # Wait for checksums disabled on the primary
-               wait_for_checksum_state($node_primary, 'off');
+               # Disable checksums on the primary and wait for completion
+               disable_data_checksums($node_primary, wait => 1);
 
                # log LSN right after the primary flips checksums to "off"
                $lsn_post =
                  $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+               note("LSN after disabling: " . $lsn_post . "\n");
 
                $data_checksum_state = 'off';
        }
index 9a2269e8a922d1e08fe7d573e682d565cf7fa155..fb704623a6006511127790a9a7b5cf8bea894631 100644 (file)
@@ -175,8 +175,19 @@ EOQ
        $postgresnode->safe_psql('postgres',
                sprintf($query, $params{cost_delay}, $params{cost_limit}));
 
-       wait_for_checksum_state($postgresnode, $params{wait})
-         if (defined($params{wait}));
+       if (defined($params{wait}))
+       {
+               wait_for_checksum_state($postgresnode, $params{wait});
+               # If we are tasked with waiting for an end state, also wait for the
+               # launcher to exit.
+               if ($params{wait} eq 'on' || $params{wait} eq 'off')
+               {
+                       $postgresnode->poll_query_until('postgres',
+                                       "SELECT count(*) = 0 "
+                                 . "FROM pg_catalog.pg_stat_activity "
+                                 . "WHERE backend_type = 'datachecksum launcher';");
+               }
+       }
 }
 
 =item disable_data_checksums($node, %params)
@@ -204,7 +215,14 @@ sub disable_data_checksums
        $postgresnode->safe_psql('postgres',
                'SELECT pg_disable_data_checksums();');
 
-       wait_for_checksum_state($postgresnode, 'off') if (defined($params{wait}));
+       if (defined($params{wait}))
+       {
+               wait_for_checksum_state($postgresnode, 'off');
+               $postgresnode->poll_query_until('postgres',
+                               "SELECT count(*) = 0 "
+                         . "FROM pg_catalog.pg_stat_activity "
+                         . "WHERE backend_type = 'datachecksum launcher';");
+       }
 }
 
 =item cointoss
index 90642d247fa6597fcff65376f6418e3decd0a5d9..c674fee02bbb4f12bc63572ec1d0ad1b00b8893d 100644 (file)
@@ -14,11 +14,3 @@ CREATE FUNCTION dcw_inject_launcher_delay(attach boolean DEFAULT true)
 CREATE FUNCTION dcw_inject_startup_delay(attach boolean DEFAULT true)
        RETURNS pg_catalog.void
        AS 'MODULE_PATHNAME' LANGUAGE C;
-
-CREATE FUNCTION dcw_inject_fail_database(attach boolean DEFAULT true)
-       RETURNS pg_catalog.void
-       AS 'MODULE_PATHNAME' LANGUAGE C;
-
-CREATE FUNCTION dcw_fake_temptable(attach boolean DEFAULT true)
-       RETURNS pg_catalog.void
-       AS 'MODULE_PATHNAME' LANGUAGE C;
index e2b91f9c78c13a68d01bb474f8858074b003edd8..c2eabc2821c5ca636d81d904ffbf66b66293de3f 100644 (file)
@@ -25,8 +25,6 @@ extern PGDLLEXPORT void dc_delay_barrier(const char *name, const void *private_d
 extern PGDLLEXPORT void dc_modify_db_result(const char *name, const void *private_data, void *arg);
 extern PGDLLEXPORT void dc_fake_temptable(const char *name, const void *private_data, void *arg);
 
-extern PGDLLEXPORT void crash(const char *name, const void *private_data, void *arg);
-
 /*
  * Test for delaying emission of procsignalbarriers.
  */
@@ -107,80 +105,3 @@ dcw_inject_startup_delay(PG_FUNCTION_ARGS)
 #endif
        PG_RETURN_VOID();
 }
-
-#ifdef USE_INJECTION_POINTS
-static uint32 db_fail = DATACHECKSUMSWORKER_FAILED;
-#endif
-
-void
-dc_modify_db_result(const char *name, const void *private_data, void *arg)
-{
-       DataChecksumsWorkerResult *res = (DataChecksumsWorkerResult *) arg;
-       uint32          new_res = *(uint32 *) private_data;
-
-       *res = new_res;
-}
-
-PG_FUNCTION_INFO_V1(dcw_inject_fail_database);
-Datum
-dcw_inject_fail_database(PG_FUNCTION_ARGS)
-{
-#ifdef USE_INJECTION_POINTS
-       bool            attach = PG_GETARG_BOOL(0);
-
-       if (attach)
-               InjectionPointAttach("datachecksumsworker-modify-db-result",
-                                                        "test_checksums",
-                                                        "dc_modify_db_result",
-                                                        &db_fail,
-                                                        sizeof(uint32));
-       else
-               InjectionPointDetach("datachecksumsworker-modify-db-result");
-#else
-       elog(ERROR,
-                "test is not working as intended when injection points are disabled");
-#endif
-       PG_RETURN_VOID();
-}
-
-/*
- * Test to force waiting for existing temptables.
- */
-void
-dc_fake_temptable(const char *name, const void *private_data, void *arg)
-{
-       static bool first_pass = true;
-       int                *numleft = (int *) arg;
-
-       if (first_pass)
-               *numleft = 1;
-       first_pass = false;
-}
-
-PG_FUNCTION_INFO_V1(dcw_fake_temptable);
-Datum
-dcw_fake_temptable(PG_FUNCTION_ARGS)
-{
-#ifdef USE_INJECTION_POINTS
-       bool            attach = PG_GETARG_BOOL(0);
-
-       if (attach)
-               InjectionPointAttach("datachecksumsworker-fake-temptable-wait",
-                                                        "test_checksums",
-                                                        "dc_fake_temptable",
-                                                        NULL,
-                                                        0);
-       else
-               InjectionPointDetach("datachecksumsworker-fake-temptable-wait");
-#else
-       elog(ERROR,
-                "test is not working as intended when injection points are disabled");
-#endif
-       PG_RETURN_VOID();
-}
-
-void
-crash(const char *name, const void *private_data, void *arg)
-{
-       abort();
-}