From f19c0eccae9680f5785b11cdc58ef571998caec9 Mon Sep 17 00:00:00 2001 From: Daniel Gustafsson Date: Fri, 3 Apr 2026 22:58:51 +0200 Subject: [PATCH] Online enabling and disabling of data checksums This allows data checksums to be enabled, or disabled, in a running cluster without restricting access to the cluster during processing. Data checksums could prior to this only be enabled during initdb or when the cluster is offline using the pg_checksums app. This commit introduce functionality to enable, or disable, data checksums while the cluster is running regardless of how it was initialized. A background worker launcher process is responsible for launching a dynamic per-database background worker which will mark all buffers dirty for all relation with storage in order for them to have data checksums calculated on write. Once all relations in all databases have been processed, the data_checksums state will be set to on and the cluster will at that point be identical to one which had data checksums enabled during initialization or via offline processing. When data checksums are being enabled, concurrent I/O operations from backends other than the data checksums worker will write the checksums but not verify them on reading. Only when all backends have absorbed the procsignalbarrier for setting data_checksums to on will they also start verifying checksums on reading. The same process is repeated during disabling; all backends write checksums but do not verify them until the barrier for setting the state to off has been absorbed by all. This in-progress state is used to ensure there are no false negatives (or positives) due to reading a checksum which is not in sync with the page. A new testmodule, test_checksums, is introduced with an extensive set of tests covering both online and offline data checksum mode changes. The tests which run concurrent pgbdench during online processing are gated behind the PG_TEST_EXTRA flag due to being very expensive to run. Two levels of PG_TEST_EXTRA flags exist to turn on a subset of the expensive tests, or the full suite of multiple runs. This work is based on an earlier version of this patch which was reviewed by among others Heikki Linnakangas, Robert Haas, Andres Freund, Tomas Vondra, Michael Banck and Andrey Borodin. During the work on this new version, Tomas Vondra has given invaluable assistance with not only coding and reviewing but very in-depth testing. Author: Daniel Gustafsson Author: Magnus Hagander Co-authored-by: Tomas Vondra Reviewed-by: Tomas Vondra Reviewed-by: Andres Freund Reviewed-by: Heikki Linnakangas Discussion: https://postgr.es/m/CABUevExz9hUUOLnJVr2kpw9Cx=o4MCr1SVKwbupzuxP7ckNutA@mail.gmail.com Discussion: https://postgr.es/m/20181030051643.elbxjww5jjgnjaxg@alap3.anarazel.de Discussion: https://postgr.es/m/CABUevEwE3urLtwxxqdgd5O2oQz9J717ZzMbh+ziCSa5YLLU_BA@mail.gmail.com --- doc/src/sgml/config.sgml | 1 + doc/src/sgml/func/func-admin.sgml | 78 + doc/src/sgml/glossary.sgml | 24 + doc/src/sgml/images/Makefile | 1 + doc/src/sgml/images/datachecksums.gv | 14 + doc/src/sgml/images/datachecksums.svg | 81 + doc/src/sgml/monitoring.sgml | 228 ++- doc/src/sgml/ref/pg_checksums.sgml | 6 + doc/src/sgml/regress.sgml | 14 + doc/src/sgml/wal.sgml | 126 +- src/backend/access/rmgrdesc/xlogdesc.c | 58 +- src/backend/access/transam/xlog.c | 502 ++++- src/backend/backup/basebackup.c | 31 +- src/backend/bootstrap/bootstrap.c | 1 + src/backend/catalog/system_views.sql | 19 + src/backend/commands/dbcommands.c | 7 + src/backend/postmaster/Makefile | 1 + src/backend/postmaster/auxprocess.c | 19 + src/backend/postmaster/bgworker.c | 10 +- src/backend/postmaster/datachecksum_state.c | 1612 +++++++++++++++++ src/backend/postmaster/meson.build | 1 + src/backend/postmaster/postmaster.c | 5 + src/backend/replication/logical/decode.c | 16 + src/backend/storage/buffer/bufmgr.c | 7 + src/backend/storage/ipc/ipci.c | 3 + src/backend/storage/ipc/procsignal.c | 8 + src/backend/storage/page/README | 4 +- src/backend/storage/page/bufpage.c | 23 +- src/backend/utils/activity/pgstat_backend.c | 2 + src/backend/utils/activity/pgstat_io.c | 2 + .../utils/activity/wait_event_names.txt | 3 + src/backend/utils/adt/pgstatfuncs.c | 8 +- src/backend/utils/init/miscinit.c | 3 +- src/backend/utils/init/postinit.c | 20 +- src/backend/utils/misc/guc_parameters.dat | 5 +- src/backend/utils/misc/guc_tables.c | 9 +- src/backend/utils/misc/postgresql.conf.sample | 10 +- src/bin/pg_checksums/pg_checksums.c | 4 +- src/bin/pg_controldata/pg_controldata.c | 2 + src/bin/pg_upgrade/controldata.c | 9 + src/bin/pg_waldump/t/001_basic.pl | 3 +- src/include/access/rmgrlist.h | 1 + src/include/access/xlog.h | 17 +- src/include/access/xlog_internal.h | 8 + src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_control.h | 8 +- src/include/catalog/pg_proc.dat | 14 + src/include/commands/progress.h | 16 + src/include/miscadmin.h | 6 + src/include/postmaster/datachecksum_state.h | 58 + src/include/postmaster/proctypelist.h | 2 + src/include/replication/decode.h | 1 + src/include/storage/bufpage.h | 2 +- src/include/storage/checksum.h | 16 + src/include/storage/lwlocklist.h | 1 + src/include/storage/procsignal.h | 4 + src/include/utils/backend_progress.h | 1 + src/test/modules/Makefile | 1 + src/test/modules/meson.build | 1 + src/test/modules/test_checksums/.gitignore | 2 + src/test/modules/test_checksums/Makefile | 40 + src/test/modules/test_checksums/README | 30 + src/test/modules/test_checksums/meson.build | 38 + .../modules/test_checksums/t/001_basic.pl | 63 + .../modules/test_checksums/t/002_restarts.pl | 110 ++ .../test_checksums/t/003_standby_restarts.pl | 114 ++ .../modules/test_checksums/t/004_offline.pl | 82 + .../modules/test_checksums/t/005_injection.pl | 74 + .../test_checksums/t/006_pgbench_single.pl | 275 +++ .../test_checksums/t/007_pgbench_standby.pl | 400 ++++ src/test/modules/test_checksums/t/008_pitr.pl | 189 ++ src/test/modules/test_checksums/t/009_fpi.pl | 64 + .../test_checksums/t/DataChecksums/Utils.pm | 262 +++ .../test_checksums/test_checksums--1.0.sql | 24 + .../modules/test_checksums/test_checksums.c | 184 ++ .../test_checksums/test_checksums.control | 4 + src/test/perl/PostgreSQL/Test/Cluster.pm | 36 + src/test/regress/expected/rules.out | 35 + src/test/regress/expected/stats.out | 18 +- src/tools/pgindent/typedefs.list | 7 + 80 files changed, 5132 insertions(+), 58 deletions(-) create mode 100644 doc/src/sgml/images/datachecksums.gv create mode 100644 doc/src/sgml/images/datachecksums.svg create mode 100644 src/backend/postmaster/datachecksum_state.c create mode 100644 src/include/postmaster/datachecksum_state.h create mode 100644 src/test/modules/test_checksums/.gitignore create mode 100644 src/test/modules/test_checksums/Makefile create mode 100644 src/test/modules/test_checksums/README create mode 100644 src/test/modules/test_checksums/meson.build create mode 100644 src/test/modules/test_checksums/t/001_basic.pl create mode 100644 src/test/modules/test_checksums/t/002_restarts.pl create mode 100644 src/test/modules/test_checksums/t/003_standby_restarts.pl create mode 100644 src/test/modules/test_checksums/t/004_offline.pl create mode 100644 src/test/modules/test_checksums/t/005_injection.pl create mode 100644 src/test/modules/test_checksums/t/006_pgbench_single.pl create mode 100644 src/test/modules/test_checksums/t/007_pgbench_standby.pl create mode 100644 src/test/modules/test_checksums/t/008_pitr.pl create mode 100644 src/test/modules/test_checksums/t/009_fpi.pl create mode 100644 src/test/modules/test_checksums/t/DataChecksums/Utils.pm create mode 100644 src/test/modules/test_checksums/test_checksums--1.0.sql create mode 100644 src/test/modules/test_checksums/test_checksums.c create mode 100644 src/test/modules/test_checksums/test_checksums.control diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 422ba304982..d3fea738ca3 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -7187,6 +7187,7 @@ local0.* /var/log/postgresql bgworker bgwriter checkpointer + checksums ioworker postmaster slotsyncworker diff --git a/doc/src/sgml/func/func-admin.sgml b/doc/src/sgml/func/func-admin.sgml index 210b1118bdf..24ecb46542e 100644 --- a/doc/src/sgml/func/func-admin.sgml +++ b/doc/src/sgml/func/func-admin.sgml @@ -3123,4 +3123,82 @@ SELECT convert_from(pg_read_binary_file('file_in_utf8.txt'), 'UTF8'); + + Data Checksum Functions + + + The functions shown in can + be used to enable or disable data checksums in a running cluster. + + + Changing data checksums can be done in a cluster with concurrent activity + without blocking queries, but overall system performance will be affected. + See for further details on how changing the + data checksums state can affect a system and possible mitigations for how + to reduce the impact. + + + + Data Checksum Functions + + + + + Function + + + Description + + + + + + + + + pg_enable_data_checksums + + pg_enable_data_checksums ( cost_delay int, cost_limit int ) + void + + + Initiates the process of enabling data checksums for the cluster. This + will set the data checksums state to inprogress-on + as well as start a background worker that will process all pages in all + databases and enable data checksums on them. When all pages have + been processed, the cluster will automatically set data checksums state + to on. This operation is WAL logged and replicated + to all standby nodes. + + + If cost_delay and cost_limit are + specified, the process is throttled using the same principles as + Cost-based Vacuum Delay. + + + + + + + + pg_disable_data_checksums + + pg_disable_data_checksums () + void + + + Disables data checksum calculation and validation for the cluster. This + will set the data checksum state to inprogress-off + while data checksums are being disabled. When all active backends have + stopped validating data checksums, the data checksum state will be + set to off. + + + + + +
+ +
+ diff --git a/doc/src/sgml/glossary.sgml b/doc/src/sgml/glossary.sgml index 113d7640626..b881ae71198 100644 --- a/doc/src/sgml/glossary.sgml +++ b/doc/src/sgml/glossary.sgml @@ -199,6 +199,8 @@ (but not the autovacuum workers), the background writer, the checkpointer, + the data checksums worker, + the data checksums worker launcher, the logger, the startup process, the WAL archiver, @@ -574,6 +576,28 @@ + + Data Checksums Worker + + + A background worker + which enables data checksums in a specific database. + + + + + + Data Checksums Worker Launcher + + + A background worker + which starts data + checksum worker processes for enabling data checksums in each + database, or disables data checksums cluster-wide. + + + + Database diff --git a/doc/src/sgml/images/Makefile b/doc/src/sgml/images/Makefile index 38f8869d78d..7b8ac0fbb32 100644 --- a/doc/src/sgml/images/Makefile +++ b/doc/src/sgml/images/Makefile @@ -3,6 +3,7 @@ # see README in this directory about image handling ALL_IMAGES = \ + datachecksums.svg \ genetic-algorithm.svg \ gin.svg \ pagelayout.svg \ diff --git a/doc/src/sgml/images/datachecksums.gv b/doc/src/sgml/images/datachecksums.gv new file mode 100644 index 00000000000..dff3ff7340a --- /dev/null +++ b/doc/src/sgml/images/datachecksums.gv @@ -0,0 +1,14 @@ +digraph G { + A -> B [label="SELECT pg_enable_data_checksums()"]; + B -> C; + D -> A; + C -> D [label="SELECT pg_disable_data_checksums()"]; + E -> A [label=" --no-data-checksums"]; + E -> C [label=" --data-checksums"]; + + A [label="off"]; + B [label="inprogress-on"]; + C [label="on"]; + D [label="inprogress-off"]; + E [label="initdb"]; +} diff --git a/doc/src/sgml/images/datachecksums.svg b/doc/src/sgml/images/datachecksums.svg new file mode 100644 index 00000000000..8c58f42922e --- /dev/null +++ b/doc/src/sgml/images/datachecksums.svg @@ -0,0 +1,81 @@ + + + + + +G + + + +A + +off + + + +B + +inprogress-on + + + +A->B + + +SELECT pg_enable_data_checksums() + + + +C + +on + + + +B->C + + + + + +D + +inprogress-off + + + +C->D + + +SELECT pg_disable_data_checksums() + + + +D->A + + + + + +E + +initdb + + + +E->A + + + --no-data-checksums + + + +E->C + + + --data-checksums + + + diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index bb75ed1069b..312374da5e0 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -3885,9 +3885,14 @@ description | Waiting for a newly initialized WAL file to reach durable storage Number of data page checksum failures detected in this - database (or on a shared object), or NULL if data checksums are - disabled. - + database (or on a shared object). Detected failures are not reset if + the setting changes. Clusters + which are initialized without data checksums will show this as + 0. In PostgreSQL version + 18 and earlier, this was set to NULL for clusters + with data checksums disabled. + + @@ -3896,8 +3901,8 @@ description | Waiting for a newly initialized WAL file to reach durable storage Time at which the last data page checksum failure was detected in - this database (or on a shared object), or NULL if data checksums are - disabled. + this database (or on a shared object). Last failure is reported + regardless of the setting. @@ -7634,6 +7639,219 @@ FROM pg_stat_get_backend_idset() AS backendid; + + Data Checksum Progress Reporting + + + pg_stat_progress_data_checksums + + + + When data checksums are being enabled on a running cluster, the + pg_stat_progress_data_checksums view will contain + a row for the launcher process, and one row for each worker process which + is currently calculating and writing checksums for the data pages in a database. + The launcher provides overview of the overall progress (how many databases + have been processed, how many remain), while the workers track progress for + currently processed databases. + + + + <structname>pg_stat_progress_data_checksums</structname> View + + + + + + Column Type + + + Description + + + + + + + + + + pid integer + + + Process ID of the data checksum process, launcher or worker. + + + + + + + + datid oid + + + OID of this database, or 0 for the launcher process. + + + + + + + + datname name + + + Name of this database, or NULL for the + launcher process. + + + + + + + + phase text + + + Current processing phase, see + for description of the phases. + + + + + + + + databases_total integer + + + The total number of databases which will be processed. Only the + launcher process has this value set, the worker processes have this + set to NULL. + + + + + + + + databases_done integer + + + The number of databases which have been processed. Only the launcher + process has this value set, the worker processes have this set to + NULL. + + + + + + + + relations_total integer + + + The total number of relations which will be processed, or + NULL if the worker process hasn't + calculated the number of relations yet. The launcher process has + this set to NULL since it isn't responsible for + processing relations, only launching worker processes. + + + + + + + + relations_done integer + + + The number of relations which have been processed. The launcher + process has this set to NULL. + + + + + + + + blocks_total integer + + + The number of blocks in the current relation which will be processed, + or NULL if the worker process hasn't + calculated the number of blocks yet. The launcher process has + this set to NULL. + + + + + + + + blocks_done integer + + + The number of blocks in the current relation which have been processed. + The launcher process has this set to NULL. + + + + + + +
+ + + Data Checksum Phases + + + + + + Phase + Description + + + + + enabling + + The command is currently enabling data checksums on the cluster. + + + + disabling + + The command is currently disabling data checksums on the cluster. + + + + done + + The command is done and the data checksum state in the cluster has + changed. + + + + waiting on barrier + + The command is currently waiting for the current active backends to + acknowledge the change in data checksum state. + + + + waiting on temporary tables + + The command is currently waiting for all temporary tables which existed + at the time the command was started to be removed. + + + + +
+
+ diff --git a/doc/src/sgml/ref/pg_checksums.sgml b/doc/src/sgml/ref/pg_checksums.sgml index b64393c813f..45890324075 100644 --- a/doc/src/sgml/ref/pg_checksums.sgml +++ b/doc/src/sgml/ref/pg_checksums.sgml @@ -45,6 +45,12 @@ PostgreSQL documentation exit status is nonzero if the operation failed. + + When enabling checksums, if checksums were in the process of being enabled + when the cluster was shut down, pg_checksums + will still process all relations regardless of the online processing. + + When verifying checksums, every file in the cluster is scanned. When enabling checksums, each relation file block with a changed checksum is diff --git a/doc/src/sgml/regress.sgml b/doc/src/sgml/regress.sgml index 873387ec168..c74941bfbf2 100644 --- a/doc/src/sgml/regress.sgml +++ b/doc/src/sgml/regress.sgml @@ -275,6 +275,20 @@ make check-world PG_TEST_EXTRA='kerberos ldap ssl load_balance libpq_encryption' The following values are currently supported: + + checksum, checksum_extended + + + Runs additional tests for enabling data checksums which inject faults + to cause re-tries in the processing, as well as tests that run pgbench + concurrently and randomly restarts the cluster. Some of these test + suites require injection points enabled in the installation. + checksum_extended is an extended version with + longer runtime, injected random delays and larger datasets. + + + + kerberos diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index f3b86b26be9..165af8a0cf2 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -246,9 +246,10 @@ Checksums can be disabled when the cluster is initialized using initdb. - They can also be enabled or disabled at a later time as an offline - operation. Data checksums are enabled or disabled at the full cluster - level, and cannot be specified individually for databases or tables. + They can also be enabled or disabled at a later time either as an offline + operation or online in a running cluster allowing concurrent access. Data + checksums are enabled or disabled at the full cluster level, and cannot be + specified individually for databases, tables or replicated cluster members. @@ -265,7 +266,7 @@ - Off-line Enabling of Checksums + Offline Enabling of Checksums The pg_checksums @@ -274,6 +275,123 @@ + + + Online Enabling of Checksums + + + Checksums can be enabled or disabled online, by calling the appropriate + functions. + + + + Both enabling and disabling data checksums happens in two phases, separated + by a checkpoint to ensure durability. The different states, and their + transitions, are illustrated in + and discussed in further detail in this section. + + + +
+ data checksums states + + + + + +
+
+ + + Enabling checksums will set the cluster checksum state to + inprogress-on. During this time, checksums will be + written but not verified. In addition to this, a background worker process + is started that enables checksums on all existing data in the cluster. Once + this worker has completed processing all databases in the cluster, the + checksum state will automatically switch to on. The + processing will consume two background worker processes, make sure that + max_worker_processes allows for at least two more + additional processes. + + + + The process will initially wait for all open transactions to finish before + it starts, so that it can be certain that there are no tables that have been + created inside a transaction that has not committed yet and thus would not + be visible to the process enabling checksums. It will also, for each database, + wait for all pre-existing temporary tables to get removed before it finishes. + If long-lived temporary tables are used in an application it may be necessary + to terminate these application connections to allow the process to complete. + + + + If the cluster is stopped while in inprogress-on state, + for any reason, or processing was interrupted, then the checksum enable + process must be restarted manually. To do this, re-execute the function + pg_enable_data_checksums() once the cluster has been + restarted. The process will start over, there is no support for resuming + work from where it was interrupted. If the cluster is stopped while in + inprogress-off, then the checksum state will be set to + off when the cluster is restarted. + + + + Disabling data checksums will set the data checksum state to + inprogress-off. During this time, checksums will be + written but not verified. After all processes acknowledge the change, + the state will automatically be set to off. + + + + Disabling data checksums while data checksums are actively being enabled + will abort the current processing. + + + + Impact on system of online operations + + Enabling data checksums can cause significant I/O to the system, as all of the + database pages will need to be rewritten, and will be written both to the + data files and the WAL. The impact may be limited by throttling using the + cost_delay and cost_limit + parameters of the pg_enable_data_checksums() function. + + + + + + I/O: all pages need to have data checksums calculated and written which + will generate a lot of dirty pages that will need to be flushed to disk, + as well as WAL logged. + + + Replication: When the standby receives the data checksum state change + in the WAL stream it will issue a + restartpoint in order to flush the current state into the + pg_control file. The restartpoint will flush the + current state to disk and will block redo until finished. This in turn + will induce replication lag, which on synchronous standbys also blocks + the primary. Reducing before the + process is started can help with reducing the time it takes for the + restartpoint to finish. + + + Shutdown/Restart: If the server is shut down or restarted when data + checksums are being enabled, the process will not resume and all pages + need to be recalculated and rewritten. Enabling data checksums should + be done when there is no need for regular maintenance or during a + service window. + + + + + + No I/O is incurred when disabling data checksums, but checkpoints are + still required. + + + +
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 44194d3ea17..2468a7d2578 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -18,6 +18,7 @@ #include "access/xlog.h" #include "access/xlog_internal.h" #include "catalog/pg_control.h" +#include "storage/checksum.h" #include "utils/guc.h" #include "utils/timestamp.h" @@ -54,6 +55,40 @@ get_wal_level_string(int wal_level) return wal_level_str; } +const char * +get_checksum_state_string(uint32 state) +{ + switch (state) + { + case PG_DATA_CHECKSUM_VERSION: + return "on"; + case PG_DATA_CHECKSUM_INPROGRESS_OFF: + return "inprogress-off"; + case PG_DATA_CHECKSUM_INPROGRESS_ON: + return "inprogress-on"; + case PG_DATA_CHECKSUM_OFF: + return "off"; + } + + Assert(false); + return "?"; +} + +void +xlog2_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG2_CHECKSUMS) + { + xl_checksum_state xlrec; + + memcpy(&xlrec, rec, sizeof(xl_checksum_state)); + appendStringInfoString(buf, get_checksum_state_string(xlrec.new_checksum_state)); + } +} + void xlog_desc(StringInfo buf, XLogReaderState *record) { @@ -69,7 +104,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record) "tli %u; prev tli %u; fpw %s; wal_level %s; logical decoding %s; xid %u:%u; oid %u; multi %u; offset %" PRIu64 "; " "oldest xid %u in DB %u; oldest multi %u in DB %u; " "oldest/newest commit timestamp xid: %u/%u; " - "oldest running xid %u; %s", + "oldest running xid %u; " + "checksums %s; %s", LSN_FORMAT_ARGS(checkpoint->redo), checkpoint->ThisTimeLineID, checkpoint->PrevTimeLineID, @@ -88,6 +124,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) checkpoint->oldestCommitTsXid, checkpoint->newestCommitTsXid, checkpoint->oldestActiveXid, + get_checksum_state_string(checkpoint->dataChecksumState), (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online"); } else if (info == XLOG_NEXTOID) @@ -166,7 +203,9 @@ xlog_desc(StringInfo buf, XLogReaderState *record) xl_checkpoint_redo xlrec; memcpy(&xlrec, rec, sizeof(xl_checkpoint_redo)); - appendStringInfo(buf, "wal_level %s", get_wal_level_string(xlrec.wal_level)); + appendStringInfo(buf, "wal_level %s; checksums %s", + get_wal_level_string(xlrec.wal_level), + get_checksum_state_string(xlrec.data_checksum_version)); } else if (info == XLOG_LOGICAL_DECODING_STATUS_CHANGE) { @@ -241,6 +280,21 @@ xlog_identify(uint8 info) return id; } +const char * +xlog2_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG2_CHECKSUMS: + id = "CHECKSUMS"; + break; + } + + return id; +} + /* * Returns a string giving information about all the blocks in an * XLogRecord. diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 2c1c6f88b74..9e8999bbb61 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -75,6 +75,7 @@ #include "pgstat.h" #include "port/atomics.h" #include "postmaster/bgwriter.h" +#include "postmaster/datachecksum_state.h" #include "postmaster/startup.h" #include "postmaster/walsummarizer.h" #include "postmaster/walwriter.h" @@ -92,6 +93,7 @@ #include "storage/predicate.h" #include "storage/proc.h" #include "storage/procarray.h" +#include "storage/procsignal.h" #include "storage/reinit.h" #include "storage/spin.h" #include "storage/sync.h" @@ -553,6 +555,9 @@ typedef struct XLogCtlData */ XLogRecPtr lastFpwDisableRecPtr; + /* last data_checksum_version we've seen */ + uint32 data_checksum_version; + slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; @@ -650,6 +655,21 @@ static XLogRecPtr LocalMinRecoveryPoint; static TimeLineID LocalMinRecoveryPointTLI; static bool updateMinRecoveryPoint = true; +/* + * Local state for Controlfile data_checksum_version. After initialization + * this is only updated when absorbing a procsignal barrier during interrupt + * processing. The reason for keeping a copy in backend-private memory is to + * avoid locking for interrogating the data checksum state. Possible values + * are the data checksum versions defined in storage/checksum.h. + */ +static ChecksumStateType LocalDataChecksumState = 0; + +/* + * Variable backing the GUC, keep it in sync with LocalDataChecksumState. + * See SetLocalDataChecksumState(). + */ +int data_checksums = 0; + /* For WALInsertLockAcquire/Release functions */ static int MyLockNo = 0; static bool holdingAllLocks = false; @@ -717,6 +737,8 @@ static void WALInsertLockAcquireExclusive(void); static void WALInsertLockRelease(void); static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); +static void XLogChecksums(uint32 new_type); + /* * Insert an XLOG record represented by an already-constructed chain of data * chunks. This is a low-level routine; to construct the WAL record header @@ -4254,6 +4276,12 @@ InitControlFile(uint64 sysidentifier, uint32 data_checksum_version) ControlFile->wal_log_hints = wal_log_hints; ControlFile->track_commit_timestamp = track_commit_timestamp; ControlFile->data_checksum_version = data_checksum_version; + + /* + * Set the data_checksum_version value into XLogCtl, which is where all + * processes get the current value from. + */ + XLogCtl->data_checksum_version = data_checksum_version; } static void @@ -4588,10 +4616,6 @@ ReadControlFile(void) (SizeOfXLogLongPHD - SizeOfXLogShortPHD); CalculateCheckpointSegments(); - - /* Make the initdb settings visible as GUC variables, too */ - SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no", - PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); } /* @@ -4625,13 +4649,323 @@ GetMockAuthenticationNonce(void) } /* - * Are checksums enabled for data pages? + * DataChecksumsNeedWrite + * Returns whether data checksums must be written or not + * + * Returns true if data checksums are enabled, or are in the process of being + * enabled. During "inprogress-on" and "inprogress-off" states checksums must + * be written even though they are not verified (see datachecksum_state.c for + * a longer discussion). + * + * This function is intended for callsites which are about to write a data page + * to storage, and need to know whether to re-calculate the checksum for the + * page header. Calling this function must be performed as close to the write + * operation as possible to keep the critical section short. + */ +bool +DataChecksumsNeedWrite(void) +{ + return (LocalDataChecksumState == PG_DATA_CHECKSUM_VERSION || + LocalDataChecksumState == PG_DATA_CHECKSUM_INPROGRESS_ON || + LocalDataChecksumState == PG_DATA_CHECKSUM_INPROGRESS_OFF); +} + +bool +DataChecksumsInProgressOn(void) +{ + return LocalDataChecksumState == PG_DATA_CHECKSUM_INPROGRESS_ON; +} + +/* + * DataChecksumsNeedVerify + * Returns whether data checksums must be verified or not + * + * Data checksums are only verified if they are fully enabled in the cluster. + * During the "inprogress-on" and "inprogress-off" states they are only + * updated, not verified (see datachecksum_state.c for a longer discussion). + * + * This function is intended for callsites which have read data and are about + * to perform checksum validation based on the result of this. Calling this + * function must be performed as close to the validation call as possible to + * keep the critical section short. This is in order to protect against time of + * check/time of use situations around data checksum validation. */ bool -DataChecksumsEnabled(void) +DataChecksumsNeedVerify(void) { + return (LocalDataChecksumState == PG_DATA_CHECKSUM_VERSION); +} + +/* + * SetDataChecksumsOnInProgress + * Sets the data checksum state to "inprogress-on" to enable checksums + * + * To start the process of enabling data checksums in a running cluster the + * data_checksum_version state must be changed to "inprogress-on". See + * SetDataChecksumsOn below for a description on how this state change works. + * This function blocks until all backends in the cluster have acknowledged the + * state transition. + */ +void +SetDataChecksumsOnInProgress(void) +{ + uint64 barrier; + Assert(ControlFile != NULL); - return (ControlFile->data_checksum_version > 0); + + /* + * The state transition is performed in a critical section with + * checkpoints held off to provide crash safety. + */ + START_CRIT_SECTION(); + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + + XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_ON); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON; + SpinLockRelease(&XLogCtl->info_lck); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON); + + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + END_CRIT_SECTION(); + + /* + * Update the controlfile before waiting since if we have an immediate + * shutdown while waiting we want to come back up with checksums enabled. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + /* + * Await state change in all backends to ensure that all backends are in + * "inprogress-on". Once done we know that all backends are writing data + * checksums. + */ + WaitForProcSignalBarrier(barrier); +} + +/* + * SetDataChecksumsOn + * Set data checksums state to 'on' cluster-wide + * + * Enabling data checksums is performed using two barriers, the first one to + * set the state to "inprogress-on" (done by SetDataChecksumsOnInProgress()) + * and the second one to set the state to "on" (done here). Below is a short + * description of the processing, a more detailed write-up can be found in + * datachecksum_state.c. + * + * To start the process of enabling data checksums in a running cluster the + * data_checksum_version state must be changed to "inprogress-on". This state + * requires data checksums to be written but not verified. This ensures that + * all data pages can be checksummed without the risk of false negatives in + * validation during the process. When all existing pages are guaranteed to + * have checksums, and all new pages will be initiated with checksums, the + * state can be changed to "on". Once the state is "on" checksums will be both + * written and verified. + * + * This function blocks until all backends in the cluster have acknowledged the + * state transition. + */ +void +SetDataChecksumsOn(void) +{ + uint64 barrier; + + Assert(ControlFile != NULL); + + SpinLockAcquire(&XLogCtl->info_lck); + + /* + * The only allowed state transition to "on" is from "inprogress-on" since + * that state ensures that all pages will have data checksums written. No + * such state transition exists, if it does happen it's likely due to a + * programmer error. + */ + if (XLogCtl->data_checksum_version != PG_DATA_CHECKSUM_INPROGRESS_ON) + { + SpinLockRelease(&XLogCtl->info_lck); + elog(WARNING, + "cannot set data checksums to \"on\", current state is not \"inprogress-on\", disabling"); + SetDataChecksumsOff(); + return; + } + + SpinLockRelease(&XLogCtl->info_lck); + + INJECTION_POINT("datachecksums-enable-checksums-delay", NULL); + START_CRIT_SECTION(); + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + + XLogChecksums(PG_DATA_CHECKSUM_VERSION); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_VERSION; + SpinLockRelease(&XLogCtl->info_lck); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON); + + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + END_CRIT_SECTION(); + + /* + * Update the controlfile before waiting since if we have an immediate + * shutdown while waiting we want to come back up with checksums enabled. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_VERSION; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST); + + /* + * Await state transition to "on" in all backends. When done we know that + * data data checksums are both written and verified in all backends. + */ + WaitForProcSignalBarrier(barrier); +} + +/* + * SetDataChecksumsOff + * Disables data checksums cluster-wide + * + * Disabling data checksums must be performed with two sets of barriers, each + * carrying a different state. The state is first set to "inprogress-off" + * during which checksums are still written but not verified. This ensures that + * backends which have yet to observe the state change from "on" won't get + * validation errors on concurrently modified pages. Once all backends have + * changed to "inprogress-off", the barrier for moving to "off" can be emitted. + * This function blocks until all backends in the cluster have acknowledged the + * state transition. + */ +void +SetDataChecksumsOff(void) +{ + uint64 barrier; + + Assert(ControlFile != NULL); + + SpinLockAcquire(&XLogCtl->info_lck); + + /* If data checksums are already disabled there is nothing to do */ + if (XLogCtl->data_checksum_version == 0) + { + SpinLockRelease(&XLogCtl->info_lck); + return; + } + + /* + * If data checksums are currently enabled we first transition to the + * "inprogress-off" state during which backends continue to write + * checksums without verifying them. When all backends are in + * "inprogress-off" the next transition to "off" can be performed, after + * which all data checksum processing is disabled. + */ + if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_VERSION) + { + SpinLockRelease(&XLogCtl->info_lck); + + START_CRIT_SECTION(); + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + + XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_OFF); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF; + SpinLockRelease(&XLogCtl->info_lck); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF); + + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + END_CRIT_SECTION(); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_OFF; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST); + + /* + * Update local state in all backends to ensure that any backend in + * "on" state is changed to "inprogress-off". + */ + WaitForProcSignalBarrier(barrier); + + /* + * At this point we know that no backends are verifying data checksums + * during reading. Next, we can safely move to state "off" to also + * stop writing checksums. + */ + } + else + { + /* + * Ending up here implies that the checksums state is "inprogress-on" + * or "inprogress-off" and we can transition directly to "off" from + * there. + */ + SpinLockRelease(&XLogCtl->info_lck); + } + + START_CRIT_SECTION(); + /* Ensure that we don't incur a checkpoint during disabling checksums */ + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + + XLogChecksums(PG_DATA_CHECKSUM_OFF); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = 0; + SpinLockRelease(&XLogCtl->info_lck); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF); + + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + END_CRIT_SECTION(); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_OFF; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST); + + WaitForProcSignalBarrier(barrier); +} + +/* + * InitLocalDataChecksumState + * + * Set up backend local caches of controldata variables which may change at + * any point during runtime and thus require special cased locking. So far + * this only applies to data_checksum_version, but it's intended to be general + * purpose enough to handle future cases. + */ +void +InitLocalDataChecksumState(void) +{ + SpinLockAcquire(&XLogCtl->info_lck); + SetLocalDataChecksumState(XLogCtl->data_checksum_version); + SpinLockRelease(&XLogCtl->info_lck); +} + +void +SetLocalDataChecksumState(uint32 data_checksum_version) +{ + LocalDataChecksumState = data_checksum_version; + + data_checksums = data_checksum_version; +} + +/* guc hook */ +const char * +show_data_checksums(void) +{ + return get_checksum_state_string(LocalDataChecksumState); } /* @@ -4925,6 +5259,7 @@ LocalProcessControlFile(bool reset) Assert(reset || ControlFile == NULL); ControlFile = palloc_object(ControlFileData); ReadControlFile(); + SetLocalDataChecksumState(ControlFile->data_checksum_version); } /* @@ -5094,6 +5429,11 @@ XLOGShmemInit(void) XLogCtl->InstallXLogFileSegmentActive = false; XLogCtl->WalWriterSleeping = false; + /* Use the checksum info from control file */ + XLogCtl->data_checksum_version = ControlFile->data_checksum_version; + + SetLocalDataChecksumState(XLogCtl->data_checksum_version); + SpinLockInit(&XLogCtl->Insert.insertpos_lck); SpinLockInit(&XLogCtl->info_lck); pg_atomic_init_u64(&XLogCtl->logInsertResult, InvalidXLogRecPtr); @@ -5168,6 +5508,7 @@ BootStrapXLOG(uint32 data_checksum_version) checkPoint.newestCommitTsXid = InvalidTransactionId; checkPoint.time = (pg_time_t) time(NULL); checkPoint.oldestActiveXid = InvalidTransactionId; + checkPoint.dataChecksumState = data_checksum_version; TransamVariables->nextXid = checkPoint.nextXid; TransamVariables->nextOid = checkPoint.nextOid; @@ -6244,6 +6585,47 @@ StartupXLOG(void) pfree(endOfRecoveryInfo->recoveryStopReason); pfree(endOfRecoveryInfo); + /* + * If we reach this point with checksums in the state inprogress-on, it + * means that data checksums were in the process of being enabled when the + * cluster shut down. Since processing didn't finish, the operation will + * have to be restarted from scratch since there is no capability to + * continue where it was when the cluster shut down. Thus, revert the + * state back to off, and inform the user with a warning message. Being + * able to restart processing is a TODO, but it wouldn't be possible to + * restart here since we cannot launch a dynamic background worker + * directly from here (it has to be from a regular backend). + */ + if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_ON) + { + XLogChecksums(PG_DATA_CHECKSUM_OFF); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = 0; + SetLocalDataChecksumState(XLogCtl->data_checksum_version); + SpinLockRelease(&XLogCtl->info_lck); + + ereport(WARNING, + errmsg("enabling data checksums was interrupted"), + errhint("Data checksum processing must be manually restarted for checksums to be enabled")); + } + + /* + * If data checksums were being disabled when the cluster was shut down, + * we know that we have a state where all backends have stopped validating + * checksums and we can move to off instead of prompting the user to + * perform any action. + */ + if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_OFF) + { + XLogChecksums(PG_DATA_CHECKSUM_OFF); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = 0; + SetLocalDataChecksumState(XLogCtl->data_checksum_version); + SpinLockRelease(&XLogCtl->info_lck); + } + /* * All done with end-of-recovery actions. * @@ -6549,7 +6931,7 @@ GetRedoRecPtr(void) XLogRecPtr ptr; /* - * The possibly not up-to-date copy in XlogCtl is enough. Even if we + * The possibly not up-to-date copy in XLogCtl is enough. Even if we * grabbed a WAL insertion lock to read the authoritative value in * Insert->RedoRecPtr, someone might update it just after we've released * the lock. @@ -7127,6 +7509,12 @@ CreateCheckPoint(int flags) checkPoint.fullPageWrites = Insert->fullPageWrites; checkPoint.wal_level = wal_level; + /* + * Get the current data_checksum_version value from xlogctl, valid at the + * time of the checkpoint. + */ + checkPoint.dataChecksumState = XLogCtl->data_checksum_version; + if (shutdown) { XLogRecPtr curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos); @@ -7183,6 +7571,9 @@ CreateCheckPoint(int flags) WALInsertLockAcquire(); redo_rec.wal_level = wal_level; + SpinLockAcquire(&XLogCtl->info_lck); + redo_rec.data_checksum_version = XLogCtl->data_checksum_version; + SpinLockRelease(&XLogCtl->info_lck); WALInsertLockRelease(); /* Include WAL level in record for WAL summarizer's benefit. */ @@ -7243,6 +7634,10 @@ CreateCheckPoint(int flags) checkPoint.nextOid += TransamVariables->oidCount; LWLockRelease(OidGenLock); + SpinLockAcquire(&XLogCtl->info_lck); + checkPoint.dataChecksumState = XLogCtl->data_checksum_version; + SpinLockRelease(&XLogCtl->info_lck); + checkPoint.logicalDecodingEnabled = IsLogicalDecodingEnabled(); MultiXactGetCheckptMulti(shutdown, @@ -7392,6 +7787,9 @@ CreateCheckPoint(int flags) ControlFile->minRecoveryPoint = InvalidXLogRecPtr; ControlFile->minRecoveryPointTLI = 0; + /* make sure we start with the checksum version as of the checkpoint */ + ControlFile->data_checksum_version = checkPoint.dataChecksumState; + /* * Persist unloggedLSN value. It's reset on crash recovery, so this goes * unused on non-shutdown checkpoints, but seems useful to store it always @@ -7535,6 +7933,12 @@ CreateEndOfRecoveryRecord(void) LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->minRecoveryPoint = recptr; ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID; + + /* start with the latest checksum version (as of the end of recovery) */ + SpinLockAcquire(&XLogCtl->info_lck); + ControlFile->data_checksum_version = XLogCtl->data_checksum_version; + SpinLockRelease(&XLogCtl->info_lck); + UpdateControlFile(); LWLockRelease(ControlFileLock); @@ -7876,6 +8280,10 @@ CreateRestartPoint(int flags) if (flags & CHECKPOINT_IS_SHUTDOWN) ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY; } + + /* we shall start with the latest checksum version */ + ControlFile->data_checksum_version = lastCheckPoint.dataChecksumState; + UpdateControlFile(); } LWLockRelease(ControlFileLock); @@ -8314,6 +8722,24 @@ XLogReportParameters(void) } } +/* + * Log the new state of checksums + */ +static void +XLogChecksums(uint32 new_type) +{ + xl_checksum_state xlrec; + XLogRecPtr recptr; + + xlrec.new_checksum_state = new_type; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_checksum_state)); + + recptr = XLogInsert(RM_XLOG2_ID, XLOG2_CHECKSUMS); + XLogFlush(recptr); +} + /* * Update full_page_writes in shared memory, and write an * XLOG_FPW_CHANGE record if necessary. @@ -8440,6 +8866,11 @@ xlog_redo(XLogReaderState *record) MultiXactAdvanceOldest(checkPoint.oldestMulti, checkPoint.oldestMultiDB); + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = checkPoint.dataChecksumState; + SetLocalDataChecksumState(checkPoint.dataChecksumState); + SpinLockRelease(&XLogCtl->info_lck); + /* * No need to set oldestClogXid here as well; it'll be set when we * redo an xl_clog_truncate if it changed since initialization. @@ -8499,6 +8930,7 @@ xlog_redo(XLogReaderState *record) /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; + ControlFile->data_checksum_version = checkPoint.dataChecksumState; LWLockRelease(ControlFileLock); /* @@ -8525,6 +8957,8 @@ xlog_redo(XLogReaderState *record) { CheckPoint checkPoint; TimeLineID replayTLI; + bool new_state = false; + int old_state; memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); /* In an ONLINE checkpoint, treat the XID counter as a minimum */ @@ -8563,6 +8997,8 @@ xlog_redo(XLogReaderState *record) /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; + old_state = ControlFile->data_checksum_version; + ControlFile->data_checksum_version = checkPoint.dataChecksumState; LWLockRelease(ControlFileLock); /* TLI should not change in an on-line checkpoint */ @@ -8574,6 +9010,18 @@ xlog_redo(XLogReaderState *record) RecoveryRestartPoint(&checkPoint, record); + /* + * If the data checksum state change we need to emit a barrier. + */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = checkPoint.dataChecksumState; + if (checkPoint.dataChecksumState != old_state) + new_state = true; + SpinLockRelease(&XLogCtl->info_lck); + + if (new_state) + EmitAndWaitDataChecksumsBarrier(checkPoint.dataChecksumState); + /* * After replaying a checkpoint record, free all smgr objects. * Otherwise we would never do so for dropped relations, as the @@ -8735,7 +9183,19 @@ xlog_redo(XLogReaderState *record) } else if (info == XLOG_CHECKPOINT_REDO) { - /* nothing to do here, just for informational purposes */ + xl_checkpoint_redo redo_rec; + bool new_state = false; + + memcpy(&redo_rec, XLogRecGetData(record), sizeof(xl_checkpoint_redo)); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = redo_rec.data_checksum_version; + if (redo_rec.data_checksum_version != ControlFile->data_checksum_version) + new_state = true; + SpinLockRelease(&XLogCtl->info_lck); + + if (new_state) + EmitAndWaitDataChecksumsBarrier(redo_rec.data_checksum_version); } else if (info == XLOG_LOGICAL_DECODING_STATUS_CHANGE) { @@ -8788,6 +9248,30 @@ xlog_redo(XLogReaderState *record) } } +void +xlog2_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG2_CHECKSUMS) + { + xl_checksum_state state; + + memcpy(&state, XLogRecGetData(record), sizeof(xl_checksum_state)); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = state.new_checksum_state; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * Block on a procsignalbarrier to await all processes having seen the + * change to checksum status. Once the barrier has been passed we can + * initiate the corresponding processing. + */ + EmitAndWaitDataChecksumsBarrier(state.new_checksum_state); + } +} + /* * Return the extra open flags used for opening a file, depending on the * value of the GUCs wal_sync_method, fsync and debug_io_direct. diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c index ab1fbae8001..9c79dadaacc 100644 --- a/src/backend/backup/basebackup.c +++ b/src/backend/backup/basebackup.c @@ -1613,10 +1613,11 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, /* * If we weren't told not to verify checksums, and if checksums are * enabled for this cluster, and if this is a relation file, then verify - * the checksum. + * the checksum. We cannot at this point check if checksums are enabled + * or disabled as that might change, thus we check at each point where we + * could be validating a checksum. */ - if (!noverify_checksums && DataChecksumsEnabled() && - RelFileNumberIsValid(relfilenumber)) + if (!noverify_checksums && RelFileNumberIsValid(relfilenumber)) verify_checksum = true; /* @@ -1749,7 +1750,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, * If the amount of data we were able to read was not a multiple of * BLCKSZ, we cannot verify checksums, which are block-level. */ - if (verify_checksum && (cnt % BLCKSZ != 0)) + if (verify_checksum && DataChecksumsNeedVerify() && (cnt % BLCKSZ != 0)) { ereport(WARNING, (errmsg("could not verify checksum in file \"%s\", block " @@ -1844,9 +1845,10 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, * 'blkno' is the block number of the first page in the bbsink's buffer * relative to the start of the relation. * - * 'verify_checksum' indicates whether we should try to verify checksums - * for the blocks we read. If we do this, we'll update *checksum_failures - * and issue warnings as appropriate. + * 'verify_checksum' determines if the user has asked to verify checksums, but + * since data checksums can be disabled, or become disabled, we need to check + * state before verifying individual pages. If we do this, we'll update + * *checksum_failures and issue warnings as appropriate. */ static off_t read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd, @@ -1872,6 +1874,13 @@ read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd, int reread_cnt; uint16 expected_checksum; + /* + * The data checksum state can change at any point, so we need to + * re-check before each page. + */ + if (!DataChecksumsNeedVerify()) + return cnt; + page = sink->bbs_buffer + BLCKSZ * i; /* If the page is OK, go on to the next one. */ @@ -1894,7 +1903,12 @@ read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd, * allows us to wait until we can be certain that no write to the * block is in progress. Since we don't have any such thing right now, * we just do this and hope for the best. + * + * The data checksum state may also have changed concurrently so check + * again. */ + if (!DataChecksumsNeedVerify()) + return cnt; reread_cnt = basebackup_read_file(fd, sink->bbs_buffer + BLCKSZ * i, BLCKSZ, offset + BLCKSZ * i, @@ -2009,6 +2023,9 @@ verify_page_checksum(Page page, XLogRecPtr start_lsn, BlockNumber blkno, if (PageIsNew(page) || PageGetLSN(page) >= start_lsn) return true; + if (!DataChecksumsNeedVerify()) + return true; + /* Perform the actual checksum calculation. */ checksum = pg_checksum_page(page, blkno); diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 38ef683d4c7..c52c0a6023d 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -35,6 +35,7 @@ #include "port/pg_getopt_ctx.h" #include "postmaster/postmaster.h" #include "storage/bufpage.h" +#include "storage/checksum.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/proc.h" diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index e54018004db..eba25aa3e4d 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1451,6 +1451,25 @@ CREATE VIEW pg_stat_progress_copy AS FROM pg_stat_get_progress_info('COPY') AS S LEFT JOIN pg_database D ON S.datid = D.oid; +CREATE VIEW pg_stat_progress_data_checksums AS + SELECT + S.pid AS pid, S.datid, D.datname AS datname, + CASE S.param1 WHEN 0 THEN 'enabling' + WHEN 1 THEN 'disabling' + WHEN 2 THEN 'waiting on temporary tables' + WHEN 3 THEN 'waiting on barrier' + WHEN 4 THEN 'done' + END AS phase, + CASE S.param2 WHEN -1 THEN NULL ELSE S.param2 END AS databases_total, + S.param3 AS databases_done, + CASE S.param4 WHEN -1 THEN NULL ELSE S.param4 END AS relations_total, + CASE S.param5 WHEN -1 THEN NULL ELSE S.param5 END AS relations_done, + CASE S.param6 WHEN -1 THEN NULL ELSE S.param6 END AS blocks_total, + CASE S.param7 WHEN -1 THEN NULL ELSE S.param7 END AS blocks_done + FROM pg_stat_get_progress_info('DATACHECKSUMS') AS S + LEFT JOIN pg_database D ON S.datid = D.oid + ORDER BY S.datid; -- return the launcher process first + CREATE VIEW pg_user_mappings AS SELECT U.oid AS umid, diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 9b18bb4a17e..f0819d15ab7 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1044,7 +1044,14 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) if (pg_strcasecmp(strategy, "wal_log") == 0) dbstrategy = CREATEDB_WAL_LOG; else if (pg_strcasecmp(strategy, "file_copy") == 0) + { + if (DataChecksumsInProgressOn()) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("create database strategy \"%s\" not allowed when data checksums are being enabled", + strategy)); dbstrategy = CREATEDB_FILE_COPY; + } else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile index 0f4435d2d97..55044b2bc6f 100644 --- a/src/backend/postmaster/Makefile +++ b/src/backend/postmaster/Makefile @@ -18,6 +18,7 @@ OBJS = \ bgworker.o \ bgwriter.o \ checkpointer.o \ + datachecksum_state.o \ fork_process.o \ interrupt.o \ launch_backend.o \ diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c index cf24f662d27..8fdc518b3a1 100644 --- a/src/backend/postmaster/auxprocess.c +++ b/src/backend/postmaster/auxprocess.c @@ -15,6 +15,7 @@ #include #include +#include "access/xlog.h" #include "miscadmin.h" #include "pgstat.h" #include "postmaster/auxprocess.h" @@ -69,6 +70,24 @@ AuxiliaryProcessMainCommon(void) ProcSignalInit(NULL, 0); + /* + * Initialize a local cache of the data_checksum_version, to be updated by + * the procsignal-based barriers. + * + * This intentionally happens after initializing the procsignal, otherwise + * we might miss a state change. This means we can get a barrier for the + * state we've just initialized - but it can happen only once. + * + * The postmaster (which is what gets forked into the new child process) + * does not handle barriers, therefore it may not have the current value + * of LocalDataChecksumVersion value (it'll have the value read from the + * control file, which may be arbitrarily old). + * + * NB: Even if the postmaster handled barriers, the value might still be + * stale, as it might have changed after this process forked. + */ + InitLocalDataChecksumState(); + /* * Auxiliary processes don't run transactions, but they may need a * resource owner anyway to manage buffer pins acquired outside diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index f2a62489d9c..536aff7ca05 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -18,6 +18,7 @@ #include "pgstat.h" #include "port/atomics.h" #include "postmaster/bgworker_internals.h" +#include "postmaster/datachecksum_state.h" #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" #include "replication/logicalworker.h" @@ -144,7 +145,14 @@ static const struct .fn_name = "TableSyncWorkerMain", .fn_addr = TableSyncWorkerMain }, - + { + .fn_name = "DataChecksumsWorkerLauncherMain", + .fn_addr = DataChecksumsWorkerLauncherMain + }, + { + .fn_name = "DataChecksumsWorkerMain", + .fn_addr = DataChecksumsWorkerMain + } }; /* Private functions. */ diff --git a/src/backend/postmaster/datachecksum_state.c b/src/backend/postmaster/datachecksum_state.c new file mode 100644 index 00000000000..76004bcedc6 --- /dev/null +++ b/src/backend/postmaster/datachecksum_state.c @@ -0,0 +1,1612 @@ +/*------------------------------------------------------------------------- + * + * datachecksum_state.c + * Background worker for enabling or disabling data checksums online as + * well as functionality for manipulating data checksum state + * + * When enabling data checksums on a cluster at initdb time or when shut down + * with pg_checksums, no extra process is required as each page is checksummed, + * and verified, when accessed. When enabling checksums on an already running + * cluster, this worker will ensure that all pages are checksummed before + * verification of the checksums is turned on. In the case of disabling + * checksums, the state transition is performed only in the control file, no + * changes are performed on the data pages. + * + * Checksums can be either enabled or disabled cluster-wide, with on/off being + * the end state for data_checksums. + * + * 1. Enabling checksums + * --------------------- + * When enabling checksums in an online cluster, data_checksums will be set to + * "inprogress-on" which signals that write operations MUST compute and write + * the checksum on the data page, but during reading the checksum SHALL NOT be + * verified. This ensures that all objects created during when checksums are + * being enabled will have checksums set, but reads won't fail due to missing or + * invalid checksums. Invalid checksums can be present in case the cluster had + * checksums enabled, then disabled them and updated the page while they were + * disabled. + * + * The DataChecksumsWorker will compile a list of all databases at the start, + * any databases created concurrently will see the in-progress state and will + * be checksummed automatically. All databases from the original list MUST BE + * successfully processed in order for data checksums to be enabled, the only + * exception are databases which are dropped before having been processed. + * + * For each database, all relations which have storage are read and every data + * page is marked dirty to force a write with the checksum. This will generate + * a lot of WAL as the entire database is read and written. + * + * If the processing is interrupted by a cluster crash or restart, it needs to + * be restarted from the beginning again as state isn't persisted. + * + * 2. Disabling checksums + * ---------------------- + * When disabling checksums, data_checksums will be set to "inprogress-off" + * which signals that checksums are written but no longer need to be verified. + * This ensures that backends which have not yet transitioned to the + * "inprogress-off" state will still see valid checksums on pages. + * + * 3. Synchronization and Correctness + * ---------------------------------- + * The processes involved in enabling or disabling data checksums in an + * online cluster must be properly synchronized with the normal backends + * serving concurrent queries to ensure correctness. Correctness is defined + * as the following: + * + * - Backends SHALL NOT violate the data_checksums state they have agreed to + * by acknowledging the procsignalbarrier: This means that all backends + * MUST calculate and write data checksums during all states except off; + * MUST validate checksums only in the 'on' state. + * - Data checksums SHALL NOT be considered enabled cluster-wide until all + * currently connected backends have state "on": This means that all + * backends must wait on the procsignalbarrier to be acknowledged by all + * before proceeding to validate data checksums. + * + * There are two steps of synchronization required for changing data_checksums + * in an online cluster: (i) changing state in the active backends ("on", + * "off", "inprogress-on" and "inprogress-off"), and (ii) ensuring no + * incompatible objects and processes are left in a database when workers end. + * The former deals with cluster-wide agreement on data checksum state and the + * latter with ensuring that any concurrent activity cannot break the data + * checksum contract during processing. + * + * Synchronizing the state change is done with procsignal barriers. Before + * updating the data_checksums state in the control file, all other backends must absorb the + * barrier. Barrier absorption will happen during interrupt processing, which + * means that connected backends will change state at different times. If + * waiting for a barrier is done during startup, for example during replay, it + * is important to realize that any locks held by the startup process might + * cause deadlocks if backends end up waiting for those locks while startup + * is waiting for a procsignalbarrier. + * + * 3.1 When Enabling Data Checksums + * -------------------------------- + * A process which fails to observe data checksums being enabled can induce two + * types of errors: failing to write the checksum when modifying the page and + * failing to validate the data checksum on the page when reading it. + * + * When processing starts all backends belong to one of the below sets, with + * one if Bd and Bi being empty: + * + * Bg: Backend updating the global state and emitting the procsignalbarrier + * Bd: Backends in "off" state + * Bi: Backends in "inprogress-on" state + * + * If processing is started in an online cluster then all backends are in Bd. + * If processing was halted by the cluster shutting down (due to a crash or + * intentional restart), the controlfile state "inprogress-on" will be observed + * on system startup and all backends will be placed in Bd. The controlfile + * state will also be set to "off". + * + * Backends transition Bd -> Bi via a procsignalbarrier which is emitted by the + * DataChecksumsLauncher. When all backends have acknowledged the barrier then + * Bd will be empty and the next phase can begin: calculating and writing data + * checksums with DataChecksumsWorkers. When the DataChecksumsWorker processes + * have finished writing checksums on all pages, data checksums are enabled + * cluster-wide via another procsignalbarrier. There are four sets of backends + * where Bd shall be an empty set: + * + * Bg: Backend updating the global state and emitting the procsignalbarrier + * Bd: Backends in "off" state + * Be: Backends in "on" state + * Bi: Backends in "inprogress-on" state + * + * Backends in Bi and Be will write checksums when modifying a page, but only + * backends in Be will verify the checksum during reading. The Bg backend is + * blocked waiting for all backends in Bi to process interrupts and move to + * Be. Any backend starting while Bg is waiting on the procsignalbarrier will + * observe the global state being "on" and will thus automatically belong to + * Be. Checksums are enabled cluster-wide when Bi is an empty set. Bi and Be + * are compatible sets while still operating based on their local state as + * both write data checksums. + * + * 3.2 When Disabling Data Checksums + * --------------------------------- + * A process which fails to observe that data checksums have been disabled + * can induce two types of errors: writing the checksum when modifying the + * page and validating a data checksum which is no longer correct due to + * modifications to the page. The former is not an error per se as data + * integrity is maintained, but it is wasteful. The latter will cause errors + * in user operations. Assuming the following sets of backends: + * + * Bg: Backend updating the global state and emitting the procsignalbarrier + * Bd: Backends in "off" state + * Be: Backends in "on" state + * Bo: Backends in "inprogress-off" state + * Bi: Backends in "inprogress-on" state + * + * Backends transition from the Be state to Bd like so: Be -> Bo -> Bd. From + * all other states, the transition can be straight to Bd. + * + * The goal is to transition all backends to Bd making the others empty sets. + * Backends in Bo write data checksums, but don't validate them, such that + * backends still in Be can continue to validate pages until the barrier has + * been absorbed such that they are in Bo. Once all backends are in Bo, the + * barrier to transition to "off" can be raised and all backends can safely + * stop writing data checksums as no backend is enforcing data checksum + * validation any longer. + * + * 4. Future opportunities for optimizations + * ----------------------------------------- + * Below are some potential optimizations and improvements which were brought + * up during reviews of this feature, but which weren't implemented in the + * initial version. These are ideas listed without any validation on their + * feasibility or potential payoff. More discussion on (most of) these can be + * found on the -hackers threads linked to in the commit message of this + * feature. + * + * * Launching datachecksumsworker for resuming operation from the startup + * process: Currently users have to restart processing manually after a + * restart since dynamic background worker cannot be started from the + * postmaster. Changing the startup process could make restarting the + * processing automatic on cluster restart. + * * Avoid dirtying the page when checksums already match: Iff the checksum + * on the page happens to already match we still dirty the page. It should + * be enough to only do the log_newpage_buffer() call in that case. + * * Teach pg_checksums to avoid checksummed pages when pg_checksums is used + * to enable checksums on a cluster which is in inprogress-on state and + * may have checksummed pages (make pg_checksums be able to resume an + * online operation). This should only be attempted for wal_level minimal. + * * Restartability (not necessarily with page granularity). + * * Avoid processing databases which were created during inprogress-on. + * Right now all databases are processed regardless to be safe. + * * Teach CREATE DATABASE to calculate checksums for databases created + * during inprogress-on with a template database which has yet to be + * processed. + * + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/postmaster/datachecksum_state.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "catalog/indexing.h" +#include "catalog/pg_class.h" +#include "catalog/pg_database.h" +#include "commands/progress.h" +#include "commands/vacuum.h" +#include "common/relpath.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "postmaster/bgwriter.h" +#include "postmaster/datachecksum_state.h" +#include "storage/bufmgr.h" +#include "storage/checksum.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/lmgr.h" +#include "storage/lwlock.h" +#include "storage/procarray.h" +#include "storage/smgr.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/injection_point.h" +#include "utils/lsyscache.h" +#include "utils/ps_status.h" +#include "utils/syscache.h" +#include "utils/wait_event.h" + +/* + * Configuration of conditions which must match when absorbing a procsignal + * barrier during data checksum enable/disable operations. A single function + * is used for absorbing all barriers, and the current and target states must + * be defined as a from/to tuple in the checksum_barriers struct. + */ +typedef struct ChecksumBarrierCondition +{ + /* Current state of data checksums */ + int from; + /* Target state for data checksums */ + int to; +} ChecksumBarrierCondition; + +static const ChecksumBarrierCondition checksum_barriers[6] = +{ + /* + * Disabling checksums: If checksums are currently enabled, disabling must + * go through the 'inprogress-off' state. + */ + {PG_DATA_CHECKSUM_VERSION, PG_DATA_CHECKSUM_INPROGRESS_OFF}, + {PG_DATA_CHECKSUM_INPROGRESS_OFF, PG_DATA_CHECKSUM_OFF}, + + /* + * If checksums are in the process of being enabled, but are not yet being + * verified, we can abort by going back to 'off' state. + */ + {PG_DATA_CHECKSUM_INPROGRESS_ON, PG_DATA_CHECKSUM_OFF}, + + /* + * Enabling checksums must normally go through the 'inprogress-on' state. + */ + {PG_DATA_CHECKSUM_OFF, PG_DATA_CHECKSUM_INPROGRESS_ON}, + {PG_DATA_CHECKSUM_INPROGRESS_ON, PG_DATA_CHECKSUM_VERSION}, + + /* + * If checksums are being disabled but all backends are still computing + * checksums, we can go straight back to 'on' + */ + {PG_DATA_CHECKSUM_INPROGRESS_OFF, PG_DATA_CHECKSUM_VERSION}, +}; + +/* + * Signaling between backends calling pg_enable/disable_data_checksums, the + * checksums launcher process, and the checksums worker process. + * + * This struct is protected by DataChecksumsWorkerLock + */ +typedef struct DataChecksumsStateStruct +{ + /* + * These are set by pg_{enable|disable}_data_checksums, to tell the + * launcher what the target state is. + */ + DataChecksumsWorkerOperation launch_operation; + int launch_cost_delay; + int launch_cost_limit; + + /* + * Is a launcher process is currently running? This is set by the main + * launcher process, after it has read the above launch_* parameters. + */ + bool launcher_running; + + /* + * Is a worker process currently running? This is set by the worker + * launcher when it starts waiting for a worker process to finish. + */ + int worker_pid; + + /* + * These fields indicate the target state that the launcher is currently + * working towards. They can be different from the corresponding launch_* + * fields, if a new pg_enable/disable_data_checksums() call was made while + * the launcher/worker was already running. + * + * The below members are set when the launcher starts, and are only + * accessed read-only by the single worker. Thus, we can access these + * without a lock. If multiple workers, or dynamic cost parameters, are + * supported at some point then this would need to be revisited. + */ + DataChecksumsWorkerOperation operation; + int cost_delay; + int cost_limit; + + /* + * Signaling between the launcher and the worker process. + * + * As there is only a single worker, and the launcher won't read these + * until the worker exits, they can be accessed without the need for a + * lock. If multiple workers are supported then this will have to be + * revisited. + */ + + /* result, set by worker before exiting */ + DataChecksumsWorkerResult success; + + /* + * tells the worker process whether it should also process the shared + * catalogs + */ + bool process_shared_catalogs; +} DataChecksumsStateStruct; + +/* Shared memory segment for datachecksumsworker */ +static DataChecksumsStateStruct *DataChecksumState; + +typedef struct DataChecksumsWorkerDatabase +{ + Oid dboid; + char *dbname; +} DataChecksumsWorkerDatabase; + +/* Flag set by the interrupt handler */ +static volatile sig_atomic_t abort_requested = false; + +/* + * Have we set the DataChecksumsStateStruct->launcher_running flag? + * If we have, we need to clear it before exiting! + */ +static volatile sig_atomic_t launcher_running = false; + +/* Are we enabling data checksums, or disabling them? */ +static DataChecksumsWorkerOperation operation; + +/* Prototypes */ +static bool DatabaseExists(Oid dboid); +static List *BuildDatabaseList(void); +static List *BuildRelationList(bool temp_relations, bool include_shared); +static void FreeDatabaseList(List *dblist); +static DataChecksumsWorkerResult ProcessDatabase(DataChecksumsWorkerDatabase *db); +static bool ProcessAllDatabases(void); +static bool ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy); +static void launcher_cancel_handler(SIGNAL_ARGS); +static void WaitForAllTransactionsToFinish(void); + +/***************************************************************************** + * Functionality for manipulating the data checksum state in the cluster + */ + +void +EmitAndWaitDataChecksumsBarrier(uint32 state) +{ + uint64 barrier; + + switch (state) + { + case PG_DATA_CHECKSUM_INPROGRESS_ON: + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON); + WaitForProcSignalBarrier(barrier); + break; + + case PG_DATA_CHECKSUM_INPROGRESS_OFF: + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF); + WaitForProcSignalBarrier(barrier); + break; + + case PG_DATA_CHECKSUM_VERSION: + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON); + WaitForProcSignalBarrier(barrier); + break; + + case PG_DATA_CHECKSUM_OFF: + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF); + WaitForProcSignalBarrier(barrier); + break; + + default: + Assert(false); + } +} + +/* + * AbsorbDataChecksumsBarrier + * Generic function for absorbing data checksum state changes + * + * All procsignalbarriers regarding data checksum state changes are absorbed + * with this function. The set of conditions required for the state change to + * be accepted are listed in the checksum_barriers struct, target_state is + * used to look up the relevant entry. + */ +bool +AbsorbDataChecksumsBarrier(ProcSignalBarrierType barrier) +{ + uint32 target_state; + int current = data_checksums; + bool found = false; + + /* + * Translate the barrier condition to the target state, doing it here + * instead of in the procsignal code saves the latter from knowing about + * checksum states. + */ + switch (barrier) + { + case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON: + target_state = PG_DATA_CHECKSUM_INPROGRESS_ON; + break; + case PROCSIGNAL_BARRIER_CHECKSUM_ON: + target_state = PG_DATA_CHECKSUM_VERSION; + break; + case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF: + target_state = PG_DATA_CHECKSUM_INPROGRESS_OFF; + break; + case PROCSIGNAL_BARRIER_CHECKSUM_OFF: + target_state = PG_DATA_CHECKSUM_OFF; + break; + default: + elog(ERROR, "incorrect barrier \"%i\" received", barrier); + } + + /* + * If the target state matches the current state then the barrier has been + * repeated. + */ + if (current == target_state) + return true; + + /* + * If the cluster is in recovery we skip the validation of current state + * since the replay is trusted. + */ + if (RecoveryInProgress()) + { + SetLocalDataChecksumState(target_state); + return true; + } + + /* + * Find the barrier condition definition for the target state. Not finding + * a condition would be a grave programmer error as the states are a + * discrete set. + */ + for (int i = 0; i < lengthof(checksum_barriers) && !found; i++) + { + if (checksum_barriers[i].from == current && checksum_barriers[i].to == target_state) + found = true; + } + + /* + * If the relevant state criteria aren't satisfied, throw an error which + * will be caught by the procsignal machinery for a later retry. + */ + if (!found) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("incorrect data checksum state %i for target state %i", + current, target_state)); + + SetLocalDataChecksumState(target_state); + return true; +} + + +/* + * Disables data checksums for the cluster, if applicable. Starts a background + * worker which turns off the data checksums. + */ +Datum +disable_data_checksums(PG_FUNCTION_ARGS) +{ + if (!superuser()) + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to change data checksum state")); + + StartDataChecksumsWorkerLauncher(DISABLE_DATACHECKSUMS, 0, 0); + PG_RETURN_VOID(); +} + +/* + * Enables data checksums for the cluster, if applicable. Supports vacuum- + * like cost based throttling to limit system load. Starts a background worker + * which updates data checksums on existing data. + */ +Datum +enable_data_checksums(PG_FUNCTION_ARGS) +{ + int cost_delay = PG_GETARG_INT32(0); + int cost_limit = PG_GETARG_INT32(1); + + if (!superuser()) + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to change data checksum state")); + + if (cost_delay < 0) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("cost delay cannot be a negative value")); + + if (cost_limit <= 0) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("cost limit must be greater than zero")); + + StartDataChecksumsWorkerLauncher(ENABLE_DATACHECKSUMS, cost_delay, cost_limit); + + PG_RETURN_VOID(); +} + + +/***************************************************************************** + * Functionality for running the datachecksumsworker and associated launcher + */ + +/* + * StartDataChecksumsWorkerLauncher + * Main entry point for datachecksumsworker launcher process + * + * The main entrypoint for starting data checksums processing for enabling as + * well as disabling. + */ +void +StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op, + int cost_delay, + int cost_limit) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + bool launcher_running; + DataChecksumsWorkerOperation launcher_running_op; + +#ifdef USE_ASSERT_CHECKING + /* The cost delay settings have no effect when disabling */ + if (op == DISABLE_DATACHECKSUMS) + Assert(cost_delay == 0 && cost_limit == 0); +#endif + + INJECTION_POINT("datachecksumsworker-startup-delay", NULL); + + /* Store the desired state in shared memory */ + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + + DataChecksumState->launch_operation = op; + DataChecksumState->launch_cost_delay = cost_delay; + DataChecksumState->launch_cost_limit = cost_limit; + + /* Is the launcher already running? If so, what is it doing? */ + launcher_running = DataChecksumState->launcher_running; + if (launcher_running) + launcher_running_op = DataChecksumState->operation; + + LWLockRelease(DataChecksumsWorkerLock); + + /* + * Launch a new launcher process, if it's not running already. + * + * If the launcher is currently busy enabling the checksums, and we want + * them disabled (or vice versa), the launcher will notice that at latest + * when it's about to exit, and will loop back process the new request. So + * if the launcher is already running, we don't need to do anything more + * here to abort it. + * + * If you call pg_enable/disable_data_checksums() twice in a row, before + * the launcher has had a chance to start up, we still end up launching it + * twice. That's OK, the second invocation will see that a launcher is + * already running and exit quickly. + * + * TODO: We could optimize here and skip launching the launcher, if we are + * already in the desired state, i.e. if the checksums are already enabled + * and you call pg_enable_data_checksums(). + */ + if (!launcher_running) + { + /* + * Prepare the BackgroundWorker and launch it. + */ + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "DataChecksumsWorkerLauncherMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksum launcher"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksum launcher"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = (Datum) 0; + + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("failed to start background worker to process data checksums")); + } + else + { + if (launcher_running_op == op) + ereport(ERROR, + errmsg("data checksum processing already running")); + } +} + +/* + * ProcessSingleRelationFork + * Enable data checksums in a single relation/fork. + * + * Returns true if successful, and false if *aborted*. On error, an actual + * error is raised in the lower levels. + */ +static bool +ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy) +{ + BlockNumber numblocks = RelationGetNumberOfBlocksInFork(reln, forkNum); + char activity[NAMEDATALEN * 2 + 128]; + char *relns; + + relns = get_namespace_name(RelationGetNamespace(reln)); + + /* Report the current relation to pgstat_activity */ + snprintf(activity, sizeof(activity) - 1, "processing: %s.%s (%s, %u blocks)", + (relns ? relns : ""), RelationGetRelationName(reln), forkNames[forkNum], numblocks); + pgstat_report_activity(STATE_RUNNING, activity); + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL, numblocks); + if (relns) + pfree(relns); + + /* + * We are looping over the blocks which existed at the time of process + * start, which is safe since new blocks are created with checksums set + * already due to the state being "inprogress-on". + */ + for (BlockNumber blknum = 0; blknum < numblocks; blknum++) + { + Buffer buf = ReadBufferExtended(reln, forkNum, blknum, RBM_NORMAL, strategy); + + /* Need to get an exclusive lock to mark the buffer as dirty */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * Mark the buffer as dirty and force a full page write. We have to + * re-write the page to WAL even if the checksum hasn't changed, + * because if there is a replica it might have a slightly different + * version of the page with an invalid checksum, caused by unlogged + * changes (e.g. hintbits) on the primary happening while checksums + * were off. This can happen if there was a valid checksum on the page + * at one point in the past, so only when checksums are first on, then + * off, and then turned on again. TODO: investigate if this could be + * avoided if the checksum is calculated to be correct and wal_level + * is set to "minimal", + */ + START_CRIT_SECTION(); + MarkBufferDirty(buf); + log_newpage_buffer(buf, false); + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buf); + + /* + * This is the only place where we check if we are asked to abort, the + * abortion will bubble up from here. + */ + Assert(operation == ENABLE_DATACHECKSUMS); + LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED); + if (DataChecksumState->launch_operation == DISABLE_DATACHECKSUMS) + abort_requested = true; + LWLockRelease(DataChecksumsWorkerLock); + + if (abort_requested) + return false; + + /* update the block counter */ + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_BLOCKS_DONE, + (blknum + 1)); + + /* + * Processing is re-using the vacuum cost delay for process + * throttling, hence why we call vacuum APIs here. + */ + vacuum_delay_point(false); + } + + return true; +} + +/* + * ProcessSingleRelationByOid + * Process a single relation based on oid. + * + * Returns true if successful, and false if *aborted*. On error, an actual + * error is raised in the lower levels. + */ +static bool +ProcessSingleRelationByOid(Oid relationId, BufferAccessStrategy strategy) +{ + Relation rel; + bool aborted = false; + + StartTransactionCommand(); + + rel = try_relation_open(relationId, AccessShareLock); + if (rel == NULL) + { + /* + * Relation no longer exists. We don't consider this an error since + * there are no pages in it that need data checksums, and thus return + * true. The worker operates off a list of relations generated at the + * start of processing, so relations being dropped in the meantime is + * to be expected. + */ + CommitTransactionCommand(); + pgstat_report_activity(STATE_IDLE, NULL); + return true; + } + RelationGetSmgr(rel); + + for (ForkNumber fnum = 0; fnum <= MAX_FORKNUM; fnum++) + { + if (smgrexists(rel->rd_smgr, fnum)) + { + if (!ProcessSingleRelationFork(rel, fnum, strategy)) + { + aborted = true; + break; + } + } + } + relation_close(rel, AccessShareLock); + + CommitTransactionCommand(); + pgstat_report_activity(STATE_IDLE, NULL); + + return !aborted; +} + +/* + * ProcessDatabase + * Enable data checksums in a single database. + * + * We do this by launching a dynamic background worker into this database, and + * waiting for it to finish. We have to do this in a separate worker, since + * each process can only be connected to one database during its lifetime. + */ +static DataChecksumsWorkerResult +ProcessDatabase(DataChecksumsWorkerDatabase *db) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + BgwHandleStatus status; + pid_t pid; + char activity[NAMEDATALEN + 64]; + + DataChecksumState->success = DATACHECKSUMSWORKER_FAILED; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "%s", "DataChecksumsWorkerMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksum worker"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksum worker"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = ObjectIdGetDatum(db->dboid); + + /* + * If there are no worker slots available, there is little we can do. If + * we retry in a bit it's still unlikely that the user has managed to + * reconfigure in the meantime and we'd be run through retries fast. + */ + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + { + ereport(WARNING, + errmsg("could not start background worker for enabling data checksums in database \"%s\"", + db->dbname), + errhint("The \"%s\" setting might be too low.", "max_worker_processes")); + return DATACHECKSUMSWORKER_FAILED; + } + + status = WaitForBackgroundWorkerStartup(bgw_handle, &pid); + if (status == BGWH_STOPPED) + { + ereport(WARNING, + errmsg("could not start background worker for enabling data checksums in database \"%s\"", + db->dbname), + errhint("More details on the error might be found in the server log.")); + + /* + * Heuristic to see if the database was dropped, and if it was we can + * treat it as not an error, else treat as fatal and error out. TODO: + * this could probably be improved with a tighter check. + */ + if (DatabaseExists(db->dboid)) + return DATACHECKSUMSWORKER_FAILED; + else + return DATACHECKSUMSWORKER_DROPDB; + } + + /* + * If the postmaster crashed we cannot end up with a processed database so + * we have no alternative other than exiting. When enabling checksums we + * won't at this time have changed the data checksums state in pg_control + * to enabled so when the cluster comes back up processing will have to be + * restarted. + */ + if (status == BGWH_POSTMASTER_DIED) + ereport(FATAL, + errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("cannot enable data checksums without the postmaster process"), + errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums().")); + + Assert(status == BGWH_STARTED); + ereport(LOG, + errmsg("initiating data checksum processing in database \"%s\"", + db->dbname)); + + /* Save the pid of the worker so we can signal it later */ + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + DataChecksumState->worker_pid = pid; + LWLockRelease(DataChecksumsWorkerLock); + + snprintf(activity, sizeof(activity) - 1, + "Waiting for worker in database %s (pid %ld)", db->dbname, (long) pid); + pgstat_report_activity(STATE_RUNNING, activity); + + status = WaitForBackgroundWorkerShutdown(bgw_handle); + if (status == BGWH_POSTMASTER_DIED) + ereport(FATAL, + errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("postmaster exited during data checksum processing in \"%s\"", + db->dbname), + errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums().")); + + if (DataChecksumState->success == DATACHECKSUMSWORKER_ABORTED) + ereport(LOG, + errmsg("data checksums processing was aborted in database \"%s\"", + db->dbname)); + + pgstat_report_activity(STATE_IDLE, NULL); + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + DataChecksumState->worker_pid = InvalidPid; + LWLockRelease(DataChecksumsWorkerLock); + + return DataChecksumState->success; +} + +/* + * launcher_exit + * + * Internal routine for cleaning up state when the launcher process exits. We + * need to clean up the abort flag to ensure that processing started again if + * it was previously aborted (note: started again, *not* restarted from where + * it left off). + */ +static void +launcher_exit(int code, Datum arg) +{ + abort_requested = false; + + if (launcher_running) + { + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + if (DataChecksumState->worker_pid != InvalidPid) + { + ereport(LOG, + errmsg("data checksums launcher exiting while worker is still running, signalling worker")); + kill(DataChecksumState->worker_pid, SIGTERM); + } + LWLockRelease(DataChecksumsWorkerLock); + } + + /* + * If the launcher is exiting before data checksums are enabled then set + * the state to off since processing cannot be resumed. + */ + if (DataChecksumsInProgressOn()) + SetDataChecksumsOff(); + + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + launcher_running = false; + DataChecksumState->launcher_running = false; + LWLockRelease(DataChecksumsWorkerLock); +} + +/* + * launcher_cancel_handler + * + * Internal routine for reacting to SIGINT and flagging the worker to abort. + * The worker won't be interrupted immediately but will check for abort flag + * between each block in a relation. + */ +static void +launcher_cancel_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + abort_requested = true; + + /* + * There is no sleeping in the main loop, the flag will be checked + * periodically in ProcessSingleRelationFork. The worker does however + * sleep when waiting for concurrent transactions to end so we still need + * to set the latch. + */ + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * WaitForAllTransactionsToFinish + * Blocks awaiting all current transactions to finish + * + * Returns when all transactions which are active at the call of the function + * have ended, or if the postmaster dies while waiting. If the postmaster dies + * the abort flag will be set to indicate that the caller of this shouldn't + * proceed. + * + * NB: this will return early, if aborted by SIGINT or if the target state + * is changed while we're running. + */ +static void +WaitForAllTransactionsToFinish(void) +{ + TransactionId waitforxid; + + LWLockAcquire(XidGenLock, LW_SHARED); + waitforxid = XidFromFullTransactionId(TransamVariables->nextXid); + LWLockRelease(XidGenLock); + + while (TransactionIdPrecedes(GetOldestActiveTransactionId(false, true), waitforxid)) + { + char activity[64]; + int rc; + + /* Oldest running xid is older than us, so wait */ + snprintf(activity, + sizeof(activity), + "Waiting for current transactions to finish (waiting for %u)", + waitforxid); + pgstat_report_activity(STATE_RUNNING, activity); + + /* Retry every 3 seconds */ + ResetLatch(MyLatch); + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + 3000, + WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION); + + /* + * If the postmaster died we won't be able to enable checksums + * cluster-wide so abort and hope to continue when restarted. + */ + if (rc & WL_POSTMASTER_DEATH) + ereport(FATAL, + errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("postmaster exited during data checksums processing"), + errhint("Data checksums processing must be restarted manually after cluster restart.")); + + CHECK_FOR_INTERRUPTS(); + + LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED); + if (DataChecksumState->launch_operation != operation) + abort_requested = true; + LWLockRelease(DataChecksumsWorkerLock); + if (abort_requested) + break; + } + + pgstat_report_activity(STATE_IDLE, NULL); + return; +} + +/* + * DataChecksumsWorkerLauncherMain + * + * Main function for launching dynamic background workers for processing data + * checksums in databases. This function has the bgworker management, with + * ProcessAllDatabases being responsible for looping over the databases and + * initiating processing. + */ +void +DataChecksumsWorkerLauncherMain(Datum arg) +{ + on_shmem_exit(launcher_exit, 0); + + ereport(DEBUG1, + errmsg("background worker \"datachecksums launcher\" started")); + + pqsignal(SIGTERM, die); + pqsignal(SIGINT, launcher_cancel_handler); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, SIG_IGN); + + BackgroundWorkerUnblockSignals(); + + MyBackendType = B_DATACHECKSUMSWORKER_LAUNCHER; + init_ps_display(NULL); + + INJECTION_POINT("datachecksumsworker-launcher-delay", NULL); + + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + + if (DataChecksumState->launcher_running) + { + ereport(LOG, + errmsg("background worker \"datachecksums launcher\" already running, exiting")); + /* Launcher was already running, let it finish */ + LWLockRelease(DataChecksumsWorkerLock); + return; + } + + launcher_running = true; + + /* Initialize a connection to shared catalogs only */ + BackgroundWorkerInitializeConnectionByOid(InvalidOid, InvalidOid, 0); + + operation = DataChecksumState->launch_operation; + DataChecksumState->launcher_running = true; + DataChecksumState->operation = operation; + DataChecksumState->cost_delay = DataChecksumState->launch_cost_delay; + DataChecksumState->cost_limit = DataChecksumState->launch_cost_limit; + LWLockRelease(DataChecksumsWorkerLock); + + /* + * The target state can change while we are busy enabling/disabling + * checksums, if the user calls pg_disable/enable_data_checksums() before + * we are finished with the previous request. In that case, we will loop + * back here, to process the new request. + */ +again: + + pgstat_progress_start_command(PROGRESS_COMMAND_DATACHECKSUMS, + InvalidOid); + + if (operation == ENABLE_DATACHECKSUMS) + { + /* + * If we are asked to enable checksums in a cluster which already has + * checksums enabled, exit immediately as there is nothing more to do. + */ + if (DataChecksumsNeedVerify()) + goto done; + + ereport(LOG, + errmsg("enabling data checksums requested, starting data checksum calculation")); + + /* + * Set the state to inprogress-on and wait on the procsignal barrier. + */ + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_ENABLING); + SetDataChecksumsOnInProgress(); + + /* + * All backends are now in inprogress-on state and are writing data + * checksums. Start processing all data at rest. + */ + if (!ProcessAllDatabases()) + { + /* + * If the target state changed during processing then it's not a + * failure, so restart processing instead. + */ + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + if (DataChecksumState->launch_operation != operation) + { + LWLockRelease(DataChecksumsWorkerLock); + goto done; + } + LWLockRelease(DataChecksumsWorkerLock); + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("unable to enable data checksums in cluster")); + } + + /* + * Data checksums have been set on all pages, set the state to on in + * order to instruct backends to validate checksums on reading. + */ + SetDataChecksumsOn(); + + ereport(LOG, + errmsg("data checksums are now enabled")); + } + else if (operation == DISABLE_DATACHECKSUMS) + { + ereport(LOG, + errmsg("disabling data checksums requested")); + + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_DISABLING); + SetDataChecksumsOff(); + ereport(LOG, + errmsg("data checksums are now disabled")); + } + else + Assert(false); + +done: + + /* + * This state will only be displayed for a fleeting moment, but for the + * sake of correctness it is still added before ending the command. + */ + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_DONE); + + /* + * All done. But before we exit, check if the target state was changed + * while we were running. In that case we will have to start all over + * again. + */ + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + if (DataChecksumState->launch_operation != operation) + { + DataChecksumState->operation = DataChecksumState->launch_operation; + operation = DataChecksumState->launch_operation; + DataChecksumState->cost_delay = DataChecksumState->launch_cost_delay; + DataChecksumState->cost_limit = DataChecksumState->launch_cost_limit; + LWLockRelease(DataChecksumsWorkerLock); + goto again; + } + + /* Shut down progress reporting as we are done */ + pgstat_progress_end_command(); + + launcher_running = false; + DataChecksumState->launcher_running = false; + LWLockRelease(DataChecksumsWorkerLock); +} + +/* + * ProcessAllDatabases + * Compute the list of all databases and process checksums in each + * + * This will generate a list of databases to process for enabling checksums. + * If a database encounters a failure then processing will end immediately and + * return an error. + */ +static bool +ProcessAllDatabases(void) +{ + List *DatabaseList; + int cumulative_total = 0; + + /* Set up so first run processes shared catalogs, not once in every db */ + DataChecksumState->process_shared_catalogs = true; + + /* Get a list of all databases to process */ + WaitForAllTransactionsToFinish(); + DatabaseList = BuildDatabaseList(); + + /* + * Update progress reporting with the total number of databases we need to + * process. This number should not be changed during processing, the + * columns for processed databases is instead increased such that it can + * be compared against the total. + */ + { + const int index[] = { + PROGRESS_DATACHECKSUMS_DBS_TOTAL, + PROGRESS_DATACHECKSUMS_DBS_DONE, + PROGRESS_DATACHECKSUMS_RELS_TOTAL, + PROGRESS_DATACHECKSUMS_RELS_DONE, + PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL, + PROGRESS_DATACHECKSUMS_BLOCKS_DONE, + }; + + int64 vals[6]; + + vals[0] = list_length(DatabaseList); + vals[1] = 0; + /* translated to NULL */ + vals[2] = -1; + vals[3] = -1; + vals[4] = -1; + vals[5] = -1; + + pgstat_progress_update_multi_param(6, index, vals); + } + + foreach_ptr(DataChecksumsWorkerDatabase, db, DatabaseList) + { + DataChecksumsWorkerResult result; + + result = ProcessDatabase(db); + + /* Allow a test process to alter the result of the operation */ + INJECTION_POINT("datachecksumsworker-modify-db-result", &result); + + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_DBS_DONE, + ++cumulative_total); + + if (result == DATACHECKSUMSWORKER_FAILED) + { + /* + * Disable checksums on cluster, because we failed one of the + * databases and this is an all or nothing process. + */ + SetDataChecksumsOff(); + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("data checksums failed to get enabled in all databases, aborting"), + errhint("The server log might have more information on the cause of the error.")); + } + else if (result == DATACHECKSUMSWORKER_ABORTED || abort_requested) + { + /* Abort flag set, so exit the whole process */ + return false; + } + + /* + * When one database has completed, it will have done shared catalogs + * so we don't have to process them again. + */ + DataChecksumState->process_shared_catalogs = false; + } + + FreeDatabaseList(DatabaseList); + + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_WAITING_BARRIER); + return true; +} + +/* + * DataChecksumStateSize + * Compute required space for datachecksumsworker-related shared memory + */ +Size +DataChecksumsShmemSize(void) +{ + Size size; + + size = sizeof(DataChecksumsStateStruct); + size = MAXALIGN(size); + + return size; +} + +/* + * DataChecksumStateInit + * Allocate and initialize datachecksumsworker-related shared memory + */ +void +DataChecksumsShmemInit(void) +{ + bool found; + + DataChecksumState = (DataChecksumsStateStruct *) + ShmemInitStruct("DataChecksumsWorker Data", + DataChecksumsShmemSize(), + &found); + if (!found) + MemSet(DataChecksumState, 0, DataChecksumsShmemSize()); +} + +/* + * DatabaseExists + * + * Scans the system catalog to check if a database with the given Oid exist + * and returns true if it is found, else false. + */ +static bool +DatabaseExists(Oid dboid) +{ + Relation rel; + ScanKeyData skey; + SysScanDesc scan; + bool found; + HeapTuple tuple; + + StartTransactionCommand(); + + rel = table_open(DatabaseRelationId, AccessShareLock); + ScanKeyInit(&skey, + Anum_pg_database_oid, + BTEqualStrategyNumber, F_OIDEQ, + dboid); + scan = systable_beginscan(rel, DatabaseOidIndexId, true, SnapshotSelf, + 1, &skey); + tuple = systable_getnext(scan); + found = HeapTupleIsValid(tuple); + + systable_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return found; +} + +/* + * BuildDatabaseList + * Compile a list of all currently available databases in the cluster + * + * This creates the list of databases for the datachecksumsworker workers to + * add checksums to. If the caller wants to ensure that no concurrently + * running CREATE DATABASE calls exist, this needs to be preceded by a call + * to WaitForAllTransactionsToFinish(). + */ +static List * +BuildDatabaseList(void) +{ + List *DatabaseList = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext ctx = CurrentMemoryContext; + MemoryContext oldctx; + + StartTransactionCommand(); + + rel = table_open(DatabaseRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_database pgdb = (Form_pg_database) GETSTRUCT(tup); + DataChecksumsWorkerDatabase *db; + + oldctx = MemoryContextSwitchTo(ctx); + + db = (DataChecksumsWorkerDatabase *) palloc0(sizeof(DataChecksumsWorkerDatabase)); + + db->dboid = pgdb->oid; + db->dbname = pstrdup(NameStr(pgdb->datname)); + + DatabaseList = lappend(DatabaseList, db); + + MemoryContextSwitchTo(oldctx); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return DatabaseList; +} + +static void +FreeDatabaseList(List *dblist) +{ + if (!dblist) + return; + + foreach_ptr(DataChecksumsWorkerDatabase, db, dblist) + { + if (db->dbname != NULL) + pfree(db->dbname); + } + + list_free_deep(dblist); +} + +/* + * BuildRelationList + * Compile a list of relations in the database + * + * Returns a list of OIDs for the request relation types. If temp_relations + * is True then only temporary relations are returned. If temp_relations is + * False then non-temporary relations which have data checksums are returned. + * If include_shared is True then shared relations are included as well in a + * non-temporary list. include_shared has no relevance when building a list of + * temporary relations. + */ +static List * +BuildRelationList(bool temp_relations, bool include_shared) +{ + List *RelationList = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext ctx = CurrentMemoryContext; + MemoryContext oldctx; + + StartTransactionCommand(); + + rel = table_open(RelationRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_class pgc = (Form_pg_class) GETSTRUCT(tup); + + /* Only include temporary relations when explicitly asked to */ + if (pgc->relpersistence == RELPERSISTENCE_TEMP) + { + if (!temp_relations) + continue; + } + else + { + /* + * If we are only interested in temp relations then continue + * immediately as the current relation isn't a temp relation. + */ + if (temp_relations) + continue; + + if (!RELKIND_HAS_STORAGE(pgc->relkind)) + continue; + + if (pgc->relisshared && !include_shared) + continue; + } + + oldctx = MemoryContextSwitchTo(ctx); + RelationList = lappend_oid(RelationList, pgc->oid); + MemoryContextSwitchTo(oldctx); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return RelationList; +} + +/* + * DataChecksumsWorkerMain + * + * Main function for enabling checksums in a single database, This is the + * function set as the bgw_function_name in the dynamic background worker + * process initiated for each database by the worker launcher. After enabling + * data checksums in each applicable relation in the database, it will wait for + * all temporary relations that were present when the function started to + * disappear before returning. This is required since we cannot rewrite + * existing temporary relations with data checksums. + */ +void +DataChecksumsWorkerMain(Datum arg) +{ + Oid dboid = DatumGetObjectId(arg); + List *RelationList = NIL; + List *InitialTempTableList = NIL; + BufferAccessStrategy strategy; + bool aborted = false; + int64 rels_done; + + operation = ENABLE_DATACHECKSUMS; + + pqsignal(SIGTERM, die); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + + BackgroundWorkerUnblockSignals(); + + MyBackendType = B_DATACHECKSUMSWORKER_WORKER; + init_ps_display(NULL); + + BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid, + BGWORKER_BYPASS_ALLOWCONN); + + /* worker will have a separate entry in pg_stat_progress_data_checksums */ + pgstat_progress_start_command(PROGRESS_COMMAND_DATACHECKSUMS, + InvalidOid); + + /* + * Get a list of all temp tables present as we start in this database. We + * need to wait until they are all gone until we are done, since we cannot + * access these relations and modify them. + */ + InitialTempTableList = BuildRelationList(true, false); + + /* + * Enable vacuum cost delay, if any. While this process isn't doing any + * vacuuming, we are re-using the infrastructure that vacuum cost delay + * provides rather than inventing something bespoke. This is an internal + * implementation detail and care should be taken to avoid it bleeding + * through to the user to avoid confusion. + */ + Assert(DataChecksumState->operation == ENABLE_DATACHECKSUMS); + VacuumCostDelay = DataChecksumState->cost_delay; + VacuumCostLimit = DataChecksumState->cost_limit; + VacuumCostActive = (VacuumCostDelay > 0); + VacuumCostBalance = 0; + VacuumCostPageHit = 0; + VacuumCostPageMiss = 0; + VacuumCostPageDirty = 0; + + /* + * Create and set the vacuum strategy as our buffer strategy. + */ + strategy = GetAccessStrategy(BAS_VACUUM); + + RelationList = BuildRelationList(false, + DataChecksumState->process_shared_catalogs); + + /* Update the total number of relations to be processed in this DB. */ + { + const int index[] = { + PROGRESS_DATACHECKSUMS_RELS_TOTAL, + PROGRESS_DATACHECKSUMS_RELS_DONE + }; + + int64 vals[2]; + + vals[0] = list_length(RelationList); + vals[1] = 0; + + pgstat_progress_update_multi_param(2, index, vals); + } + + /* Process the relations */ + rels_done = 0; + foreach_oid(reloid, RelationList) + { + CHECK_FOR_INTERRUPTS(); + + if (!ProcessSingleRelationByOid(reloid, strategy)) + { + aborted = true; + break; + } + + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_RELS_DONE, + ++rels_done); + } + list_free(RelationList); + + if (aborted) + { + DataChecksumState->success = DATACHECKSUMSWORKER_ABORTED; + ereport(DEBUG1, + errmsg("data checksum processing aborted in database OID %u", + dboid)); + return; + } + + /* The worker is about to wait for temporary tables to go away. */ + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_WAITING_TEMPREL); + + /* + * Wait for all temp tables that existed when we started to go away. This + * is necessary since we cannot "reach" them to enable checksums. Any temp + * tables created after we started will already have checksums in them + * (due to the "inprogress-on" state), so no need to wait for those. + */ + for (;;) + { + List *CurrentTempTables; + int numleft; + char activity[64]; + + CurrentTempTables = BuildRelationList(true, false); + numleft = 0; + foreach_oid(tmptbloid, InitialTempTableList) + { + if (list_member_oid(CurrentTempTables, tmptbloid)) + numleft++; + } + list_free(CurrentTempTables); + + INJECTION_POINT("datachecksumsworker-fake-temptable-wait", &numleft); + + if (numleft == 0) + break; + + /* + * At least one temp table is left to wait for, indicate in pgstat + * activity and progress reporting. + */ + snprintf(activity, + sizeof(activity), + "Waiting for %d temp tables to be removed", numleft); + pgstat_report_activity(STATE_RUNNING, activity); + + /* Retry every 3 seconds */ + ResetLatch(MyLatch); + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 3000, + WAIT_EVENT_CHECKSUM_ENABLE_TEMPTABLE_WAIT); + + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + aborted = DataChecksumState->launch_operation != operation; + LWLockRelease(DataChecksumsWorkerLock); + + CHECK_FOR_INTERRUPTS(); + + if (aborted || abort_requested) + { + DataChecksumState->success = DATACHECKSUMSWORKER_ABORTED; + ereport(LOG, + errmsg("data checksum processing aborted in database OID %u", + dboid)); + return; + } + } + + list_free(InitialTempTableList); + + /* worker done */ + pgstat_progress_end_command(); + + DataChecksumState->success = DATACHECKSUMSWORKER_SUCCESSFUL; +} diff --git a/src/backend/postmaster/meson.build b/src/backend/postmaster/meson.build index e1f70726604..6cba23bbeef 100644 --- a/src/backend/postmaster/meson.build +++ b/src/backend/postmaster/meson.build @@ -6,6 +6,7 @@ backend_sources += files( 'bgworker.c', 'bgwriter.c', 'checkpointer.c', + 'datachecksum_state.c', 'fork_process.c', 'interrupt.c', 'launch_backend.c', diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index abf0c97569e..eb4f3eb72d4 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -2991,6 +2991,11 @@ PostmasterStateMachine(void) B_INVALID, B_STANDALONE_BACKEND); + /* also add data checksums processes */ + remainMask = btmask_add(remainMask, + B_DATACHECKSUMSWORKER_LAUNCHER, + B_DATACHECKSUMSWORKER_WORKER); + /* All types should be included in targetMask or remainMask */ Assert((remainMask.mask | targetMask.mask) == BTYPE_MASK_ALL.mask); } diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 3c027bcb2f7..57aaef57c61 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -189,6 +189,22 @@ xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) } } +void +xlog2_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + uint8 info = XLogRecGetInfo(buf->record) & ~XLR_INFO_MASK; + + ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(buf->record), buf->origptr); + + switch (info) + { + case XLOG2_CHECKSUMS: + break; + default: + elog(ERROR, "unexpected RM_XLOG2_ID record type: %u", info); + } +} + /* * Handle rmgr XACT_ID records for LogicalDecodingProcessRecord(). */ diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 5c64570020d..3cc0b0bdd92 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8567,6 +8567,13 @@ buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, if (flags & READ_BUFFERS_IGNORE_CHECKSUM_FAILURES) piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE; + /* + * If the buffers are marked for zero on error, we want to log that in + * case of a checksum failure. + */ + if (flags & READ_BUFFERS_ZERO_ON_ERROR) + piv_flags |= PIV_ZERO_BUFFERS_ON_ERROR; + /* Check for garbage data. */ if (!failed) { diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index d692d419846..7aab5da3386 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -31,6 +31,7 @@ #include "postmaster/autovacuum.h" #include "postmaster/bgworker_internals.h" #include "postmaster/bgwriter.h" +#include "postmaster/datachecksum_state.h" #include "postmaster/walsummarizer.h" #include "replication/logicallauncher.h" #include "replication/origin.h" @@ -142,6 +143,7 @@ CalculateShmemSize(void) size = add_size(size, AioShmemSize()); size = add_size(size, WaitLSNShmemSize()); size = add_size(size, LogicalDecodingCtlShmemSize()); + size = add_size(size, DataChecksumsShmemSize()); /* include additional requested shmem from preload libraries */ size = add_size(size, total_addin_request); @@ -310,6 +312,7 @@ CreateOrAttachShmemStructs(void) PgArchShmemInit(); ApplyLauncherShmemInit(); SlotSyncShmemInit(); + DataChecksumsShmemInit(); /* * Set up other modules that need some shared memory space diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index 7e017c8d53b..f1ab3aa3fe0 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -22,6 +22,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "port/pg_bitutils.h" +#include "postmaster/datachecksum_state.h" #include "replication/logicalctl.h" #include "replication/logicalworker.h" #include "replication/walsender.h" @@ -582,6 +583,13 @@ ProcessProcSignalBarrier(void) case PROCSIGNAL_BARRIER_UPDATE_XLOG_LOGICAL_INFO: processed = ProcessBarrierUpdateXLogLogicalInfo(); break; + + case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON: + case PROCSIGNAL_BARRIER_CHECKSUM_ON: + case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF: + case PROCSIGNAL_BARRIER_CHECKSUM_OFF: + processed = AbsorbDataChecksumsBarrier(type); + break; } /* diff --git a/src/backend/storage/page/README b/src/backend/storage/page/README index e30d7ac59ad..73c36a63908 100644 --- a/src/backend/storage/page/README +++ b/src/backend/storage/page/README @@ -10,7 +10,9 @@ http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed 2010/12/22 on -hackers list. Current implementation requires this be enabled system-wide at initdb time, or -by using the pg_checksums tool on an offline cluster. +by using the pg_checksums tool on an offline cluster. Checksums can also be +enabled at runtime using pg_enable_data_checksums(), and disabled by using +pg_disable_data_checksums(). The checksum is not valid at all times on a data page!! The checksum is valid when the page leaves the shared pool and is checked diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 56f1f7ae9fc..1fdfda59edd 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -107,7 +107,15 @@ PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_fail */ if (!PageIsNew(page)) { - if (DataChecksumsEnabled()) + /* + * There shouldn't be any check for interrupt calls happening in this + * codepath, but just to be on the safe side we hold interrupts since + * if they did happen the data checksum state could change during + * verifying checksums, which could lead to incorrect verification + * results. + */ + HOLD_INTERRUPTS(); + if (DataChecksumsNeedVerify()) { checksum = pg_checksum_page(page, blkno); @@ -118,6 +126,7 @@ PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_fail *checksum_failure_p = true; } } + RESUME_INTERRUPTS(); /* * The following checks don't prove the header is correct, only that @@ -151,8 +160,9 @@ PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_fail if ((flags & (PIV_LOG_WARNING | PIV_LOG_LOG)) != 0) ereport(flags & PIV_LOG_WARNING ? WARNING : LOG, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("page verification failed, calculated checksum %u but expected %u", - checksum, p->pd_checksum))); + errmsg("page verification failed, calculated checksum %u but expected %u%s", + checksum, p->pd_checksum, + (flags & PIV_ZERO_BUFFERS_ON_ERROR ? ", buffer will be zeroed" : "")))); if (header_sane && (flags & PIV_IGNORE_CHECKSUM_FAILURE)) return true; @@ -1507,9 +1517,14 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum, void PageSetChecksum(Page page, BlockNumber blkno) { + HOLD_INTERRUPTS(); /* If we don't need a checksum, just return */ - if (PageIsNew(page) || !DataChecksumsEnabled()) + if (PageIsNew(page) || !DataChecksumsNeedWrite()) + { + RESUME_INTERRUPTS(); return; + } ((PageHeader) page)->pd_checksum = pg_checksum_page(page, blkno); + RESUME_INTERRUPTS(); } diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c index 7727fed3bda..04fe13e64c6 100644 --- a/src/backend/utils/activity/pgstat_backend.c +++ b/src/backend/utils/activity/pgstat_backend.c @@ -380,6 +380,8 @@ pgstat_tracks_backend_bktype(BackendType bktype) case B_CHECKPOINTER: case B_IO_WORKER: case B_STARTUP: + case B_DATACHECKSUMSWORKER_LAUNCHER: + case B_DATACHECKSUMSWORKER_WORKER: return false; case B_AUTOVAC_WORKER: diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index 28de24538dc..2be26e92283 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -362,6 +362,8 @@ pgstat_tracks_io_bktype(BackendType bktype) case B_LOGGER: return false; + case B_DATACHECKSUMSWORKER_LAUNCHER: + case B_DATACHECKSUMSWORKER_WORKER: case B_AUTOVAC_LAUNCHER: case B_AUTOVAC_WORKER: case B_BACKEND: diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 6be80d2daad..0a6d16f8154 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -119,6 +119,8 @@ CHECKPOINT_DELAY_COMPLETE "Waiting for a backend that blocks a checkpoint from c CHECKPOINT_DELAY_START "Waiting for a backend that blocks a checkpoint from starting." CHECKPOINT_DONE "Waiting for a checkpoint to complete." CHECKPOINT_START "Waiting for a checkpoint to start." +CHECKSUM_ENABLE_STARTCONDITION "Waiting for data checksums enabling to start." +CHECKSUM_ENABLE_TEMPTABLE_WAIT "Waiting for temporary tables to be dropped for data checksums to be enabled." EXECUTE_GATHER "Waiting for activity from a child process while executing a Gather plan node." HASH_BATCH_ALLOCATE "Waiting for an elected Parallel Hash participant to allocate a hash table." HASH_BATCH_ELECT "Waiting to elect a Parallel Hash participant to allocate a hash table." @@ -365,6 +367,7 @@ SerialControl "Waiting to read or update shared pg_serial s AioWorkerSubmissionQueue "Waiting to access AIO worker submission queue." WaitLSN "Waiting to read or update shared Wait-for-LSN state." LogicalDecodingControl "Waiting to read or update logical decoding status information." +DataChecksumsWorker "Waiting for data checksums worker." # # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE) diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 9185a8e6b83..1408de387ea 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -297,6 +297,8 @@ pg_stat_get_progress_info(PG_FUNCTION_ARGS) cmdtype = PROGRESS_COMMAND_BASEBACKUP; else if (pg_strcasecmp(cmd, "COPY") == 0) cmdtype = PROGRESS_COMMAND_COPY; + else if (pg_strcasecmp(cmd, "DATACHECKSUMS") == 0) + cmdtype = PROGRESS_COMMAND_DATACHECKSUMS; else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), @@ -1182,9 +1184,6 @@ pg_stat_get_db_checksum_failures(PG_FUNCTION_ARGS) int64 result; PgStat_StatDBEntry *dbentry; - if (!DataChecksumsEnabled()) - PG_RETURN_NULL(); - if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL) result = 0; else @@ -1200,9 +1199,6 @@ pg_stat_get_db_checksum_last_failure(PG_FUNCTION_ARGS) TimestampTz result; PgStat_StatDBEntry *dbentry; - if (!DataChecksumsEnabled()) - PG_RETURN_NULL(); - if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL) result = 0; else diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index ba191977697..7ffc808073a 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -845,7 +845,8 @@ InitializeSessionUserIdStandalone(void) * workers, in slot sync worker and in background workers. */ Assert(!IsUnderPostmaster || AmAutoVacuumWorkerProcess() || - AmLogicalSlotSyncWorkerProcess() || AmBackgroundWorkerProcess()); + AmLogicalSlotSyncWorkerProcess() || AmBackgroundWorkerProcess() || + AmDataChecksumsWorkerProcess()); /* call only once */ Assert(!OidIsValid(AuthenticatedUserId)); diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 783a7400464..6f074013aa9 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -758,6 +758,24 @@ InitPostgres(const char *in_dbname, Oid dboid, ProcSignalInit(MyCancelKey, MyCancelKeyLength); + /* + * Initialize a local cache of the data_checksum_version, to be updated by + * the procsignal-based barriers. + * + * This intentionally happens after initializing the procsignal, otherwise + * we might miss a state change. This means we can get a barrier for the + * state we've just initialized. + * + * The postmaster (which is what gets forked into the new child process) + * does not handle barriers, therefore it may not have the current value + * of LocalDataChecksumVersion value (it'll have the value read from the + * control file, which may be arbitrarily old). + * + * NB: Even if the postmaster handled barriers, the value might still be + * stale, as it might have changed after this process forked. + */ + InitLocalDataChecksumState(); + /* * Also set up timeout handlers needed for backend operation. We need * these in every case except bootstrap. @@ -886,7 +904,7 @@ InitPostgres(const char *in_dbname, Oid dboid, errhint("You should immediately run CREATE USER \"%s\" SUPERUSER;.", username != NULL ? username : "postgres"))); } - else if (AmBackgroundWorkerProcess()) + else if (AmBackgroundWorkerProcess() || AmDataChecksumsWorkerProcess()) { if (username == NULL && !OidIsValid(useroid)) { diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index fc0900efe5f..a315c4ab8ab 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -571,11 +571,12 @@ max => '1.0', }, -{ name => 'data_checksums', type => 'bool', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', +{ name => 'data_checksums', type => 'enum', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', short_desc => 'Shows whether data checksums are turned on for this cluster.', flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED', variable => 'data_checksums', - boot_val => 'false', + boot_val => 'PG_DATA_CHECKSUM_OFF', + options => 'data_checksums_options', }, # Can't be set by ALTER SYSTEM as it can lead to recursive definition diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 1e14b7b4af0..d9ca13baff9 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -501,6 +501,14 @@ static const struct config_enum_entry file_extend_method_options[] = { {NULL, 0, false} }; +static const struct config_enum_entry data_checksums_options[] = { + {"on", PG_DATA_CHECKSUM_VERSION, true}, + {"off", PG_DATA_CHECKSUM_OFF, true}, + {"inprogress-on", PG_DATA_CHECKSUM_INPROGRESS_ON, true}, + {"inprogress-off", PG_DATA_CHECKSUM_INPROGRESS_OFF, true}, + {NULL, 0, false} +}; + /* * Options for enum values stored in other modules */ @@ -629,7 +637,6 @@ static int shared_memory_size_in_huge_pages; static int wal_block_size; static int num_os_semaphores; static int effective_wal_level = WAL_LEVEL_REPLICA; -static bool data_checksums; static bool integer_datetimes; #ifdef USE_ASSERT_CHECKING diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index c8194c27aa7..6d0337853e0 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -543,11 +543,11 @@ # archiver autovacuum # backend bgworker # bgwriter checkpointer - # ioworker postmaster - # slotsyncworker startup - # syslogger walreceiver - # walsummarizer walwriter - # walsender + # checksums ioworker + # postmaster slotsyncworker + # startup syslogger + # walreceiver walsummarizer + # walwriter walsender # # Level values in order of decreasing # detail: diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index 301e256fbb1..2a38f1d688b 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -585,7 +585,7 @@ main(int argc, char *argv[]) ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY) pg_fatal("cluster must be shut down"); - if (ControlFile->data_checksum_version == 0 && + if (ControlFile->data_checksum_version != PG_DATA_CHECKSUM_VERSION && mode == PG_MODE_CHECK) pg_fatal("data checksums are not enabled in cluster"); @@ -593,7 +593,7 @@ main(int argc, char *argv[]) mode == PG_MODE_DISABLE) pg_fatal("data checksums are already disabled in cluster"); - if (ControlFile->data_checksum_version > 0 && + if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION && mode == PG_MODE_ENABLE) pg_fatal("data checksums are already enabled in cluster"); diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index a4060309ae0..fe5fc5ec133 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -287,6 +287,8 @@ main(int argc, char *argv[]) ControlFile->checkPointCopy.oldestCommitTsXid); printf(_("Latest checkpoint's newestCommitTsXid:%u\n"), ControlFile->checkPointCopy.newestCommitTsXid); + printf(_("Latest checkpoint's data_checksum_version:%u\n"), + ControlFile->checkPointCopy.dataChecksumState); printf(_("Time of latest checkpoint: %s\n"), ckpttime_str); printf(_("Fake LSN counter for unlogged rels: %X/%08X\n"), diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c index aa6e8b4de5d..79053d22dcc 100644 --- a/src/bin/pg_upgrade/controldata.c +++ b/src/bin/pg_upgrade/controldata.c @@ -15,6 +15,7 @@ #include "access/xlog_internal.h" #include "common/string.h" #include "pg_upgrade.h" +#include "storage/checksum.h" /* @@ -736,6 +737,14 @@ check_control_data(ControlData *oldctrl, * check_for_isn_and_int8_passing_mismatch(). */ + /* + * If data checksums are in any in-progress state then disallow the + * upgrade. The user should either let the process finish, or turn off + * data checksums, before retrying. + */ + if (oldctrl->data_checksum_version > PG_DATA_CHECKSUM_VERSION) + pg_fatal("checksums are being enabled in the old cluster"); + /* * We might eventually allow upgrades from checksum to no-checksum * clusters. diff --git a/src/bin/pg_waldump/t/001_basic.pl b/src/bin/pg_waldump/t/001_basic.pl index a268f0f1dd0..7dd1c3dd63e 100644 --- a/src/bin/pg_waldump/t/001_basic.pl +++ b/src/bin/pg_waldump/t/001_basic.pl @@ -79,7 +79,8 @@ BRIN CommitTs ReplicationOrigin Generic -LogicalMessage$/, +LogicalMessage +XLOG2$/, 'rmgr list'); diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index 3352b5f8532..ae32ef16d67 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -47,3 +47,4 @@ PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_i PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL, NULL) PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask, NULL) PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL, logicalmsg_decode) +PG_RMGR(RM_XLOG2_ID, "XLOG2", xlog2_redo, xlog2_desc, xlog2_identify, NULL, NULL, NULL, xlog2_decode) diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index dcc12eb8cbe..4af38e74ce4 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -57,6 +57,7 @@ extern PGDLLIMPORT int CommitDelay; extern PGDLLIMPORT int CommitSiblings; extern PGDLLIMPORT bool track_wal_io_timing; extern PGDLLIMPORT int wal_decode_buffer_size; +extern PGDLLIMPORT int data_checksums; extern PGDLLIMPORT int CheckPointSegments; @@ -119,7 +120,7 @@ extern PGDLLIMPORT bool XLogLogicalInfo; * of the bits make it to disk, but the checksum wouldn't match. Also WAL-log * them if forced by wal_log_hints=on. */ -#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints) +#define XLogHintBitIsNeeded() (wal_log_hints || DataChecksumsNeedWrite()) /* Do we need to WAL-log information required only for Hot Standby and logical replication? */ #define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA) @@ -229,8 +230,11 @@ extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn); extern XLogRecPtr XLogGetReplicationSlotMinimumLSN(void); extern void xlog_redo(struct XLogReaderState *record); +extern void xlog2_redo(struct XLogReaderState *record); extern void xlog_desc(StringInfo buf, struct XLogReaderState *record); +extern void xlog2_desc(StringInfo buf, struct XLogReaderState *record); extern const char *xlog_identify(uint8 info); +extern const char *xlog2_identify(uint8 info); extern void issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli); @@ -243,7 +247,16 @@ extern XLogRecPtr GetXLogWriteRecPtr(void); extern uint64 GetSystemIdentifier(void); extern char *GetMockAuthenticationNonce(void); -extern bool DataChecksumsEnabled(void); +extern bool DataChecksumsNeedWrite(void); +extern bool DataChecksumsNeedVerify(void); +extern bool DataChecksumsInProgressOn(void); +extern void SetDataChecksumsOnInProgress(void); +extern void SetDataChecksumsOn(void); +extern void SetDataChecksumsOff(void); +extern const char *show_data_checksums(void); +extern const char *get_checksum_state_string(uint32 state); +extern void InitLocalDataChecksumState(void); +extern void SetLocalDataChecksumState(uint32 data_checksum_version); extern bool GetDefaultCharSignedness(void); extern XLogRecPtr GetFakeLSNForUnloggedRel(void); extern Size XLOGShmemSize(void); diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 755835d63bf..10c18d39ff8 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -25,6 +25,7 @@ #include "lib/stringinfo.h" #include "pgtime.h" #include "storage/block.h" +#include "storage/checksum.h" #include "storage/relfilelocator.h" @@ -287,6 +288,12 @@ typedef struct xl_restore_point char rp_name[MAXFNAMELEN]; } xl_restore_point; +/* Information logged when data checksum level is changed */ +typedef struct xl_checksum_state +{ + ChecksumStateType new_checksum_state; +} xl_checksum_state; + /* Overwrite of prior contrecord */ typedef struct xl_overwrite_contrecord { @@ -307,6 +314,7 @@ typedef struct xl_end_of_recovery typedef struct xl_checkpoint_redo { int wal_level; + uint32 data_checksum_version; } xl_checkpoint_redo; /* diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index b1c5afc15df..582bb2e2058 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202604021 +#define CATALOG_VERSION_NO 202604031 #endif diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 77a661e818b..80b3a730e03 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -22,7 +22,7 @@ /* Version identifier for this pg_control format */ -#define PG_CONTROL_VERSION 1901 +#define PG_CONTROL_VERSION 1902 /* Nonce key length, see below */ #define MOCK_AUTH_NONCE_LEN 32 @@ -63,6 +63,9 @@ typedef struct CheckPoint * set to InvalidTransactionId. */ TransactionId oldestActiveXid; + + /* data checksums state at the time of the checkpoint */ + uint32 dataChecksumState; } CheckPoint; /* XLOG info values for XLOG rmgr */ @@ -83,6 +86,9 @@ typedef struct CheckPoint #define XLOG_CHECKPOINT_REDO 0xE0 #define XLOG_LOGICAL_DECODING_STATUS_CHANGE 0xF0 +/* XLOG info values for XLOG2 rmgr */ +#define XLOG2_CHECKSUMS 0x00 + /* * System status indicator. Note this is stored in pg_control; if you change diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index acf16254b21..bd177aebfcb 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -12558,6 +12558,20 @@ proname => 'jsonb_subscript_handler', prorettype => 'internal', proargtypes => 'internal', prosrc => 'jsonb_subscript_handler' }, +# data checksum management functions +{ oid => '9258', + descr => 'disable data checksums', + proname => 'pg_disable_data_checksums', provolatile => 'v', prorettype => 'void', + proparallel => 'r', prosrc => 'disable_data_checksums', proargtypes => '', + proacl => '{POSTGRES=X}'}, +{ oid => '9257', + descr => 'enable data checksums', + proname => 'pg_enable_data_checksums', provolatile => 'v', prorettype => 'void', + proparallel => 'r', proargtypes => 'int4 int4', proallargtypes => '{int4,int4}', + proargmodes => '{i,i}', proargnames => '{cost_delay,cost_limit}', + proargdefaults => '{0,100}', prosrc => 'enable_data_checksums', + proacl => '{POSTGRES=X}'}, + # collation management functions { oid => '3445', descr => 'import collations from operating system', proname => 'pg_import_system_collations', procost => '100', diff --git a/src/include/commands/progress.h b/src/include/commands/progress.h index 9c40772706c..67948667a97 100644 --- a/src/include/commands/progress.h +++ b/src/include/commands/progress.h @@ -185,4 +185,20 @@ #define PROGRESS_COPY_TYPE_PIPE 3 #define PROGRESS_COPY_TYPE_CALLBACK 4 +/* Progress parameters for PROGRESS_DATACHECKSUMS */ +#define PROGRESS_DATACHECKSUMS_PHASE 0 +#define PROGRESS_DATACHECKSUMS_DBS_TOTAL 1 +#define PROGRESS_DATACHECKSUMS_DBS_DONE 2 +#define PROGRESS_DATACHECKSUMS_RELS_TOTAL 3 +#define PROGRESS_DATACHECKSUMS_RELS_DONE 4 +#define PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL 5 +#define PROGRESS_DATACHECKSUMS_BLOCKS_DONE 6 + +/* Phases of datachecksumsworker operation */ +#define PROGRESS_DATACHECKSUMS_PHASE_ENABLING 0 +#define PROGRESS_DATACHECKSUMS_PHASE_DISABLING 1 +#define PROGRESS_DATACHECKSUMS_PHASE_WAITING_TEMPREL 2 +#define PROGRESS_DATACHECKSUMS_PHASE_WAITING_BARRIER 3 +#define PROGRESS_DATACHECKSUMS_PHASE_DONE 4 + #endif diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 04f29748be7..7277c37e779 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -367,6 +367,9 @@ typedef enum BackendType B_WAL_SUMMARIZER, B_WAL_WRITER, + B_DATACHECKSUMSWORKER_LAUNCHER, + B_DATACHECKSUMSWORKER_WORKER, + /* * Logger is not connected to shared memory and does not have a PGPROC * entry. @@ -392,6 +395,9 @@ extern PGDLLIMPORT BackendType MyBackendType; #define AmWalSummarizerProcess() (MyBackendType == B_WAL_SUMMARIZER) #define AmWalWriterProcess() (MyBackendType == B_WAL_WRITER) #define AmIoWorkerProcess() (MyBackendType == B_IO_WORKER) +#define AmDataChecksumsWorkerProcess() \ + (MyBackendType == B_DATACHECKSUMSWORKER_LAUNCHER || \ + MyBackendType == B_DATACHECKSUMSWORKER_WORKER) #define AmSpecialWorkerProcess() \ (AmAutoVacuumLauncherProcess() || \ diff --git a/src/include/postmaster/datachecksum_state.h b/src/include/postmaster/datachecksum_state.h new file mode 100644 index 00000000000..343494edcc8 --- /dev/null +++ b/src/include/postmaster/datachecksum_state.h @@ -0,0 +1,58 @@ +/*------------------------------------------------------------------------- + * + * datachecksum_state.h + * header file for data checksum helper background worker and data + * checksum state manipulation + * + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/postmaster/datachecksum_state.h + * + *------------------------------------------------------------------------- + */ +#ifndef DATACHECKSUM_STATE_H +#define DATACHECKSUM_STATE_H + +#include "storage/procsignal.h" + +/* Shared memory */ +extern Size DataChecksumsShmemSize(void); +extern void DataChecksumsShmemInit(void); + +/* Possible operations the Datachecksumsworker can perform */ +typedef enum DataChecksumsWorkerOperation +{ + ENABLE_DATACHECKSUMS, + DISABLE_DATACHECKSUMS, +} DataChecksumsWorkerOperation; + +/* + * Possible states for a database entry which has been processed. Exported + * here since we want to be able to reference this from injection point tests. + */ +typedef enum +{ + DATACHECKSUMSWORKER_SUCCESSFUL = 0, + DATACHECKSUMSWORKER_ABORTED, + DATACHECKSUMSWORKER_FAILED, + DATACHECKSUMSWORKER_DROPDB, +} DataChecksumsWorkerResult; + +/* Prototypes for data checksum state manipulation */ +bool AbsorbDataChecksumsBarrier(ProcSignalBarrierType target_state); +void EmitAndWaitDataChecksumsBarrier(uint32 state); + +/* Prototypes for data checksum background worker */ + +/* Start the background processes for enabling or disabling checksums */ +void StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op, + int cost_delay, + int cost_limit); + +/* Background worker entrypoints */ +void DataChecksumsWorkerLauncherMain(Datum arg); +void DataChecksumsWorkerMain(Datum arg); + +#endif /* DATACHECKSUM_STATE_H */ diff --git a/src/include/postmaster/proctypelist.h b/src/include/postmaster/proctypelist.h index feac19ba207..b3477e6f17a 100644 --- a/src/include/postmaster/proctypelist.h +++ b/src/include/postmaster/proctypelist.h @@ -38,6 +38,8 @@ PG_PROCTYPE(B_BACKEND, "backend", gettext_noop("client backend"), BackendMain, t PG_PROCTYPE(B_BG_WORKER, "bgworker", gettext_noop("background worker"), BackgroundWorkerMain, true) PG_PROCTYPE(B_BG_WRITER, "bgwriter", gettext_noop("background writer"), BackgroundWriterMain, true) PG_PROCTYPE(B_CHECKPOINTER, "checkpointer", gettext_noop("checkpointer"), CheckpointerMain, true) +PG_PROCTYPE(B_DATACHECKSUMSWORKER_LAUNCHER, "checksums", gettext_noop("datachecksum launcher"), NULL, false) +PG_PROCTYPE(B_DATACHECKSUMSWORKER_WORKER, "checksums", gettext_noop("datachecksum worker"), NULL, false) PG_PROCTYPE(B_DEAD_END_BACKEND, "backend", gettext_noop("dead-end client backend"), BackendMain, true) PG_PROCTYPE(B_INVALID, "postmaster", gettext_noop("unrecognized"), NULL, false) PG_PROCTYPE(B_IO_WORKER, "ioworker", gettext_noop("io worker"), IoWorkerMain, true) diff --git a/src/include/replication/decode.h b/src/include/replication/decode.h index 49f00fc48b8..107e43ef750 100644 --- a/src/include/replication/decode.h +++ b/src/include/replication/decode.h @@ -22,6 +22,7 @@ typedef struct XLogRecordBuffer } XLogRecordBuffer; extern void xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +extern void xlog2_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); extern void heap_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); extern void heap2_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); extern void xact_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index e5267b93fe6..634e1e49ee5 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -230,7 +230,6 @@ typedef PageHeaderData *PageHeader; * handling pages. */ #define PG_PAGE_LAYOUT_VERSION 4 -#define PG_DATA_CHECKSUM_VERSION 1 /* ---------------------------------------------------------------- * page support functions @@ -501,6 +500,7 @@ do { \ #define PIV_LOG_WARNING (1 << 0) #define PIV_LOG_LOG (1 << 1) #define PIV_IGNORE_CHECKSUM_FAILURE (1 << 2) +#define PIV_ZERO_BUFFERS_ON_ERROR (1 << 3) #define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap) \ PageAddItemExtended(page, item, size, offsetNumber, \ diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h index ff417d5ae3e..3b1440c0c95 100644 --- a/src/include/storage/checksum.h +++ b/src/include/storage/checksum.h @@ -15,6 +15,22 @@ #include "storage/block.h" +/* + * Checksum state 0 is used for when data checksums are disabled (OFF). + * PG_DATA_CHECKSUM_INPROGRESS_{ON|OFF} defines that data checksums are either + * currently being enabled or disabled, and PG_DATA_CHECKSUM_VERSION defines + * that data checksums are enabled. The ChecksumStateType is stored in + * pg_control so changing requires a catversion bump, and the values cannot + * be reordered. New states must be added at the end. + */ +typedef enum ChecksumStateType +{ + PG_DATA_CHECKSUM_OFF = 0, + PG_DATA_CHECKSUM_VERSION = 1, + PG_DATA_CHECKSUM_INPROGRESS_OFF = 2, + PG_DATA_CHECKSUM_INPROGRESS_ON = 3, +} ChecksumStateType; + /* * Compute the checksum for a Postgres page. The page must be aligned on a * 4-byte boundary. diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h index 59ee097977d..af8553bcb6c 100644 --- a/src/include/storage/lwlocklist.h +++ b/src/include/storage/lwlocklist.h @@ -87,6 +87,7 @@ PG_LWLOCK(52, SerialControl) PG_LWLOCK(53, AioWorkerSubmissionQueue) PG_LWLOCK(54, WaitLSN) PG_LWLOCK(55, LogicalDecodingControl) +PG_LWLOCK(56, DataChecksumsWorker) /* * There also exist several built-in LWLock tranches. As with the predefined diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index 348fba53a93..cc4f26aa33d 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -48,6 +48,10 @@ typedef enum PROCSIGNAL_BARRIER_SMGRRELEASE, /* ask smgr to close files */ PROCSIGNAL_BARRIER_UPDATE_XLOG_LOGICAL_INFO, /* ask to update * XLogLogicalInfo */ + PROCSIGNAL_BARRIER_CHECKSUM_OFF, + PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON, + PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF, + PROCSIGNAL_BARRIER_CHECKSUM_ON, } ProcSignalBarrierType; /* diff --git a/src/include/utils/backend_progress.h b/src/include/utils/backend_progress.h index 6300dbd15d5..61e13c40e28 100644 --- a/src/include/utils/backend_progress.h +++ b/src/include/utils/backend_progress.h @@ -28,6 +28,7 @@ typedef enum ProgressCommandType PROGRESS_COMMAND_BASEBACKUP, PROGRESS_COMMAND_COPY, PROGRESS_COMMAND_REPACK, + PROGRESS_COMMAND_DATACHECKSUMS, } ProgressCommandType; #define PGSTAT_NUM_PROGRESS_PARAM 20 diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 28ce3b35eda..864b407abcf 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -20,6 +20,7 @@ SUBDIRS = \ test_bitmapset \ test_bloomfilter \ test_cloexec \ + test_checksums \ test_copy_callbacks \ test_custom_rmgrs \ test_custom_stats \ diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build index 3ac291656c1..e5acacd5083 100644 --- a/src/test/modules/meson.build +++ b/src/test/modules/meson.build @@ -20,6 +20,7 @@ subdir('test_binaryheap') subdir('test_bitmapset') subdir('test_bloomfilter') subdir('test_cloexec') +subdir('test_checksums') subdir('test_copy_callbacks') subdir('test_cplusplusext') subdir('test_custom_rmgrs') diff --git a/src/test/modules/test_checksums/.gitignore b/src/test/modules/test_checksums/.gitignore new file mode 100644 index 00000000000..871e943d50e --- /dev/null +++ b/src/test/modules/test_checksums/.gitignore @@ -0,0 +1,2 @@ +# Generated by test suite +/tmp_check/ diff --git a/src/test/modules/test_checksums/Makefile b/src/test/modules/test_checksums/Makefile new file mode 100644 index 00000000000..fa85b79ae57 --- /dev/null +++ b/src/test/modules/test_checksums/Makefile @@ -0,0 +1,40 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/test/modules/test_checksums +# +# Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/test/modules/test_checksums/Makefile +# +#------------------------------------------------------------------------- + +EXTRA_INSTALL = src/test/modules/injection_points + +export enable_injection_points + +MODULE_big = test_checksums +OBJS = \ + $(WIN32RES) \ + test_checksums.o +PGFILEDESC = "test_checksums - test code for data checksums" + +EXTENSION = test_checksums +DATA = test_checksums--1.0.sql + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_checksums +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +check: + $(prove_check) + +installcheck: + $(prove_installcheck) diff --git a/src/test/modules/test_checksums/README b/src/test/modules/test_checksums/README new file mode 100644 index 00000000000..6a23e4ff0ae --- /dev/null +++ b/src/test/modules/test_checksums/README @@ -0,0 +1,30 @@ +src/test/modules/test_checksums/README + +Regression tests for data checksums +=================================== +This directory contains a test suite for enabling, and disabling, data +checksums both offline as well as in a running cluster. + +Running the tests with autoconf +=============================== + + make check + +or + + make installcheck + +Running the tests with meson +============================ +From your build directory, issue the following command: + + meson test -q --print-errorlogs --suite setup --suite test_checksums + +NOTE: This creates a temporary installation (in the case of "make check" or +"--suite setup"), with multiple nodes, be they master or standby(s) for the +purpose of the tests. + +NOTE: This test suite requires TAP tests to be enabled, a subset of the tests +also require injection points to function. In order to run the extended test +then "checksum_extended" must be set in the PG_TEST_EXTRA environment +variable. diff --git a/src/test/modules/test_checksums/meson.build b/src/test/modules/test_checksums/meson.build new file mode 100644 index 00000000000..9b1421a9b91 --- /dev/null +++ b/src/test/modules/test_checksums/meson.build @@ -0,0 +1,38 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +test_checksums_sources = files( + 'test_checksums.c', +) + +test_checksums = shared_module('test_checksums', + test_checksums_sources, + kwargs: pg_test_mod_args, +) +test_install_libs += test_checksums + +test_install_data += files( + 'test_checksums.control', + 'test_checksums--1.0.sql', +) + +tests += { + 'name': 'test_checksums', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'tap': { + 'env': { + 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no', + }, + 'tests': [ + 't/001_basic.pl', + 't/002_restarts.pl', + 't/003_standby_restarts.pl', + 't/004_offline.pl', + 't/005_injection.pl', + 't/006_pgbench_single.pl', + 't/007_pgbench_standby.pl', + 't/008_pitr.pl', + 't/009_fpi.pl', + ], + }, +} diff --git a/src/test/modules/test_checksums/t/001_basic.pl b/src/test/modules/test_checksums/t/001_basic.pl new file mode 100644 index 00000000000..c008e95fbff --- /dev/null +++ b/src/test/modules/test_checksums/t/001_basic.pl @@ -0,0 +1,63 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Initialize node with checksums disabled. +my $node = PostgreSQL::Test::Cluster->new('basic_node'); +$node->init(no_data_checksums => 1); +$node->start; + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are turned off +test_checksum_state($node, 'off'); + +# Enable data checksums and wait for the state transition to 'on' +enable_data_checksums($node, wait => 'on'); + +# Run a dummy query just to make sure we can read back data +my $result = + $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1 "); +is($result, '9999', 'ensure checksummed pages can be read back'); + +# Enable data checksums again which should be a no-op so we explicitly don't +# wait for any state transition as none should happen here +enable_data_checksums($node); +test_checksum_state($node, 'on'); +# ..and make sure we can still read/write data +$node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +# Disable checksums again and wait for the state transition +disable_data_checksums($node, wait => 1); + +# Test reading data again +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '10000', 'ensure previously checksummed pages can be read back'); + +# Re-enable checksums and make sure that the underlying data has changed to +# ensure that checksums will be different. +$node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); +enable_data_checksums($node, wait => 'on'); + +# Run a dummy query just to make sure we can read back the data +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +$node->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/002_restarts.pl b/src/test/modules/test_checksums/t/002_restarts.pl new file mode 100644 index 00000000000..bab59be82bd --- /dev/null +++ b/src/test/modules/test_checksums/t/002_restarts.pl @@ -0,0 +1,110 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster with a +# restart which breaks processing. +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Initialize node with checksums disabled. +my $node = PostgreSQL::Test::Cluster->new('restarts_node'); +$node->init(no_data_checksums => 1); +$node->start; + +# Initialize result storage for queries +my $result; + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are disabled +test_checksum_state($node, 'off'); + +SKIP: +{ + skip 'Data checksum delay tests not enabled in PG_TEST_EXTRA', 6 + if (!$ENV{PG_TEST_EXTRA} + || $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/); + + # Create a barrier for checksum enablement to block on, in this case a pre- + # existing temporary table which is kept open while processing is started. + # We can accomplish this by setting up an interactive psql process which + # keeps the temporary table created as we enable checksums in another psql + # process. + # + # This is a similar test to the synthetic variant in 005_injection.pl + # which fakes this scenario. + my $bsession = $node->background_psql('postgres'); + $bsession->query_safe('CREATE TEMPORARY TABLE tt (a integer);'); + + # In another session, make sure we can see the blocking temp table but + # start processing anyways and check that we are blocked with a proper + # wait event. + $result = $node->safe_psql('postgres', + "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';" + ); + is($result, 't', 'ensure we can see the temporary table'); + + # Enabling data checksums shouldn't work as the process is blocked on the + # temporary table held open by $bsession. Ensure that we reach inprogress- + # on before we do more tests. + enable_data_checksums($node, wait => 'inprogress-on'); + + # Wait for processing to finish and the worker waiting for leftover temp + # relations to be able to actually finish + $result = $node->poll_query_until( + 'postgres', + "SELECT wait_event FROM pg_catalog.pg_stat_activity " + . "WHERE backend_type = 'datachecksum worker';", + 'ChecksumEnableTemptableWait'); + + # The datachecksumsworker waits for temporary tables to disappear for 3 + # seconds before retrying, so sleep for 4 seconds to be guaranteed to see + # a retry cycle + sleep(4); + + # Re-check the wait event to ensure we are blocked on the right thing. + $result = $node->safe_psql('postgres', + "SELECT wait_event FROM pg_catalog.pg_stat_activity " + . "WHERE backend_type = 'datachecksum worker';"); + is($result, 'ChecksumEnableTemptableWait', + 'ensure the correct wait condition is set'); + test_checksum_state($node, 'inprogress-on'); + + # Stop the cluster while bsession is still attached. We can't close the + # session first since the brief period between closing and stopping might + # be enough for checksums to get enabled. + $node->stop; + $bsession->quit; + $node->start; + + # Ensure the checksums aren't enabled across the restart. This leaves the + # cluster in the same state as before we entered the SKIP block. + test_checksum_state($node, 'off'); +} + +enable_data_checksums($node, wait => 'on'); + +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '9999', 'ensure checksummed pages can be read back'); + +$result = $node->poll_query_until( + 'postgres', + "SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksum%';", + '0'); +is($result, 1, 'await datachecksums worker/launcher termination'); + +disable_data_checksums($node, wait => 1); + +$node->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/003_standby_restarts.pl b/src/test/modules/test_checksums/t/003_standby_restarts.pl new file mode 100644 index 00000000000..6b016925651 --- /dev/null +++ b/src/test/modules/test_checksums/t/003_standby_restarts.pl @@ -0,0 +1,114 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster with +# streaming replication +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Initialize primary node +my $node_primary = PostgreSQL::Test::Cluster->new('standby_restarts_primary'); +$node_primary->init(allows_streaming => 1, no_data_checksums => 1); +$node_primary->start; + +my $slotname = 'physical_slot'; +$node_primary->safe_psql('postgres', + "SELECT pg_create_physical_replication_slot('$slotname')"); + +# Take backup +my $backup_name = 'my_backup'; +$node_primary->backup($backup_name); + +# Create streaming standby linking to primary +my $node_standby = PostgreSQL::Test::Cluster->new('standby_restarts_standby'); +$node_standby->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_standby->append_conf( + 'postgresql.conf', qq[ +primary_slot_name = '$slotname' +]); +$node_standby->start; + +# Create some content on the primary to have un-checksummed data in the cluster +$node_primary->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Wait for standbys to catch up +$node_primary->wait_for_catchup($node_standby, 'replay', + $node_primary->lsn('insert')); + +# Check that checksums are turned off on all nodes +test_checksum_state($node_primary, 'off'); +test_checksum_state($node_standby, 'off'); + +# --------------------------------------------------------------------------- +# Enable checksums for the cluster, and make sure that both the primary and +# standby change state. +# + +# Ensure that the primary switches to "inprogress-on" +enable_data_checksums($node_primary, wait => 'inprogress-on'); +# Wait for checksum enable to be replayed +$node_primary->wait_for_catchup($node_standby, 'replay'); + +# Ensure that the standby has switched to "inprogress-on" or "on". Normally it +# would be "inprogress-on", but it is theoretically possible for the primary to +# complete the checksum enabling *and* have the standby replay that record +# before we reach the check below. +my $result = $node_standby->poll_query_until( + 'postgres', + "SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'f'); +is($result, 1, 'ensure standby has absorbed the inprogress-on barrier'); +$result = $node_standby->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" +); + +is(($result eq 'inprogress-on' || $result eq 'on'), + 1, 'ensure checksums are on, or in progress, on standby_1'); + +# Insert some more data which should be checksummed on INSERT +$node_primary->safe_psql('postgres', + "INSERT INTO t VALUES (generate_series(1, 10000));"); + +# Wait for checksums enabled on the primary and standby +wait_for_checksum_state($node_primary, 'on'); +wait_for_checksum_state($node_standby, 'on'); + +$result = + $node_primary->safe_psql('postgres', "SELECT count(a) FROM t WHERE a > 1"); +is($result, '19998', 'ensure we can safely read all data with checksums'); + +$result = $node_primary->poll_query_until( + 'postgres', + "SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksum%';", + '0'); +is($result, 1, 'await datachecksums worker/launcher termination'); + +# +# Disable checksums and ensure it's propagated to standby and that we can +# still read all data +# + +# Disable checksums and wait for the operation to be replayed +disable_data_checksums($node_primary); +$node_primary->wait_for_catchup($node_standby, 'replay'); +# Ensure that the primary and standby has switched to off +wait_for_checksum_state($node_primary, 'off'); +wait_for_checksum_state($node_standby, 'off'); +# Doublecheck reading data without errors +$result = + $node_primary->safe_psql('postgres', "SELECT count(a) FROM t WHERE a > 1"); +is($result, "19998", 'ensure we can safely read all data without checksums'); + +$node_standby->stop; +$node_primary->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/004_offline.pl b/src/test/modules/test_checksums/t/004_offline.pl new file mode 100644 index 00000000000..f1972bddff1 --- /dev/null +++ b/src/test/modules/test_checksums/t/004_offline.pl @@ -0,0 +1,82 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums offline from various states +# of checksum processing +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Initialize node with checksums disabled. +my $node = PostgreSQL::Test::Cluster->new('offline_node'); +$node->init(no_data_checksums => 1); +$node->start; + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are disabled +test_checksum_state($node, 'off'); + +# Enable checksums offline using pg_checksums +$node->stop; +$node->checksum_enable_offline; +$node->start; + +# Ensure that checksums are enabled +test_checksum_state($node, 'on'); + +# Run a dummy query just to make sure we can read back some data +my $result = + $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '9999', 'ensure checksummed pages can be read back'); + +# Disable checksums offline again using pg_checksums +$node->stop; +$node->checksum_disable_offline; +$node->start; + +# Ensure that checksums are disabled +test_checksum_state($node, 'off'); + +# Create a barrier for checksum enablement to block on, in this case a pre- +# existing temporary table which is kept open while processing is started. We +# can accomplish this by setting up an interactive psql process which keeps the +# temporary table created as we enable checksums in another psql process. + +my $bsession = $node->background_psql('postgres'); +$bsession->query_safe('CREATE TEMPORARY TABLE tt (a integer);'); + +# In another session, make sure we can see the blocking temp table but start +# processing anyways and check that we are blocked with a proper wait event. +$result = $node->safe_psql('postgres', + "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';"); +is($result, 't', 'ensure we can see the temporary table'); + +enable_data_checksums($node, wait => 'inprogress-on'); + +# Turn the cluster off and enable checksums offline, then start back up +$bsession->quit; +$node->stop; +$node->checksum_enable_offline; +$node->start; + +# Ensure that checksums are now enabled even though processing wasn't +# restarted +test_checksum_state($node, 'on'); + +# Run a dummy query just to make sure we can read back some data +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '9999', 'ensure checksummed pages can be read back'); + +$node->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/005_injection.pl b/src/test/modules/test_checksums/t/005_injection.pl new file mode 100644 index 00000000000..897f282a1f2 --- /dev/null +++ b/src/test/modules/test_checksums/t/005_injection.pl @@ -0,0 +1,74 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster with +# injection point tests injecting failures into the processing + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +# --------------------------------------------------------------------------- +# Test cluster setup +# + +# Initiate testcluster +my $node = PostgreSQL::Test::Cluster->new('injection_node'); +$node->init(no_data_checksums => 1); +$node->start; + +# Set up test environment +$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); + +# --------------------------------------------------------------------------- +# Inducing failures and crashes in processing + +# Force enabling checksums to fail by marking one of the databases as having +# failed in processing. +disable_data_checksums($node, wait => 1); +$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(true);'); +enable_data_checksums($node, wait => 'off'); +$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(false);'); + +# Make sure that disabling after a failure works +disable_data_checksums($node); +test_checksum_state($node, 'off'); + +# --------------------------------------------------------------------------- +# Timing and retry related tests +# + +SKIP: +{ + skip 'Data checksum delay tests not enabled in PG_TEST_EXTRA', 4 + if (!$ENV{PG_TEST_EXTRA} + || $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/); + + # Inject a delay in the barrier for enabling checksums + disable_data_checksums($node, wait => 1); + $node->safe_psql('postgres', 'SELECT dcw_inject_delay_barrier();'); + enable_data_checksums($node, wait => 'on'); + + # Fake the existence of a temporary table at the start of processing, which + # will force the processing to wait and retry in order to wait for it to + # disappear. + disable_data_checksums($node, wait => 1); + $node->safe_psql('postgres', 'SELECT dcw_fake_temptable(true);'); + enable_data_checksums($node, wait => 'on'); +} + +$node->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/006_pgbench_single.pl b/src/test/modules/test_checksums/t/006_pgbench_single.pl new file mode 100644 index 00000000000..0ab5b04b931 --- /dev/null +++ b/src/test/modules/test_checksums/t/006_pgbench_single.pl @@ -0,0 +1,275 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster with +# concurrent activity via pgbench runs + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# This test suite is expensive, or very expensive, to execute. There are two +# PG_TEST_EXTRA options for running it, "checksum" for a pared-down test suite +# an "checksum_extended" for the full suite. The full suite can run for hours +# on slow or constrained systems. +my $extended = undef; +if ($ENV{PG_TEST_EXTRA}) +{ + $extended = 1 if ($ENV{PG_TEST_EXTRA} =~ /\bchecksum_extended\b/); + plan skip_all => 'Expensive data checksums test disabled' + unless ($ENV{PG_TEST_EXTRA} =~ /\bchecksum(_extended)?\b/); +} +else +{ + plan skip_all => 'Expensive data checksums test disabled'; +} + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my $node; +my $node_loglocation = 0; + +# The number of full test iterations which will be performed. The exact number +# of tests performed and the wall time taken is non-deterministic as the test +# performs a lot of randomized actions, but 10 iterations will be a long test +# run regardless. +my $TEST_ITERATIONS = 1; +$TEST_ITERATIONS = 10 if ($extended); + +# Variables which record the current state of the cluster +my $data_checksum_state = 'off'; +my $pgbench = undef; + +# Start a pgbench run in the background against the server specified via the +# port passed as parameter. +sub background_rw_pgbench +{ + my $port = shift; + + # If a previous pgbench is still running, start by shutting it down. + $pgbench->finish if $pgbench; + + my $clients = 1; + my $runtime = 2; + + if ($extended) + { + # Randomize the number of pgbench clients a bit (range 1-16) + $clients = 1 + int(rand(15)); + $runtime = 600; + } + my @cmd = ('pgbench', '-p', $port, '-T', $runtime, '-c', $clients); + + # Randomize whether we spawn connections or not + push(@cmd, '-C') if ($extended && cointoss); + # Finally add the database name to use + push(@cmd, 'postgres'); + + $pgbench = IPC::Run::start( + \@cmd, + '<' => '/dev/null', + '>' => '/dev/null', + '2>' => '/dev/null', + IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default)); +} + +# Invert the state of data checksums in the cluster, if data checksums are on +# then disable them and vice versa. Also performs proper validation of the +# before and after state. +sub flip_data_checksums +{ + # First, make sure the cluster is in the state we expect it to be + test_checksum_state($node, $data_checksum_state); + + if ($data_checksum_state eq 'off') + { + # Coin-toss to see if we are injecting a retry due to a temptable + $node->safe_psql('postgres', 'SELECT dcw_fake_temptable();') + if cointoss(); + + # log LSN right before we start changing checksums + my $result = + $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN before enabling: " . $result . "\n"); + + # Ensure that the primary switches to "inprogress-on" + enable_data_checksums($node, wait => 'inprogress-on'); + + random_sleep() if ($extended); + + # Wait for checksums enabled on the primary + wait_for_checksum_state($node, 'on'); + + # log LSN right after the primary flips checksums to "on" + $result = $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN after enabling: " . $result . "\n"); + + random_sleep() if ($extended); + + $node->safe_psql('postgres', 'SELECT dcw_fake_temptable(false);'); + $data_checksum_state = 'on'; + } + elsif ($data_checksum_state eq 'on') + { + random_sleep() if ($extended); + + # log LSN right before we start changing checksums + my $result = + $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN before disabling: " . $result . "\n"); + + disable_data_checksums($node); + + # Wait for checksums disabled on the primary + wait_for_checksum_state($node, 'off'); + + # log LSN right after the primary flips checksums to "off" + $result = $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN after disabling: " . $result . "\n"); + + random_sleep() if ($extended); + + $data_checksum_state = 'off'; + } + else + { + # This should only happen due to programmer error when hacking on the + # test code, but since that might pass subtly we error out. + BAIL_OUT('data_checksum_state variable has invalid state:' + . $data_checksum_state); + } +} + +# Create and start a cluster with one node +$node = PostgreSQL::Test::Cluster->new('pgbench_single_main'); +$node->init(allows_streaming => 1, no_data_checksums => 1); +# max_connections need to be bumped in order to accommodate for pgbench clients +# and log_statement is dialled down since it otherwise will generate enormous +# amounts of logging. Page verification failures are still logged. +$node->append_conf( + 'postgresql.conf', + qq[ +max_connections = 100 +log_statement = none +]); +$node->start; +$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;"); +# Initialize pgbench +my $scalefactor = ($extended ? 10 : 1); +$node->command_ok( + [ + 'pgbench', '-p', $node->port, '-i', + '-s', $scalefactor, '-q', 'postgres' + ]); +# Start the test suite with pgbench running. +background_rw_pgbench($node->port); + +# Main test suite. This loop will start a pgbench run on the cluster and while +# that's running flip the state of data checksums concurrently. It will then +# randomly restart the cluster and then check for +# the desired state. The idea behind doing things randomly is to stress out +# any timing related issues by subjecting the cluster for varied workloads. +# A TODO is to generate a trace such that any test failure can be traced to +# its order of operations for debugging. +for (my $i = 0; $i < $TEST_ITERATIONS; $i++) +{ + note("iteration ", ($i + 1), " of ", $TEST_ITERATIONS); + + if (!$node->is_alive) + { + # Start, to do recovery, and stop + $node->start; + $node->stop('fast'); + + # Since the log isn't being written to now, parse the log and check + # for instances of checksum verification failures. + my $log = PostgreSQL::Test::Utils::slurp_file($node->logfile, + $node_loglocation); + unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in primary log (during WAL recovery)" + ); + $node_loglocation = -s $node->logfile; + + # Randomize the WAL size, to trigger checkpoints less/more often + my $sb = 64 + int(rand(1024)); + $node->append_conf('postgresql.conf', qq[max_wal_size = $sb]); + note("changing max_wal_size to " . $sb); + + $node->start; + + # Start a pgbench in the background against the primary + background_rw_pgbench($node->port); + } + + $node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); + + flip_data_checksums(); + random_sleep() if ($extended); + my $result = + $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); + is($result, '100000', 'ensure data pages can be read back on primary'); + + random_sleep() if ($extended); + + # Potentially powercycle the node + if (cointoss()) + { + $node->stop(stopmode()); + + PostgreSQL::Test::Utils::system_log("pg_controldata", + $node->data_dir); + + my $log = PostgreSQL::Test::Utils::slurp_file($node->logfile, + $node_loglocation); + unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in primary log (outside WAL recovery)" + ); + $node_loglocation = -s $node->logfile; + } + + random_sleep() if ($extended); +} + +# Make sure the node is running +if (!$node->is_alive) +{ + $node->start; +} + +# Testrun is over, ensure that data reads back as expected and perform a final +# verification of the data checksum state. +my $result = + $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '100000', 'ensure data pages can be read back on primary'); +test_checksum_state($node, $data_checksum_state); + +# Perform one final pass over the logs and hunt for unexpected errors +my $log = + PostgreSQL::Test::Utils::slurp_file($node->logfile, $node_loglocation); +unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in primary log"); +$node_loglocation = -s $node->logfile; + +$node->teardown_node; + +done_testing(); diff --git a/src/test/modules/test_checksums/t/007_pgbench_standby.pl b/src/test/modules/test_checksums/t/007_pgbench_standby.pl new file mode 100644 index 00000000000..b0d40d24005 --- /dev/null +++ b/src/test/modules/test_checksums/t/007_pgbench_standby.pl @@ -0,0 +1,400 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster, +# comprising of a primary and a replicated standby, with concurrent activity +# via pgbench runs + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# This test suite is expensive, or very expensive, to execute. There are two +# PG_TEST_EXTRA options for running it, "checksum" for a pared-down test suite +# an "checksum_extended" for the full suite. The full suite can run for hours +# on slow or constrained systems. +my $extended = undef; +if ($ENV{PG_TEST_EXTRA}) +{ + $extended = 1 if ($ENV{PG_TEST_EXTRA} =~ /\bchecksum_extended\b/); + plan skip_all => 'Expensive data checksums test disabled' + unless ($ENV{PG_TEST_EXTRA} =~ /\bchecksum(_extended)?\b/); +} +else +{ + plan skip_all => 'Expensive data checksums test disabled'; +} + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my $node_primary_slot = 'physical_slot'; +my $node_primary_backup = 'primary_backup'; +my $node_primary; +my $node_primary_loglocation = 0; +my $node_standby; +my $node_standby_loglocation = 0; + +# The number of full test iterations which will be performed. The exact number +# of tests performed and the wall time taken is non-deterministic as the test +# performs a lot of randomized actions, but 5 iterations will be a long test +# run regardless. +my $TEST_ITERATIONS = 5; +$TEST_ITERATIONS = 1 if ($extended); + +# Variables which record the current state of the cluster +my $data_checksum_state = 'off'; + +my $pgbench_primary = undef; +my $pgbench_standby = undef; + +# Start a pgbench run in the background against the server specified via the +# port passed as parameter +sub background_pgbench +{ + my ($port, $standby) = @_; + my $pgbench = ($standby ? \$pgbench_standby : \$pgbench_primary); + + # Terminate any currently running pgbench process before continuing + $$pgbench->finish if $$pgbench; + + my $clients = 1; + my $runtime = 5; + + if ($extended) + { + # Randomize the number of pgbench clients a bit (range 1-16) + $clients = 1 + int(rand(15)); + $runtime = 600; + } + + my @cmd = ('pgbench', '-p', $port, '-T', $runtime, '-c', $clients); + # Randomize whether we spawn connections or not + push(@cmd, '-C') if ($extended && cointoss()); + # If we run on a standby it needs to be a read-only benchmark + push(@cmd, '-S') if ($standby); + # Finally add the database name to use + push(@cmd, 'postgres'); + + $$pgbench = IPC::Run::start( + \@cmd, + '<' => '/dev/null', + '>' => '/dev/null', + '2>' => '/dev/null', + IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default)); +} + +# Invert the state of data checksums in the cluster, if data checksums are on +# then disable them and vice versa. Also performs proper validation of the +# before and after state. +sub flip_data_checksums +{ + # First, make sure the cluster is in the state we expect it to be + test_checksum_state($node_primary, $data_checksum_state); + test_checksum_state($node_standby, $data_checksum_state); + + if ($data_checksum_state eq 'off') + { + # Coin-toss to see if we are injecting a retry due to a temptable + $node_primary->safe_psql('postgres', 'SELECT dcw_fake_temptable();') + if cointoss(); + + # log LSN right before we start changing checksums + my $result = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN before enabling: " . $result . "\n"); + + # Ensure that the primary switches to "inprogress-on" + enable_data_checksums($node_primary, wait => 'inprogress-on'); + + random_sleep() if ($extended); + + # Wait for checksum enable to be replayed + $node_primary->wait_for_catchup($node_standby, 'replay'); + + # Ensure that the standby has switched to "inprogress-on" or "on". + # Normally it would be "inprogress-on", but it is theoretically + # possible for the primary to complete the checksum enabling *and* have + # the standby replay that record before we reach the check below. + $result = $node_standby->poll_query_until( + 'postgres', + "SELECT setting = 'off' " + . "FROM pg_catalog.pg_settings " + . "WHERE name = 'data_checksums';", + 'f'); + is($result, 1, + 'ensure standby has absorbed the inprogress-on barrier'); + $result = $node_standby->safe_psql('postgres', + "SELECT setting " + . "FROM pg_catalog.pg_settings " + . "WHERE name = 'data_checksums';"); + + is(($result eq 'inprogress-on' || $result eq 'on'), + 1, 'ensure checksums are on, or in progress, on standby_1'); + + # Wait for checksums enabled on the primary and standby + wait_for_checksum_state($node_primary, 'on'); + + # log LSN right after the primary flips checksums to "on" + $result = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN after enabling: " . $result . "\n"); + + random_sleep() if ($extended); + wait_for_checksum_state($node_standby, 'on'); + + $node_primary->safe_psql('postgres', + 'SELECT dcw_fake_temptable(false);'); + $data_checksum_state = 'on'; + } + elsif ($data_checksum_state eq 'on') + { + random_sleep() if ($extended); + + # log LSN right before we start changing checksums + my $result = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN before disabling: " . $result . "\n"); + + disable_data_checksums($node_primary); + $node_primary->wait_for_catchup($node_standby, 'replay'); + + # Wait for checksums disabled on the primary and standby + wait_for_checksum_state($node_primary, 'off'); + wait_for_checksum_state($node_standby, 'off'); + + # log LSN right after the primary flips checksums to "off" + $result = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN after disabling: " . $result . "\n"); + + random_sleep() if ($extended); + wait_for_checksum_state($node_standby, 'off'); + + $data_checksum_state = 'off'; + } + else + { + # This should only happen due to programmer error when hacking on the + # test code, but since that might pass subtly we error out. + BAIL_OUT('data_checksum_state variable has invalid state:' + . $data_checksum_state); + } +} + +# Create and start a cluster with one primary and one standby node, and ensure +# they are caught up and in sync. +$node_primary = PostgreSQL::Test::Cluster->new('pgbench_standby_main'); +$node_primary->init(allows_streaming => 1, no_data_checksums => 1); +# max_connections need to be bumped in order to accommodate for pgbench clients +# and log_statement is dialled down since it otherwise will generate enormous +# amounts of logging. Page verification failures are still logged. +$node_primary->append_conf( + 'postgresql.conf', + qq[ +max_connections = 30 +log_statement = none +]); +$node_primary->start; +$node_primary->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); +# Create some content to have un-checksummed data in the cluster +$node_primary->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;"); +$node_primary->safe_psql('postgres', + "SELECT pg_create_physical_replication_slot('$node_primary_slot');"); +$node_primary->backup($node_primary_backup); + +$node_standby = PostgreSQL::Test::Cluster->new('pgbench_standby_standby'); +$node_standby->init_from_backup($node_primary, $node_primary_backup, + has_streaming => 1); +$node_standby->append_conf( + 'postgresql.conf', qq[ +primary_slot_name = '$node_primary_slot' +]); +$node_standby->start; + +# Initialize pgbench and wait for the objects to be created on the standby +my $scalefactor = ($extended ? 10 : 1); +$node_primary->command_ok( + [ + 'pgbench', '-p', $node_primary->port, '-i', '-s', $scalefactor, '-q', + 'postgres' + ]); +$node_primary->wait_for_catchup($node_standby, 'replay'); + +# Start the test suite with pgbench running on all nodes +background_pgbench($node_standby->port, 1); +background_pgbench($node_primary->port, 0); + +# Main test suite. This loop will start a pgbench run on the cluster and while +# that's running flip the state of data checksums concurrently. It will then +# randomly restart the cluster and then check for +# the desired state. The idea behind doing things randomly is to stress out +# any timing related issues by subjecting the cluster for varied workloads. +# A TODO is to generate a trace such that any test failure can be traced to +# its order of operations for debugging. +for (my $i = 0; $i < $TEST_ITERATIONS; $i++) +{ + note("iteration ", ($i + 1), " of ", $TEST_ITERATIONS); + + if (!$node_primary->is_alive) + { + # start, to do recovery, and stop + $node_primary->start; + $node_primary->stop('fast'); + + # Since the log isn't being written to now, parse the log and check + # for instances of checksum verification failures. + my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile, + $node_primary_loglocation); + unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in primary log (during WAL recovery)" + ); + $node_primary_loglocation = -s $node_primary->logfile; + + # randomize the WAL size, to trigger checkpoints less/more often + my $sb = 32 + int(rand(960)); + $node_primary->append_conf('postgresql.conf', qq[max_wal_size = $sb]); + + note("changing primary max_wal_size to " . $sb); + + $node_primary->start; + + # Start a pgbench in the background against the primary + background_pgbench($node_primary->port, 0); + } + + if (!$node_standby->is_alive) + { + $node_standby->start; + $node_standby->stop('fast'); + + # Since the log isn't being written to now, parse the log and check + # for instances of checksum verification failures. + my $log = + PostgreSQL::Test::Utils::slurp_file($node_standby->logfile, + $node_standby_loglocation); + unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in standby_1 log (during WAL recovery)" + ); + $node_standby_loglocation = -s $node_standby->logfile; + + # randomize the WAL size, to trigger checkpoints less/more often + my $sb = 32 + int(rand(960)); + $node_standby->append_conf('postgresql.conf', qq[max_wal_size = $sb]); + + note("changing standby max_wal_size to " . $sb); + + $node_standby->start; + + # Start a read-only pgbench in the background on the standby + background_pgbench($node_standby->port, 1); + } + + $node_primary->safe_psql('postgres', "UPDATE t SET a = a + 1;"); + $node_primary->wait_for_catchup($node_standby, 'write'); + + flip_data_checksums(); + random_sleep() if ($extended); + my $result = $node_primary->safe_psql('postgres', + "SELECT count(*) FROM t WHERE a > 1"); + is($result, '100000', 'ensure data pages can be read back on primary'); + random_sleep(); + + # Potentially powercycle the cluster (the nodes independently). A TODO is + # to randomly stop the nodes in the opposite order too. + if ($extended && cointoss()) + { + $node_primary->stop(stopmode()); + + # print the contents of the control file on the primary + PostgreSQL::Test::Utils::system_log("pg_controldata", + $node_primary->data_dir); + + # slurp the file after shutdown, so that it doesn't interfere with the recovery + my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile, + $node_primary_loglocation); + unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in primary log (outside WAL recovery)" + ); + $node_primary_loglocation = -s $node_primary->logfile; + } + + random_sleep() if ($extended); + + if ($extended && cointoss()) + { + $node_standby->stop(stopmode()); + + # print the contents of the control file on the standby + PostgreSQL::Test::Utils::system_log("pg_controldata", + $node_standby->data_dir); + + # slurp the file after shutdown, so that it doesn't interfere with the recovery + my $log = + PostgreSQL::Test::Utils::slurp_file($node_standby->logfile, + $node_standby_loglocation); + unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in standby_1 log (outside WAL recovery)" + ); + $node_standby_loglocation = -s $node_standby->logfile; + } +} + +# make sure the nodes are running +if (!$node_primary->is_alive) +{ + $node_primary->start; +} + +if (!$node_standby->is_alive) +{ + $node_standby->start; +} + +# Testrun is over, ensure that data reads back as expected and perform a final +# verification of the data checksum state. +my $result = + $node_primary->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '100000', 'ensure data pages can be read back on primary'); +test_checksum_state($node_primary, $data_checksum_state); +test_checksum_state($node_standby, $data_checksum_state); + +# Perform one final pass over the logs and hunt for unexpected errors +my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile, + $node_primary_loglocation); +unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in primary log"); +$node_primary_loglocation = -s $node_primary->logfile; +$log = PostgreSQL::Test::Utils::slurp_file($node_standby->logfile, + $node_standby_loglocation); +unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in standby_1 log"); +$node_standby_loglocation = -s $node_standby->logfile; + +$node_standby->teardown_node; +$node_primary->teardown_node; + +done_testing(); diff --git a/src/test/modules/test_checksums/t/008_pitr.pl b/src/test/modules/test_checksums/t/008_pitr.pl new file mode 100644 index 00000000000..b9b89f414ab --- /dev/null +++ b/src/test/modules/test_checksums/t/008_pitr.pl @@ -0,0 +1,189 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# This test suite is expensive, or very expensive, to execute. There are two +# PG_TEST_EXTRA options for running it, "checksum" for a pared-down test suite +# an "checksum_extended" for the full suite. +my $extended = undef; +if ($ENV{PG_TEST_EXTRA}) +{ + $extended = 1 if ($ENV{PG_TEST_EXTRA} =~ /\bchecksum_extended\b/); + plan skip_all => 'Expensive data checksums test disabled' + unless ($ENV{PG_TEST_EXTRA} =~ /\bchecksum(_extended)?\b/); +} +else +{ + plan skip_all => 'Expensive data checksums test disabled'; +} + + +my $pgbench = undef; +my $data_checksum_state = 'off'; + +my $node_primary; + +# Invert the state of data checksums in the cluster, if data checksums are on +# then disable them and vice versa. Also performs proper validation of the +# before and after state. +sub flip_data_checksums +{ + my $lsn_pre = undef; + my $lsn_post = undef; + + # First, make sure the cluster is in the state we expect it to be + test_checksum_state($node_primary, $data_checksum_state); + + if ($data_checksum_state eq 'off') + { + # log LSN right before we start changing checksums + $lsn_pre = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN before enabling: " . $lsn_pre . "\n"); + + # Wait for checksums enabled on the primary + enable_data_checksums($node_primary, wait => 'on'); + + # log LSN right after the primary flips checksums to "on" + $lsn_post = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN after enabling: " . $lsn_post . "\n"); + + $data_checksum_state = 'on'; + } + elsif ($data_checksum_state eq 'on') + { + # log LSN right before we start changing checksums + $lsn_pre = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + + disable_data_checksums($node_primary); + + # Wait for checksums disabled on the primary + wait_for_checksum_state($node_primary, 'off'); + + # log LSN right after the primary flips checksums to "off" + $lsn_post = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + + $data_checksum_state = 'off'; + } + else + { + # This should only happen due to programmer error when hacking on the + # test code, but since that might pass subtly we error out. + BAIL_OUT('data_checksum_state variable has invalid state:' + . $data_checksum_state); + } + + return ($lsn_pre, $lsn_post); +} +# Start a pgbench run in the background against the server specified via the +# port passed as parameter. +sub background_rw_pgbench +{ + my $port = shift; + + # If a previous pgbench is still running, start by shutting it down. + $pgbench->finish if $pgbench; + + # Randomize the number of pgbench clients in extended mode, else 1 client + my $clients = ($extended ? 1 + int(rand(15)) : 1); + my $runtime = ($extended ? 600 : 5); + + my @cmd = ('pgbench', '-p', $port, '-T', $runtime, '-c', $clients); + + # Randomize whether we spawn connections or not + push(@cmd, '-C') if ($extended && cointoss()); + # Finally add the database name to use + push(@cmd, 'postgres'); + + $pgbench = IPC::Run::start( + \@cmd, + '<' => '/dev/null', + '>' => '/dev/null', + '2>' => '/dev/null', + IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default)); +} + +# Start a primary node with WAL archiving enabled and with enough connections +# available to handle pgbench clients. +$node_primary = PostgreSQL::Test::Cluster->new('pitr_main'); +$node_primary->init( + has_archiving => 1, + allows_streaming => 1, + no_data_checksums => 1); +$node_primary->append_conf( + 'postgresql.conf', + qq[ +max_connections = 100 +log_statement = none +]); +$node_primary->start; + +# Prime the cluster with a bit of known data which we can read back to check +# for data consistency as well as page verification faults in the logfile. +$node_primary->safe_psql('postgres', + 'CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;'); +# Initialize and start pgbench in read/write mode against the cluster +my $scalefactor = ($extended ? 10 : 1); +$node_primary->command_ok( + [ + 'pgbench', '-p', $node_primary->port, '-i', '-s', $scalefactor, '-q', + 'postgres' + ]); +background_rw_pgbench($node_primary->port); + +# Take a backup to use for PITR +my $backup_name = 'my_backup'; +$node_primary->backup($backup_name); + +my ($pre_lsn, $post_lsn) = flip_data_checksums(); + +$node_primary->safe_psql('postgres', "UPDATE t SET a = a + 1;"); +$node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('a');"); +$node_primary->safe_psql('postgres', "UPDATE t SET a = a + 1;"); +$node_primary->stop('immediate'); + +my $node_pitr = PostgreSQL::Test::Cluster->new('pitr_backup'); +$node_pitr->init_from_backup( + $node_primary, $backup_name, + standby => 0, + has_restoring => 1); +$node_pitr->append_conf( + 'postgresql.conf', qq{ +recovery_target_lsn = '$post_lsn' +recovery_target_action = 'promote' +recovery_target_inclusive = on +}); + +$node_pitr->start; + +$node_pitr->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';") + or die "Timed out while waiting for PITR promotion"; + +test_checksum_state($node_pitr, $data_checksum_state); +my $result = + $node_pitr->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '99999', 'ensure data pages can be read back on primary'); + +$node_pitr->stop; + +my $log = PostgreSQL::Test::Utils::slurp_file($node_pitr->logfile, 0); +unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in pitr log"); + +done_testing(); diff --git a/src/test/modules/test_checksums/t/009_fpi.pl b/src/test/modules/test_checksums/t/009_fpi.pl new file mode 100644 index 00000000000..a1cea91f787 --- /dev/null +++ b/src/test/modules/test_checksums/t/009_fpi.pl @@ -0,0 +1,64 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Create and start a cluster with one node +my $node = PostgreSQL::Test::Cluster->new('fpi_node'); +$node->init(allows_streaming => 1, no_data_checksums => 1); +# max_connections need to be bumped in order to accommodate for pgbench clients +# and log_statement is dialled down since it otherwise will generate enormous +# amounts of logging. Page verification failures are still logged. +$node->append_conf( + 'postgresql.conf', + qq[ +max_connections = 100 +log_statement = none +]); +$node->start; +$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1, 1000000) AS a;"); + +# Enable data checksums and wait for the state transition to 'on' +enable_data_checksums($node, wait => 'on'); + +$node->safe_psql('postgres', 'UPDATE t SET a = a + 1;'); + +disable_data_checksums($node, wait => 1); + +$node->append_conf('postgresql.conf', 'full_page_writes = off'); +$node->restart; +test_checksum_state($node, 'off'); + +$node->safe_psql('postgres', 'UPDATE t SET a = a + 1;'); +$node->safe_psql('postgres', 'DELETE FROM t WHERE a < 10000;'); + +$node->adjust_conf('postgresql.conf', 'full_page_writes', 'on'); +$node->restart; +test_checksum_state($node, 'off'); + +enable_data_checksums($node, wait => 'on'); + +my $result = $node->safe_psql('postgres', 'SELECT count(*) FROM t;'); +is($result, '990003', 'Reading back all data from table t'); + +$node->stop; +my $log = PostgreSQL::Test::Utils::slurp_file($node->logfile, 0); +unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in server log"); + +done_testing(); diff --git a/src/test/modules/test_checksums/t/DataChecksums/Utils.pm b/src/test/modules/test_checksums/t/DataChecksums/Utils.pm new file mode 100644 index 00000000000..9a2269e8a92 --- /dev/null +++ b/src/test/modules/test_checksums/t/DataChecksums/Utils.pm @@ -0,0 +1,262 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +=pod + +=head1 NAME + +DataChecksums::Utils - Utility functions for testing data checksums in a running cluster + +=head1 SYNOPSIS + + use PostgreSQL::Test::Cluster; + use DataChecksums::Utils qw( .. ); + + # Create, and start, a new cluster + my $node = PostgreSQL::Test::Cluster->new('primary'); + $node->init; + $node->start; + + test_checksum_state($node, 'off'); + + enable_data_checksums($node); + + wait_for_checksum_state($node, 'on'); + + +=cut + +package DataChecksums::Utils; + +use strict; +use warnings FATAL => 'all'; +use Exporter 'import'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +our @EXPORT = qw( + cointoss + disable_data_checksums + enable_data_checksums + random_sleep + stopmode + test_checksum_state + wait_for_checksum_state + wait_for_cluster_crash +); + +=pod + +=head1 METHODS + +=over + +=item test_checksum_state(node, state) + +Test that the current value of the data checksum GUC in the server running +at B matches B. If the values differ, a test failure is logged. +Returns True if the values match, otherwise False. + +=cut + +sub test_checksum_state +{ + my ($postgresnode, $state) = @_; + + my $result = $postgresnode->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" + ); + is($result, $state, 'ensure checksums are set to ' . $state); + return $result eq $state; +} + +=item wait_for_checksum_state(node, state) + +Test the value of the data checksum GUC in the server running at B +repeatedly until it matches B or times out. Processing will run for +$PostgreSQL::Test::Utils::timeout_default seconds before timing out. If the +values differ when the process times out, False is returned and a test failure +is logged, otherwise True. + +=cut + +sub wait_for_checksum_state +{ + my ($postgresnode, $state) = @_; + + my $res = $postgresnode->poll_query_until( + 'postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + $state); + is($res, 1, 'ensure data checksums are transitioned to ' . $state); + return $res == 1; +} + +=item wait_for_cluster_crash(node, params) + +Repeatedly test if the cluster running at B responds to connections +and return when it no longer does so, or when it times out. Processing will +run for $PostgreSQL::Test::Utils::timeout_default seconds unless a timeout +value is specified as a parameter. Returns True if the cluster crashed, else +False if the process timed out. + +=over + +=item timeout + +Approximate number of seconds to wait for cluster to crash, default is +$PostgreSQL::Test::Utils::timeout_default. There are no real-time guarantees +that the total process time won't exceed the timeout. + +=back + +=cut + +sub wait_for_cluster_crash +{ + my $postgresnode = shift; + my %params = @_; + my $crash = 0; + + $params{timeout} = $PostgreSQL::Test::Utils::timeout_default + unless (defined($params{timeout})); + + for (my $naps = 0; $naps < $params{timeout}; $naps++) + { + if (!$postgresnode->is_alive) + { + $crash = 1; + last; + } + sleep(1); + } + + return $crash == 1; +} + +=item enable_data_checksums($node, %params) + +Function for enabling data checksums in the cluster running at B. + +=over + +=item cost_delay + +The B to use when enabling data checksums, default is 0. + +=item cost_limit + +The B to use when enabling data checksums, default is 100. + +=item wait + +If defined, the function will wait for the state defined in this parameter, +waiting timing out, before returning. The function will wait for +$PostgreSQL::Test::Utils::timeout_default seconds before timing out. + +=back + +=cut + +sub enable_data_checksums +{ + my $postgresnode = shift; + my %params = @_; + + # Set sane defaults for the parameters + $params{cost_delay} = 0 unless (defined($params{cost_delay})); + $params{cost_limit} = 100 unless (defined($params{cost_limit})); + + my $query = <<'EOQ'; +SELECT pg_enable_data_checksums(%s, %s); +EOQ + + $postgresnode->safe_psql('postgres', + sprintf($query, $params{cost_delay}, $params{cost_limit})); + + wait_for_checksum_state($postgresnode, $params{wait}) + if (defined($params{wait})); +} + +=item disable_data_checksums($node, %params) + +Function for disabling data checksums in the cluster running at B. + +=over + +=item wait + +If defined, the function will wait for the state to turn to B, or +waiting timing out, before returning. The function will wait for +$PostgreSQL::Test::Utils::timeout_default seconds before timing out. +Unlike in C the value of the parameter is discarded. + +=back + +=cut + +sub disable_data_checksums +{ + my $postgresnode = shift; + my %params = @_; + + $postgresnode->safe_psql('postgres', + 'SELECT pg_disable_data_checksums();'); + + wait_for_checksum_state($postgresnode, 'off') if (defined($params{wait})); +} + +=item cointoss + +Helper for retrieving a binary value with random distribution for deciding +whether to turn things off during testing. + +=back + +=cut + +sub cointoss +{ + return int(rand() < 0.5); +} + +=item random_sleep(max) + +Helper for injecting random sleeps here and there in the testrun. The sleep +duration will be in the range (0,B), but won't be predictable in order to +avoid sleep patterns that manage to avoid race conditions and timing bugs. +The default B is 3 seconds. + +=back + +=cut + +sub random_sleep +{ + my $max = shift; + return if (defined($max) && ($max == 0)); + sleep(int(rand(defined($max) ? $max : 3))) if cointoss; +} + +=item stopmode + +Small helper function for randomly selecting a valid stopmode. + +=back + +=cut + +sub stopmode +{ + return 'immediate' if (cointoss); + return 'fast'; +} + +=pod + +=back + +=cut + +1; diff --git a/src/test/modules/test_checksums/test_checksums--1.0.sql b/src/test/modules/test_checksums/test_checksums--1.0.sql new file mode 100644 index 00000000000..90642d247fa --- /dev/null +++ b/src/test/modules/test_checksums/test_checksums--1.0.sql @@ -0,0 +1,24 @@ +/* src/test/modules/test_checksums/test_checksums--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_checksums" to load this file. \quit + +CREATE FUNCTION dcw_inject_delay_barrier(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION dcw_inject_launcher_delay(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION dcw_inject_startup_delay(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION dcw_inject_fail_database(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION dcw_fake_temptable(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_checksums/test_checksums.c b/src/test/modules/test_checksums/test_checksums.c new file mode 100644 index 00000000000..b087a3b4664 --- /dev/null +++ b/src/test/modules/test_checksums/test_checksums.c @@ -0,0 +1,184 @@ +/*-------------------------------------------------------------------------- + * + * test_checksums.c + * Test data checksums + * + * Copyright (c) 2026, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/test/modules/test_checksums/test_checksums.c + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "funcapi.h" +#include "miscadmin.h" +#include "postmaster/datachecksum_state.h" +#include "storage/latch.h" +#include "utils/injection_point.h" +#include "utils/wait_event.h" + +PG_MODULE_MAGIC; + +extern PGDLLEXPORT void dc_delay_barrier(const char *name, const void *private_data, void *arg); +extern PGDLLEXPORT void dc_modify_db_result(const char *name, const void *private_data, void *arg); +extern PGDLLEXPORT void dc_fake_temptable(const char *name, const void *private_data, void *arg); + +extern PGDLLEXPORT void crash(const char *name, const void *private_data, void *arg); + +/* + * Test for delaying emission of procsignalbarriers. + */ +void +dc_delay_barrier(const char *name, const void *private_data, void *arg) +{ + (void) name; + (void) private_data; + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + (3 * 1000), + WAIT_EVENT_PG_SLEEP); +} + +PG_FUNCTION_INFO_V1(dcw_inject_delay_barrier); +Datum +dcw_inject_delay_barrier(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksums-enable-checksums-delay", + "test_checksums", + "dc_delay_barrier", + NULL, + 0); + else + InjectionPointDetach("datachecksums-enable-checksums-delay"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(dcw_inject_launcher_delay); +Datum +dcw_inject_launcher_delay(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksumsworker-launcher-delay", + "test_checksums", + "dc_delay_barrier", + NULL, + 0); + else + InjectionPointDetach("datachecksumsworker-launcher-delay"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(dcw_inject_startup_delay); +Datum +dcw_inject_startup_delay(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksumsworker-startup-delay", + "test_checksums", + "dc_delay_barrier", + NULL, + 0); + else + InjectionPointDetach("datachecksumsworker-startup-delay"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +static uint32 db_fail = DATACHECKSUMSWORKER_FAILED; + +void +dc_modify_db_result(const char *name, const void *private_data, void *arg) +{ + DataChecksumsWorkerResult *res = (DataChecksumsWorkerResult *) arg; + uint32 new_res = *(uint32 *) private_data; + + *res = new_res; +} + +PG_FUNCTION_INFO_V1(dcw_inject_fail_database); +Datum +dcw_inject_fail_database(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksumsworker-modify-db-result", + "test_checksums", + "dc_modify_db_result", + &db_fail, + sizeof(uint32)); + else + InjectionPointDetach("datachecksumsworker-modify-db-result"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +/* + * Test to force waiting for existing temptables. + */ +void +dc_fake_temptable(const char *name, const void *private_data, void *arg) +{ + static bool first_pass = true; + int *numleft = (int *) arg; + + if (first_pass) + *numleft = 1; + first_pass = false; +} + +PG_FUNCTION_INFO_V1(dcw_fake_temptable); +Datum +dcw_fake_temptable(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksumsworker-fake-temptable-wait", + "test_checksums", + "dc_fake_temptable", + NULL, + 0); + else + InjectionPointDetach("datachecksumsworker-fake-temptable-wait"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +void +crash(const char *name, const void *private_data, void *arg) +{ + abort(); +} diff --git a/src/test/modules/test_checksums/test_checksums.control b/src/test/modules/test_checksums/test_checksums.control new file mode 100644 index 00000000000..84b4cc035a7 --- /dev/null +++ b/src/test/modules/test_checksums/test_checksums.control @@ -0,0 +1,4 @@ +comment = 'Test code for data checksums' +default_version = '1.0' +module_pathname = '$libdir/test_checksums' +relocatable = true diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index f8dc732e66e..54e6b646e8f 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -3898,6 +3898,42 @@ sub advance_wal } } +=item $node->checksum_enable_offline() + +Enable data page checksums in an offline cluster with B. The +caller is responsible for ensuring that the cluster is in the right state for +this operation. + +=cut + +sub checksum_enable_offline +{ + my ($self) = @_; + + print "# Enabling checksums in \"$self->data_dir\"\n"; + PostgreSQL::Test::Utils::system_or_bail('pg_checksums', '-D', + $self->data_dir, '-e'); + return; +} + +=item $node->checksum_disable_offline() + +Disable data page checksums in an offline cluster with B. The +caller is responsible for ensuring that the cluster is in the right state for +this operation. + +=cut + +sub checksum_disable_offline +{ + my ($self) = @_; + + print "# Disabling checksums in \"$self->data_dir\"\n"; + PostgreSQL::Test::Utils::system_or_bail('pg_checksums', '-D', + $self->data_dir, '-d'); + return; +} + =pod =back diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 2b3cf6d8569..81a73c426d2 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2085,6 +2085,41 @@ pg_stat_progress_create_index| SELECT s.pid, s.param15 AS partitions_done FROM (pg_stat_get_progress_info('CREATE INDEX'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) LEFT JOIN pg_database d ON ((s.datid = d.oid))); +pg_stat_progress_data_checksums| SELECT s.pid, + s.datid, + d.datname, + CASE s.param1 + WHEN 0 THEN 'enabling'::text + WHEN 1 THEN 'disabling'::text + WHEN 2 THEN 'waiting on temporary tables'::text + WHEN 3 THEN 'waiting on barrier'::text + WHEN 4 THEN 'done'::text + ELSE NULL::text + END AS phase, + CASE s.param2 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param2 + END AS databases_total, + s.param3 AS databases_done, + CASE s.param4 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param4 + END AS relations_total, + CASE s.param5 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param5 + END AS relations_done, + CASE s.param6 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param6 + END AS blocks_total, + CASE s.param7 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param7 + END AS blocks_done + FROM (pg_stat_get_progress_info('DATACHECKSUMS'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) + LEFT JOIN pg_database d ON ((s.datid = d.oid))) + ORDER BY s.datid; pg_stat_progress_repack| SELECT s.pid, s.datid, d.datname, diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out index ea7f7846895..35632f83052 100644 --- a/src/test/regress/expected/stats.out +++ b/src/test/regress/expected/stats.out @@ -51,6 +51,22 @@ client backend|relation|vacuum client backend|temp relation|normal client backend|wal|init client backend|wal|normal +datachecksum launcher|relation|bulkread +datachecksum launcher|relation|bulkwrite +datachecksum launcher|relation|init +datachecksum launcher|relation|normal +datachecksum launcher|relation|vacuum +datachecksum launcher|temp relation|normal +datachecksum launcher|wal|init +datachecksum launcher|wal|normal +datachecksum worker|relation|bulkread +datachecksum worker|relation|bulkwrite +datachecksum worker|relation|init +datachecksum worker|relation|normal +datachecksum worker|relation|vacuum +datachecksum worker|temp relation|normal +datachecksum worker|wal|init +datachecksum worker|wal|normal io worker|relation|bulkread io worker|relation|bulkwrite io worker|relation|init @@ -95,7 +111,7 @@ walsummarizer|wal|init walsummarizer|wal|normal walwriter|wal|init walwriter|wal|normal -(79 rows) +(95 rows) \a -- ensure that both seqscan and indexscan plans are allowed SET enable_seqscan TO on; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 91b1225da82..ad999aa48dd 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -438,6 +438,8 @@ CheckPointStmt CheckpointStatsData CheckpointerRequest CheckpointerShmemStruct +ChecksumBarrierCondition +ChecksumStateType Chromosome CkptSortItem CkptTsStatus @@ -610,6 +612,7 @@ CustomScan CustomScanMethods CustomScanState CycleCtr +DataChecksumsWorkerOperation DBState DbOidName DCHCacheEntry @@ -628,6 +631,9 @@ DSMREntryType DSMRegistryCtxStruct DSMRegistryEntry DWORD +DataChecksumsWorkerDatabase +DataChecksumsWorkerResult +DataChecksumsStateStruct DataDirSyncMethod DataDumperPtr DataPageDeleteStack @@ -4405,6 +4411,7 @@ xl_btree_unlink_page xl_btree_update xl_btree_vacuum xl_checkpoint_redo +xl_checksum_state xl_clog_truncate xl_commit_ts_truncate xl_dbase_create_file_copy_rec -- 2.47.3