]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Add CONCURRENTLY option to REPACK
authorÁlvaro Herrera <alvherre@kurilemu.de>
Mon, 6 Apr 2026 19:55:08 +0000 (21:55 +0200)
committerÁlvaro Herrera <alvherre@kurilemu.de>
Mon, 6 Apr 2026 19:55:08 +0000 (21:55 +0200)
When this flag is specified, REPACK no longer acquires access-exclusive
lock while the new copy of the table is being created; instead, it
creates the initial copy under share-update-exclusive lock only (same as
vacuum, etc), and it follows an MVCC snapshot; it sets up a replication
slot starting at that snapshot, and uses a concurrent background worker
to do logical decoding starting at the snapshot to populate a stash of
concurrent data changes.  Those changes can then be re-applied to the
new copy of the table just before swapping the relfilenodes.
Applications can continue to access the original copy of the table
normally until just before the swap, which is the only point at which
the access-exclusive lock is needed.

There are some loose ends in this commit:
1. concurrent repack needs its own replication slot in order to apply
   logical decoding, which are a scarce resource and easy to run out of.
2. due to the way the historic snapshot is initially set up, only one
   REPACK process can be running at any one time on the whole system.
3. there's a danger of deadlocking (and thus abort) due to the lock
   upgrade required at the final phase.

These issues will be addressed in upcoming commits.

The design and most of the code are by Antonin Houska, heavily based on
his own pg_squeeze third-party implementation.

Author: Antonin Houska <ah@cybertec.at>
Co-authored-by: Mihail Nikalayeu <mihailnikalayeu@gmail.com>
Co-authored-by: Álvaro Herrera <alvherre@kurilemu.de>
Reviewed-by: Matthias van de Meent <boekewurm+postgres@gmail.com>
Reviewed-by: Srinath Reddy Sadipiralla <srinath2133@gmail.com>
Reviewed-by: Amit Kapila <amit.kapila16@gmail.com>
Reviewed-by: Jim Jones <jim.jones@uni-muenster.de>
Reviewed-by: Robert Treat <rob@xzilla.net>
Reviewed-by: Noriyoshi Shinoda <noriyoshi.shinoda@hpe.com>
Reviewed-by: vignesh C <vignesh21@gmail.com>
Discussion: https://postgr.es/m/5186.1706694913@antos
Discussion: https://postgr.es/m/202507262156.sb455angijk6@alvherre.pgsql

46 files changed:
doc/src/sgml/monitoring.sgml
doc/src/sgml/mvcc.sgml
doc/src/sgml/ref/repack.sgml
src/Makefile
src/backend/access/heap/heapam.c
src/backend/access/heap/heapam_handler.c
src/backend/access/heap/rewriteheap.c
src/backend/catalog/system_views.sql
src/backend/commands/Makefile
src/backend/commands/matview.c
src/backend/commands/meson.build
src/backend/commands/repack.c
src/backend/commands/repack_worker.c [new file with mode: 0644]
src/backend/commands/tablecmds.c
src/backend/commands/vacuum.c
src/backend/libpq/pqmq.c
src/backend/meson.build
src/backend/postmaster/bgworker.c
src/backend/replication/logical/decode.c
src/backend/replication/pgrepack/Makefile [new file with mode: 0644]
src/backend/replication/pgrepack/meson.build [new file with mode: 0644]
src/backend/replication/pgrepack/pgrepack.c [new file with mode: 0644]
src/backend/storage/ipc/procsignal.c
src/backend/tcop/postgres.c
src/backend/utils/activity/wait_event_names.txt
src/bin/psql/tab-complete.in.c
src/include/access/heapam_xlog.h
src/include/access/tableam.h
src/include/commands/progress.h
src/include/commands/repack.h
src/include/commands/repack_internal.h [new file with mode: 0644]
src/include/replication/decode.h
src/include/storage/lockdefs.h
src/include/storage/procsignal.h
src/makefiles/Makefile.cygwin
src/makefiles/Makefile.win32
src/test/modules/injection_points/Makefile
src/test/modules/injection_points/expected/repack.out [new file with mode: 0644]
src/test/modules/injection_points/expected/repack_toast.out [new file with mode: 0644]
src/test/modules/injection_points/meson.build
src/test/modules/injection_points/specs/repack.spec [new file with mode: 0644]
src/test/modules/injection_points/specs/repack_toast.spec [new file with mode: 0644]
src/test/regress/expected/cluster.out
src/test/regress/expected/rules.out
src/test/regress/sql/cluster.sql
src/tools/pgindent/typedefs.list

index 312374da5e08f5846c87809289ae3b13f67a8d5e..9678877bf3d1a062a645514f56e30f0593a53863 100644 (file)
@@ -6990,14 +6990,35 @@ FROM pg_stat_get_backend_idset() AS backendid;
 
      <row>
       <entry role="catalog_table_entry"><para role="column_definition">
-       <structfield>heap_tuples_written</structfield> <type>bigint</type>
+       <structfield>heap_tuples_inserted</structfield> <type>bigint</type>
       </para>
       <para>
-       Number of heap tuples written.
+       Number of heap tuples inserted.
        This counter only advances when the phase is
        <literal>seq scanning heap</literal>,
-       <literal>index scanning heap</literal>
-       or <literal>writing new heap</literal>.
+       <literal>index scanning heap</literal>,
+       <literal>writing new heap</literal>
+       or <literal>catch-up</literal>.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>heap_tuples_updated</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Number of heap tuples updated.
+       This counter only advances when the phase is <literal>catch-up</literal>.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>heap_tuples_deleted</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Number of heap tuples deleted.
+       This counter only advances when the phase is <literal>catch-up</literal>.
       </para></entry>
      </row>
 
@@ -7078,6 +7099,14 @@ FROM pg_stat_get_backend_idset() AS backendid;
        <command>REPACK</command> is currently writing the new heap.
      </entry>
     </row>
+    <row>
+     <entry><literal>catch-up</literal></entry>
+     <entry>
+       <command>REPACK CONCURRENTLY</command> is currently processing the DML
+       commands that other transactions executed during any of the preceding
+       phases.
+     </entry>
+    </row>
     <row>
      <entry><literal>swapping relation files</literal></entry>
      <entry>
index e775260936aa3bf17281cee797cc1d37b09c8415..241caeb3593b9b5bec89c9e4fde0595bf2e668dc 100644 (file)
@@ -1845,15 +1845,17 @@ SELECT pg_advisory_lock(q.id) FROM
    <title>Caveats</title>
 
    <para>
-    Some DDL commands, currently only <link linkend="sql-truncate"><command>TRUNCATE</command></link> and the
-    table-rewriting forms of <link linkend="sql-altertable"><command>ALTER TABLE</command></link>, are not
+    Some commands, currently only <link linkend="sql-truncate"><command>TRUNCATE</command></link>, the
+    table-rewriting forms of <link linkend="sql-altertable"><command>ALTER
+    TABLE</command></link> and <command>REPACK</command> with
+    the <literal>CONCURRENTLY</literal> option, are not
     MVCC-safe.  This means that after the truncation or rewrite commits, the
     table will appear empty to concurrent transactions, if they are using a
-    snapshot taken before the DDL command committed.  This will only be an
+    snapshot taken before the command committed.  This will only be an
     issue for a transaction that did not access the table in question
-    before the DDL command started &mdash; any transaction that has done so
+    before the command started &mdash; any transaction that has done so
     would hold at least an <literal>ACCESS SHARE</literal> table lock,
-    which would block the DDL command until that transaction completes.
+    which would block the truncating or rewriting command until that transaction completes.
     So these commands will not cause any apparent inconsistency in the
     table contents for successive queries on the target table, but they
     could cause visible inconsistency between the contents of the target
index 8ccf7c7a417b5a0c00abf981def7ff7bbe5dc78c..e993dfb31086d7ccfcac0d3f49f6c322efe92ffe 100644 (file)
@@ -28,6 +28,7 @@ REPACK [ ( <replaceable class="parameter">option</replaceable> [, ...] ) ] USING
 
     VERBOSE [ <replaceable class="parameter">boolean</replaceable> ]
     ANALYZE [ <replaceable class="parameter">boolean</replaceable> ]
+    CONCURRENTLY [ <replaceable class="parameter">boolean</replaceable> ]
 
 <phrase>and <replaceable class="parameter">table_and_columns</replaceable> is:</phrase>
 
@@ -54,7 +55,8 @@ REPACK [ ( <replaceable class="parameter">option</replaceable> [, ...] ) ] USING
    processes every table and materialized view in the current database that
    the current user has the <literal>MAINTAIN</literal> privilege on. This
    form of <command>REPACK</command> cannot be executed inside a transaction
-   block.
+   block.  Also, this form is not allowed if
+   the <literal>CONCURRENTLY</literal> option is used.
   </para>
 
   <para>
@@ -67,7 +69,8 @@ REPACK [ ( <replaceable class="parameter">option</replaceable> [, ...] ) ] USING
    When a table is being repacked, an <literal>ACCESS EXCLUSIVE</literal> lock
    is acquired on it. This prevents any other database operations (both reads
    and writes) from operating on the table until the <command>REPACK</command>
-   is finished.
+   is finished. If you want to keep the table accessible during the repacking,
+   consider using the <literal>CONCURRENTLY</literal> option.
   </para>
 
   <refsect2 id="sql-repack-notes-on-clustering" xreflabel="Notes on Clustering">
@@ -198,6 +201,117 @@ REPACK [ ( <replaceable class="parameter">option</replaceable> [, ...] ) ] USING
     </listitem>
    </varlistentry>
 
+   <varlistentry>
+    <term><literal>CONCURRENTLY</literal></term>
+    <listitem>
+     <para>
+      Allow other transactions to use the table while it is being repacked.
+     </para>
+
+     <para>
+      Internally, <command>REPACK</command> copies the contents of the table
+      (ignoring dead tuples) into a new file, sorted by the specified index,
+      and also creates a new file for each index. Then it swaps the old and
+      new files for the table and all the indexes, and deletes the old
+      files. The <literal>ACCESS EXCLUSIVE</literal> lock is needed to make
+      sure that the old files do not change during the processing because the
+      changes would get lost due to the swap.
+     </para>
+
+     <para>
+      With the <literal>CONCURRENTLY</literal> option, the <literal>ACCESS
+      EXCLUSIVE</literal> lock is only acquired to swap the table and index
+      files. The data changes that took place during the creation of the new
+      table and index files are captured using logical decoding
+      (<xref linkend="logicaldecoding"/>) and applied before
+      the <literal>ACCESS EXCLUSIVE</literal> lock is requested. Thus the lock
+      is typically held only for the time needed to swap the files, which
+      should be pretty short. However, the time might still be noticeable if
+      too many data changes have been done to the table while
+      <command>REPACK</command> was waiting for the lock: those changes must
+      be processed just before the files are swapped, while the
+      <literal>ACCESS EXCLUSIVE</literal> lock is being held.
+     </para>
+
+     <para>
+      Note that <command>REPACK</command> with the
+      <literal>CONCURRENTLY</literal> option does not try to order the rows
+      inserted into the table after the repacking started. Also
+      note <command>REPACK</command> might fail to complete due to DDL
+      commands executed on the table by other transactions during the
+      repacking.
+     </para>
+
+     <note>
+      <para>
+       In addition to the temporary space requirements explained in
+       <xref linkend="sql-repack-notes-on-resources"/>,
+       the <literal>CONCURRENTLY</literal> option can add to the usage of
+       temporary space a bit more. The reason is that other transactions can
+       perform DML operations which cannot be applied to the new file until
+       <command>REPACK</command> has copied all the existing tuples from the
+       old file. Thus the tuples inserted into the old file during the copying
+       are also stored separately in a temporary file, until they can be
+       processed.
+      </para>
+     </note>
+
+     <para>
+      The <literal>CONCURRENTLY</literal> option cannot be used in the
+      following cases:
+
+      <itemizedlist>
+       <listitem>
+        <para>
+          The table is <literal>UNLOGGED</literal>.
+        </para>
+       </listitem>
+
+       <listitem>
+        <para>
+          The table is partitioned.
+        </para>
+       </listitem>
+
+       <listitem>
+        <para>
+         The table lacks a primary key and index-based replica identity.
+        </para>
+       </listitem>
+
+       <listitem>
+        <para>
+          The table is a system catalog or a <acronym>TOAST</acronym> table.
+        </para>
+       </listitem>
+
+       <listitem>
+        <para>
+         <command>REPACK</command> is executed inside a transaction block.
+        </para>
+       </listitem>
+
+       <listitem>
+        <para>
+         The <link linkend="guc-max-replication-slots"><varname>max_replication_slots</varname></link>
+         configuration parameter does not allow for creation of an additional
+         replication slot.
+        </para>
+       </listitem>
+      </itemizedlist>
+     </para>
+
+     <warning>
+      <para>
+       <command>REPACK</command> with the <literal>CONCURRENTLY</literal>
+       option is not MVCC-safe, see <xref linkend="mvcc-caveats"/> for
+       details.
+      </para>
+     </warning>
+
+    </listitem>
+   </varlistentry>
+
    <varlistentry>
     <term><literal>VERBOSE</literal></term>
     <listitem>
index 2f31a2f20a71379e575404d675db0f637562145d..a501de0f5cc5719fe9c117d1a83b80b30125dece 100644 (file)
@@ -23,6 +23,7 @@ SUBDIRS = \
        interfaces \
        backend/replication/libpqwalreceiver \
        backend/replication/pgoutput \
+       backend/replication/pgrepack \
        fe_utils \
        bin \
        pl \
index e06ce2db2cf968c85ef24df470231acfe5cb3464..f6ac5a0897ca85e4268fc7e0ee9dba6566108463 100644 (file)
@@ -61,7 +61,8 @@ static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
                                                                  Buffer newbuf, HeapTuple oldtup,
                                                                  HeapTuple newtup, HeapTuple old_key_tuple,
-                                                                 bool all_visible_cleared, bool new_all_visible_cleared);
+                                                                 bool all_visible_cleared, bool new_all_visible_cleared,
+                                                                 bool walLogical);
 #ifdef USE_ASSERT_CHECKING
 static void check_lock_if_inplace_updateable_rel(Relation relation,
                                                                                                 const ItemPointerData *otid,
@@ -2716,6 +2717,7 @@ heap_delete(Relation relation, const ItemPointerData *tid,
        uint16          new_infomask,
                                new_infomask2;
        bool            changingPart = (options & TABLE_DELETE_CHANGING_PARTITION) != 0;
+       bool            walLogical = (options & TABLE_DELETE_NO_LOGICAL) == 0;
        bool            have_tuple_lock = false;
        bool            iscombo;
        bool            all_visible_cleared = false;
@@ -2950,7 +2952,8 @@ l1:
         * Compute replica identity tuple before entering the critical section so
         * we don't PANIC upon a memory allocation failure.
         */
-       old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
+       old_key_tuple = walLogical ?
+               ExtractReplicaIdentity(relation, &tp, true, &old_key_copied) : NULL;
 
        /*
         * If this is the first possibly-multixact-able operation in the current
@@ -3040,6 +3043,16 @@ l1:
                                xlrec.flags |= XLH_DELETE_CONTAINS_OLD_KEY;
                }
 
+               /*
+                * Mark the change as not-for-logical-decoding if caller requested so.
+                *
+                * (This is used for changes that affect relations not visible to
+                * other transactions, such as the transient table during concurrent
+                * repack.)
+                */
+               if (!walLogical)
+                       xlrec.flags |= XLH_DELETE_NO_LOGICAL;
+
                XLogBeginInsert();
                XLogRegisterData(&xlrec, SizeOfHeapDelete);
 
@@ -3190,6 +3203,7 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
        HeapTuple       heaptup;
        HeapTuple       old_key_tuple = NULL;
        bool            old_key_copied = false;
+       bool            walLogical = (options & TABLE_UPDATE_NO_LOGICAL) == 0;
        Page            page,
                                newpage;
        BlockNumber block;
@@ -4071,7 +4085,8 @@ l2:
                                                                 newbuf, &oldtup, heaptup,
                                                                 old_key_tuple,
                                                                 all_visible_cleared,
-                                                                all_visible_cleared_new);
+                                                                all_visible_cleared_new,
+                                                                walLogical);
                if (newbuf != buffer)
                {
                        PageSetLSN(newpage, recptr);
@@ -8747,7 +8762,8 @@ static XLogRecPtr
 log_heap_update(Relation reln, Buffer oldbuf,
                                Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
                                HeapTuple old_key_tuple,
-                               bool all_visible_cleared, bool new_all_visible_cleared)
+                               bool all_visible_cleared, bool new_all_visible_cleared,
+                               bool walLogical)
 {
        xl_heap_update xlrec;
        xl_heap_header xlhdr;
@@ -8758,7 +8774,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
                                suffixlen = 0;
        XLogRecPtr      recptr;
        Page            page = BufferGetPage(newbuf);
-       bool            need_tuple_data = RelationIsLogicallyLogged(reln);
+       bool            need_tuple_data = walLogical && RelationIsLogicallyLogged(reln);
        bool            init;
        int                     bufflags;
 
index 07f07188d46c9f98236b206eaaebcd9d9a73b439..20d3b46e062074ef21ad5c1d115740ccb39b8a57 100644 (file)
 static void reform_and_rewrite_tuple(HeapTuple tuple,
                                                                         Relation OldHeap, Relation NewHeap,
                                                                         Datum *values, bool *isnull, RewriteState rwstate);
+static void heap_insert_for_repack(HeapTuple tuple, Relation OldHeap,
+                                                                  Relation NewHeap, Datum *values, bool *isnull,
+                                                                  BulkInsertState bistate);
+static HeapTuple reform_tuple(HeapTuple tuple, Relation OldHeap,
+                                                         Relation NewHeap, Datum *values, bool *isnull);
 
 static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
                                                                   HeapTuple tuple,
@@ -589,6 +594,7 @@ static void
 heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
                                                                 Relation OldIndex, bool use_sort,
                                                                 TransactionId OldestXmin,
+                                                                Snapshot snapshot,
                                                                 TransactionId *xid_cutoff,
                                                                 MultiXactId *multi_cutoff,
                                                                 double *num_tuples,
@@ -596,6 +602,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
                                                                 double *tups_recently_dead)
 {
        RewriteState rwstate;
+       BulkInsertState bistate;
        IndexScanDesc indexScan;
        TableScanDesc tableScan;
        HeapScanDesc heapScan;
@@ -609,6 +616,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
        bool       *isnull;
        BufferHeapTupleTableSlot *hslot;
        BlockNumber prev_cblock = InvalidBlockNumber;
+       bool            concurrent = snapshot != NULL;
 
        /* Remember if it's a system catalog */
        is_system_catalog = IsSystemRelation(OldHeap);
@@ -624,10 +632,21 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
        values = palloc_array(Datum, natts);
        isnull = palloc_array(bool, natts);
 
-       /* Initialize the rewrite operation */
-       rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, *xid_cutoff,
-                                                                *multi_cutoff);
+       /*
+        * In non-concurrent mode, initialize the rewrite operation.  This is not
+        * needed in concurrent mode.
+        */
+       if (!concurrent)
+               rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin,
+                                                                        *xid_cutoff, *multi_cutoff);
+       else
+               rwstate = NULL;
 
+       /* In concurrent mode, prepare for bulk-insert operation. */
+       if (concurrent)
+               bistate = GetBulkInsertState();
+       else
+               bistate = NULL;
 
        /* Set up sorting if wanted */
        if (use_sort)
@@ -641,6 +660,9 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
         * Prepare to scan the OldHeap.  To ensure we see recently-dead tuples
         * that still need to be copied, we scan with SnapshotAny and use
         * HeapTupleSatisfiesVacuum for the visibility test.
+        *
+        * In the CONCURRENTLY case, we do regular MVCC visibility tests, using
+        * the snapshot passed by the caller.
         */
        if (OldIndex != NULL && !use_sort)
        {
@@ -657,7 +679,9 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 
                tableScan = NULL;
                heapScan = NULL;
-               indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, NULL, 0, 0,
+               indexScan = index_beginscan(OldHeap, OldIndex,
+                                                                       snapshot ? snapshot : SnapshotAny,
+                                                                       NULL, 0, 0,
                                                                        SO_NONE);
                index_rescan(indexScan, NULL, 0, NULL, 0);
        }
@@ -667,7 +691,9 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
                pgstat_progress_update_param(PROGRESS_REPACK_PHASE,
                                                                         PROGRESS_REPACK_PHASE_SEQ_SCAN_HEAP);
 
-               tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL,
+               tableScan = table_beginscan(OldHeap,
+                                                                       snapshot ? snapshot : SnapshotAny,
+                                                                       0, (ScanKey) NULL,
                                                                        SO_NONE);
                heapScan = (HeapScanDesc) tableScan;
                indexScan = NULL;
@@ -744,83 +770,94 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
                buf = hslot->buffer;
 
                /*
-                * To be able to guarantee that we can set the hint bit, acquire an
-                * exclusive lock on the old buffer. We need the hint bits, set in
-                * heapam_relation_copy_for_cluster() -> HeapTupleSatisfiesVacuum(),
-                * to be set, as otherwise reform_and_rewrite_tuple() ->
-                * rewrite_heap_tuple() will get confused. Specifically,
-                * rewrite_heap_tuple() checks for HEAP_XMAX_INVALID in the old tuple
-                * to determine whether to check the old-to-new mapping hash table.
-                *
-                * It'd be better if we somehow could avoid setting hint bits on the
-                * old page. One reason to use VACUUM FULL are very bloated tables -
-                * rewriting most of the old table during VACUUM FULL doesn't exactly
-                * help...
+                * In concurrent mode, our table or index scan has used regular MVCC
+                * visibility test against a snapshot passed by caller; therefore we
+                * don't need another visibility test.  In non-concurrent mode
+                * however, we must test the visibility of each tuple we read.
                 */
-               LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
-
-               switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
+               if (!concurrent)
                {
-                       case HEAPTUPLE_DEAD:
-                               /* Definitely dead */
-                               isdead = true;
-                               break;
-                       case HEAPTUPLE_RECENTLY_DEAD:
-                               *tups_recently_dead += 1;
-                               pg_fallthrough;
-                       case HEAPTUPLE_LIVE:
-                               /* Live or recently dead, must copy it */
-                               isdead = false;
-                               break;
-                       case HEAPTUPLE_INSERT_IN_PROGRESS:
+                       /*
+                        * To be able to guarantee that we can set the hint bit, acquire
+                        * an exclusive lock on the old buffer. We need the hint bits, set
+                        * in heapam_relation_copy_for_cluster() ->
+                        * HeapTupleSatisfiesVacuum(), to be set, as otherwise
+                        * reform_and_rewrite_tuple() -> rewrite_heap_tuple() will get
+                        * confused. Specifically, rewrite_heap_tuple() checks for
+                        * HEAP_XMAX_INVALID in the old tuple to determine whether to
+                        * check the old-to-new mapping hash table.
+                        *
+                        * It'd be better if we somehow could avoid setting hint bits on
+                        * the old page. One reason to use VACUUM FULL are very bloated
+                        * tables - rewriting most of the old table during VACUUM FULL
+                        * doesn't exactly help...
+                        */
+                       LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 
-                               /*
-                                * Since we hold exclusive lock on the relation, normally the
-                                * only way to see this is if it was inserted earlier in our
-                                * own transaction.  However, it can happen in system
-                                * catalogs, since we tend to release write lock before commit
-                                * there.  Give a warning if neither case applies; but in any
-                                * case we had better copy it.
-                                */
-                               if (!is_system_catalog &&
-                                       !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
-                                       elog(WARNING, "concurrent insert in progress within table \"%s\"",
-                                                RelationGetRelationName(OldHeap));
-                               /* treat as live */
-                               isdead = false;
-                               break;
-                       case HEAPTUPLE_DELETE_IN_PROGRESS:
+                       switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
+                       {
+                               case HEAPTUPLE_DEAD:
+                                       /* Definitely dead */
+                                       isdead = true;
+                                       break;
+                               case HEAPTUPLE_RECENTLY_DEAD:
+                                       *tups_recently_dead += 1;
+                                       pg_fallthrough;
+                               case HEAPTUPLE_LIVE:
+                                       /* Live or recently dead, must copy it */
+                                       isdead = false;
+                                       break;
+                               case HEAPTUPLE_INSERT_IN_PROGRESS:
 
-                               /*
-                                * Similar situation to INSERT_IN_PROGRESS case.
-                                */
-                               if (!is_system_catalog &&
-                                       !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
-                                       elog(WARNING, "concurrent delete in progress within table \"%s\"",
-                                                RelationGetRelationName(OldHeap));
-                               /* treat as recently dead */
-                               *tups_recently_dead += 1;
-                               isdead = false;
-                               break;
-                       default:
-                               elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
-                               isdead = false; /* keep compiler quiet */
-                               break;
-               }
+                                       /*
+                                        * As long as we hold exclusive lock on the relation,
+                                        * normally the only way to see this is if it was inserted
+                                        * earlier in our own transaction.  However, it can happen
+                                        * in system catalogs, since we tend to release write lock
+                                        * before commit there. Give a warning if neither case
+                                        * applies; but in any case we had better copy it.
+                                        */
+                                       if (!is_system_catalog &&
+                                               !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
+                                               elog(WARNING, "concurrent insert in progress within table \"%s\"",
+                                                        RelationGetRelationName(OldHeap));
+                                       /* treat as live */
+                                       isdead = false;
+                                       break;
+                               case HEAPTUPLE_DELETE_IN_PROGRESS:
 
-               LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+                                       /*
+                                        * Similar situation to INSERT_IN_PROGRESS case.
+                                        */
+                                       if (!is_system_catalog &&
+                                               !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
+                                               elog(WARNING, "concurrent delete in progress within table \"%s\"",
+                                                        RelationGetRelationName(OldHeap));
+                                       /* treat as recently dead */
+                                       *tups_recently_dead += 1;
+                                       isdead = false;
+                                       break;
+                               default:
+                                       elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
+                                       isdead = false; /* keep compiler quiet */
+                                       break;
+                       }
 
-               if (isdead)
-               {
-                       *tups_vacuumed += 1;
-                       /* heap rewrite module still needs to see it... */
-                       if (rewrite_heap_dead_tuple(rwstate, tuple))
+                       LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+                       if (isdead)
                        {
-                               /* A previous recently-dead tuple is now known dead */
                                *tups_vacuumed += 1;
-                               *tups_recently_dead -= 1;
+                               /* heap rewrite module still needs to see it... */
+                               if (rewrite_heap_dead_tuple(rwstate, tuple))
+                               {
+                                       /* A previous recently-dead tuple is now known dead */
+                                       *tups_vacuumed += 1;
+                                       *tups_recently_dead -= 1;
+                               }
+
+                               continue;
                        }
-                       continue;
                }
 
                *num_tuples += 1;
@@ -839,12 +876,16 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
                {
                        const int       ct_index[] = {
                                PROGRESS_REPACK_HEAP_TUPLES_SCANNED,
-                               PROGRESS_REPACK_HEAP_TUPLES_WRITTEN
+                               PROGRESS_REPACK_HEAP_TUPLES_INSERTED
                        };
                        int64           ct_val[2];
 
-                       reform_and_rewrite_tuple(tuple, OldHeap, NewHeap,
-                                                                        values, isnull, rwstate);
+                       if (!concurrent)
+                               reform_and_rewrite_tuple(tuple, OldHeap, NewHeap,
+                                                                                values, isnull, rwstate);
+                       else
+                               heap_insert_for_repack(tuple, OldHeap, NewHeap,
+                                                                          values, isnull, bistate);
 
                        /*
                         * In indexscan mode and also VACUUM FULL, report increase in
@@ -892,12 +933,17 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
                                break;
 
                        n_tuples += 1;
-                       reform_and_rewrite_tuple(tuple,
-                                                                        OldHeap, NewHeap,
-                                                                        values, isnull,
-                                                                        rwstate);
+                       if (!concurrent)
+                               reform_and_rewrite_tuple(tuple,
+                                                                                OldHeap, NewHeap,
+                                                                                values, isnull,
+                                                                                rwstate);
+                       else
+                               heap_insert_for_repack(tuple, OldHeap, NewHeap,
+                                                                          values, isnull, bistate);
+
                        /* Report n_tuples */
-                       pgstat_progress_update_param(PROGRESS_REPACK_HEAP_TUPLES_WRITTEN,
+                       pgstat_progress_update_param(PROGRESS_REPACK_HEAP_TUPLES_INSERTED,
                                                                                 n_tuples);
                }
 
@@ -905,7 +951,10 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
        }
 
        /* Write out any remaining tuples, and fsync if needed */
-       end_heap_rewrite(rwstate);
+       if (rwstate)
+               end_heap_rewrite(rwstate);
+       if (bistate)
+               FreeBulkInsertState(bistate);
 
        /* Clean up */
        pfree(values);
@@ -2303,27 +2352,84 @@ static void
 reform_and_rewrite_tuple(HeapTuple tuple,
                                                 Relation OldHeap, Relation NewHeap,
                                                 Datum *values, bool *isnull, RewriteState rwstate)
+{
+       HeapTuple       newtuple;
+
+       newtuple = reform_tuple(tuple, OldHeap, NewHeap, values, isnull);
+
+       /* The heap rewrite module does the rest */
+       rewrite_heap_tuple(rwstate, tuple, newtuple);
+
+       heap_freetuple(newtuple);
+}
+
+/*
+ * Insert tuple when processing REPACK CONCURRENTLY.
+ *
+ * rewriteheap.c is not used in the CONCURRENTLY case because it'd be
+ * difficult to do the same in the catch-up phase (as the logical
+ * decoding does not provide us with sufficient visibility
+ * information). Thus we must use heap_insert() both during the
+ * catch-up and here.
+ *
+ * We pass the NO_LOGICAL flag to heap_insert() in order to skip logical
+ * decoding: as soon as REPACK CONCURRENTLY swaps the relation files, it drops
+ * this relation, so no logical replication subscription should need the data.
+ *
+ * BulkInsertState is used because many tuples are inserted in the typical
+ * case.
+ */
+static void
+heap_insert_for_repack(HeapTuple tuple, Relation OldHeap, Relation NewHeap,
+                                          Datum *values, bool *isnull, BulkInsertState bistate)
+{
+       HeapTuple       newtuple;
+
+       newtuple = reform_tuple(tuple, OldHeap, NewHeap, values, isnull);
+
+       heap_insert(NewHeap, newtuple, GetCurrentCommandId(true),
+                               HEAP_INSERT_NO_LOGICAL, bistate);
+
+       heap_freetuple(newtuple);
+}
+
+/*
+ * Subroutine for reform_and_rewrite_tuple and heap_insert_for_repack.
+ *
+ * Deform the given tuple, set values of dropped columns to NULL, form a new
+ * tuple and return it.  If no attributes need to be changed in this way, a
+ * copy of the original tuple is returned.  Caller is responsible for freeing
+ * the returned tuple.
+ *
+ * XXX this coding assumes that both relations have the same tupledesc.
+ */
+static HeapTuple
+reform_tuple(HeapTuple tuple, Relation OldHeap, Relation NewHeap,
+                        Datum *values, bool *isnull)
 {
        TupleDesc       oldTupDesc = RelationGetDescr(OldHeap);
        TupleDesc       newTupDesc = RelationGetDescr(NewHeap);
-       HeapTuple       copiedTuple;
-       int                     i;
+       bool            needs_reform = false;
+
+       /* Skip work if the tuple doesn't need any attributes changed */
+       for (int i = 0; i < newTupDesc->natts; i++)
+       {
+               if (TupleDescCompactAttr(newTupDesc, i)->attisdropped &&
+                       !heap_attisnull(tuple, i + 1, newTupDesc))
+                       needs_reform = true;
+       }
+       if (!needs_reform)
+               return heap_copytuple(tuple);
 
        heap_deform_tuple(tuple, oldTupDesc, values, isnull);
 
-       /* Be sure to null out any dropped columns */
-       for (i = 0; i < newTupDesc->natts; i++)
+       for (int i = 0; i < newTupDesc->natts; i++)
        {
                if (TupleDescCompactAttr(newTupDesc, i)->attisdropped)
                        isnull[i] = true;
        }
 
-       copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
-
-       /* The heap rewrite module does the rest */
-       rewrite_heap_tuple(rwstate, tuple, copiedTuple);
-
-       heap_freetuple(copiedTuple);
+       return heap_form_tuple(newTupDesc, values, isnull);
 }
 
 /*
index f707b102c7234323bc83f6674e94a12d7e2f2117..5a5398a76ae7d46cfa937da90c2961d7c8473af9 100644 (file)
@@ -621,9 +621,9 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
                uint32          options = HEAP_INSERT_SKIP_FSM;
 
                /*
-                * While rewriting the heap for VACUUM FULL / CLUSTER, make sure data
-                * for the TOAST table are not logically decoded.  The main heap is
-                * WAL-logged as XLOG FPI records, which are not logically decoded.
+                * While rewriting the heap for REPACK, make sure data for the TOAST
+                * table are not logically decoded.  The main heap is WAL-logged as
+                * XLOG FPI records, which are not logically decoded.
                 */
                options |= HEAP_INSERT_NO_LOGICAL;
 
index eba25aa3e4d79d36bc0393f0c8f7d441c777bf3c..64ef4897571a3ced43369a60bfee8744f0771c4c 100644 (file)
@@ -1343,16 +1343,19 @@ CREATE VIEW pg_stat_progress_repack AS
                       WHEN 2 THEN 'index scanning heap'
                       WHEN 3 THEN 'sorting tuples'
                       WHEN 4 THEN 'writing new heap'
-                      WHEN 5 THEN 'swapping relation files'
-                      WHEN 6 THEN 'rebuilding index'
-                      WHEN 7 THEN 'performing final cleanup'
+                      WHEN 5 THEN 'catch-up'
+                      WHEN 6 THEN 'swapping relation files'
+                      WHEN 7 THEN 'rebuilding index'
+                      WHEN 8 THEN 'performing final cleanup'
                       END AS phase,
         CAST(S.param3 AS oid) AS repack_index_relid,
         S.param4 AS heap_tuples_scanned,
-        S.param5 AS heap_tuples_written,
-        S.param6 AS heap_blks_total,
-        S.param7 AS heap_blks_scanned,
-        S.param8 AS index_rebuild_count
+        S.param5 AS heap_tuples_inserted,
+        S.param6 AS heap_tuples_updated,
+        S.param7 AS heap_tuples_deleted,
+        S.param8 AS heap_blks_total,
+        S.param9 AS heap_blks_scanned,
+        S.param10 AS index_rebuild_count
     FROM pg_stat_get_progress_info('REPACK') AS S
         LEFT JOIN pg_database D ON S.datid = D.oid;
 
@@ -1370,7 +1373,7 @@ CREATE VIEW pg_stat_progress_cluster AS
         phase,
         repack_index_relid AS cluster_index_relid,
         heap_tuples_scanned,
-        heap_tuples_written,
+        heap_tuples_inserted + heap_tuples_updated AS heap_tuples_written,
         heap_blks_total,
         heap_blks_scanned,
         index_rebuild_count
index fe1bba3a9b93c647b6b424f0da6dafbecb732bfd..5b9d084977e48a3065f9545aa88032169dc61d39 100644 (file)
@@ -51,6 +51,7 @@ OBJS = \
        propgraphcmds.o \
        publicationcmds.o \
        repack.o \
+       repack_worker.o \
        schemacmds.o \
        seclabel.o \
        sequence.o \
index 5db4fe75dcecdd83b561e0aad483fd48e84741ea..f7d8007f796b0b162131eceb4cf5c004f4d35d26 100644 (file)
@@ -893,6 +893,7 @@ static void
 refresh_by_heap_swap(Oid matviewOid, Oid OIDNewHeap, char relpersistence)
 {
        finish_heap_swap(matviewOid, OIDNewHeap, false, false, true, true,
+                                        true,          /* reindex */
                                         RecentXmin, ReadNextMultiXactId(), relpersistence);
 }
 
index f624aae74af5a7143ae9e49f7ec66531b2b060aa..9f258d566ebf69d5217835e42bf0463538dc6b66 100644 (file)
@@ -39,6 +39,7 @@ backend_sources += files(
   'propgraphcmds.c',
   'publicationcmds.c',
   'repack.c',
+  'repack_worker.c',
   'schemacmds.c',
   'seclabel.c',
   'sequence.c',
index 20f0a572236151081f2368f554ef3f162af4fd59..17b639b3b4484068c2c14edfb33e9a483b05f3ec 100644 (file)
@@ -4,6 +4,22 @@
  *    REPACK a table; formerly known as CLUSTER.  VACUUM FULL also uses
  *    parts of this code.
  *
+ * There are two somewhat different ways to rewrite a table.  In non-
+ * concurrent mode, it's easy: take AccessExclusiveLock, create a new
+ * transient relation, copy the tuples over to the relfilenode of the new
+ * relation, swap the relfilenodes, then drop the old relation.
+ *
+ * In concurrent mode, we lock the table with only ShareUpdateExclusiveLock,
+ * then do an initial copy as above.  However, while the tuples are being
+ * copied, concurrent transactions could modify the table. To cope with those
+ * changes, we rely on logical decoding to obtain them from WAL.  A bgworker
+ * consumes WAL while the initial copy is ongoing (to prevent excessive WAL
+ * from being reserved), and accumulates the changes in a file.  Once the
+ * initial copy is complete, we read the changes from the file and re-apply
+ * them on the new heap.  Then we upgrade our ShareUpdateExclusiveLock to
+ * AccessExclusiveLock and swap the relfilenodes.  This way, the time we hold
+ * a strong lock on the table is much reduced, and the bloat is eliminated.
+ *
  *
  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994-5, Regents of the University of California
 #include "catalog/namespace.h"
 #include "catalog/objectaccess.h"
 #include "catalog/pg_am.h"
+#include "catalog/pg_constraint.h"
 #include "catalog/pg_inherits.h"
 #include "catalog/toasting.h"
 #include "commands/defrem.h"
 #include "commands/progress.h"
 #include "commands/repack.h"
+#include "commands/repack_internal.h"
 #include "commands/tablecmds.h"
 #include "commands/vacuum.h"
+#include "executor/executor.h"
+#include "libpq/pqformat.h"
+#include "libpq/pqmq.h"
 #include "miscadmin.h"
 #include "optimizer/optimizer.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "storage/proc.h"
 #include "utils/acl.h"
 #include "utils/fmgroids.h"
 #include "utils/guc.h"
+#include "utils/injection_point.h"
 #include "utils/inval.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
@@ -54,6 +77,7 @@
 #include "utils/relmapper.h"
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
+#include "utils/wait_event_types.h"
 
 /*
  * This struct is used to pass around the information on tables to be
@@ -66,12 +90,75 @@ typedef struct
        Oid                     indexOid;
 } RelToCluster;
 
+/*
+ * The first file exported by the decoding worker must contain a snapshot, the
+ * following ones contain the data changes.
+ */
+#define WORKER_FILE_SNAPSHOT   0
+
+/*
+ * Information needed to apply concurrent data changes.
+ */
+typedef struct ChangeContext
+{
+       /* The relation the changes are applied to. */
+       Relation        cc_rel;
+
+       /* Needed to update indexes of rel_dst. */
+       ResultRelInfo *cc_rri;
+       EState     *cc_estate;
+
+       /*
+        * Existing tuples to UPDATE and DELETE are located via this index. We
+        * keep the scankey in partially initialized state to avoid repeated work.
+        * sk_argument is completed on the fly.
+        */
+       Relation        cc_ident_index;
+       ScanKey         cc_ident_key;
+       int                     cc_ident_key_nentries;
+
+       /* Sequential number of the file containing the changes. */
+       int                     cc_file_seq;
+} ChangeContext;
+
+/*
+ * Backend-local information to control the decoding worker.
+ */
+typedef struct DecodingWorker
+{
+       /* The worker. */
+       BackgroundWorkerHandle *handle;
+
+       /* DecodingWorkerShared is in this segment. */
+       dsm_segment *seg;
+
+       /* Handle of the error queue. */
+       shm_mq_handle *error_mqh;
+} DecodingWorker;
+
+/* Pointer to currently running decoding worker. */
+static DecodingWorker *decoding_worker = NULL;
+
+/*
+ * Is there a message sent by a repack worker that the backend needs to
+ * receive?
+ */
+volatile sig_atomic_t RepackMessagePending = false;
+
+static LOCKMODE RepackLockLevel(bool concurrent);
 static bool cluster_rel_recheck(RepackCommand cmd, Relation OldHeap,
-                                                               Oid indexOid, Oid userid, int options);
-static void rebuild_relation(Relation OldHeap, Relation index, bool verbose);
+                                                               Oid indexOid, Oid userid, LOCKMODE lmode,
+                                                               int options);
+static void check_concurrent_repack_requirements(Relation rel,
+                                                                                                Oid *ident_idx_p);
+static void rebuild_relation(Relation OldHeap, Relation index, bool verbose,
+                                                        Oid ident_idx);
 static void copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex,
-                                                       bool verbose, bool *pSwapToastByContent,
-                                                       TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
+                                                       Snapshot snapshot,
+                                                       bool verbose,
+                                                       bool *pSwapToastByContent,
+                                                       TransactionId *pFreezeXid,
+                                                       MultiXactId *pCutoffMulti);
 static List *get_tables_to_repack(RepackCommand cmd, bool usingindex,
                                                                  MemoryContext permcxt);
 static List *get_tables_to_repack_partitioned(RepackCommand cmd,
@@ -79,10 +166,47 @@ static List *get_tables_to_repack_partitioned(RepackCommand cmd,
                                                                                          MemoryContext permcxt);
 static bool repack_is_permitted_for_relation(RepackCommand cmd,
                                                                                         Oid relid, Oid userid);
+
+static void apply_concurrent_changes(BufFile *file, ChangeContext *chgcxt);
+static void apply_concurrent_insert(Relation rel, TupleTableSlot *slot,
+                                                                       ChangeContext *chgcxt);
+static void apply_concurrent_update(Relation rel, TupleTableSlot *spilled_tuple,
+                                                                       TupleTableSlot *ondisk_tuple,
+                                                                       ChangeContext *chgcxt);
+static void apply_concurrent_delete(Relation rel, TupleTableSlot *slot);
+static void restore_tuple(BufFile *file, Relation relation,
+                                                 TupleTableSlot *slot);
+static void adjust_toast_pointers(Relation relation, TupleTableSlot *dest,
+                                                                 TupleTableSlot *src);
+static bool find_target_tuple(Relation rel, ChangeContext *chgcxt,
+                                                         TupleTableSlot *locator,
+                                                         TupleTableSlot *received);
+static void process_concurrent_changes(XLogRecPtr end_of_wal,
+                                                                          ChangeContext *chgcxt,
+                                                                          bool done);
+static void initialize_change_context(ChangeContext *chgcxt,
+                                                                         Relation relation,
+                                                                         Oid ident_index_id);
+static void release_change_context(ChangeContext *chgcxt);
+static void rebuild_relation_finish_concurrent(Relation NewHeap, Relation OldHeap,
+                                                                                          Oid identIdx,
+                                                                                          TransactionId frozenXid,
+                                                                                          MultiXactId cutoffMulti);
+static List *build_new_indexes(Relation NewHeap, Relation OldHeap, List *OldIndexes);
+static void copy_index_constraints(Relation old_index, Oid new_index_id,
+                                                                  Oid new_heap_id);
 static Relation process_single_relation(RepackStmt *stmt,
+                                                                               LOCKMODE lockmode,
+                                                                               bool isTopLevel,
                                                                                ClusterParams *params);
 static Oid     determine_clustered_index(Relation rel, bool usingindex,
                                                                          const char *indexname);
+
+static void start_repack_decoding_worker(Oid relid);
+static void stop_repack_decoding_worker(void);
+static Snapshot get_initial_snapshot(DecodingWorker *worker);
+
+static void ProcessRepackMessage(StringInfo msg);
 static const char *RepackCommandAsString(RepackCommand cmd);
 
 
@@ -115,6 +239,7 @@ ExecRepack(ParseState *pstate, RepackStmt *stmt, bool isTopLevel)
        ClusterParams params = {0};
        Relation        rel = NULL;
        MemoryContext repack_context;
+       LOCKMODE        lockmode;
        List       *rtcs;
 
        /* Parse option list */
@@ -125,6 +250,16 @@ ExecRepack(ParseState *pstate, RepackStmt *stmt, bool isTopLevel)
                else if (strcmp(opt->defname, "analyze") == 0 ||
                                 strcmp(opt->defname, "analyse") == 0)
                        params.options |= defGetBoolean(opt) ? CLUOPT_ANALYZE : 0;
+               else if (strcmp(opt->defname, "concurrently") == 0 &&
+                                defGetBoolean(opt))
+               {
+                       if (stmt->command != REPACK_COMMAND_REPACK)
+                               ereport(ERROR,
+                                               errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                               errmsg("CONCURRENTLY option not supported for %s",
+                                                          RepackCommandAsString(stmt->command)));
+                       params.options |= CLUOPT_CONCURRENT;
+               }
                else
                        ereport(ERROR,
                                        errcode(ERRCODE_SYNTAX_ERROR),
@@ -134,13 +269,31 @@ ExecRepack(ParseState *pstate, RepackStmt *stmt, bool isTopLevel)
                                        parser_errposition(pstate, opt->location));
        }
 
+       /* Determine the lock mode to use. */
+       lockmode = RepackLockLevel((params.options & CLUOPT_CONCURRENT) != 0);
+
+       if ((params.options & CLUOPT_CONCURRENT) != 0)
+       {
+               /*
+                * Make sure we're not in a transaction block.
+                *
+                * The reason is that repack_setup_logical_decoding() could wait
+                * indefinitely for our XID to complete. (The deadlock detector would
+                * not recognize it because we'd be waiting for ourselves, i.e. no
+                * real lock conflict.) It would be possible to run in a transaction
+                * block if we had no XID, but this restriction is simpler for users
+                * to understand and we don't lose any functionality.
+                */
+               PreventInTransactionBlock(isTopLevel, "REPACK (CONCURRENTLY)");
+       }
+
        /*
         * If a single relation is specified, process it and we're done ... unless
         * the relation is a partitioned table, in which case we fall through.
         */
        if (stmt->relation != NULL)
        {
-               rel = process_single_relation(stmt, &params);
+               rel = process_single_relation(stmt, lockmode, isTopLevel, &params);
                if (rel == NULL)
                        return;                         /* all done */
        }
@@ -156,10 +309,31 @@ ExecRepack(ParseState *pstate, RepackStmt *stmt, bool isTopLevel)
                                           "REPACK (ANALYZE)"));
 
        /*
-        * By here, we know we are in a multi-table situation.  In order to avoid
-        * holding locks for too long, we want to process each table in its own
-        * transaction.  This forces us to disallow running inside a user
-        * transaction block.
+        * By here, we know we are in a multi-table situation.
+        *
+        * Concurrent processing is currently considered rather special (e.g. in
+        * terms of resources consumed) so it is not performed in bulk.
+        */
+       if (params.options & CLUOPT_CONCURRENT)
+       {
+               if (rel != NULL)
+               {
+                       Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
+                       ereport(ERROR,
+                                       errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                       errmsg("REPACK (CONCURRENTLY) is not supported for partitioned tables"),
+                                       errhint("Consider running the command on individual partitions."));
+               }
+               else
+                       ereport(ERROR,
+                                       errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                       errmsg("REPACK (CONCURRENTLY) requires an explicit table name"));
+       }
+
+       /*
+        * In order to avoid holding locks for too long, we want to process each
+        * table in its own transaction.  This forces us to disallow running
+        * inside a user transaction block.
         */
        PreventInTransactionBlock(isTopLevel, RepackCommandAsString(stmt->command));
 
@@ -168,6 +342,12 @@ ExecRepack(ParseState *pstate, RepackStmt *stmt, bool isTopLevel)
                                                                                   "Repack",
                                                                                   ALLOCSET_DEFAULT_SIZES);
 
+       /*
+        * Since we open a new transaction for each relation, we have to check
+        * that the relation still is what we think it is.
+        *
+        * In single-transaction CLUSTER, we don't need the overhead.
+        */
        params.options |= CLUOPT_RECHECK;
 
        /*
@@ -253,7 +433,7 @@ ExecRepack(ParseState *pstate, RepackStmt *stmt, bool isTopLevel)
                 * Open the target table, coping with the case where it has been
                 * dropped.
                 */
-               rel = try_table_open(rtc->tableOid, AccessExclusiveLock);
+               rel = try_table_open(rtc->tableOid, lockmode);
                if (rel == NULL)
                {
                        CommitTransactionCommand();
@@ -264,7 +444,7 @@ ExecRepack(ParseState *pstate, RepackStmt *stmt, bool isTopLevel)
                PushActiveSnapshot(GetTransactionSnapshot());
 
                /* Process this table */
-               cluster_rel(stmt->command, rel, rtc->indexOid, &params);
+               cluster_rel(stmt->command, rel, rtc->indexOid, &params, isTopLevel);
                /* cluster_rel closes the relation, but keeps lock */
 
                PopActiveSnapshot();
@@ -278,6 +458,22 @@ ExecRepack(ParseState *pstate, RepackStmt *stmt, bool isTopLevel)
        MemoryContextDelete(repack_context);
 }
 
+/*
+ * In the non-concurrent case, we obtain AccessExclusiveLock throughout the
+ * operation to avoid any lock-upgrade hazards.  In the concurrent case, we
+ * grab ShareUpdateExclusiveLock (just like VACUUM) for most of the
+ * processing and only acquire AccessExclusiveLock at the end, to swap the
+ * relation -- supposedly for a short time.
+ */
+static LOCKMODE
+RepackLockLevel(bool concurrent)
+{
+       if (concurrent)
+               return ShareUpdateExclusiveLock;
+       else
+               return AccessExclusiveLock;
+}
+
 /*
  * cluster_rel
  *
@@ -293,22 +489,39 @@ ExecRepack(ParseState *pstate, RepackStmt *stmt, bool isTopLevel)
  * If indexOid is InvalidOid, the table will be rewritten in physical order
  * instead of index order.
  *
+ * Note that, in the concurrent case, the function releases the lock at some
+ * point, in order to get AccessExclusiveLock for the final steps (i.e. to
+ * swap the relation files). To make things simpler, the caller should expect
+ * OldHeap to be closed on return, regardless CLUOPT_CONCURRENT. (The
+ * AccessExclusiveLock is kept till the end of the transaction.)
+ *
  * 'cmd' indicates which command is being executed, to be used for error
  * messages.
  */
 void
 cluster_rel(RepackCommand cmd, Relation OldHeap, Oid indexOid,
-                       ClusterParams *params)
+                       ClusterParams *params, bool isTopLevel)
 {
        Oid                     tableOid = RelationGetRelid(OldHeap);
+       Relation        index;
+       LOCKMODE        lmode;
        Oid                     save_userid;
        int                     save_sec_context;
        int                     save_nestlevel;
        bool            verbose = ((params->options & CLUOPT_VERBOSE) != 0);
        bool            recheck = ((params->options & CLUOPT_RECHECK) != 0);
-       Relation        index;
+       bool            concurrent = ((params->options & CLUOPT_CONCURRENT) != 0);
+       Oid                     ident_idx = InvalidOid;
 
-       Assert(CheckRelationLockedByMe(OldHeap, AccessExclusiveLock, false));
+       /* Determine the lock mode to use. */
+       lmode = RepackLockLevel(concurrent);
+
+       /*
+        * Check some preconditions in the concurrent case.  This also obtains the
+        * replica index OID.
+        */
+       if (concurrent)
+               check_concurrent_repack_requirements(OldHeap, &ident_idx);
 
        /* Check for user-requested abort. */
        CHECK_FOR_INTERRUPTS();
@@ -328,16 +541,15 @@ cluster_rel(RepackCommand cmd, Relation OldHeap, Oid indexOid,
        RestrictSearchPath();
 
        /*
-        * Since we may open a new transaction for each relation, we have to check
-        * that the relation still is what we think it is.
+        * Recheck that the relation is still what it was when we started.
         *
-        * If this is a single-transaction CLUSTER, we can skip these tests. We
-        * *must* skip the one on indisclustered since it would reject an attempt
-        * to cluster a not-previously-clustered index.
+        * Note that it's critical to skip this in single-relation CLUSTER;
+        * otherwise, we would reject an attempt to cluster using a
+        * not-previously-clustered index.
         */
        if (recheck &&
                !cluster_rel_recheck(cmd, OldHeap, indexOid, save_userid,
-                                                        params->options))
+                                                        lmode, params->options))
                goto out;
 
        /*
@@ -353,6 +565,12 @@ cluster_rel(RepackCommand cmd, Relation OldHeap, Oid indexOid,
                                errmsg("cannot execute %s on a shared catalog",
                                           RepackCommandAsString(cmd)));
 
+       /*
+        * The CONCURRENTLY case should have been rejected earlier because it does
+        * not support system catalogs.
+        */
+       Assert(!(OldHeap->rd_rel->relisshared && concurrent));
+
        /*
         * Don't process temp tables of other backends ... their local buffer
         * manager is not going to cope.
@@ -374,7 +592,7 @@ cluster_rel(RepackCommand cmd, Relation OldHeap, Oid indexOid,
        if (OidIsValid(indexOid))
        {
                /* verify the index is good and lock it */
-               check_index_is_clusterable(OldHeap, indexOid, AccessExclusiveLock);
+               check_index_is_clusterable(OldHeap, indexOid, lmode);
                /* also open it */
                index = index_open(indexOid, NoLock);
        }
@@ -409,7 +627,9 @@ cluster_rel(RepackCommand cmd, Relation OldHeap, Oid indexOid,
        if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
                !RelationIsPopulated(OldHeap))
        {
-               relation_close(OldHeap, AccessExclusiveLock);
+               if (index)
+                       index_close(index, lmode);
+               relation_close(OldHeap, lmode);
                goto out;
        }
 
@@ -422,11 +642,34 @@ cluster_rel(RepackCommand cmd, Relation OldHeap, Oid indexOid,
         * invalid, because we move tuples around.  Promote them to relation
         * locks.  Predicate locks on indexes will be promoted when they are
         * reindexed.
+        *
+        * During concurrent processing, the heap as well as its indexes stay in
+        * operation, so we postpone this step until they are locked using
+        * AccessExclusiveLock near the end of the processing.
         */
-       TransferPredicateLocksToHeapRelation(OldHeap);
+       if (!concurrent)
+               TransferPredicateLocksToHeapRelation(OldHeap);
 
        /* rebuild_relation does all the dirty work */
-       rebuild_relation(OldHeap, index, verbose);
+       PG_TRY();
+       {
+               rebuild_relation(OldHeap, index, verbose, ident_idx);
+       }
+       PG_FINALLY();
+       {
+               if (concurrent)
+               {
+                       /*
+                        * Since during normal operation the worker was already asked to
+                        * exit, stopping it explicitly is especially important on ERROR.
+                        * However it still seems a good practice to make sure that the
+                        * worker never survives the REPACK command.
+                        */
+                       stop_repack_decoding_worker();
+               }
+       }
+       PG_END_TRY();
+
        /* rebuild_relation closes OldHeap, and index if valid */
 
 out:
@@ -445,14 +688,14 @@ out:
  */
 static bool
 cluster_rel_recheck(RepackCommand cmd, Relation OldHeap, Oid indexOid,
-                                       Oid userid, int options)
+                                       Oid userid, LOCKMODE lmode, int options)
 {
        Oid                     tableOid = RelationGetRelid(OldHeap);
 
        /* Check that the user still has privileges for the relation */
        if (!repack_is_permitted_for_relation(cmd, tableOid, userid))
        {
-               relation_close(OldHeap, AccessExclusiveLock);
+               relation_close(OldHeap, lmode);
                return false;
        }
 
@@ -466,7 +709,7 @@ cluster_rel_recheck(RepackCommand cmd, Relation OldHeap, Oid indexOid,
         */
        if (RELATION_IS_OTHER_TEMP(OldHeap))
        {
-               relation_close(OldHeap, AccessExclusiveLock);
+               relation_close(OldHeap, lmode);
                return false;
        }
 
@@ -477,7 +720,7 @@ cluster_rel_recheck(RepackCommand cmd, Relation OldHeap, Oid indexOid,
                 */
                if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
                {
-                       relation_close(OldHeap, AccessExclusiveLock);
+                       relation_close(OldHeap, lmode);
                        return false;
                }
 
@@ -488,7 +731,7 @@ cluster_rel_recheck(RepackCommand cmd, Relation OldHeap, Oid indexOid,
                if ((options & CLUOPT_RECHECK_ISCLUSTERED) != 0 &&
                        !get_index_isclustered(indexOid))
                {
-                       relation_close(OldHeap, AccessExclusiveLock);
+                       relation_close(OldHeap, lmode);
                        return false;
                }
        }
@@ -500,7 +743,7 @@ cluster_rel_recheck(RepackCommand cmd, Relation OldHeap, Oid indexOid,
  * Verify that the specified heap and index are valid to cluster on
  *
  * Side effect: obtains lock on the index.  The caller may
- * in some cases already have AccessExclusiveLock on the table, but
+ * in some cases already have a lock of the same strength on the table, but
  * not in all cases so we can't rely on the table-level lock for
  * protection here.
  */
@@ -625,18 +868,98 @@ mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
        table_close(pg_index, RowExclusiveLock);
 }
 
+/*
+ * Check if the CONCURRENTLY option is legal for the relation.
+ *
+ * *Ident_idx_p receives OID of the identity index.
+ */
+static void
+check_concurrent_repack_requirements(Relation rel, Oid *ident_idx_p)
+{
+       char            relpersistence,
+                               replident;
+       Oid                     ident_idx;
+
+       /* Data changes in system relations are not logically decoded. */
+       if (IsCatalogRelation(rel))
+               ereport(ERROR,
+                               errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                               errmsg("cannot repack relation \"%s\"",
+                                          RelationGetRelationName(rel)),
+                               errhint("REPACK CONCURRENTLY is not supported for catalog relations."));
+
+       /*
+        * reorderbuffer.c does not seem to handle processing of TOAST relation
+        * alone.
+        */
+       if (IsToastRelation(rel))
+               ereport(ERROR,
+                               errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                               errmsg("cannot repack relation \"%s\"",
+                                          RelationGetRelationName(rel)),
+                               errhint("REPACK CONCURRENTLY is not supported for TOAST relations"));
+
+       relpersistence = rel->rd_rel->relpersistence;
+       if (relpersistence != RELPERSISTENCE_PERMANENT)
+               ereport(ERROR,
+                               errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                               errmsg("cannot repack relation \"%s\"",
+                                          RelationGetRelationName(rel)),
+                               errhint("REPACK CONCURRENTLY is only allowed for permanent relations."));
+
+       /* With NOTHING, WAL does not contain the old tuple. */
+       replident = rel->rd_rel->relreplident;
+       if (replident == REPLICA_IDENTITY_NOTHING)
+               ereport(ERROR,
+                               errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                               errmsg("cannot repack relation \"%s\"",
+                                          RelationGetRelationName(rel)),
+                               errhint("Relation \"%s\" has insufficient replication identity.",
+                                               RelationGetRelationName(rel)));
+
+       /*
+        * Obtain the replica identity index -- either one that has been set
+        * explicitly, or the primary key.  If none of these cases apply, the
+        * table cannot be repacked concurrently.  It might be possible to have
+        * repack work with a FULL replica identity; however that requires more
+        * work and is not implemented yet.
+        */
+       ident_idx = RelationGetReplicaIndex(rel);
+       if (!OidIsValid(ident_idx) && OidIsValid(rel->rd_pkindex))
+               ident_idx = rel->rd_pkindex;
+       if (!OidIsValid(ident_idx))
+               ereport(ERROR,
+                               errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                               errmsg("cannot process relation \"%s\"",
+                                          RelationGetRelationName(rel)),
+                               errhint("Relation \"%s\" has no identity index.",
+                                               RelationGetRelationName(rel)));
+
+       *ident_idx_p = ident_idx;
+}
+
+
 /*
  * rebuild_relation: rebuild an existing relation in index or physical order
  *
- * OldHeap: table to rebuild.
+ * OldHeap: table to rebuild.  See cluster_rel() for comments on the required
+ * lock strength.
+ *
  * index: index to cluster by, or NULL to rewrite in physical order.
  *
- * On entry, heap and index (if one is given) must be open, and
- * AccessExclusiveLock held on them.
- * On exit, they are closed, but locks on them are not released.
+ * ident_idx: identity index, to handle replaying of concurrent data changes
+ * to the new heap. InvalidOid if there's no CONCURRENTLY option.
+ *
+ * On entry, heap and index (if one is given) must be open, and the
+ * appropriate lock held on them -- AccessExclusiveLock for exclusive
+ * processing and ShareUpdateExclusiveLock for concurrent processing.
+ *
+ * On exit, they are closed, but still locked with AccessExclusiveLock.
+ * (The function handles the lock upgrade if 'concurrent' is true.)
  */
 static void
-rebuild_relation(Relation OldHeap, Relation index, bool verbose)
+rebuild_relation(Relation OldHeap, Relation index, bool verbose,
+                                Oid ident_idx)
 {
        Oid                     tableOid = RelationGetRelid(OldHeap);
        Oid                     accessMethod = OldHeap->rd_rel->relam;
@@ -644,13 +967,55 @@ rebuild_relation(Relation OldHeap, Relation index, bool verbose)
        Oid                     OIDNewHeap;
        Relation        NewHeap;
        char            relpersistence;
-       bool            is_system_catalog;
        bool            swap_toast_by_content;
        TransactionId frozenXid;
        MultiXactId cutoffMulti;
+       bool            concurrent = OidIsValid(ident_idx);
+       Snapshot        snapshot = NULL;
+#if USE_ASSERT_CHECKING
+       LOCKMODE        lmode;
+
+       lmode = RepackLockLevel(concurrent);
+
+       Assert(CheckRelationLockedByMe(OldHeap, lmode, false));
+       Assert(index == NULL || CheckRelationLockedByMe(index, lmode, false));
+#endif
+
+       if (concurrent)
+       {
+               /*
+                * The worker needs to be member of the locking group we're the leader
+                * of. We ought to become the leader before the worker starts. The
+                * worker will join the group as soon as it starts.
+                *
+                * This is to make sure that the deadlock described below is
+                * detectable by deadlock.c: if the worker waits for a transaction to
+                * complete and we are waiting for the worker output, then effectively
+                * we (i.e. this backend) are waiting for that transaction.
+                */
+               BecomeLockGroupLeader();
+
+               /*
+                * Start the worker that decodes data changes applied while we're
+                * copying the table contents.
+                *
+                * Note that the worker has to wait for all transactions with XID
+                * already assigned to finish. If some of those transactions is
+                * waiting for a lock conflicting with ShareUpdateExclusiveLock on our
+                * table (e.g.  it runs CREATE INDEX), we can end up in a deadlock.
+                * Not sure this risk is worth unlocking/locking the table (and its
+                * clustering index) and checking again if it's still eligible for
+                * REPACK CONCURRENTLY.
+                */
+               start_repack_decoding_worker(tableOid);
+
+               /*
+                * Wait until the worker has the initial snapshot and retrieve it.
+                */
+               snapshot = get_initial_snapshot(decoding_worker);
 
-       Assert(CheckRelationLockedByMe(OldHeap, AccessExclusiveLock, false) &&
-                  (index == NULL || CheckRelationLockedByMe(index, AccessExclusiveLock, false)));
+               PushActiveSnapshot(snapshot);
+       }
 
        /* for CLUSTER or REPACK USING INDEX, mark the index as the one to use */
        if (index != NULL)
@@ -658,7 +1023,6 @@ rebuild_relation(Relation OldHeap, Relation index, bool verbose)
 
        /* Remember info about rel before closing OldHeap */
        relpersistence = OldHeap->rd_rel->relpersistence;
-       is_system_catalog = IsSystemRelation(OldHeap);
 
        /*
         * Create the transient table that will receive the re-ordered data.
@@ -674,30 +1038,59 @@ rebuild_relation(Relation OldHeap, Relation index, bool verbose)
        NewHeap = table_open(OIDNewHeap, NoLock);
 
        /* Copy the heap data into the new table in the desired order */
-       copy_table_data(NewHeap, OldHeap, index, verbose,
+       copy_table_data(NewHeap, OldHeap, index, snapshot, verbose,
                                        &swap_toast_by_content, &frozenXid, &cutoffMulti);
 
+       /* The historic snapshot won't be needed anymore. */
+       if (snapshot)
+       {
+               PopActiveSnapshot();
+               UpdateActiveSnapshotCommandId();
+       }
 
-       /* Close relcache entries, but keep lock until transaction commit */
-       table_close(OldHeap, NoLock);
-       if (index)
-               index_close(index, NoLock);
+       if (concurrent)
+       {
+               Assert(!swap_toast_by_content);
 
-       /*
-        * Close the new relation so it can be dropped as soon as the storage is
-        * swapped. The relation is not visible to others, so no need to unlock it
-        * explicitly.
-        */
-       table_close(NewHeap, NoLock);
+               /*
+                * Close the index, but keep the lock. Both heaps will be closed by
+                * the following call.
+                */
+               if (index)
+                       index_close(index, NoLock);
 
-       /*
-        * Swap the physical files of the target and transient tables, then
-        * rebuild the target's indexes and throw away the transient table.
-        */
-       finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
-                                        swap_toast_by_content, false, true,
-                                        frozenXid, cutoffMulti,
-                                        relpersistence);
+               rebuild_relation_finish_concurrent(NewHeap, OldHeap, ident_idx,
+                                                                                  frozenXid, cutoffMulti);
+
+               pgstat_progress_update_param(PROGRESS_REPACK_PHASE,
+                                                                        PROGRESS_REPACK_PHASE_FINAL_CLEANUP);
+       }
+       else
+       {
+               bool            is_system_catalog = IsSystemRelation(OldHeap);
+
+               /* Close relcache entries, but keep lock until transaction commit */
+               table_close(OldHeap, NoLock);
+               if (index)
+                       index_close(index, NoLock);
+
+               /*
+                * Close the new relation so it can be dropped as soon as the storage
+                * is swapped. The relation is not visible to others, so no need to
+                * unlock it explicitly.
+                */
+               table_close(NewHeap, NoLock);
+
+               /*
+                * Swap the physical files of the target and transient tables, then
+                * rebuild the target's indexes and throw away the transient table.
+                */
+               finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
+                                                swap_toast_by_content, false, true,
+                                                true,  /* reindex */
+                                                frozenXid, cutoffMulti,
+                                                relpersistence);
+       }
 }
 
 
@@ -832,15 +1225,18 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, Oid NewAccessMethod,
 /*
  * Do the physical copying of table data.
  *
+ * 'snapshot' and 'decoding_ctx': see table_relation_copy_for_cluster(). Pass
+ * iff concurrent processing is required.
+ *
  * There are three output parameters:
  * *pSwapToastByContent is set true if toast tables must be swapped by content.
  * *pFreezeXid receives the TransactionId used as freeze cutoff point.
  * *pCutoffMulti receives the MultiXactId used as a cutoff point.
  */
 static void
-copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verbose,
-                               bool *pSwapToastByContent, TransactionId *pFreezeXid,
-                               MultiXactId *pCutoffMulti)
+copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex,
+                               Snapshot snapshot, bool verbose, bool *pSwapToastByContent,
+                               TransactionId *pFreezeXid, MultiXactId *pCutoffMulti)
 {
        Relation        relRelation;
        HeapTuple       reltup;
@@ -857,6 +1253,10 @@ copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verb
        int                     elevel = verbose ? INFO : DEBUG2;
        PGRUsage        ru0;
        char       *nspname;
+       bool            concurrent = snapshot != NULL;
+       LOCKMODE        lmode;
+
+       lmode = RepackLockLevel(concurrent);
 
        pg_rusage_init(&ru0);
 
@@ -885,7 +1285,7 @@ copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verb
         * will be held till end of transaction.
         */
        if (OldHeap->rd_rel->reltoastrelid)
-               LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
+               LockRelationOid(OldHeap->rd_rel->reltoastrelid, lmode);
 
        /*
         * If both tables have TOAST tables, perform toast swap by content.  It is
@@ -894,7 +1294,8 @@ copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verb
         * swap by links.  This is okay because swap by content is only essential
         * for system catalogs, and we don't support schema changes for them.
         */
-       if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
+       if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid &&
+               !concurrent)
        {
                *pSwapToastByContent = true;
 
@@ -915,6 +1316,10 @@ copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verb
                 * follow the toast pointers to the wrong place.  (It would actually
                 * work for values copied over from the old toast table, but not for
                 * any values that we toast which were previously not toasted.)
+                *
+                * This would not work with CONCURRENTLY because we may need to delete
+                * TOASTed tuples from the new heap. With this hack, we'd delete them
+                * from the old heap.
                 */
                NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
        }
@@ -990,7 +1395,8 @@ copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verb
         * values (e.g. because the AM doesn't use freezing).
         */
        table_relation_copy_for_cluster(OldHeap, NewHeap, OldIndex, use_sort,
-                                                                       cutoffs.OldestXmin, &cutoffs.FreezeLimit,
+                                                                       cutoffs.OldestXmin, snapshot,
+                                                                       &cutoffs.FreezeLimit,
                                                                        &cutoffs.MultiXactCutoff,
                                                                        &num_tuples, &tups_vacuumed,
                                                                        &tups_recently_dead);
@@ -999,7 +1405,11 @@ copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verb
        *pFreezeXid = cutoffs.FreezeLimit;
        *pCutoffMulti = cutoffs.MultiXactCutoff;
 
-       /* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
+       /*
+        * Reset rd_toastoid just to be tidy --- it shouldn't be looked at again.
+        * In the CONCURRENTLY case, we need to set it again before applying the
+        * concurrent changes.
+        */
        NewHeap->rd_toastoid = InvalidOid;
 
        num_pages = RelationGetNumberOfBlocks(NewHeap);
@@ -1457,14 +1867,13 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
                                 bool swap_toast_by_content,
                                 bool check_constraints,
                                 bool is_internal,
+                                bool reindex,
                                 TransactionId frozenXid,
                                 MultiXactId cutoffMulti,
                                 char newrelpersistence)
 {
        ObjectAddress object;
        Oid                     mapped_tables[4];
-       int                     reindex_flags;
-       ReindexParams reindex_params = {0};
        int                     i;
 
        /* Report that we are now swapping relation files */
@@ -1490,39 +1899,47 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
        if (is_system_catalog)
                CacheInvalidateCatalog(OIDOldHeap);
 
-       /*
-        * Rebuild each index on the relation (but not the toast table, which is
-        * all-new at this point).  It is important to do this before the DROP
-        * step because if we are processing a system catalog that will be used
-        * during DROP, we want to have its indexes available.  There is no
-        * advantage to the other order anyway because this is all transactional,
-        * so no chance to reclaim disk space before commit.  We do not need a
-        * final CommandCounterIncrement() because reindex_relation does it.
-        *
-        * Note: because index_build is called via reindex_relation, it will never
-        * set indcheckxmin true for the indexes.  This is OK even though in some
-        * sense we are building new indexes rather than rebuilding existing ones,
-        * because the new heap won't contain any HOT chains at all, let alone
-        * broken ones, so it can't be necessary to set indcheckxmin.
-        */
-       reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
-       if (check_constraints)
-               reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;
+       if (reindex)
+       {
+               int                     reindex_flags;
+               ReindexParams reindex_params = {0};
 
-       /*
-        * Ensure that the indexes have the same persistence as the parent
-        * relation.
-        */
-       if (newrelpersistence == RELPERSISTENCE_UNLOGGED)
-               reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED;
-       else if (newrelpersistence == RELPERSISTENCE_PERMANENT)
-               reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT;
+               /*
+                * Rebuild each index on the relation (but not the toast table, which
+                * is all-new at this point).  It is important to do this before the
+                * DROP step because if we are processing a system catalog that will
+                * be used during DROP, we want to have its indexes available.  There
+                * is no advantage to the other order anyway because this is all
+                * transactional, so no chance to reclaim disk space before commit. We
+                * do not need a final CommandCounterIncrement() because
+                * reindex_relation does it.
+                *
+                * Note: because index_build is called via reindex_relation, it will
+                * never set indcheckxmin true for the indexes.  This is OK even
+                * though in some sense we are building new indexes rather than
+                * rebuilding existing ones, because the new heap won't contain any
+                * HOT chains at all, let alone broken ones, so it can't be necessary
+                * to set indcheckxmin.
+                */
+               reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
+               if (check_constraints)
+                       reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;
 
-       /* Report that we are now reindexing relations */
-       pgstat_progress_update_param(PROGRESS_REPACK_PHASE,
-                                                                PROGRESS_REPACK_PHASE_REBUILD_INDEX);
+               /*
+                * Ensure that the indexes have the same persistence as the parent
+                * relation.
+                */
+               if (newrelpersistence == RELPERSISTENCE_UNLOGGED)
+                       reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED;
+               else if (newrelpersistence == RELPERSISTENCE_PERMANENT)
+                       reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT;
 
-       reindex_relation(NULL, OIDOldHeap, reindex_flags, &reindex_params);
+               /* Report that we are now reindexing relations */
+               pgstat_progress_update_param(PROGRESS_REPACK_PHASE,
+                                                                        PROGRESS_REPACK_PHASE_REBUILD_INDEX);
+
+               reindex_relation(NULL, OIDOldHeap, reindex_flags, &reindex_params);
+       }
 
        /* Report that we are now doing clean up */
        pgstat_progress_update_param(PROGRESS_REPACK_PHASE,
@@ -1566,6 +1983,17 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
        object.objectId = OIDNewHeap;
        object.objectSubId = 0;
 
+       if (!reindex)
+       {
+               /*
+                * Make sure the changes in pg_class are visible. This is especially
+                * important if !swap_toast_by_content, so that the correct TOAST
+                * relation is dropped. (reindex_relation() above did not help in this
+                * case))
+                */
+               CommandCounterIncrement();
+       }
+
        /*
         * The new relation is local to our transaction and we know nothing
         * depends on it, so DROP_RESTRICT should be OK.
@@ -1605,7 +2033,7 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
 
                        /* Get the associated valid index to be renamed */
                        toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid,
-                                                                                        NoLock);
+                                                                                        AccessExclusiveLock);
 
                        /* rename the toast table ... */
                        snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
@@ -1876,7 +2304,8 @@ repack_is_permitted_for_relation(RepackCommand cmd, Oid relid, Oid userid)
  * case, if an index name is given, it's up to the caller to resolve it.
  */
 static Relation
-process_single_relation(RepackStmt *stmt, ClusterParams *params)
+process_single_relation(RepackStmt *stmt, LOCKMODE lockmode, bool isTopLevel,
+                                               ClusterParams *params)
 {
        Relation        rel;
        Oid                     tableOid;
@@ -1893,13 +2322,9 @@ process_single_relation(RepackStmt *stmt, ClusterParams *params)
                                errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                errmsg("ANALYZE option must be specified when a column list is provided"));
 
-       /*
-        * Find, lock, and check permissions on the table.  We obtain
-        * AccessExclusiveLock right away to avoid lock-upgrade hazard in the
-        * single-transaction case.
-        */
+       /* Find, lock, and check permissions on the table. */
        tableOid = RangeVarGetRelidExtended(stmt->relation->relation,
-                                                                               AccessExclusiveLock,
+                                                                               lockmode,
                                                                                0,
                                                                                RangeVarCallbackMaintainsTable,
                                                                                NULL);
@@ -1924,13 +2349,14 @@ process_single_relation(RepackStmt *stmt, ClusterParams *params)
                return rel;
        else
        {
-               Oid                     indexOid;
+               Oid                     indexOid = InvalidOid;
 
                indexOid = determine_clustered_index(rel, stmt->usingindex,
                                                                                         stmt->indexname);
                if (OidIsValid(indexOid))
-                       check_index_is_clusterable(rel, indexOid, AccessExclusiveLock);
-               cluster_rel(stmt->command, rel, indexOid, params);
+                       check_index_is_clusterable(rel, indexOid, lockmode);
+
+               cluster_rel(stmt->command, rel, indexOid, params, isTopLevel);
 
                /*
                 * Do an analyze, if requested.  We close the transaction and start a
@@ -2025,3 +2451,1196 @@ RepackCommandAsString(RepackCommand cmd)
        }
        return "???";                           /* keep compiler quiet */
 }
+
+/*
+ * Apply all the changes stored in 'file'.
+ */
+static void
+apply_concurrent_changes(BufFile *file, ChangeContext *chgcxt)
+{
+       ConcurrentChangeKind kind = '\0';
+       Relation        rel = chgcxt->cc_rel;
+       TupleTableSlot *spilled_tuple;
+       TupleTableSlot *old_update_tuple;
+       TupleTableSlot *ondisk_tuple;
+       bool            have_old_tuple = false;
+       MemoryContext oldcxt;
+
+       spilled_tuple = MakeSingleTupleTableSlot(RelationGetDescr(rel),
+                                                                                        &TTSOpsVirtual);
+       ondisk_tuple = MakeSingleTupleTableSlot(RelationGetDescr(rel),
+                                                                                       table_slot_callbacks(rel));
+       old_update_tuple = MakeSingleTupleTableSlot(RelationGetDescr(rel),
+                                                                                               &TTSOpsVirtual);
+
+       oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(chgcxt->cc_estate));
+
+       while (true)
+       {
+               size_t          nread;
+               ConcurrentChangeKind prevkind = kind;
+
+               CHECK_FOR_INTERRUPTS();
+
+               nread = BufFileReadMaybeEOF(file, &kind, 1, true);
+               if (nread == 0)                 /* done with the file? */
+                       break;
+
+               /*
+                * If this is the old tuple for an update, read it into the tuple slot
+                * and go to the next one.  The update itself will be executed on the
+                * next iteration, when we receive the NEW tuple.
+                */
+               if (kind == CHANGE_UPDATE_OLD)
+               {
+                       restore_tuple(file, rel, old_update_tuple);
+                       have_old_tuple = true;
+                       continue;
+               }
+
+               /*
+                * Just before an UPDATE or DELETE, we must update the command
+                * counter, because the change could refer to a tuple that we have
+                * just inserted; and before an INSERT, we have to do this also if the
+                * previous command was either update or delete.
+                *
+                * With this approach we don't spend so many CCIs for long strings of
+                * only INSERTs, which can't affect one another.
+                */
+               if (kind == CHANGE_UPDATE_NEW || kind == CHANGE_DELETE ||
+                       (kind == CHANGE_INSERT && (prevkind == CHANGE_UPDATE_NEW ||
+                                                                          prevkind == CHANGE_DELETE)))
+               {
+                       CommandCounterIncrement();
+                       UpdateActiveSnapshotCommandId();
+               }
+
+               /*
+                * Now restore the tuple into the slot and execute the change.
+                */
+               restore_tuple(file, rel, spilled_tuple);
+
+               if (kind == CHANGE_INSERT)
+               {
+                       apply_concurrent_insert(rel, spilled_tuple, chgcxt);
+               }
+               else if (kind == CHANGE_DELETE)
+               {
+                       bool            found;
+
+                       /* Find the tuple to be deleted */
+                       found = find_target_tuple(rel, chgcxt, spilled_tuple, ondisk_tuple);
+                       if (!found)
+                               elog(ERROR, "failed to find target tuple");
+                       apply_concurrent_delete(rel, ondisk_tuple);
+               }
+               else if (kind == CHANGE_UPDATE_NEW)
+               {
+                       TupleTableSlot *key;
+                       bool            found;
+
+                       if (have_old_tuple)
+                               key = old_update_tuple;
+                       else
+                               key = spilled_tuple;
+
+                       /* Find the tuple to be updated or deleted. */
+                       found = find_target_tuple(rel, chgcxt, key, ondisk_tuple);
+                       if (!found)
+                               elog(ERROR, "failed to find target tuple");
+
+                       /*
+                        * If 'tup' contains TOAST pointers, they point to the old
+                        * relation's toast. Copy the corresponding TOAST pointers for the
+                        * new relation from the existing tuple. (The fact that we
+                        * received a TOAST pointer here implies that the attribute hasn't
+                        * changed.)
+                        */
+                       adjust_toast_pointers(rel, spilled_tuple, ondisk_tuple);
+
+                       apply_concurrent_update(rel, spilled_tuple, ondisk_tuple, chgcxt);
+
+                       ExecClearTuple(old_update_tuple);
+                       have_old_tuple = false;
+               }
+               else
+                       elog(ERROR, "unrecognized kind of change: %d", kind);
+
+               ResetPerTupleExprContext(chgcxt->cc_estate);
+       }
+
+       /* Cleanup. */
+       ExecDropSingleTupleTableSlot(spilled_tuple);
+       ExecDropSingleTupleTableSlot(ondisk_tuple);
+       ExecDropSingleTupleTableSlot(old_update_tuple);
+
+       MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * Apply an insert from the spill of concurrent changes to the new copy of the
+ * table.
+ */
+static void
+apply_concurrent_insert(Relation rel, TupleTableSlot *slot,
+                                               ChangeContext *chgcxt)
+{
+       /* Put the tuple in the table, but make sure it won't be decoded */
+       table_tuple_insert(rel, slot, GetCurrentCommandId(true),
+                                          TABLE_INSERT_NO_LOGICAL, NULL);
+
+       /* Update indexes with this new tuple. */
+       ExecInsertIndexTuples(chgcxt->cc_rri,
+                                                 chgcxt->cc_estate,
+                                                 0,
+                                                 slot,
+                                                 NIL, NULL);
+       pgstat_progress_incr_param(PROGRESS_REPACK_HEAP_TUPLES_INSERTED, 1);
+}
+
+/*
+ * Apply an update from the spill of concurrent changes to the new copy of the
+ * table.
+ */
+static void
+apply_concurrent_update(Relation rel, TupleTableSlot *spilled_tuple,
+                                               TupleTableSlot *ondisk_tuple,
+                                               ChangeContext *chgcxt)
+{
+       LockTupleMode lockmode;
+       TM_FailureData tmfd;
+       TU_UpdateIndexes update_indexes;
+       TM_Result       res;
+
+       /*
+        * Carry out the update, skipping logical decoding for it.
+        */
+       res = table_tuple_update(rel, &(ondisk_tuple->tts_tid), spilled_tuple,
+                                                        GetCurrentCommandId(true),
+                                                        TABLE_UPDATE_NO_LOGICAL,
+                                                        InvalidSnapshot,
+                                                        InvalidSnapshot,
+                                                        false,
+                                                        &tmfd, &lockmode, &update_indexes);
+       if (res != TM_Ok)
+               ereport(ERROR,
+                               errmsg("failed to apply concurrent UPDATE"));
+
+       if (update_indexes != TU_None)
+       {
+               uint32          flags = EIIT_IS_UPDATE;
+
+               if (update_indexes == TU_Summarizing)
+                       flags |= EIIT_ONLY_SUMMARIZING;
+               ExecInsertIndexTuples(chgcxt->cc_rri,
+                                                         chgcxt->cc_estate,
+                                                         flags,
+                                                         spilled_tuple,
+                                                         NIL, NULL);
+       }
+
+       pgstat_progress_incr_param(PROGRESS_REPACK_HEAP_TUPLES_UPDATED, 1);
+}
+
+static void
+apply_concurrent_delete(Relation rel, TupleTableSlot *slot)
+{
+       TM_Result       res;
+       TM_FailureData tmfd;
+
+       /*
+        * Delete tuple from the new heap, skipping logical decoding for it.
+        */
+       res = table_tuple_delete(rel, &(slot->tts_tid),
+                                                        GetCurrentCommandId(true),
+                                                        TABLE_DELETE_NO_LOGICAL,
+                                                        InvalidSnapshot, InvalidSnapshot,
+                                                        false,
+                                                        &tmfd);
+
+       if (res != TM_Ok)
+               ereport(ERROR,
+                               errmsg("failed to apply concurrent DELETE"));
+
+       pgstat_progress_incr_param(PROGRESS_REPACK_HEAP_TUPLES_DELETED, 1);
+}
+
+/*
+ * Read tuple from file and put it in the input slot.  All memory is allocated
+ * in the current memory context; caller is responsible for freeing it as
+ * appropriate.
+ *
+ * External attributes are stored in separate memory chunks, in order to avoid
+ * exceeding MaxAllocSize - that could happen if the individual attributes are
+ * smaller than MaxAllocSize but the whole tuple is bigger.
+ */
+static void
+restore_tuple(BufFile *file, Relation relation, TupleTableSlot *slot)
+{
+       uint32          t_len;
+       HeapTuple       tup;
+       int                     natt_ext;
+
+       /* Read the tuple. */
+       BufFileReadExact(file, &t_len, sizeof(t_len));
+       tup = (HeapTuple) palloc(HEAPTUPLESIZE + t_len);
+       tup->t_data = (HeapTupleHeader) ((char *) tup + HEAPTUPLESIZE);
+       BufFileReadExact(file, tup->t_data, t_len);
+       tup->t_len = t_len;
+       ItemPointerSetInvalid(&tup->t_self);
+       tup->t_tableOid = RelationGetRelid(relation);
+
+       /*
+        * Put the tuple we read in a slot. This deforms it, so that we can hack
+        * the external attributes in place.
+        */
+       ExecForceStoreHeapTuple(tup, slot, false);
+
+       /*
+        * Next, read any attributes we stored separately into the tts_values
+        * array elements expecting them, if any.  This matches
+        * repack_store_change.
+        */
+       BufFileReadExact(file, &natt_ext, sizeof(natt_ext));
+       if (natt_ext > 0)
+       {
+               TupleDesc       desc = slot->tts_tupleDescriptor;
+
+               for (int i = 0; i < desc->natts; i++)
+               {
+                       CompactAttribute *attr = TupleDescCompactAttr(desc, i);
+                       varlena    *varlen;
+                       union
+                       {
+                               alignas(int32) varlena hdr;
+                               char            data[sizeof(void *)];
+                       }                       chunk_header;
+                       void       *value;
+                       Size            varlensz;
+
+                       if (attr->attisdropped || attr->attlen != -1)
+                               continue;
+                       if (slot_attisnull(slot, i + 1))
+                               continue;
+                       varlen = (varlena *) DatumGetPointer(slot->tts_values[i]);
+                       if (!VARATT_IS_EXTERNAL_INDIRECT(varlen))
+                               continue;
+                       slot_getsomeattrs(slot, i + 1);
+
+                       BufFileReadExact(file, &chunk_header, VARHDRSZ);
+                       varlensz = VARSIZE_ANY(&chunk_header);
+
+                       value = palloc(varlensz);
+                       SET_VARSIZE(value, VARSIZE_ANY(&chunk_header));
+                       BufFileReadExact(file, (char *) value + VARHDRSZ, varlensz - VARHDRSZ);
+
+                       slot->tts_values[i] = PointerGetDatum(value);
+                       natt_ext--;
+                       if (natt_ext < 0)
+                               ereport(ERROR,
+                                               errcode(ERRCODE_DATA_CORRUPTED),
+                                               errmsg("insufficient number of attributes stored separately"));
+               }
+       }
+}
+
+/*
+ * Adjust 'dest' replacing any EXTERNAL_ONDISK toast pointers with the
+ * corresponding ones from 'src'.
+ */
+static void
+adjust_toast_pointers(Relation relation, TupleTableSlot *dest, TupleTableSlot *src)
+{
+       TupleDesc       desc = dest->tts_tupleDescriptor;
+
+       for (int i = 0; i < desc->natts; i++)
+       {
+               CompactAttribute *attr = TupleDescCompactAttr(desc, i);
+               varlena    *varlena_dst;
+
+               if (attr->attisdropped)
+                       continue;
+               if (attr->attlen != -1)
+                       continue;
+               if (slot_attisnull(dest, i + 1))
+                       continue;
+
+               slot_getsomeattrs(dest, i + 1);
+
+               varlena_dst = (varlena *) DatumGetPointer(dest->tts_values[i]);
+               if (!VARATT_IS_EXTERNAL_ONDISK(varlena_dst))
+                       continue;
+               slot_getsomeattrs(src, i + 1);
+
+               dest->tts_values[i] = src->tts_values[i];
+       }
+}
+
+/*
+ * Find the tuple to be updated or deleted by the given data change, whose
+ * tuple has already been loaded into locator.
+ *
+ * If the tuple is found, put it in retrieved and return true.  If the tuple is
+ * not found, return false.
+ */
+static bool
+find_target_tuple(Relation rel, ChangeContext *chgcxt, TupleTableSlot *locator,
+                                 TupleTableSlot *retrieved)
+{
+       Form_pg_index idx = chgcxt->cc_ident_index->rd_index;
+       IndexScanDesc scan;
+       bool            retval;
+
+       /*
+        * Scan key is passed by caller, so it does not have to be constructed
+        * multiple times. Key entries have all fields initialized, except for
+        * sk_argument.
+        *
+        * Use the incoming tuple to finalize the scan key.
+        */
+       for (int i = 0; i < chgcxt->cc_ident_key_nentries; i++)
+       {
+               ScanKey         entry = &chgcxt->cc_ident_key[i];
+               AttrNumber      attno = idx->indkey.values[i];
+
+               entry->sk_argument = locator->tts_values[attno - 1];
+               Assert(!locator->tts_isnull[attno - 1]);
+       }
+
+       /* XXX no instrumentation for now */
+       scan = index_beginscan(rel, chgcxt->cc_ident_index, GetActiveSnapshot(),
+                                                  NULL, chgcxt->cc_ident_key_nentries, 0, 0);
+       index_rescan(scan, chgcxt->cc_ident_key, chgcxt->cc_ident_key_nentries, NULL, 0);
+       retval = index_getnext_slot(scan, ForwardScanDirection, retrieved);
+       index_endscan(scan);
+
+       return retval;
+}
+
+/*
+ * Decode and apply concurrent changes, up to (and including) the record whose
+ * LSN is 'end_of_wal'.
+ *
+ * XXX the names "process_concurrent_changes" and "apply_concurrent_changes"
+ * are far too similar to each other.
+ */
+static void
+process_concurrent_changes(XLogRecPtr end_of_wal, ChangeContext *chgcxt, bool done)
+{
+       DecodingWorkerShared *shared;
+       char            fname[MAXPGPATH];
+       BufFile    *file;
+
+       pgstat_progress_update_param(PROGRESS_REPACK_PHASE,
+                                                                PROGRESS_REPACK_PHASE_CATCH_UP);
+
+       /* Ask the worker for the file. */
+       shared = (DecodingWorkerShared *) dsm_segment_address(decoding_worker->seg);
+       SpinLockAcquire(&shared->mutex);
+       shared->lsn_upto = end_of_wal;
+       shared->done = done;
+       SpinLockRelease(&shared->mutex);
+
+       /*
+        * The worker needs to finish processing of the current WAL record. Even
+        * if it's idle, it'll need to close the output file. Thus we're likely to
+        * wait, so prepare for sleep.
+        */
+       ConditionVariablePrepareToSleep(&shared->cv);
+       for (;;)
+       {
+               int                     last_exported;
+
+               SpinLockAcquire(&shared->mutex);
+               last_exported = shared->last_exported;
+               SpinLockRelease(&shared->mutex);
+
+               /*
+                * Has the worker exported the file we are waiting for?
+                */
+               if (last_exported == chgcxt->cc_file_seq)
+                       break;
+
+               ConditionVariableSleep(&shared->cv, WAIT_EVENT_REPACK_WORKER_EXPORT);
+       }
+       ConditionVariableCancelSleep();
+
+       /* Open the file. */
+       DecodingWorkerFileName(fname, shared->relid, chgcxt->cc_file_seq);
+       file = BufFileOpenFileSet(&shared->sfs.fs, fname, O_RDONLY, false);
+       apply_concurrent_changes(file, chgcxt);
+
+       BufFileClose(file);
+
+       /* Get ready for the next file. */
+       chgcxt->cc_file_seq++;
+}
+
+/*
+ * Initialize the ChangeContext struct for the given relation, with
+ * the given index as identity index.
+ */
+static void
+initialize_change_context(ChangeContext *chgcxt,
+                                                 Relation relation, Oid ident_index_id)
+{
+       chgcxt->cc_rel = relation;
+
+       /* Only initialize fields needed by ExecInsertIndexTuples(). */
+       chgcxt->cc_estate = CreateExecutorState();
+
+       chgcxt->cc_rri = (ResultRelInfo *) palloc(sizeof(ResultRelInfo));
+       InitResultRelInfo(chgcxt->cc_rri, relation, 0, 0, 0);
+       ExecOpenIndices(chgcxt->cc_rri, false);
+
+       /*
+        * The table's relcache entry already has the relcache entry for the
+        * identity index; find that.
+        */
+       chgcxt->cc_ident_index = NULL;
+       for (int i = 0; i < chgcxt->cc_rri->ri_NumIndices; i++)
+       {
+               Relation        ind_rel;
+
+               ind_rel = chgcxt->cc_rri->ri_IndexRelationDescs[i];
+               if (ind_rel->rd_id == ident_index_id)
+               {
+                       chgcxt->cc_ident_index = ind_rel;
+                       break;
+               }
+       }
+       if (chgcxt->cc_ident_index == NULL)
+               elog(ERROR, "failed to find identity index");
+
+       /* Set up for scanning said identity index */
+       {
+               Form_pg_index indexForm;
+
+               indexForm = chgcxt->cc_ident_index->rd_index;
+               chgcxt->cc_ident_key_nentries = indexForm->indnkeyatts;
+               chgcxt->cc_ident_key = (ScanKey) palloc_array(ScanKeyData, indexForm->indnkeyatts);
+               for (int i = 0; i < indexForm->indnkeyatts; i++)
+               {
+                       ScanKey         entry;
+                       Oid                     opfamily,
+                                               opcintype,
+                                               opno,
+                                               opcode;
+
+                       entry = &chgcxt->cc_ident_key[i];
+
+                       opfamily = chgcxt->cc_ident_index->rd_opfamily[i];
+                       opcintype = chgcxt->cc_ident_index->rd_opcintype[i];
+                       opno = get_opfamily_member(opfamily, opcintype, opcintype,
+                                                                          BTEqualStrategyNumber);
+                       if (!OidIsValid(opno))
+                               elog(ERROR, "failed to find = operator for type %u", opcintype);
+                       opcode = get_opcode(opno);
+                       if (!OidIsValid(opcode))
+                               elog(ERROR, "failed to find = operator for operator %u", opno);
+
+                       /* Initialize everything but argument. */
+                       ScanKeyInit(entry,
+                                               i + 1,
+                                               BTEqualStrategyNumber, opcode,
+                                               (Datum) NULL);
+                       entry->sk_collation = chgcxt->cc_ident_index->rd_indcollation[i];
+               }
+       }
+
+       chgcxt->cc_file_seq = WORKER_FILE_SNAPSHOT + 1;
+}
+
+/*
+ * Free up resources taken by a ChangeContext.
+ */
+static void
+release_change_context(ChangeContext *chgcxt)
+{
+       ExecCloseIndices(chgcxt->cc_rri);
+       FreeExecutorState(chgcxt->cc_estate);
+       /* XXX are these pfrees necessary? */
+       pfree(chgcxt->cc_rri);
+       pfree(chgcxt->cc_ident_key);
+}
+
+/*
+ * The final steps of rebuild_relation() for concurrent processing.
+ *
+ * On entry, NewHeap is locked in AccessExclusiveLock mode. OldHeap and its
+ * clustering index (if one is passed) are still locked in a mode that allows
+ * concurrent data changes. On exit, both tables and their indexes are closed,
+ * but locked in AccessExclusiveLock mode.
+ */
+static void
+rebuild_relation_finish_concurrent(Relation NewHeap, Relation OldHeap,
+                                                                  Oid identIdx, TransactionId frozenXid,
+                                                                  MultiXactId cutoffMulti)
+{
+       List       *ind_oids_new;
+       Oid                     old_table_oid = RelationGetRelid(OldHeap);
+       Oid                     new_table_oid = RelationGetRelid(NewHeap);
+       List       *ind_oids_old = RelationGetIndexList(OldHeap);
+       ListCell   *lc,
+                          *lc2;
+       char            relpersistence;
+       bool            is_system_catalog;
+       Oid                     ident_idx_new;
+       XLogRecPtr      end_of_wal;
+       List       *indexrels;
+       ChangeContext chgcxt;
+
+       Assert(CheckRelationLockedByMe(OldHeap, ShareUpdateExclusiveLock, false));
+       Assert(CheckRelationLockedByMe(NewHeap, AccessExclusiveLock, false));
+
+       /*
+        * Unlike the exclusive case, we build new indexes for the new relation
+        * rather than swapping the storage and reindexing the old relation. The
+        * point is that the index build can take some time, so we do it before we
+        * get AccessExclusiveLock on the old heap and therefore we cannot swap
+        * the heap storage yet.
+        *
+        * index_create() will lock the new indexes using AccessExclusiveLock - no
+        * need to change that. At the same time, we use ShareUpdateExclusiveLock
+        * to lock the existing indexes - that should be enough to prevent others
+        * from changing them while we're repacking the relation. The lock on
+        * table should prevent others from changing the index column list, but
+        * might not be enough for commands like ALTER INDEX ... SET ... (Those
+        * are not necessarily dangerous, but can make user confused if the
+        * changes they do get lost due to REPACK.)
+        */
+       ind_oids_new = build_new_indexes(NewHeap, OldHeap, ind_oids_old);
+
+       /*
+        * The identity index in the new relation appears in the same relative
+        * position as the corresponding index in the old relation.  Find it.
+        */
+       ident_idx_new = InvalidOid;
+       foreach_oid(ind_old, ind_oids_old)
+       {
+               if (identIdx == ind_old)
+               {
+                       int                     pos = foreach_current_index(ind_old);
+
+                       if (unlikely(list_length(ind_oids_new) < pos))
+                               elog(ERROR, "list of new indexes too short");
+                       ident_idx_new = list_nth_oid(ind_oids_new, pos);
+                       break;
+               }
+       }
+       if (!OidIsValid(ident_idx_new))
+               elog(ERROR, "could not find index matching \"%s\" at the new relation",
+                        get_rel_name(identIdx));
+
+       /* Gather information to apply concurrent changes. */
+       initialize_change_context(&chgcxt, NewHeap, ident_idx_new);
+
+       /*
+        * During testing, wait for another backend to perform concurrent data
+        * changes which we will process below.
+        */
+       INJECTION_POINT("repack-concurrently-before-lock", NULL);
+
+       /*
+        * Flush all WAL records inserted so far (possibly except for the last
+        * incomplete page; see GetInsertRecPtr), to minimize the amount of data
+        * we need to flush while holding exclusive lock on the source table.
+        */
+       XLogFlush(GetXLogInsertEndRecPtr());
+       end_of_wal = GetFlushRecPtr(NULL);
+
+       /*
+        * Apply concurrent changes first time, to minimize the time we need to
+        * hold AccessExclusiveLock. (Quite some amount of WAL could have been
+        * written during the data copying and index creation.)
+        */
+       process_concurrent_changes(end_of_wal, &chgcxt, false);
+
+       /*
+        * Acquire AccessExclusiveLock on the table, its TOAST relation (if there
+        * is one), all its indexes, so that we can swap the files.
+        */
+       LockRelationOid(old_table_oid, AccessExclusiveLock);
+
+       /*
+        * Lock all indexes now, not only the clustering one: all indexes need to
+        * have their files swapped. While doing that, store their relation
+        * references in a zero-terminated array, to handle predicate locks below.
+        */
+       indexrels = NIL;
+       foreach_oid(ind_oid, ind_oids_old)
+       {
+               Relation        index;
+
+               index = index_open(ind_oid, AccessExclusiveLock);
+
+               /*
+                * Some things about the index may have changed before we locked the
+                * index, such as ALTER INDEX RENAME.  We don't need to do anything
+                * here to absorb those changes in the new index.
+                */
+               indexrels = lappend(indexrels, index);
+       }
+
+       /*
+        * Lock the OldHeap's TOAST relation exclusively - again, the lock is
+        * needed to swap the files.
+        */
+       if (OidIsValid(OldHeap->rd_rel->reltoastrelid))
+               LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
+
+       /*
+        * Tuples and pages of the old heap will be gone, but the heap will stay.
+        */
+       TransferPredicateLocksToHeapRelation(OldHeap);
+       foreach_ptr(RelationData, index, indexrels)
+       {
+               TransferPredicateLocksToHeapRelation(index);
+               index_close(index, NoLock);
+       }
+       list_free(indexrels);
+
+       /*
+        * Flush WAL again, to make sure that all changes committed while we were
+        * waiting for the exclusive lock are available for decoding.
+        */
+       XLogFlush(GetXLogInsertEndRecPtr());
+       end_of_wal = GetFlushRecPtr(NULL);
+
+       /*
+        * Apply the concurrent changes again. Indicate that the decoding worker
+        * won't be needed anymore.
+        */
+       process_concurrent_changes(end_of_wal, &chgcxt, true);
+
+       /* Remember info about rel before closing OldHeap */
+       relpersistence = OldHeap->rd_rel->relpersistence;
+       is_system_catalog = IsSystemRelation(OldHeap);
+
+       pgstat_progress_update_param(PROGRESS_REPACK_PHASE,
+                                                                PROGRESS_REPACK_PHASE_SWAP_REL_FILES);
+
+       /*
+        * Even ShareUpdateExclusiveLock should have prevented others from
+        * creating / dropping indexes (even using the CONCURRENTLY option), so we
+        * do not need to check whether the lists match.
+        */
+       forboth(lc, ind_oids_old, lc2, ind_oids_new)
+       {
+               Oid                     ind_old = lfirst_oid(lc);
+               Oid                     ind_new = lfirst_oid(lc2);
+               Oid                     mapped_tables[4] = {0};
+
+               swap_relation_files(ind_old, ind_new,
+                                                       (old_table_oid == RelationRelationId),
+                                                       false,  /* swap_toast_by_content */
+                                                       true,
+                                                       InvalidTransactionId,
+                                                       InvalidMultiXactId,
+                                                       mapped_tables);
+
+#ifdef USE_ASSERT_CHECKING
+
+               /*
+                * Concurrent processing is not supported for system relations, so
+                * there should be no mapped tables.
+                */
+               for (int i = 0; i < 4; i++)
+                       Assert(!OidIsValid(mapped_tables[i]));
+#endif
+       }
+
+       /* The new indexes must be visible for deletion. */
+       CommandCounterIncrement();
+
+       /* Close the old heap but keep lock until transaction commit. */
+       table_close(OldHeap, NoLock);
+       /* Close the new heap. (We didn't have to open its indexes). */
+       table_close(NewHeap, NoLock);
+
+       /* Cleanup what we don't need anymore. (And close the identity index.) */
+       release_change_context(&chgcxt);
+
+       /*
+        * Swap the relations and their TOAST relations and TOAST indexes. This
+        * also drops the new relation and its indexes.
+        *
+        * (System catalogs are currently not supported.)
+        */
+       Assert(!is_system_catalog);
+       finish_heap_swap(old_table_oid, new_table_oid,
+                                        is_system_catalog,
+                                        false,         /* swap_toast_by_content */
+                                        false,
+                                        true,
+                                        false,         /* reindex */
+                                        frozenXid, cutoffMulti,
+                                        relpersistence);
+}
+
+/*
+ * Build indexes on NewHeap according to those on OldHeap.
+ *
+ * OldIndexes is the list of index OIDs on OldHeap. The contained indexes end
+ * up locked using ShareUpdateExclusiveLock.
+ *
+ * A list of OIDs of the corresponding indexes created on NewHeap is
+ * returned. The order of items does match, so we can use these arrays to swap
+ * index storage.
+ */
+static List *
+build_new_indexes(Relation NewHeap, Relation OldHeap, List *OldIndexes)
+{
+       List       *result = NIL;
+
+       pgstat_progress_update_param(PROGRESS_REPACK_PHASE,
+                                                                PROGRESS_REPACK_PHASE_REBUILD_INDEX);
+
+       foreach_oid(oldindex, OldIndexes)
+       {
+               Oid                     newindex;
+               char       *newName;
+               Relation        ind;
+
+               ind = index_open(oldindex, ShareUpdateExclusiveLock);
+
+               newName = ChooseRelationName(get_rel_name(oldindex),
+                                                                        NULL,
+                                                                        "repacknew",
+                                                                        get_rel_namespace(ind->rd_index->indrelid),
+                                                                        false);
+               newindex = index_create_copy(NewHeap, INDEX_CREATE_SUPPRESS_PROGRESS,
+                                                                        oldindex, ind->rd_rel->reltablespace,
+                                                                        newName);
+               copy_index_constraints(ind, newindex, RelationGetRelid(NewHeap));
+               result = lappend_oid(result, newindex);
+
+               index_close(ind, NoLock);
+       }
+
+       return result;
+}
+
+/*
+ * Create a transient copy of a constraint -- supported by a transient
+ * copy of the index that supports the original constraint.
+ *
+ * When repacking a table that contains exclusion constraints, the executor
+ * relies on these constraints being properly catalogued.  These copies are
+ * to support that.
+ *
+ * We don't need the constraints for anything else (the original constraints
+ * will be there once repack completes), so we add pg_depend entries so that
+ * the are dropped when the transient table is dropped.
+ */
+static void
+copy_index_constraints(Relation old_index, Oid new_index_id, Oid new_heap_id)
+{
+       ScanKeyData skey;
+       Relation        rel;
+       TupleDesc       desc;
+       SysScanDesc scan;
+       HeapTuple       tup;
+       ObjectAddress objrel;
+
+       rel = table_open(ConstraintRelationId, RowExclusiveLock);
+       ObjectAddressSet(objrel, RelationRelationId, new_heap_id);
+
+       /*
+        * Retrieve the constraints supported by the old index and create an
+        * identical one that points to the new index.
+        */
+       ScanKeyInit(&skey,
+                               Anum_pg_constraint_conrelid,
+                               BTEqualStrategyNumber, F_OIDEQ,
+                               ObjectIdGetDatum(old_index->rd_index->indrelid));
+       scan = systable_beginscan(rel, ConstraintRelidTypidNameIndexId, true,
+                                                         NULL, 1, &skey);
+       desc = RelationGetDescr(rel);
+       while (HeapTupleIsValid(tup = systable_getnext(scan)))
+       {
+               Form_pg_constraint conform = (Form_pg_constraint) GETSTRUCT(tup);
+               Oid                     oid;
+               Datum           values[Natts_pg_constraint] = {0};
+               bool            nulls[Natts_pg_constraint] = {0};
+               bool            replaces[Natts_pg_constraint] = {0};
+               HeapTuple       new_tup;
+               ObjectAddress objcon;
+
+               if (conform->conindid != RelationGetRelid(old_index))
+                       continue;
+
+               oid = GetNewOidWithIndex(rel, ConstraintOidIndexId,
+                                                                Anum_pg_constraint_oid);
+               values[Anum_pg_constraint_oid - 1] = ObjectIdGetDatum(oid);
+               replaces[Anum_pg_constraint_oid - 1] = true;
+               values[Anum_pg_constraint_conrelid - 1] = ObjectIdGetDatum(new_heap_id);
+               replaces[Anum_pg_constraint_conrelid - 1] = true;
+               values[Anum_pg_constraint_conindid - 1] = ObjectIdGetDatum(new_index_id);
+               replaces[Anum_pg_constraint_conindid - 1] = true;
+
+               new_tup = heap_modify_tuple(tup, desc, values, nulls, replaces);
+
+               /* Insert it into the catalog. */
+               CatalogTupleInsert(rel, new_tup);
+
+               /* Create a dependency so it's removed when we drop the new heap. */
+               ObjectAddressSet(objcon, ConstraintRelationId, oid);
+               recordDependencyOn(&objcon, &objrel, DEPENDENCY_AUTO);
+       }
+       systable_endscan(scan);
+
+       table_close(rel, RowExclusiveLock);
+
+       CommandCounterIncrement();
+}
+
+/*
+ * Try to start a background worker to perform logical decoding of data
+ * changes applied to relation while REPACK CONCURRENTLY is copying its
+ * contents to a new table.
+ */
+static void
+start_repack_decoding_worker(Oid relid)
+{
+       Size            size;
+       dsm_segment *seg;
+       DecodingWorkerShared *shared;
+       shm_mq     *mq;
+       shm_mq_handle *mqh;
+       BackgroundWorker bgw;
+
+       /* Setup shared memory. */
+       size = BUFFERALIGN(offsetof(DecodingWorkerShared, error_queue)) +
+               BUFFERALIGN(REPACK_ERROR_QUEUE_SIZE);
+       seg = dsm_create(size, 0);
+       shared = (DecodingWorkerShared *) dsm_segment_address(seg);
+       shared->lsn_upto = InvalidXLogRecPtr;
+       shared->done = false;
+       SharedFileSetInit(&shared->sfs, seg);
+       shared->last_exported = -1;
+       SpinLockInit(&shared->mutex);
+       shared->dbid = MyDatabaseId;
+
+       /*
+        * This is the UserId set in cluster_rel(). Security context shouldn't be
+        * needed for decoding worker.
+        */
+       shared->roleid = GetUserId();
+       shared->relid = relid;
+       ConditionVariableInit(&shared->cv);
+       shared->backend_proc = MyProc;
+       shared->backend_pid = MyProcPid;
+       shared->backend_proc_number = MyProcNumber;
+
+       mq = shm_mq_create((char *) BUFFERALIGN(shared->error_queue),
+                                          REPACK_ERROR_QUEUE_SIZE);
+       shm_mq_set_receiver(mq, MyProc);
+       mqh = shm_mq_attach(mq, seg, NULL);
+
+       memset(&bgw, 0, sizeof(bgw));
+       snprintf(bgw.bgw_name, BGW_MAXLEN,
+                        "REPACK decoding worker for relation \"%s\"",
+                        get_rel_name(relid));
+       snprintf(bgw.bgw_type, BGW_MAXLEN, "REPACK decoding worker");
+       bgw.bgw_flags = BGWORKER_SHMEM_ACCESS |
+               BGWORKER_BACKEND_DATABASE_CONNECTION;
+       bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+       bgw.bgw_restart_time = BGW_NEVER_RESTART;
+       snprintf(bgw.bgw_library_name, MAXPGPATH, "postgres");
+       snprintf(bgw.bgw_function_name, BGW_MAXLEN, "RepackWorkerMain");
+       bgw.bgw_main_arg = UInt32GetDatum(dsm_segment_handle(seg));
+       bgw.bgw_notify_pid = MyProcPid;
+
+       decoding_worker = palloc0_object(DecodingWorker);
+       if (!RegisterDynamicBackgroundWorker(&bgw, &decoding_worker->handle))
+               ereport(ERROR,
+                               errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
+                               errmsg("out of background worker slots"),
+                               errhint("You might need to increase \"%s\".", "max_worker_processes"));
+
+       decoding_worker->seg = seg;
+       decoding_worker->error_mqh = mqh;
+
+       /*
+        * The decoding setup must be done before the caller can have XID assigned
+        * for any reason, otherwise the worker might end up in a deadlock,
+        * waiting for the caller's transaction to end. Therefore wait here until
+        * the worker indicates that it has the logical decoding initialized.
+        */
+       ConditionVariablePrepareToSleep(&shared->cv);
+       for (;;)
+       {
+               bool            initialized;
+
+               SpinLockAcquire(&shared->mutex);
+               initialized = shared->initialized;
+               SpinLockRelease(&shared->mutex);
+
+               if (initialized)
+                       break;
+
+               ConditionVariableSleep(&shared->cv, WAIT_EVENT_REPACK_WORKER_EXPORT);
+       }
+       ConditionVariableCancelSleep();
+}
+
+/*
+ * Stop the decoding worker and cleanup the related resources.
+ *
+ * The worker stops on its own when it knows there is no more work to do, but
+ * we need to stop it explicitly at least on ERROR in the launching backend.
+ */
+static void
+stop_repack_decoding_worker(void)
+{
+       BgwHandleStatus status;
+
+       /* Haven't reached the worker startup? */
+       if (decoding_worker == NULL)
+               return;
+
+       /* Could not register the worker? */
+       if (decoding_worker->handle == NULL)
+               return;
+
+       TerminateBackgroundWorker(decoding_worker->handle);
+       /* The worker should really exit before the REPACK command does. */
+       HOLD_INTERRUPTS();
+       status = WaitForBackgroundWorkerShutdown(decoding_worker->handle);
+       RESUME_INTERRUPTS();
+
+       if (status == BGWH_POSTMASTER_DIED)
+               ereport(FATAL,
+                               errcode(ERRCODE_ADMIN_SHUTDOWN),
+                               errmsg("postmaster exited during REPACK command"));
+
+       shm_mq_detach(decoding_worker->error_mqh);
+
+       /*
+        * If we could not cancel the current sleep due to ERROR, do that before
+        * we detach from the shared memory the condition variable is located in.
+        * If we did not, the bgworker ERROR handling code would try and fail
+        * badly.
+        */
+       ConditionVariableCancelSleep();
+
+       dsm_detach(decoding_worker->seg);
+       pfree(decoding_worker);
+       decoding_worker = NULL;
+}
+
+/*
+ * Get the initial snapshot from the decoding worker.
+ */
+static Snapshot
+get_initial_snapshot(DecodingWorker *worker)
+{
+       DecodingWorkerShared *shared;
+       char            fname[MAXPGPATH];
+       BufFile    *file;
+       Size            snap_size;
+       char       *snap_space;
+       Snapshot        snapshot;
+
+       shared = (DecodingWorkerShared *) dsm_segment_address(worker->seg);
+
+       /*
+        * The worker needs to initialize the logical decoding, which usually
+        * takes some time. Therefore it makes sense to prepare for the sleep
+        * first.
+        */
+       ConditionVariablePrepareToSleep(&shared->cv);
+       for (;;)
+       {
+               int                     last_exported;
+
+               SpinLockAcquire(&shared->mutex);
+               last_exported = shared->last_exported;
+               SpinLockRelease(&shared->mutex);
+
+               /*
+                * Has the worker exported the file we are waiting for?
+                */
+               if (last_exported == WORKER_FILE_SNAPSHOT)
+                       break;
+
+               ConditionVariableSleep(&shared->cv, WAIT_EVENT_REPACK_WORKER_EXPORT);
+       }
+       ConditionVariableCancelSleep();
+
+       /* Read the snapshot from a file. */
+       DecodingWorkerFileName(fname, shared->relid, WORKER_FILE_SNAPSHOT);
+       file = BufFileOpenFileSet(&shared->sfs.fs, fname, O_RDONLY, false);
+       BufFileReadExact(file, &snap_size, sizeof(snap_size));
+       snap_space = (char *) palloc(snap_size);
+       BufFileReadExact(file, snap_space, snap_size);
+       BufFileClose(file);
+
+       /* Restore it. */
+       snapshot = RestoreSnapshot(snap_space);
+       pfree(snap_space);
+
+       return snapshot;
+}
+
+/*
+ * Generate worker's file name into 'fname', which must be of size MAXPGPATH.
+ * If relations of the same 'relid' happen to be processed at the same time,
+ * they must be from different databases and therefore different backends must
+ * be involved.
+ */
+void
+DecodingWorkerFileName(char *fname, Oid relid, uint32 seq)
+{
+       /* The PID is already present in the fileset name, so we needn't add it */
+       snprintf(fname, MAXPGPATH, "%u-%u", relid, seq);
+}
+
+/*
+ * Handle receipt of an interrupt indicating a repack worker message.
+ *
+ * Note: this is called within a signal handler!  All we can do is set
+ * a flag that will cause the next CHECK_FOR_INTERRUPTS() to invoke
+ * ProcessRepackMessages().
+ */
+void
+HandleRepackMessageInterrupt(void)
+{
+       InterruptPending = true;
+       RepackMessagePending = true;
+       SetLatch(MyLatch);
+}
+
+/*
+ * Process any queued protocol messages received from the repack worker.
+ */
+void
+ProcessRepackMessages(void)
+{
+       MemoryContext oldcontext;
+       static MemoryContext hpm_context = NULL;
+
+       /*
+        * Nothing to do if we haven't launched the worker yet or have already
+        * terminated it.
+        */
+       if (decoding_worker == NULL)
+               return;
+
+       /*
+        * This is invoked from ProcessInterrupts(), and since some of the
+        * functions it calls contain CHECK_FOR_INTERRUPTS(), there is a potential
+        * for recursive calls if more signals are received while this runs.  It's
+        * unclear that recursive entry would be safe, and it doesn't seem useful
+        * even if it is safe, so let's block interrupts until done.
+        */
+       HOLD_INTERRUPTS();
+
+       /*
+        * Moreover, CurrentMemoryContext might be pointing almost anywhere.  We
+        * don't want to risk leaking data into long-lived contexts, so let's do
+        * our work here in a private context that we can reset on each use.
+        */
+       if (hpm_context == NULL)        /* first time through? */
+               hpm_context = AllocSetContextCreate(TopMemoryContext,
+                                                                                       "ProcessRepackMessages",
+                                                                                       ALLOCSET_DEFAULT_SIZES);
+       else
+               MemoryContextReset(hpm_context);
+
+       oldcontext = MemoryContextSwitchTo(hpm_context);
+
+       /* OK to process messages.  Reset the flag saying there are more to do. */
+       RepackMessagePending = false;
+
+       /*
+        * Read as many messages as we can from the worker, but stop when no more
+        * messages can be read from the worker without blocking.
+        */
+       while (true)
+       {
+               shm_mq_result res;
+               Size            nbytes;
+               void       *data;
+
+               res = shm_mq_receive(decoding_worker->error_mqh, &nbytes,
+                                                        &data, true);
+               if (res == SHM_MQ_WOULD_BLOCK)
+                       break;
+               else if (res == SHM_MQ_SUCCESS)
+               {
+                       StringInfoData msg;
+
+                       initStringInfo(&msg);
+                       appendBinaryStringInfo(&msg, data, nbytes);
+                       ProcessRepackMessage(&msg);
+                       pfree(msg.data);
+               }
+               else
+               {
+                       /*
+                        * The decoding worker is special in that it exits as soon as it
+                        * has its work done. Thus the DETACHED result code is fine.
+                        */
+                       Assert(res == SHM_MQ_DETACHED);
+
+                       break;
+               }
+       }
+
+       MemoryContextSwitchTo(oldcontext);
+
+       /* Might as well clear the context on our way out */
+       MemoryContextReset(hpm_context);
+
+       RESUME_INTERRUPTS();
+}
+
+/*
+ * Process a single protocol message received from a single parallel worker.
+ */
+static void
+ProcessRepackMessage(StringInfo msg)
+{
+       char            msgtype;
+
+       msgtype = pq_getmsgbyte(msg);
+
+       switch (msgtype)
+       {
+               case PqMsg_ErrorResponse:
+               case PqMsg_NoticeResponse:
+                       {
+                               ErrorData       edata;
+
+                               /* Parse ErrorResponse or NoticeResponse. */
+                               pq_parse_errornotice(msg, &edata);
+
+                               /* Death of a worker isn't enough justification for suicide. */
+                               edata.elevel = Min(edata.elevel, ERROR);
+
+                               /*
+                                * Add a context line to show that this is a message
+                                * propagated from the worker.  Otherwise, it can sometimes be
+                                * confusing to understand what actually happened.
+                                */
+                               if (edata.context)
+                                       edata.context = psprintf("%s\n%s", edata.context,
+                                                                                        _("REPACK decoding worker"));
+                               else
+                                       edata.context = pstrdup(_("REPACK decoding worker"));
+
+                               /* Rethrow error or print notice. */
+                               ThrowErrorData(&edata);
+
+                               break;
+                       }
+
+               default:
+                       {
+                               elog(ERROR, "unrecognized message type received from decoding worker: %c (message length %d bytes)",
+                                        msgtype, msg->len);
+                       }
+       }
+}
diff --git a/src/backend/commands/repack_worker.c b/src/backend/commands/repack_worker.c
new file mode 100644 (file)
index 0000000..94d0349
--- /dev/null
@@ -0,0 +1,533 @@
+/*-------------------------------------------------------------------------
+ *
+ * repack_worker.c
+ *    Implementation of the background worker for ad-hoc logical decoding
+ *    during REPACK (CONCURRENTLY).
+ *
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *       src/backend/commands/repack_worker.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/table.h"
+#include "access/xlog_internal.h"
+#include "access/xlogutils.h"
+#include "access/xlogwait.h"
+#include "commands/repack.h"
+#include "commands/repack_internal.h"
+#include "libpq/pqmq.h"
+#include "replication/snapbuild.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "tcop/tcopprot.h"
+#include "utils/memutils.h"
+
+#define REPL_PLUGIN_NAME   "pgrepack"
+
+static void RepackWorkerShutdown(int code, Datum arg);
+static LogicalDecodingContext *repack_setup_logical_decoding(Oid relid);
+static void repack_cleanup_logical_decoding(LogicalDecodingContext *ctx);
+static void export_initial_snapshot(Snapshot snapshot,
+                                                                       DecodingWorkerShared *shared);
+static bool decode_concurrent_changes(LogicalDecodingContext *ctx,
+                                                                         DecodingWorkerShared *shared);
+
+/* Is this process a REPACK worker? */
+static bool am_repack_worker = false;
+
+/* The WAL segment being decoded. */
+static XLogSegNo repack_current_segment = 0;
+
+/*
+ * Keep track of the table we're processing, to skip logical decoding of data
+ * from other relations.
+ */
+static RelFileLocator repacked_rel_locator = {.relNumber = InvalidOid};
+static RelFileLocator repacked_rel_toast_locator = {.relNumber = InvalidOid};
+
+
+/* REPACK decoding worker entry point */
+void
+RepackWorkerMain(Datum main_arg)
+{
+       dsm_segment *seg;
+       DecodingWorkerShared *shared;
+       shm_mq     *mq;
+       shm_mq_handle *mqh;
+       LogicalDecodingContext *decoding_ctx;
+       SharedFileSet *sfs;
+       Snapshot        snapshot;
+
+       am_repack_worker = true;
+
+       /*
+        * Override the default bgworker_die() with die() so we can use
+        * CHECK_FOR_INTERRUPTS().
+        */
+       pqsignal(SIGTERM, die);
+       BackgroundWorkerUnblockSignals();
+
+       seg = dsm_attach(DatumGetUInt32(main_arg));
+       if (seg == NULL)
+               ereport(ERROR,
+                               errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                               errmsg("could not map dynamic shared memory segment"));
+
+       shared = (DecodingWorkerShared *) dsm_segment_address(seg);
+       shared->dsm_seg = seg;
+
+       /* Arrange to signal the leader if we exit. */
+       before_shmem_exit(RepackWorkerShutdown, PointerGetDatum(shared));
+
+       /*
+        * Join locking group - see the comments around the call of
+        * start_repack_decoding_worker().
+        */
+       if (!BecomeLockGroupMember(shared->backend_proc, shared->backend_pid))
+               return;                                 /* The leader is not running anymore. */
+
+       /*
+        * Setup a queue to send error messages to the backend that launched this
+        * worker.
+        */
+       mq = (shm_mq *) (char *) BUFFERALIGN(shared->error_queue);
+       shm_mq_set_sender(mq, MyProc);
+       mqh = shm_mq_attach(mq, seg, NULL);
+       pq_redirect_to_shm_mq(seg, mqh);
+       pq_set_parallel_leader(shared->backend_pid,
+                                                  shared->backend_proc_number);
+
+       /* Connect to the database. */
+       BackgroundWorkerInitializeConnectionByOid(shared->dbid, shared->roleid, 0);
+
+       /*
+        * Transaction is needed to open relation, and it also provides us with a
+        * resource owner.
+        */
+       StartTransactionCommand();
+
+       shared = (DecodingWorkerShared *) dsm_segment_address(seg);
+
+       /*
+        * Not sure the spinlock is needed here - the backend should not change
+        * anything in the shared memory until we have serialized the snapshot.
+        */
+       SpinLockAcquire(&shared->mutex);
+       Assert(!XLogRecPtrIsValid(shared->lsn_upto));
+       sfs = &shared->sfs;
+       SpinLockRelease(&shared->mutex);
+
+       SharedFileSetAttach(sfs, seg);
+
+       /*
+        * Prepare to capture the concurrent data changes ourselves.
+        */
+       decoding_ctx = repack_setup_logical_decoding(shared->relid);
+
+       /* Announce that we're ready. */
+       SpinLockAcquire(&shared->mutex);
+       shared->initialized = true;
+       SpinLockRelease(&shared->mutex);
+       ConditionVariableSignal(&shared->cv);
+
+       /* There doesn't seem to a nice API to set these */
+       XactIsoLevel = XACT_REPEATABLE_READ;
+       XactReadOnly = true;
+
+       /* Build the initial snapshot and export it. */
+       snapshot = SnapBuildInitialSnapshot(decoding_ctx->snapshot_builder);
+       export_initial_snapshot(snapshot, shared);
+
+       /*
+        * Only historic snapshots should be used now. Do not let us restrict the
+        * progress of xmin horizon.
+        */
+       InvalidateCatalogSnapshot();
+
+       for (;;)
+       {
+               bool            stop = decode_concurrent_changes(decoding_ctx, shared);
+
+               if (stop)
+                       break;
+
+       }
+
+       /* Cleanup. */
+       repack_cleanup_logical_decoding(decoding_ctx);
+       CommitTransactionCommand();
+}
+
+/*
+ * See ParallelWorkerShutdown for details.
+ */
+static void
+RepackWorkerShutdown(int code, Datum arg)
+{
+       DecodingWorkerShared *shared = (DecodingWorkerShared *) DatumGetPointer(arg);
+
+       SendProcSignal(shared->backend_pid,
+                                  PROCSIG_REPACK_MESSAGE,
+                                  shared->backend_proc_number);
+
+       dsm_detach(shared->dsm_seg);
+}
+
+bool
+AmRepackWorker(void)
+{
+       return am_repack_worker;
+}
+
+/*
+ * This function is much like pg_create_logical_replication_slot() except that
+ * the new slot is neither released (if anyone else could read changes from
+ * our slot, we could miss changes other backends do while we copy the
+ * existing data into temporary table), nor persisted (it's easier to handle
+ * crash by restarting all the work from scratch).
+ */
+static LogicalDecodingContext *
+repack_setup_logical_decoding(Oid relid)
+{
+       Relation        rel;
+       Oid                     toastrelid;
+       LogicalDecodingContext *ctx;
+       NameData        slotname;
+       RepackDecodingState *dstate;
+       MemoryContext oldcxt;
+
+       /*
+        * REPACK CONCURRENTLY is not allowed in a transaction block, so this
+        * should never fire.
+        */
+       Assert(!TransactionIdIsValid(GetTopTransactionIdIfAny()));
+
+       /*
+        * Make sure we can use logical decoding.
+        */
+       CheckSlotPermissions();
+       CheckLogicalDecodingRequirements();
+
+       /*
+        * A single backend should not execute multiple REPACK commands at a time,
+        * so use PID to make the slot unique.
+        *
+        * RS_TEMPORARY so that the slot gets cleaned up on ERROR.
+        */
+       snprintf(NameStr(slotname), NAMEDATALEN, "repack_%d", MyProcPid);
+       ReplicationSlotCreate(NameStr(slotname), true, RS_TEMPORARY, false, false,
+                                                 false);
+
+       EnsureLogicalDecodingEnabled();
+
+       /*
+        * Neither prepare_write nor do_write callback nor update_progress is
+        * useful for us.
+        */
+       ctx = CreateInitDecodingContext(REPL_PLUGIN_NAME,
+                                                                       NIL,
+                                                                       true,
+                                                                       InvalidXLogRecPtr,
+                                                                       XL_ROUTINE(.page_read = read_local_xlog_page,
+                                                                                          .segment_open = wal_segment_open,
+                                                                                          .segment_close = wal_segment_close),
+                                                                       NULL, NULL, NULL);
+
+       /*
+        * We don't have control on setting fast_forward, so at least check it.
+        */
+       Assert(!ctx->fast_forward);
+
+       /* Avoid logical decoding of other relations. */
+       rel = table_open(relid, AccessShareLock);
+       repacked_rel_locator = rel->rd_locator;
+       toastrelid = rel->rd_rel->reltoastrelid;
+       if (OidIsValid(toastrelid))
+       {
+               Relation        toastrel;
+
+               /* Avoid logical decoding of other TOAST relations. */
+               toastrel = table_open(toastrelid, AccessShareLock);
+               repacked_rel_toast_locator = toastrel->rd_locator;
+               table_close(toastrel, AccessShareLock);
+       }
+       table_close(rel, AccessShareLock);
+
+       DecodingContextFindStartpoint(ctx);
+
+       /*
+        * decode_concurrent_changes() needs non-blocking callback.
+        */
+       ctx->reader->routine.page_read = read_local_xlog_page_no_wait;
+
+       /* Some WAL records should have been read. */
+       Assert(ctx->reader->EndRecPtr != InvalidXLogRecPtr);
+
+       /*
+        * Initialize repack_current_segment so that we can notice WAL segment
+        * boundaries.
+        */
+       XLByteToSeg(ctx->reader->EndRecPtr, repack_current_segment,
+                               wal_segment_size);
+
+       /* Our private state belongs to the decoding context. */
+       oldcxt = MemoryContextSwitchTo(ctx->context);
+
+       /*
+        * read_local_xlog_page_no_wait() needs to be able to indicate the end of
+        * WAL.
+        */
+       ctx->reader->private_data = palloc0_object(ReadLocalXLogPageNoWaitPrivate);
+       dstate = palloc0_object(RepackDecodingState);
+       MemoryContextSwitchTo(oldcxt);
+
+#ifdef USE_ASSERT_CHECKING
+       dstate->relid = relid;
+#endif
+
+       dstate->change_cxt = AllocSetContextCreate(ctx->context,
+                                                                                          "REPACK - change",
+                                                                                          ALLOCSET_DEFAULT_SIZES);
+
+       /* The file will be set as soon as we have it opened. */
+       dstate->file = NULL;
+
+       /*
+        * Memory context and resource owner for long-lived resources.
+        */
+       dstate->worker_cxt = CurrentMemoryContext;
+       dstate->worker_resowner = CurrentResourceOwner;
+
+       ctx->output_writer_private = dstate;
+
+       return ctx;
+}
+
+static void
+repack_cleanup_logical_decoding(LogicalDecodingContext *ctx)
+{
+       RepackDecodingState *dstate;
+
+       dstate = (RepackDecodingState *) ctx->output_writer_private;
+       if (dstate->slot)
+               ExecDropSingleTupleTableSlot(dstate->slot);
+
+       FreeDecodingContext(ctx);
+       ReplicationSlotDropAcquired();
+}
+
+/*
+ * Make snapshot available to the backend that launched the decoding worker.
+ */
+static void
+export_initial_snapshot(Snapshot snapshot, DecodingWorkerShared *shared)
+{
+       char            fname[MAXPGPATH];
+       BufFile    *file;
+       Size            snap_size;
+       char       *snap_space;
+
+       snap_size = EstimateSnapshotSpace(snapshot);
+       snap_space = (char *) palloc(snap_size);
+       SerializeSnapshot(snapshot, snap_space);
+
+       DecodingWorkerFileName(fname, shared->relid, shared->last_exported + 1);
+       file = BufFileCreateFileSet(&shared->sfs.fs, fname);
+       /* To make restoration easier, write the snapshot size first. */
+       BufFileWrite(file, &snap_size, sizeof(snap_size));
+       BufFileWrite(file, snap_space, snap_size);
+       pfree(snap_space);
+       BufFileClose(file);
+
+       /* Increase the counter to tell the backend that the file is available. */
+       SpinLockAcquire(&shared->mutex);
+       shared->last_exported++;
+       SpinLockRelease(&shared->mutex);
+       ConditionVariableSignal(&shared->cv);
+}
+
+/*
+ * Decode logical changes from the WAL sequence and store them to a file.
+ *
+ * If true is returned, there is no more work for the worker.
+ */
+static bool
+decode_concurrent_changes(LogicalDecodingContext *ctx,
+                                                 DecodingWorkerShared *shared)
+{
+       RepackDecodingState *dstate;
+       XLogRecPtr      lsn_upto;
+       bool            done;
+       char            fname[MAXPGPATH];
+
+       dstate = (RepackDecodingState *) ctx->output_writer_private;
+
+       /* Open the output file. */
+       DecodingWorkerFileName(fname, shared->relid, shared->last_exported + 1);
+       dstate->file = BufFileCreateFileSet(&shared->sfs.fs, fname);
+
+       SpinLockAcquire(&shared->mutex);
+       lsn_upto = shared->lsn_upto;
+       done = shared->done;
+       SpinLockRelease(&shared->mutex);
+
+       while (true)
+       {
+               XLogRecord *record;
+               XLogSegNo       segno_new;
+               char       *errm = NULL;
+               XLogRecPtr      end_lsn;
+
+               CHECK_FOR_INTERRUPTS();
+
+               record = XLogReadRecord(ctx->reader, &errm);
+               if (record)
+               {
+                       LogicalDecodingProcessRecord(ctx, ctx->reader);
+
+                       /*
+                        * If WAL segment boundary has been crossed, inform the decoding
+                        * system that the catalog_xmin can advance.
+                        */
+                       end_lsn = ctx->reader->EndRecPtr;
+                       XLByteToSeg(end_lsn, segno_new, wal_segment_size);
+                       if (segno_new != repack_current_segment)
+                       {
+                               LogicalConfirmReceivedLocation(end_lsn);
+                               elog(DEBUG1, "REPACK: confirmed receive location %X/%X",
+                                        (uint32) (end_lsn >> 32), (uint32) end_lsn);
+                               repack_current_segment = segno_new;
+                       }
+               }
+               else
+               {
+                       ReadLocalXLogPageNoWaitPrivate *priv;
+
+                       if (errm)
+                               ereport(ERROR,
+                                               errmsg("%s", errm));
+
+                       /*
+                        * In the decoding loop we do not want to get blocked when there
+                        * is no more WAL available, otherwise the loop would become
+                        * uninterruptible.
+                        */
+                       priv = (ReadLocalXLogPageNoWaitPrivate *) ctx->reader->private_data;
+                       if (priv->end_of_wal)
+                               /* Do not miss the end of WAL condition next time. */
+                               priv->end_of_wal = false;
+                       else
+                               ereport(ERROR,
+                                               errmsg("could not read WAL record"));
+               }
+
+               /*
+                * Whether we could read new record or not, keep checking if
+                * 'lsn_upto' was specified.
+                */
+               if (!XLogRecPtrIsValid(lsn_upto))
+               {
+                       SpinLockAcquire(&shared->mutex);
+                       lsn_upto = shared->lsn_upto;
+                       /* 'done' should be set at the same time as 'lsn_upto' */
+                       done = shared->done;
+                       SpinLockRelease(&shared->mutex);
+               }
+               if (XLogRecPtrIsValid(lsn_upto) &&
+                       ctx->reader->EndRecPtr >= lsn_upto)
+                       break;
+
+               if (record == NULL)
+               {
+                       int64           timeout = 0;
+                       WaitLSNResult res;
+
+                       /*
+                        * Before we retry reading, wait until new WAL is flushed.
+                        *
+                        * There is a race condition such that the backend executing
+                        * REPACK determines 'lsn_upto', but before it sets the shared
+                        * variable, we reach the end of WAL. In that case we'd need to
+                        * wait until the next WAL flush (unrelated to REPACK). Although
+                        * that should not be a problem in a busy system, it might be
+                        * noticeable in other cases, including regression tests (which
+                        * are not necessarily executed in parallel). Therefore it makes
+                        * sense to use timeout.
+                        *
+                        * If lsn_upto is valid, WAL records having LSN lower than that
+                        * should already have been flushed to disk.
+                        */
+                       if (!XLogRecPtrIsValid(lsn_upto))
+                               timeout = 100L;
+                       res = WaitForLSN(WAIT_LSN_TYPE_PRIMARY_FLUSH,
+                                                        ctx->reader->EndRecPtr + 1,
+                                                        timeout);
+                       if (res != WAIT_LSN_RESULT_SUCCESS &&
+                               res != WAIT_LSN_RESULT_TIMEOUT)
+                               ereport(ERROR,
+                                               errmsg("waiting for WAL failed"));
+               }
+       }
+
+       /*
+        * Close the file so we can make it available to the backend.
+        */
+       BufFileClose(dstate->file);
+       dstate->file = NULL;
+       SpinLockAcquire(&shared->mutex);
+       shared->lsn_upto = InvalidXLogRecPtr;
+       shared->last_exported++;
+       SpinLockRelease(&shared->mutex);
+       ConditionVariableSignal(&shared->cv);
+
+       return done;
+}
+
+/*
+ * Does the WAL record contain a data change that this backend does not need
+ * to decode on behalf of REPACK (CONCURRENTLY)?
+ */
+bool
+change_useless_for_repack(XLogRecordBuffer *buf)
+{
+       XLogReaderState *r = buf->record;
+       RelFileLocator locator;
+
+       /* TOAST locator should not be set unless the main is. */
+       Assert(!OidIsValid(repacked_rel_toast_locator.relNumber) ||
+                  OidIsValid(repacked_rel_locator.relNumber));
+
+       /*
+        * Backends not involved in REPACK (CONCURRENTLY) should not do the
+        * filtering.
+        */
+       if (!OidIsValid(repacked_rel_locator.relNumber))
+               return false;
+
+       /*
+        * If the record does not contain the block 0, it's probably not INSERT /
+        * UPDATE / DELETE. In any case, we do not have enough information to
+        * filter the change out.
+        */
+       if (!XLogRecGetBlockTagExtended(r, 0, &locator, NULL, NULL, NULL))
+               return false;
+
+       /*
+        * Decode the change if it belongs to the table we are repacking, or if it
+        * belongs to its TOAST relation.
+        */
+       if (RelFileLocatorEquals(locator, repacked_rel_locator))
+               return false;
+       if (OidIsValid(repacked_rel_toast_locator.relNumber) &&
+               RelFileLocatorEquals(locator, repacked_rel_toast_locator))
+               return false;
+
+       /* Filter out changes of other tables. */
+       return true;
+}
index e2882a50b3b724720665731865554560a3877e23..eec09ba1ded4ca517d74b13a47f791f269855fbf 100644 (file)
@@ -6058,6 +6058,7 @@ ATRewriteTables(AlterTableStmt *parsetree, List **wqueue, LOCKMODE lockmode,
                        finish_heap_swap(tab->relid, OIDNewHeap,
                                                         false, false, true,
                                                         !OidIsValid(tab->newTableSpace),
+                                                        true,  /* reindex */
                                                         RecentXmin,
                                                         ReadNextMultiXactId(),
                                                         persistence);
index a67a70df297979661d3514b9c55753fe9054dade..99d0db82ed7f48d3eb930d3915108bb70bbc2f24 100644 (file)
@@ -127,7 +127,7 @@ static void vac_truncate_clog(TransactionId frozenXID,
                                                          TransactionId lastSaneFrozenXid,
                                                          MultiXactId lastSaneMinMulti);
 static bool vacuum_rel(Oid relid, RangeVar *relation, VacuumParams params,
-                                          BufferAccessStrategy bstrategy);
+                                          BufferAccessStrategy bstrategy, bool isTopLevel);
 static double compute_parallel_delay(void);
 static VacOptValue get_vacoptval_from_boolean(DefElem *def);
 static bool vac_tid_reaped(ItemPointer itemptr, void *state);
@@ -630,7 +630,8 @@ vacuum(List *relations, const VacuumParams *params, BufferAccessStrategy bstrate
 
                        if (params->options & VACOPT_VACUUM)
                        {
-                               if (!vacuum_rel(vrel->oid, vrel->relation, *params, bstrategy))
+                               if (!vacuum_rel(vrel->oid, vrel->relation, *params, bstrategy,
+                                                               isTopLevel))
                                        continue;
                        }
 
@@ -2004,7 +2005,7 @@ vac_truncate_clog(TransactionId frozenXID,
  */
 static bool
 vacuum_rel(Oid relid, RangeVar *relation, VacuumParams params,
-                  BufferAccessStrategy bstrategy)
+                  BufferAccessStrategy bstrategy, bool isTopLevel)
 {
        LOCKMODE        lmode;
        Relation        rel;
@@ -2295,7 +2296,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams params,
 
                        /* VACUUM FULL is a variant of REPACK; see repack.c */
                        cluster_rel(REPACK_COMMAND_VACUUMFULL, rel, InvalidOid,
-                                               &cluster_params);
+                                               &cluster_params, isTopLevel);
                        /* cluster_rel closes the relation, but keeps lock */
 
                        rel = NULL;
@@ -2338,7 +2339,8 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams params,
                toast_vacuum_params.options |= VACOPT_PROCESS_MAIN;
                toast_vacuum_params.toast_parent = relid;
 
-               vacuum_rel(toast_relid, NULL, toast_vacuum_params, bstrategy);
+               vacuum_rel(toast_relid, NULL, toast_vacuum_params, bstrategy,
+                                  isTopLevel);
        }
 
        /*
index 22e5164adbf75746db02e5f9404f50f87d020d48..21ce180c78ddff0c5b17f1db187863ca6a0f40e3 100644 (file)
@@ -14,6 +14,7 @@
 #include "postgres.h"
 
 #include "access/parallel.h"
+#include "commands/repack.h"
 #include "libpq/libpq.h"
 #include "libpq/pqformat.h"
 #include "libpq/pqmq.h"
@@ -177,6 +178,10 @@ mq_putmessage(char msgtype, const char *s, size_t len)
                                SendProcSignal(pq_mq_parallel_leader_pid,
                                                           PROCSIG_PARALLEL_APPLY_MESSAGE,
                                                           pq_mq_parallel_leader_proc_number);
+                       else if (AmRepackWorker())
+                               SendProcSignal(pq_mq_parallel_leader_pid,
+                                                          PROCSIG_REPACK_MESSAGE,
+                                                          pq_mq_parallel_leader_proc_number);
                        else
                        {
                                Assert(IsParallelWorker());
index 4f5292d8f88431e945655bf1d30ba7720f1a21bc..f737d799c610f952c9a059fe8da1776f45feafc6 100644 (file)
@@ -219,5 +219,6 @@ pg_test_mod_args = pg_mod_args + {
 subdir('jit/llvm')
 subdir('replication/libpqwalreceiver')
 subdir('replication/pgoutput')
+subdir('replication/pgrepack')
 subdir('snowball')
 subdir('utils/mb/conversion_procs')
index 0992b9b63536bd881e215c803ecc259bbff68fc4..3914d22a51479572d0ee91b563bc8e53302f8b0e 100644 (file)
@@ -13,6 +13,7 @@
 #include "postgres.h"
 
 #include "access/parallel.h"
+#include "commands/repack.h"
 #include "libpq/pqsignal.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -146,6 +147,10 @@ static const struct
                .fn_name = "ParallelWorkerMain",
                .fn_addr = ParallelWorkerMain
        },
+       {
+               .fn_name = "RepackWorkerMain",
+               .fn_addr = RepackWorkerMain
+       },
        {
                .fn_name = "SequenceSyncWorkerMain",
                .fn_addr = SequenceSyncWorkerMain
index 57aaef57c61ebd2592aedd5b574a848bfc8f5b29..c9fea8cad2829cec5cc78ec2927df2cfb828076d 100644 (file)
@@ -33,6 +33,7 @@
 #include "access/xlogreader.h"
 #include "access/xlogrecord.h"
 #include "catalog/pg_control.h"
+#include "commands/repack.h"
 #include "replication/decode.h"
 #include "replication/logical.h"
 #include "replication/message.h"
@@ -436,7 +437,8 @@ heap2_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
        {
                case XLOG_HEAP2_MULTI_INSERT:
                        if (SnapBuildProcessChange(builder, xid, buf->origptr) &&
-                               !ctx->fast_forward)
+                               !ctx->fast_forward &&
+                               !change_useless_for_repack(buf))
                                DecodeMultiInsert(ctx, buf);
                        break;
                case XLOG_HEAP2_NEW_CID:
@@ -498,7 +500,8 @@ heap_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
        {
                case XLOG_HEAP_INSERT:
                        if (SnapBuildProcessChange(builder, xid, buf->origptr) &&
-                               !ctx->fast_forward)
+                               !ctx->fast_forward &&
+                               !change_useless_for_repack(buf))
                                DecodeInsert(ctx, buf);
                        break;
 
@@ -510,19 +513,22 @@ heap_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
                case XLOG_HEAP_HOT_UPDATE:
                case XLOG_HEAP_UPDATE:
                        if (SnapBuildProcessChange(builder, xid, buf->origptr) &&
-                               !ctx->fast_forward)
+                               !ctx->fast_forward &&
+                               !change_useless_for_repack(buf))
                                DecodeUpdate(ctx, buf);
                        break;
 
                case XLOG_HEAP_DELETE:
                        if (SnapBuildProcessChange(builder, xid, buf->origptr) &&
-                               !ctx->fast_forward)
+                               !ctx->fast_forward &&
+                               !change_useless_for_repack(buf))
                                DecodeDelete(ctx, buf);
                        break;
 
                case XLOG_HEAP_TRUNCATE:
                        if (SnapBuildProcessChange(builder, xid, buf->origptr) &&
-                               !ctx->fast_forward)
+                               !ctx->fast_forward &&
+                               !change_useless_for_repack(buf))
                                DecodeTruncate(ctx, buf);
                        break;
 
@@ -538,7 +544,8 @@ heap_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 
                case XLOG_HEAP_CONFIRM:
                        if (SnapBuildProcessChange(builder, xid, buf->origptr) &&
-                               !ctx->fast_forward)
+                               !ctx->fast_forward &&
+                               !change_useless_for_repack(buf))
                                DecodeSpecConfirm(ctx, buf);
                        break;
 
@@ -1035,6 +1042,15 @@ DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 
        xlrec = (xl_heap_delete *) XLogRecGetData(r);
 
+       /*
+        * Skip changes that were marked as ignorable at origin.
+        *
+        * (This is used for changes that affect relations not visible to other
+        * transactions, such as the transient table during concurrent repack.)
+        */
+       if (xlrec->flags & XLH_DELETE_NO_LOGICAL)
+               return;
+
        /* only interested in our database */
        XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL);
        if (target_locator.dbOid != ctx->slot->data.database)
diff --git a/src/backend/replication/pgrepack/Makefile b/src/backend/replication/pgrepack/Makefile
new file mode 100644 (file)
index 0000000..d3d3140
--- /dev/null
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for src/backend/replication/pgrepack
+#
+# IDENTIFICATION
+#    src/backend/replication/pgrepack
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/replication/pgrepack
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+       $(WIN32RES) \
+       pgrepack.o
+PGFILEDESC = "pgrepack - logical replication output plugin for REPACK"
+NAME = pgrepack
+
+all: all-shared-lib
+
+include $(top_srcdir)/src/Makefile.shlib
+
+install: all installdirs install-lib
+
+installdirs: installdirs-lib
+
+uninstall: uninstall-lib
+
+clean distclean: clean-lib
+       rm -f $(OBJS)
diff --git a/src/backend/replication/pgrepack/meson.build b/src/backend/replication/pgrepack/meson.build
new file mode 100644 (file)
index 0000000..2c7de96
--- /dev/null
@@ -0,0 +1,18 @@
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+pgrepack_sources = files(
+  'pgrepack.c',
+)
+
+if host_system == 'windows'
+  pgrepack_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+    '--NAME', 'pgrepack',
+    '--FILEDESC', 'pgrepack - logical replication output plugin for REPACK',])
+endif
+
+pgrepack = shared_module('pgrepack',
+  pgrepack_sources,
+  kwargs: pg_mod_args,
+)
+
+backend_targets += pgrepack
diff --git a/src/backend/replication/pgrepack/pgrepack.c b/src/backend/replication/pgrepack/pgrepack.c
new file mode 100644 (file)
index 0000000..457b651
--- /dev/null
@@ -0,0 +1,287 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgrepack.c
+ *             Logical Replication output plugin for REPACK command
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *               src/backend/replication/pgrepack/pgrepack.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/detoast.h"
+#include "commands/repack_internal.h"
+#include "replication/snapbuild.h"
+#include "utils/memutils.h"
+
+PG_MODULE_MAGIC;
+
+static void repack_startup(LogicalDecodingContext *ctx,
+                                                  OutputPluginOptions *opt, bool is_init);
+static void repack_shutdown(LogicalDecodingContext *ctx);
+static void repack_begin_txn(LogicalDecodingContext *ctx,
+                                                        ReorderBufferTXN *txn);
+static void repack_commit_txn(LogicalDecodingContext *ctx,
+                                                         ReorderBufferTXN *txn, XLogRecPtr commit_lsn);
+static void repack_process_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
+                                                                 Relation rel, ReorderBufferChange *change);
+static void repack_store_change(LogicalDecodingContext *ctx, Relation relation,
+                                                               ConcurrentChangeKind kind, HeapTuple tuple);
+
+void
+_PG_output_plugin_init(OutputPluginCallbacks *cb)
+{
+       cb->startup_cb = repack_startup;
+       cb->begin_cb = repack_begin_txn;
+       cb->change_cb = repack_process_change;
+       cb->commit_cb = repack_commit_txn;
+       cb->shutdown_cb = repack_shutdown;
+}
+
+
+/* initialize this plugin */
+static void
+repack_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt,
+                          bool is_init)
+{
+       ctx->output_plugin_private = NULL;
+
+       /* Probably unnecessary, as we don't use the SQL interface ... */
+       opt->output_type = OUTPUT_PLUGIN_BINARY_OUTPUT;
+
+       if (ctx->output_plugin_options != NIL)
+       {
+               ereport(ERROR,
+                               errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                               errmsg("this plugin does not expect any options"));
+       }
+}
+
+static void
+repack_shutdown(LogicalDecodingContext *ctx)
+{
+}
+
+/*
+ * As we don't release the slot during processing of particular table, there's
+ * no room for SQL interface, even for debugging purposes. Therefore we need
+ * neither OutputPluginPrepareWrite() nor OutputPluginWrite() in the plugin
+ * callbacks. (Although we might want to write custom callbacks, this API
+ * seems to be unnecessarily generic for our purposes.)
+ */
+
+/* BEGIN callback */
+static void
+repack_begin_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn)
+{
+}
+
+/* COMMIT callback */
+static void
+repack_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
+                                 XLogRecPtr commit_lsn)
+{
+}
+
+/*
+ * Callback for individual changed tuples
+ */
+static void
+repack_process_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
+                                         Relation relation, ReorderBufferChange *change)
+{
+       RepackDecodingState *private PG_USED_FOR_ASSERTS_ONLY =
+               (RepackDecodingState *) ctx->output_writer_private;
+
+       /* Changes of other relation should not have been decoded. */
+       Assert(RelationGetRelid(relation) == private->relid);
+
+       /* Decode entry depending on its type */
+       switch (change->action)
+       {
+               case REORDER_BUFFER_CHANGE_INSERT:
+                       {
+                               HeapTuple       newtuple;
+
+                               newtuple = change->data.tp.newtuple;
+
+                               /*
+                                * Identity checks in the main function should have made this
+                                * impossible.
+                                */
+                               if (newtuple == NULL)
+                                       elog(ERROR, "incomplete insert info");
+
+                               repack_store_change(ctx, relation, CHANGE_INSERT, newtuple);
+                       }
+                       break;
+               case REORDER_BUFFER_CHANGE_UPDATE:
+                       {
+                               HeapTuple       oldtuple,
+                                                       newtuple;
+
+                               oldtuple = change->data.tp.oldtuple;
+                               newtuple = change->data.tp.newtuple;
+
+                               if (newtuple == NULL)
+                                       elog(ERROR, "incomplete update info");
+
+                               if (oldtuple != NULL)
+                                       repack_store_change(ctx, relation, CHANGE_UPDATE_OLD, oldtuple);
+
+                               repack_store_change(ctx, relation, CHANGE_UPDATE_NEW, newtuple);
+                       }
+                       break;
+               case REORDER_BUFFER_CHANGE_DELETE:
+                       {
+                               HeapTuple       oldtuple;
+
+                               oldtuple = change->data.tp.oldtuple;
+
+                               if (oldtuple == NULL)
+                                       elog(ERROR, "incomplete delete info");
+
+                               repack_store_change(ctx, relation, CHANGE_DELETE, oldtuple);
+                       }
+                       break;
+               default:
+
+                       /*
+                        * Should not come here. This includes TRUNCATE of the table being
+                        * processed. heap_decode() cannot check the file locator easily,
+                        * but we assume that TRUNCATE uses AccessExclusiveLock on the
+                        * table so it should not occur during REPACK (CONCURRENTLY).
+                        */
+                       Assert(false);
+                       break;
+       }
+}
+
+/*
+ * Write the given tuple, with the given change kind, to the repack spill
+ * file.  Later, the repack decoding worker can read these and replay
+ * the operations on the new copy of the table.
+ *
+ * For each change affecting the table being repacked, we store enough
+ * information about each tuple in it, so that it can be replayed in the
+ * new copy of the table.
+ */
+static void
+repack_store_change(LogicalDecodingContext *ctx, Relation relation,
+                                       ConcurrentChangeKind kind, HeapTuple tuple)
+{
+       RepackDecodingState *dstate;
+       MemoryContext oldcxt;
+       BufFile    *file;
+       List       *attrs_ext = NIL;
+       int                     natt_ext;
+
+       dstate = (RepackDecodingState *) ctx->output_writer_private;
+       file = dstate->file;
+
+       /* Store the change kind. */
+       BufFileWrite(file, &kind, 1);
+
+       /* Use a frequently-reset context to avoid dealing with leaks manually */
+       oldcxt = MemoryContextSwitchTo(dstate->change_cxt);
+
+       /*
+        * If the tuple contains "external indirect" attributes, we need to write
+        * the contents to the file because we have no control over that memory.
+        */
+       if (HeapTupleHasExternal(tuple))
+       {
+               TupleDesc       desc = RelationGetDescr(relation);
+               TupleTableSlot *slot;
+
+               /* Initialize the slot, if not done already */
+               if (dstate->slot == NULL)
+               {
+                       ResourceOwner saveResourceOwner;
+
+                       MemoryContextSwitchTo(dstate->worker_cxt);
+                       saveResourceOwner = CurrentResourceOwner;
+                       CurrentResourceOwner = dstate->worker_resowner;
+                       dstate->slot = MakeSingleTupleTableSlot(desc, &TTSOpsHeapTuple);
+                       MemoryContextSwitchTo(dstate->change_cxt);
+                       CurrentResourceOwner = saveResourceOwner;
+               }
+
+               slot = dstate->slot;
+               ExecStoreHeapTuple(tuple, slot, false);
+
+               /*
+                * Loop over all attributes, and find out which ones we need to spill
+                * separately, to wit: each one that's a non-null varlena and stored
+                * out of line.
+                */
+               for (int i = 0; i < desc->natts; i++)
+               {
+                       CompactAttribute *attr = TupleDescCompactAttr(desc, i);
+                       varlena    *varlen;
+
+                       if (attr->attisdropped || attr->attlen != -1 ||
+                               slot_attisnull(slot, i + 1))
+                               continue;
+
+                       slot_getsomeattrs(slot, i + 1);
+
+                       /*
+                        * This is a non-null varlena datum, but we only care if it's
+                        * out-of-line
+                        */
+                       varlen = (varlena *) DatumGetPointer(slot->tts_values[i]);
+                       if (!VARATT_IS_EXTERNAL(varlen))
+                               continue;
+
+                       /*
+                        * We spill any indirect-external attributes separately from the
+                        * heap tuple.  Anything else is written as is.
+                        */
+                       if (VARATT_IS_EXTERNAL_INDIRECT(varlen))
+                               attrs_ext = lappend(attrs_ext, varlen);
+                       else
+                       {
+                               /*
+                                * Logical decoding should not produce "external expanded"
+                                * attributes (those actually should never appear on disk), so
+                                * only TOASTed attribute can be seen here.
+                                *
+                                * We get here if the table has external values but only
+                                * in-line values are being updated now.
+                                */
+                               Assert(VARATT_IS_EXTERNAL_ONDISK(varlen));
+                       }
+               }
+
+               ExecClearTuple(slot);
+       }
+
+       /*
+        * First, write the original heap tuple, prefixed by its length.  Note
+        * that the external-toast tag for each toasted attribute will be present
+        * in what we write, so that we know where to restore each one later.
+        */
+       BufFileWrite(file, &tuple->t_len, sizeof(tuple->t_len));
+       BufFileWrite(file, tuple->t_data, tuple->t_len);
+
+       /* Then, write the number of external attributes we found. */
+       natt_ext = list_length(attrs_ext);
+       BufFileWrite(file, &natt_ext, sizeof(natt_ext));
+
+       /* Finally, the attributes themselves, if any */
+       foreach_ptr(varlena, attr_val, attrs_ext)
+       {
+               attr_val = detoast_external_attr(attr_val);
+               BufFileWrite(file, attr_val, VARSIZE_ANY(attr_val));
+               /* These attributes could be large, so free them right away */
+               pfree(attr_val);
+       }
+
+       /* Cleanup. */
+       MemoryContextSwitchTo(oldcxt);
+       MemoryContextReset(dstate->change_cxt);
+}
index 3c44a2fd082a56c2e9fc9c6716449cb745456d01..4e3ee27a058e98ed6250faed6868bcc70f3f2f61 100644 (file)
@@ -707,6 +707,9 @@ procsignal_sigusr1_handler(SIGNAL_ARGS)
        if (CheckProcSignal(PROCSIG_PARALLEL_APPLY_MESSAGE))
                HandleParallelApplyMessageInterrupt();
 
+       if (CheckProcSignal(PROCSIG_REPACK_MESSAGE))
+               HandleRepackMessageInterrupt();
+
        if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT))
                HandleRecoveryConflictInterrupt();
 
index 808aa145b352e59cceee95eee0b38cbca7fde2f2..cb353f2ed462b7d2c6213076641952c3c64276a9 100644 (file)
@@ -3578,6 +3578,9 @@ ProcessInterrupts(void)
 
        if (ParallelApplyMessagePending)
                ProcessParallelApplyMessages();
+
+       if (RepackMessagePending)
+               ProcessRepackMessages();
 }
 
 /*
index 0a6d16f8154cb0748977ecb2f6eec5f374c1e86e..7bda5298558b68b667a5d2f84fcb77fe1aaf822b 100644 (file)
@@ -156,6 +156,7 @@ RECOVERY_CONFLICT_SNAPSHOT  "Waiting for recovery conflict resolution for a vacuu
 RECOVERY_CONFLICT_TABLESPACE   "Waiting for recovery conflict resolution for dropping a tablespace."
 RECOVERY_END_COMMAND   "Waiting for <xref linkend="guc-recovery-end-command"/> to complete."
 RECOVERY_PAUSE "Waiting for recovery to be resumed."
+REPACK_WORKER_EXPORT   "Waiting for decoding worker to export a new output file."
 REPLICATION_ORIGIN_DROP        "Waiting for a replication origin to become inactive so it can be dropped."
 REPLICATION_SLOT_DROP  "Waiting for a replication slot to become inactive so it can be dropped."
 RESTORE_COMMAND        "Waiting for <xref linkend="guc-restore-command"/> to complete."
index ed15c1b4f836cd7aa7fc779362d05314e423fda4..9990f818942ecfd622773903e56690b3960497bc 100644 (file)
@@ -5236,8 +5236,8 @@ match_previous_words(int pattern_id,
                 * one word, so the above test is correct.
                 */
                if (ends_with(prev_wd, '(') || ends_with(prev_wd, ','))
-                       COMPLETE_WITH("ANALYZE", "VERBOSE");
-               else if (TailMatches("ANALYZE", "VERBOSE"))
+                       COMPLETE_WITH("ANALYZE", "CONCURRENTLY", "VERBOSE");
+               else if (TailMatches("ANALYZE", "CONCURRENTLY", "VERBOSE"))
                        COMPLETE_WITH("ON", "OFF");
        }
 
index 516806fcca2233857ce3f140e8bdd7c5156856b2..fdca7d821c87cdb415ebf023689fa929e61e7e39 100644 (file)
 #define XLH_DELETE_CONTAINS_OLD_KEY                            (1<<2)
 #define XLH_DELETE_IS_SUPER                                            (1<<3)
 #define XLH_DELETE_IS_PARTITION_MOVE                   (1<<4)
+/* See heap_delete() */
+#define XLH_DELETE_NO_LOGICAL                                  (1<<5)
 
 /* convenience macro for checking whether any form of old tuple was logged */
 #define XLH_DELETE_CONTAINS_OLD                                                \
index 4647785fd353a2ed2085f5df3e085bf0392050b4..a21c7db543904cab899cd23ed6c16c1b01e00e3d 100644 (file)
@@ -284,9 +284,10 @@ typedef struct TM_IndexDeleteOp
 
 /* "options" flag bits for table_tuple_delete */
 #define TABLE_DELETE_CHANGING_PARTITION                        (1 << 0)
+#define TABLE_DELETE_NO_LOGICAL                                        (1 << 1)
 
 /* "options" flag bits for table_tuple_update */
-/* XXX none at present */
+#define TABLE_UPDATE_NO_LOGICAL                                        (1 << 0)
 
 /* flag bits for table_tuple_lock */
 /* Follow tuples whose update is in progress if lock modes don't conflict  */
@@ -662,6 +663,7 @@ typedef struct TableAmRoutine
                                                                                          Relation OldIndex,
                                                                                          bool use_sort,
                                                                                          TransactionId OldestXmin,
+                                                                                         Snapshot snapshot,
                                                                                          TransactionId *xid_cutoff,
                                                                                          MultiXactId *multi_cutoff,
                                                                                          double *num_tuples,
@@ -1563,7 +1565,12 @@ table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid,
  *             cmax/cmin if successful)
  *     options - bitmask of options.  No values are currently recognized.
  *     crosscheck - if not InvalidSnapshot, also check old tuple against this
- *     wait - true if should wait for any conflicting update to commit/abort
+ *     options - These allow the caller to specify options that may change the
+ *     behavior of the AM. The AM will ignore options that it does not support.
+ *             TABLE_UPDATE_WAIT -- set if should wait for any conflicting update to
+ *             commit/abort
+ *             TABLE_UPDATE_NO_LOGICAL -- force-disables the emitting of logical
+ *             decoding information for the tuple.
  *
  * Output parameters:
  *     slot - newly constructed tuple data to store
@@ -1725,6 +1732,8 @@ table_relation_copy_data(Relation rel, const RelFileLocator *newrlocator)
  *   not needed for the relation's AM
  * - *xid_cutoff - ditto
  * - *multi_cutoff - ditto
+ * - snapshot - if != NULL, ignore data changes done by transactions that this
+ *      (MVCC) snapshot considers still in-progress or in the future.
  *
  * Output parameters:
  * - *xid_cutoff - rel's new relfrozenxid value, may be invalid
@@ -1737,6 +1746,7 @@ table_relation_copy_for_cluster(Relation OldTable, Relation NewTable,
                                                                Relation OldIndex,
                                                                bool use_sort,
                                                                TransactionId OldestXmin,
+                                                               Snapshot snapshot,
                                                                TransactionId *xid_cutoff,
                                                                MultiXactId *multi_cutoff,
                                                                double *num_tuples,
@@ -1745,6 +1755,7 @@ table_relation_copy_for_cluster(Relation OldTable, Relation NewTable,
 {
        OldTable->rd_tableam->relation_copy_for_cluster(OldTable, NewTable, OldIndex,
                                                                                                        use_sort, OldestXmin,
+                                                                                                       snapshot,
                                                                                                        xid_cutoff, multi_cutoff,
                                                                                                        num_tuples, tups_vacuumed,
                                                                                                        tups_recently_dead);
index 67948667a97a16618bb88be32ea84e38bd7029d2..2a12920c75f2c103aee8b8ad12fd269581855509 100644 (file)
 #define PROGRESS_REPACK_PHASE                                  1
 #define PROGRESS_REPACK_INDEX_RELID                            2
 #define PROGRESS_REPACK_HEAP_TUPLES_SCANNED            3
-#define PROGRESS_REPACK_HEAP_TUPLES_WRITTEN            4
-#define PROGRESS_REPACK_TOTAL_HEAP_BLKS                        5
-#define PROGRESS_REPACK_HEAP_BLKS_SCANNED              6
-#define PROGRESS_REPACK_INDEX_REBUILD_COUNT            7
+#define PROGRESS_REPACK_HEAP_TUPLES_INSERTED   4
+#define PROGRESS_REPACK_HEAP_TUPLES_UPDATED            5
+#define PROGRESS_REPACK_HEAP_TUPLES_DELETED            6
+#define PROGRESS_REPACK_TOTAL_HEAP_BLKS                        7
+#define PROGRESS_REPACK_HEAP_BLKS_SCANNED              8
+#define PROGRESS_REPACK_INDEX_REBUILD_COUNT            9
 
 /*
  * Phases of repack (as advertised via PROGRESS_REPACK_PHASE).
 #define PROGRESS_REPACK_PHASE_INDEX_SCAN_HEAP  2
 #define PROGRESS_REPACK_PHASE_SORT_TUPLES              3
 #define PROGRESS_REPACK_PHASE_WRITE_NEW_HEAP   4
-#define PROGRESS_REPACK_PHASE_SWAP_REL_FILES   5
-#define PROGRESS_REPACK_PHASE_REBUILD_INDEX            6
-#define PROGRESS_REPACK_PHASE_FINAL_CLEANUP            7
+#define PROGRESS_REPACK_PHASE_CATCH_UP                 5
+#define PROGRESS_REPACK_PHASE_SWAP_REL_FILES   6
+#define PROGRESS_REPACK_PHASE_REBUILD_INDEX            7
+#define PROGRESS_REPACK_PHASE_FINAL_CLEANUP            8
 
 /* Progress parameters for CREATE INDEX */
 /* 3, 4 and 5 reserved for "waitfor" metrics */
index 85061158b0c8c6bb7f9ca39777607d655142cec3..fd16e74b179fadb38c6d2c61bd26ba4e8f776b8a 100644 (file)
@@ -13,6 +13,8 @@
 #ifndef REPACK_H
 #define REPACK_H
 
+#include <signal.h>
+
 #include "nodes/parsenodes.h"
 #include "parser/parse_node.h"
 #include "storage/lockdefs.h"
@@ -25,6 +27,7 @@
 #define CLUOPT_RECHECK_ISCLUSTERED 0x04 /* recheck relation state for
                                                                                 * indisclustered */
 #define CLUOPT_ANALYZE 0x08            /* do an ANALYZE */
+#define CLUOPT_CONCURRENT 0x10 /* allow concurrent data changes */
 
 /* options for CLUSTER */
 typedef struct ClusterParams
@@ -32,11 +35,13 @@ typedef struct ClusterParams
        uint32          options;                /* bitmask of CLUOPT_* */
 } ClusterParams;
 
+extern PGDLLIMPORT volatile sig_atomic_t RepackMessagePending;
+
 
 extern void ExecRepack(ParseState *pstate, RepackStmt *stmt, bool isTopLevel);
 
 extern void cluster_rel(RepackCommand command, Relation OldHeap, Oid indexOid,
-                                               ClusterParams *params);
+                                               ClusterParams *params, bool isTopLevel);
 extern void check_index_is_clusterable(Relation OldHeap, Oid indexOid,
                                                                           LOCKMODE lockmode);
 extern void mark_index_clustered(Relation rel, Oid indexOid, bool is_internal);
@@ -48,8 +53,16 @@ extern void finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
                                                         bool swap_toast_by_content,
                                                         bool check_constraints,
                                                         bool is_internal,
+                                                        bool reindex,
                                                         TransactionId frozenXid,
                                                         MultiXactId cutoffMulti,
                                                         char newrelpersistence);
 
+extern void HandleRepackMessageInterrupt(void);
+extern void ProcessRepackMessages(void);
+
+/* in repack_worker.c */
+extern void RepackWorkerMain(Datum main_arg);
+extern bool AmRepackWorker(void);
+
 #endif                                                 /* REPACK_H */
diff --git a/src/include/commands/repack_internal.h b/src/include/commands/repack_internal.h
new file mode 100644 (file)
index 0000000..3ff6444
--- /dev/null
@@ -0,0 +1,125 @@
+/*-------------------------------------------------------------------------
+ *
+ * repack_internal.h
+ *       header for REPACK internals
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ * src/include/commands/repack_internal.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef REPACK_INTERNAL_H
+#define REPACK_INTERNAL_H
+
+#include "nodes/execnodes.h"
+#include "replication/decode.h"
+#include "postmaster/bgworker.h"
+#include "replication/logical.h"
+#include "storage/buffile.h"
+#include "storage/sharedfileset.h"
+#include "storage/shm_mq.h"
+#include "utils/resowner.h"
+
+/*
+ * The type of a change stored in the output files.
+ */
+typedef char ConcurrentChangeKind;
+
+#define                CHANGE_INSERT           'i'
+#define                CHANGE_UPDATE_OLD       'u'
+#define                CHANGE_UPDATE_NEW       'U'
+#define                CHANGE_DELETE           'd'
+
+/*
+ * Logical decoding state.
+ *
+ * The output plugin uses it to store the data changes that it decodes from
+ * WAL while the table contents is being copied to a new storage.
+ */
+typedef struct RepackDecodingState
+{
+#ifdef USE_ASSERT_CHECKING
+       /* The relation whose changes we're decoding. */
+       Oid                     relid;
+#endif
+
+       /* Per-change memory context. */
+       MemoryContext change_cxt;
+
+       /* A tuple slot used to pass tuples back and forth */
+       TupleTableSlot *slot;
+
+       /*
+        * Memory context and resource owner of the decoding worker's transaction.
+        */
+       MemoryContext worker_cxt;
+       ResourceOwner worker_resowner;
+
+       /* The current output file. */
+       BufFile    *file;
+} RepackDecodingState;
+
+/*
+ * Shared memory used for communication between the backend running REPACK and
+ * the worker that performs logical decoding of data changes.
+ */
+typedef struct DecodingWorkerShared
+{
+       /* Is the decoding initialized? */
+       bool            initialized;
+
+       /*
+        * Once the worker has reached this LSN, it should close the current
+        * output file and either create a new one or exit, according to the field
+        * 'done'. If the value is InvalidXLogRecPtr, the worker should decode all
+        * the WAL available and keep checking this field. It is ok if the worker
+        * had already decoded records whose LSN is >= lsn_upto before this field
+        * has been set.
+        */
+       XLogRecPtr      lsn_upto;
+
+       /* Exit after closing the current file? */
+       bool            done;
+
+       /* The output is stored here. */
+       SharedFileSet sfs;
+
+       /* Number of the last file exported by the worker. */
+       int                     last_exported;
+
+       /* Synchronize access to the fields above. */
+       slock_t         mutex;
+
+       /* Database to connect to. */
+       Oid                     dbid;
+
+       /* Role to connect as. */
+       Oid                     roleid;
+
+       /* Relation from which data changes to decode. */
+       Oid                     relid;
+
+       /* CV the backend waits on */
+       ConditionVariable cv;
+
+       /* Info to signal the backend. */
+       PGPROC     *backend_proc;
+       pid_t           backend_pid;
+       ProcNumber      backend_proc_number;
+       dsm_segment *dsm_seg;
+
+       /*
+        * Memory the queue is located in.
+        *
+        * For considerations on the value see the comments of
+        * PARALLEL_ERROR_QUEUE_SIZE.
+        */
+#define REPACK_ERROR_QUEUE_SIZE                        16384
+       char            error_queue[FLEXIBLE_ARRAY_MEMBER];
+} DecodingWorkerShared;
+
+extern void DecodingWorkerFileName(char *fname, Oid relid, uint32 seq);
+
+
+#endif                                                 /* REPACK_INTERNAL_H */
index 107e43ef750cd8815e0167c3a21f8f61d6c64a3d..2ef177902058df105a917061643babd08a8b9881 100644 (file)
@@ -32,4 +32,8 @@ extern void logicalmsg_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf
 extern void LogicalDecodingProcessRecord(LogicalDecodingContext *ctx,
                                                                                 XLogReaderState *record);
 
+/* in commands/repack_worker.c */
+extern bool change_useless_for_repack(XLogRecordBuffer *buf);
+
+
 #endif
index b73bb5618e6f6c938ac316556011898e93167fb2..3785b009808d88fa87e83ce6774736a9d98c5d16 100644 (file)
@@ -36,8 +36,8 @@ typedef int LOCKMODE;
 #define AccessShareLock                        1       /* SELECT */
 #define RowShareLock                   2       /* SELECT FOR UPDATE/FOR SHARE */
 #define RowExclusiveLock               3       /* INSERT, UPDATE, DELETE */
-#define ShareUpdateExclusiveLock 4     /* VACUUM (non-FULL), ANALYZE, CREATE
-                                                                        * INDEX CONCURRENTLY */
+#define ShareUpdateExclusiveLock 4     /* VACUUM (non-exclusive), ANALYZE, CREATE
+                                                                        * INDEX CONCURRENTLY, REPACK CONCURRENTLY */
 #define ShareLock                              5       /* CREATE INDEX (WITHOUT CONCURRENTLY) */
 #define ShareRowExclusiveLock  6       /* like EXCLUSIVE MODE, but allows ROW
                                                                         * SHARE */
index 7f855971b5a7ebb0cf04629c67d982e403f68607..480c02203b0cba6c0380490c95348d946acde474 100644 (file)
@@ -36,6 +36,7 @@ typedef enum
        PROCSIG_BARRIER,                        /* global barrier interrupt  */
        PROCSIG_LOG_MEMORY_CONTEXT, /* ask backend to log the memory contexts */
        PROCSIG_PARALLEL_APPLY_MESSAGE, /* Message from parallel apply workers */
+       PROCSIG_REPACK_MESSAGE,         /* Message from repack worker */
        PROCSIG_RECOVERY_CONFLICT,      /* backend is blocking recovery, check
                                                                 * PGPROC->pendingRecoveryConflicts for the
                                                                 * reason */
index 7759397263821738f8557c6cc60bb42adbfb2443..5ca6f872ae4a82c028f386fd2088f36ee8e67647 100644 (file)
@@ -16,6 +16,7 @@ ifneq (,$(findstring backend,$(subdir)))
 ifeq (,$(findstring conversion_procs,$(subdir)))
 ifeq (,$(findstring libpqwalreceiver,$(subdir)))
 ifeq (,$(findstring replication/pgoutput,$(subdir)))
+ifeq (,$(findstring replication/pgrepack,$(subdir)))
 ifeq (,$(findstring snowball,$(subdir)))
 override CPPFLAGS+= -DBUILDING_DLL
 endif
@@ -23,6 +24,7 @@ endif
 endif
 endif
 endif
+endif
 
 ifneq (,$(findstring src/common,$(subdir)))
 override CPPFLAGS+= -DBUILDING_DLL
index dc1aafa115ae04209119d8d2ade21da20e355cdb..fc12b845788a24cfa20a24bf0a897ff5acdafac8 100644 (file)
@@ -14,6 +14,7 @@ ifneq (,$(findstring backend,$(subdir)))
 ifeq (,$(findstring conversion_procs,$(subdir)))
 ifeq (,$(findstring libpqwalreceiver,$(subdir)))
 ifeq (,$(findstring replication/pgoutput,$(subdir)))
+ifeq (,$(findstring replication/pgrepack,$(subdir)))
 ifeq (,$(findstring snowball,$(subdir)))
 override CPPFLAGS+= -DBUILDING_DLL
 endif
@@ -21,6 +22,7 @@ endif
 endif
 endif
 endif
+endif
 
 ifneq (,$(findstring src/common,$(subdir)))
 override CPPFLAGS+= -DBUILDING_DLL
index a41d781f8c9a671ae2c61ebf0049552ddfd72ae2..2cd7d87c53346b34c19dc2755821dd026367dbe5 100644 (file)
@@ -14,6 +14,8 @@ REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress
 
 ISOLATION = basic \
            inplace \
+           repack \
+           repack_toast \
            syscache-update-pruned \
            heap_lock_update
 
diff --git a/src/test/modules/injection_points/expected/repack.out b/src/test/modules/injection_points/expected/repack.out
new file mode 100644 (file)
index 0000000..b575e90
--- /dev/null
@@ -0,0 +1,113 @@
+Parsed test spec with 2 sessions
+
+starting permutation: wait_before_lock change_existing change_new change_subxact1 change_subxact2 check2 wakeup_before_lock check1
+injection_points_attach
+-----------------------
+                       
+(1 row)
+
+step wait_before_lock: 
+       REPACK (CONCURRENTLY) repack_test USING INDEX repack_test_pkey;
+ <waiting ...>
+step change_existing: 
+       UPDATE repack_test SET i=10 where i=1;
+       UPDATE repack_test SET j=20 where i=2;
+       UPDATE repack_test SET i=30 where i=3;
+       UPDATE repack_test SET i=40 where i=30;
+       DELETE FROM repack_test WHERE i=4;
+
+step change_new: 
+       INSERT INTO repack_test(i, j) VALUES (5, 5), (6, 6), (7, 7), (8, 8);
+       UPDATE repack_test SET i=50 where i=5;
+       UPDATE repack_test SET j=60 where i=6;
+       DELETE FROM repack_test WHERE i=7;
+
+step change_subxact1: 
+       BEGIN;
+       INSERT INTO repack_test(i, j) VALUES (100, 100);
+       SAVEPOINT s1;
+       UPDATE repack_test SET i=101 where i=100;
+       SAVEPOINT s2;
+       UPDATE repack_test SET i=102 where i=101;
+       COMMIT;
+
+step change_subxact2: 
+       BEGIN;
+       SAVEPOINT s1;
+       INSERT INTO repack_test(i, j) VALUES (110, 110);
+       ROLLBACK TO SAVEPOINT s1;
+       INSERT INTO repack_test(i, j) VALUES (110, 111);
+       COMMIT;
+
+step check2: 
+       INSERT INTO relfilenodes(node)
+       SELECT relfilenode FROM pg_class WHERE relname='repack_test';
+
+       SELECT i, j FROM repack_test ORDER BY i, j;
+
+       INSERT INTO data_s2(i, j)
+       SELECT i, j FROM repack_test;
+
+  i|  j
+---+---
+  2| 20
+  6| 60
+  8|  8
+ 10|  1
+ 40|  3
+ 50|  5
+102|100
+110|111
+(8 rows)
+
+step wakeup_before_lock: 
+       SELECT injection_points_wakeup('repack-concurrently-before-lock');
+
+injection_points_wakeup
+-----------------------
+                       
+(1 row)
+
+step wait_before_lock: <... completed>
+step check1: 
+       INSERT INTO relfilenodes(node)
+       SELECT relfilenode FROM pg_class WHERE relname='repack_test';
+
+       SELECT count(DISTINCT node) FROM relfilenodes;
+
+       SELECT i, j FROM repack_test ORDER BY i, j;
+
+       INSERT INTO data_s1(i, j)
+       SELECT i, j FROM repack_test;
+
+       SELECT count(*)
+       FROM data_s1 d1 FULL JOIN data_s2 d2 USING (i, j)
+       WHERE d1.i ISNULL OR d2.i ISNULL;
+
+count
+-----
+    2
+(1 row)
+
+  i|  j
+---+---
+  2| 20
+  6| 60
+  8|  8
+ 10|  1
+ 40|  3
+ 50|  5
+102|100
+110|111
+(8 rows)
+
+count
+-----
+    0
+(1 row)
+
+injection_points_detach
+-----------------------
+                       
+(1 row)
+
diff --git a/src/test/modules/injection_points/expected/repack_toast.out b/src/test/modules/injection_points/expected/repack_toast.out
new file mode 100644 (file)
index 0000000..b56dde1
--- /dev/null
@@ -0,0 +1,65 @@
+Parsed test spec with 2 sessions
+
+starting permutation: wait_before_lock change check2 wakeup_before_lock check1
+injection_points_attach
+-----------------------
+                       
+(1 row)
+
+step wait_before_lock: 
+       REPACK (CONCURRENTLY) repack_test;
+ <waiting ...>
+step change: 
+       UPDATE repack_test SET j=get_long_string() where i=2;
+       DELETE FROM repack_test WHERE i=3;
+       INSERT INTO repack_test(i, j) VALUES (4, get_long_string());
+       UPDATE repack_test SET i=3 where i=1;
+
+step check2: 
+       INSERT INTO relfilenodes(node)
+       SELECT c2.relfilenode
+       FROM pg_class c1 JOIN pg_class c2 ON c2.oid = c1.oid OR c2.oid = c1.reltoastrelid
+       WHERE c1.relname='repack_test';
+
+       INSERT INTO data_s2(i, j)
+       SELECT i, j FROM repack_test;
+
+step wakeup_before_lock: 
+       SELECT injection_points_wakeup('repack-concurrently-before-lock');
+
+injection_points_wakeup
+-----------------------
+                       
+(1 row)
+
+step wait_before_lock: <... completed>
+step check1: 
+       INSERT INTO relfilenodes(node)
+       SELECT c2.relfilenode
+       FROM pg_class c1 JOIN pg_class c2 ON c2.oid = c1.oid OR c2.oid = c1.reltoastrelid
+       WHERE c1.relname='repack_test';
+
+       SELECT count(DISTINCT node) FROM relfilenodes;
+
+       INSERT INTO data_s1(i, j)
+       SELECT i, j FROM repack_test;
+
+       SELECT count(*)
+       FROM data_s1 d1 FULL JOIN data_s2 d2 USING (i, j)
+       WHERE d1.i ISNULL OR d2.i ISNULL;
+
+count
+-----
+    4
+(1 row)
+
+count
+-----
+    0
+(1 row)
+
+injection_points_detach
+-----------------------
+                       
+(1 row)
+
index fcc85414515d84d85ee68b77fb6b30074e8f56f8..a414abb924b6393ae448db5e8078f1fd189ff65e 100644 (file)
@@ -45,6 +45,8 @@ tests += {
     'specs': [
       'basic',
       'inplace',
+      'repack',
+      'repack_toast',
       'syscache-update-pruned',
       'heap_lock_update',
     ],
diff --git a/src/test/modules/injection_points/specs/repack.spec b/src/test/modules/injection_points/specs/repack.spec
new file mode 100644 (file)
index 0000000..d727a9b
--- /dev/null
@@ -0,0 +1,142 @@
+# REPACK (CONCURRENTLY) ... USING INDEX ...;
+setup
+{
+       CREATE EXTENSION injection_points;
+
+       CREATE TABLE repack_test(i int PRIMARY KEY, j int);
+       INSERT INTO repack_test(i, j) VALUES (1, 1), (2, 2), (3, 3), (4, 4);
+
+       CREATE TABLE relfilenodes(node oid);
+
+       CREATE TABLE data_s1(i int, j int);
+       CREATE TABLE data_s2(i int, j int);
+}
+
+teardown
+{
+       DROP TABLE repack_test;
+       DROP EXTENSION injection_points;
+
+       DROP TABLE relfilenodes;
+       DROP TABLE data_s1;
+       DROP TABLE data_s2;
+}
+
+session s1
+setup
+{
+       SELECT injection_points_set_local();
+       SELECT injection_points_attach('repack-concurrently-before-lock', 'wait');
+}
+# Perform the initial load and wait for s2 to do some data changes.
+step wait_before_lock
+{
+       REPACK (CONCURRENTLY) repack_test USING INDEX repack_test_pkey;
+}
+# Check the table from the perspective of s1.
+#
+# Besides the contents, we also check that relfilenode has changed.
+
+# Have each session write the contents into a table and use FULL JOIN to check
+# if the outputs are identical.
+step check1
+{
+       INSERT INTO relfilenodes(node)
+       SELECT relfilenode FROM pg_class WHERE relname='repack_test';
+
+       SELECT count(DISTINCT node) FROM relfilenodes;
+
+       SELECT i, j FROM repack_test ORDER BY i, j;
+
+       INSERT INTO data_s1(i, j)
+       SELECT i, j FROM repack_test;
+
+       SELECT count(*)
+       FROM data_s1 d1 FULL JOIN data_s2 d2 USING (i, j)
+       WHERE d1.i ISNULL OR d2.i ISNULL;
+}
+teardown
+{
+       SELECT injection_points_detach('repack-concurrently-before-lock');
+}
+
+session s2
+# Change the existing data. UPDATE changes both key and non-key columns. Also
+# update one row twice to test whether tuple version generated by this session
+# can be found.
+step change_existing
+{
+       UPDATE repack_test SET i=10 where i=1;
+       UPDATE repack_test SET j=20 where i=2;
+       UPDATE repack_test SET i=30 where i=3;
+       UPDATE repack_test SET i=40 where i=30;
+       DELETE FROM repack_test WHERE i=4;
+}
+# Insert new rows and UPDATE / DELETE some of them. Again, update both key and
+# non-key column.
+step change_new
+{
+       INSERT INTO repack_test(i, j) VALUES (5, 5), (6, 6), (7, 7), (8, 8);
+       UPDATE repack_test SET i=50 where i=5;
+       UPDATE repack_test SET j=60 where i=6;
+       DELETE FROM repack_test WHERE i=7;
+}
+
+# When applying concurrent data changes, we should see the effects of an
+# in-progress subtransaction.
+#
+# XXX Not sure this test is useful now - it was designed for the patch that
+# preserves tuple visibility and which therefore modifies
+# TransactionIdIsCurrentTransactionId().
+step change_subxact1
+{
+       BEGIN;
+       INSERT INTO repack_test(i, j) VALUES (100, 100);
+       SAVEPOINT s1;
+       UPDATE repack_test SET i=101 where i=100;
+       SAVEPOINT s2;
+       UPDATE repack_test SET i=102 where i=101;
+       COMMIT;
+}
+
+# When applying concurrent data changes, we should not see the effects of a
+# rolled back subtransaction.
+#
+# XXX Is this test useful? See above.
+step change_subxact2
+{
+       BEGIN;
+       SAVEPOINT s1;
+       INSERT INTO repack_test(i, j) VALUES (110, 110);
+       ROLLBACK TO SAVEPOINT s1;
+       INSERT INTO repack_test(i, j) VALUES (110, 111);
+       COMMIT;
+}
+
+# Check the table from the perspective of s2.
+step check2
+{
+       INSERT INTO relfilenodes(node)
+       SELECT relfilenode FROM pg_class WHERE relname='repack_test';
+
+       SELECT i, j FROM repack_test ORDER BY i, j;
+
+       INSERT INTO data_s2(i, j)
+       SELECT i, j FROM repack_test;
+}
+step wakeup_before_lock
+{
+       SELECT injection_points_wakeup('repack-concurrently-before-lock');
+}
+
+# Test if data changes introduced while one session is performing REPACK
+# CONCURRENTLY find their way into the table.
+permutation
+       wait_before_lock
+       change_existing
+       change_new
+       change_subxact1
+       change_subxact2
+       check2
+       wakeup_before_lock
+       check1
diff --git a/src/test/modules/injection_points/specs/repack_toast.spec b/src/test/modules/injection_points/specs/repack_toast.spec
new file mode 100644 (file)
index 0000000..b878b19
--- /dev/null
@@ -0,0 +1,112 @@
+# REPACK (CONCURRENTLY);
+#
+# Test handling of TOAST. At the same time, no tuplesort.
+setup
+{
+       CREATE EXTENSION injection_points;
+
+       -- Return a string that needs to be TOASTed.
+       CREATE FUNCTION get_long_string()
+       RETURNS text
+       LANGUAGE sql as $$
+               SELECT string_agg(chr(65 + trunc(25 * random())::int), '')
+               FROM generate_series(1, 2048) s(x);
+       $$;
+
+       CREATE TABLE repack_test(i int PRIMARY KEY, j text);
+       INSERT INTO repack_test(i, j) VALUES (1, get_long_string()),
+               (2, get_long_string()), (3, get_long_string());
+
+       CREATE TABLE relfilenodes(node oid);
+
+       CREATE TABLE data_s1(i int, j text);
+       CREATE TABLE data_s2(i int, j text);
+}
+
+teardown
+{
+       DROP TABLE repack_test;
+       DROP EXTENSION injection_points;
+       DROP FUNCTION get_long_string();
+
+       DROP TABLE relfilenodes;
+       DROP TABLE data_s1;
+       DROP TABLE data_s2;
+}
+
+session s1
+setup
+{
+       SELECT injection_points_set_local();
+       SELECT injection_points_attach('repack-concurrently-before-lock', 'wait');
+}
+# Perform the initial load and wait for s2 to do some data changes.
+step wait_before_lock
+{
+       REPACK (CONCURRENTLY) repack_test;
+}
+# Check the table from the perspective of s1.
+#
+# Besides the contents, we also check that relfilenode has changed.
+
+# Have each session write the contents into a table and use FULL JOIN to check
+# if the outputs are identical.
+step check1
+{
+       INSERT INTO relfilenodes(node)
+       SELECT c2.relfilenode
+       FROM pg_class c1 JOIN pg_class c2 ON c2.oid = c1.oid OR c2.oid = c1.reltoastrelid
+       WHERE c1.relname='repack_test';
+
+       SELECT count(DISTINCT node) FROM relfilenodes;
+
+       INSERT INTO data_s1(i, j)
+       SELECT i, j FROM repack_test;
+
+       SELECT count(*)
+       FROM data_s1 d1 FULL JOIN data_s2 d2 USING (i, j)
+       WHERE d1.i ISNULL OR d2.i ISNULL;
+}
+teardown
+{
+    SELECT injection_points_detach('repack-concurrently-before-lock');
+}
+
+session s2
+step change
+# Separately test UPDATE of both plain ("i") and TOASTed ("j") attribute. In
+# the first case, the new tuple we get from reorderbuffer.c contains "j" as a
+# TOAST pointer, which we need to update so it points to the new heap. In the
+# latter case, we receive "j" as "external indirect" value - here we test that
+# the decoding worker writes the tuple to a file correctly and that the
+# backend executing REPACK manages to restore it.
+{
+       UPDATE repack_test SET j=get_long_string() where i=2;
+       DELETE FROM repack_test WHERE i=3;
+       INSERT INTO repack_test(i, j) VALUES (4, get_long_string());
+       UPDATE repack_test SET i=3 where i=1;
+}
+# Check the table from the perspective of s2.
+step check2
+{
+       INSERT INTO relfilenodes(node)
+       SELECT c2.relfilenode
+       FROM pg_class c1 JOIN pg_class c2 ON c2.oid = c1.oid OR c2.oid = c1.reltoastrelid
+       WHERE c1.relname='repack_test';
+
+       INSERT INTO data_s2(i, j)
+       SELECT i, j FROM repack_test;
+}
+step wakeup_before_lock
+{
+       SELECT injection_points_wakeup('repack-concurrently-before-lock');
+}
+
+# Test if data changes introduced while one session is performing REPACK
+# CONCURRENTLY find their way into the table.
+permutation
+       wait_before_lock
+       change
+       check2
+       wakeup_before_lock
+       check1
index 269f163efa6f8cac187f21f32e12782a7968ee3b..6127b215a868eb428068f5773867bd820ca2a555 100644 (file)
@@ -537,6 +537,10 @@ SELECT relname, old.level, old.relkind, old.relfilenode = new.relfilenode FROM o
  clstrpart33 |     2 | r       | f
 (7 rows)
 
+-- CONCURRENTLY doesn't like partitioned tables
+REPACK (CONCURRENTLY) clstrpart;
+ERROR:  REPACK (CONCURRENTLY) is not supported for partitioned tables
+HINT:  Consider running the command on individual partitions.
 DROP TABLE clstrpart;
 -- Ownership of partitions is checked
 CREATE TABLE ptnowner(i int unique) PARTITION BY LIST (i);
@@ -802,6 +806,10 @@ ORDER BY o.relname;
  clstr_3
 (2 rows)
 
+-- concurrently
+REPACK (CONCURRENTLY) pg_class;
+ERROR:  cannot repack relation "pg_class"
+HINT:  REPACK CONCURRENTLY is not supported for catalog relations.
 -- clean up
 DROP TABLE clustertest;
 DROP TABLE clstr_1;
index 81a73c426d274fff04f7fca0c6c9a444821be9e3..45994ff0222c26096cbe9802e7c93f1a93498d12 100644 (file)
@@ -2021,7 +2021,7 @@ pg_stat_progress_cluster| SELECT pid,
     phase,
     repack_index_relid AS cluster_index_relid,
     heap_tuples_scanned,
-    heap_tuples_written,
+    (heap_tuples_inserted + heap_tuples_updated) AS heap_tuples_written,
     heap_blks_total,
     heap_blks_scanned,
     index_rebuild_count
@@ -2136,17 +2136,20 @@ pg_stat_progress_repack| SELECT s.pid,
             WHEN 2 THEN 'index scanning heap'::text
             WHEN 3 THEN 'sorting tuples'::text
             WHEN 4 THEN 'writing new heap'::text
-            WHEN 5 THEN 'swapping relation files'::text
-            WHEN 6 THEN 'rebuilding index'::text
-            WHEN 7 THEN 'performing final cleanup'::text
+            WHEN 5 THEN 'catch-up'::text
+            WHEN 6 THEN 'swapping relation files'::text
+            WHEN 7 THEN 'rebuilding index'::text
+            WHEN 8 THEN 'performing final cleanup'::text
             ELSE NULL::text
         END AS phase,
     (s.param3)::oid AS repack_index_relid,
     s.param4 AS heap_tuples_scanned,
-    s.param5 AS heap_tuples_written,
-    s.param6 AS heap_blks_total,
-    s.param7 AS heap_blks_scanned,
-    s.param8 AS index_rebuild_count
+    s.param5 AS heap_tuples_inserted,
+    s.param6 AS heap_tuples_updated,
+    s.param7 AS heap_tuples_deleted,
+    s.param8 AS heap_blks_total,
+    s.param9 AS heap_blks_scanned,
+    s.param10 AS index_rebuild_count
    FROM (pg_stat_get_progress_info('REPACK'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20)
      LEFT JOIN pg_database d ON ((s.datid = d.oid)));
 pg_stat_progress_vacuum| SELECT s.pid,
index f90c6ec200b4ab6ebb0f683d565e96443c3bee98..d14063a9683754fee486999fcf37ff435d795d03 100644 (file)
@@ -248,6 +248,9 @@ REPACK clstrpart;
 CREATE TEMP TABLE new_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ;
 SELECT relname, old.level, old.relkind, old.relfilenode = new.relfilenode FROM old_cluster_info AS old JOIN new_cluster_info AS new USING (relname) ORDER BY relname COLLATE "C";
 
+-- CONCURRENTLY doesn't like partitioned tables
+REPACK (CONCURRENTLY) clstrpart;
+
 DROP TABLE clstrpart;
 
 -- Ownership of partitions is checked
@@ -383,6 +386,9 @@ JOIN relnodes_new n ON o.relname = n.relname
 WHERE o.relfilenode <> n.relfilenode
 ORDER BY o.relname;
 
+-- concurrently
+REPACK (CONCURRENTLY) pg_class;
+
 -- clean up
 DROP TABLE clustertest;
 DROP TABLE clstr_1;
index 7515682fe9f9dca6a0bd1211e437d2440d50ce20..9e6a39f560833b566a616e4f23679988293f6191 100644 (file)
@@ -430,6 +430,7 @@ CatCacheHeader
 CatalogId
 CatalogIdMapEntry
 CatalogIndexState
+ChangeContext
 ChangeVarNodes_callback
 ChangeVarNodes_context
 ChannelName
@@ -509,6 +510,7 @@ CompressFileHandle
 CompressionLocation
 CompressorState
 ComputeXidHorizonsResult
+ConcurrentChangeKind
 ConditionVariable
 ConditionVariableMinimallyPadded
 ConditionalStack
@@ -655,6 +657,8 @@ DeclareCursorStmt
 DecodedBkpBlock
 DecodedXLogRecord
 DecodingOutputState
+DecodingWorker
+DecodingWorkerShared
 DefElem
 DefElemAction
 DefaultACLInfo
@@ -1318,6 +1322,7 @@ IndexElem
 IndexFetchHeapData
 IndexFetchTableData
 IndexInfo
+IndexInsertState
 IndexList
 IndexOnlyScan
 IndexOnlyScanState
@@ -2635,6 +2640,7 @@ ReorderBufferTupleCidKey
 ReorderBufferUpdateProgressTxnCB
 ReorderTuple
 RepackCommand
+RepackDecodingState
 RepackStmt
 ReparameterizeForeignPathByChild_function
 ReplOriginId