]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
pgbench: Add --continue-on-error option.
authorFujii Masao <fujii@postgresql.org>
Fri, 7 Nov 2025 10:17:37 +0000 (19:17 +0900)
committerFujii Masao <fujii@postgresql.org>
Fri, 7 Nov 2025 10:17:37 +0000 (19:17 +0900)
This commit adds the --continue-on-error option, allowing pgbench clients
to continue running even when SQL statements fail for reasons other than
serialization or deadlock errors. Without this option (by default),
the clients aborts in such cases, which was the only available behavior
previously.

This option is useful for benchmarks using custom scripts that may
raise errors, such as unique constraint violations, where users want
pgbench to complete the run despite individual statement failures.

Author: Rintaro Ikeda <ikedarintarof@oss.nttdata.com>
Co-authored-by: Yugo Nagata <nagata@sraoss.co.jp>
Co-authored-by: Fujii Masao <masao.fujii@gmail.com>
Reviewed-by: Stepan Neretin <slpmcf@gmail.com>
Reviewed-by: Matthias van de Meent <boekewurm+postgres@gmail.com>
Reviewed-by: Dilip Kumar <dilipbalaut@gmail.com>
Reviewed-by: Srinath Reddy Sadipiralla <srinath2133@gmail.com>
Reviewed-by: Hayato Kuroda <kuroda.hayato@fujitsu.com>
Reviewed-by: Anthonin Bonnefoy <anthonin.bonnefoy@datadoghq.com>
Reviewed-by: Chao Li <lic@highgo.com>
Discussion: https://postgr.es/m/44334231a4d214fac382a69cceb7d9fc@oss.nttdata.com

doc/src/sgml/ref/pgbench.sgml
src/bin/pgbench/pgbench.c
src/bin/pgbench/t/001_pgbench_with_server.pl

index a5edf6124437b8d6b34e8d62848e26039b3336b3..ecfc3d2f2b764271593bf85e6efeec990f592299 100644 (file)
@@ -76,9 +76,8 @@ tps = 896.967014 (without initial connection time)
   and number of transactions per client); these will be equal unless the run
   failed before completion or some SQL command(s) failed.  (In
   <option>-T</option> mode, only the actual number of transactions is printed.)
-  The next line reports the number of failed transactions due to
-  serialization or deadlock errors (see <xref linkend="failures-and-retries"/>
-  for more information).
+  The next line reports the number of failed transactions (see
+  <xref linkend="failures-and-retries"/> for more information).
   The last line reports the number of transactions per second.
  </para>
 
@@ -759,6 +758,26 @@ pgbench <optional> <replaceable>options</replaceable> </optional> <replaceable>d
       </listitem>
      </varlistentry>
 
+     <varlistentry id="pgbench-option-continue-on-error">
+      <term><option>--continue-on-error</option></term>
+      <listitem>
+       <para>
+        Allows clients to continue running even if an SQL statement fails
+        due to errors other than serialization or deadlock.  By default,
+        clients abort after such errors, but with this option enabled,
+        they proceed to the next transaction instead.  Note that
+        clients still abort even with this option if an error causes
+        the connection to fail.
+        See <xref linkend="failures-and-retries"/> for more information.
+       </para>
+       <para>
+        This option is useful when your custom script may raise errors
+        such as unique constraint violations, but you want the benchmark
+        to continue and measure performance including those failures.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="pgbench-option-exit-on-abort">
       <term><option>--exit-on-abort</option></term>
       <listitem>
@@ -790,6 +809,9 @@ pgbench <optional> <replaceable>options</replaceable> </optional> <replaceable>d
          <listitem>
           <para>deadlock failures;</para>
          </listitem>
+         <listitem>
+          <para>other failures;</para>
+         </listitem>
         </itemizedlist>
         See <xref linkend="failures-and-retries"/> for more information.
        </para>
@@ -2408,8 +2430,8 @@ END;
    will be reported as <literal>failed</literal>. If you use the
    <option>--failures-detailed</option> option, the
    <replaceable>time</replaceable> of the failed transaction will be reported as
-   <literal>serialization</literal> or
-   <literal>deadlock</literal> depending on the type of failure (see
+   <literal>serialization</literal>, <literal>deadlock</literal>, or
+   <literal>other</literal> depending on the type of failure (see
    <xref linkend="failures-and-retries"/> for more information).
   </para>
 
@@ -2637,6 +2659,17 @@ END;
       </para>
      </listitem>
     </varlistentry>
+
+    <varlistentry>
+     <term><replaceable>other_sql_failures</replaceable></term>
+     <listitem>
+      <para>
+       number of transactions that got an SQL error
+       (zero unless both <option>--failures-detailed</option> and
+       <option>--continue-on-error</option> are specified)
+      </para>
+     </listitem>
+    </varlistentry>
    </variablelist>
   </para>
 
@@ -2645,8 +2678,8 @@ END;
 <screen>
 <userinput>pgbench --aggregate-interval=10 --time=20 --client=10 --log --rate=1000 --latency-limit=10 --failures-detailed --max-tries=10 test</userinput>
 
-1650260552 5178 26171317 177284491527 1136 44462 2647617 7321113867 0 9866 64 7564 28340 4148 0
-1650260562 4808 25573984 220121792172 1171 62083 3037380 9666800914 0 9998 598 7392 26621 4527 0
+1650260552 5178 26171317 177284491527 1136 44462 2647617 7321113867 0 9866 64 7564 28340 4148 0 0
+1650260562 4808 25573984 220121792172 1171 62083 3037380 9666800914 0 9998 598 7392 26621 4527 0 0
 </screen>
   </para>
 
@@ -2850,10 +2883,20 @@ statement latencies in milliseconds, failures and retries:
   <para>
    A client's run is aborted in case of a serious error; for example, the
    connection with the database server was lost or the end of script was reached
-   without completing the last transaction. In addition, if execution of an SQL
-   or meta command fails for reasons other than serialization or deadlock errors,
-   the client is aborted. Otherwise, if an SQL command fails with serialization or
-   deadlock errors, the client is not aborted. In such cases, the current
+   without completing the last transaction.  The client also aborts
+   if a meta command fails, or if an SQL command fails for reasons other than
+   serialization or deadlock errors when <option>--continue-on-error</option>
+   is not specified.  With <option>--continue-on-error</option>,
+   the client does not abort on such SQL errors and instead proceeds to
+   the next transaction.  These cases are reported as
+   <literal>other failures</literal> in the output.  If the error occurs
+   in a meta command, however, the client still aborts even when this option
+   is specified.
+  </para>
+  <para>
+   If an SQL command fails due to serialization or deadlock errors, the
+   client does not abort, regardless of whether
+   <option>--continue-on-error</option> is used.  Instead, the current
    transaction is rolled back, which also includes setting the client variables
    as they were before the run of this transaction (it is assumed that one
    transaction script contains only one transaction; see
index 1515ed405ba7cb02c52fa3a08a90be4484f788ae..d8764ba6fe028dde7841cac22047dab74c64ea51 100644 (file)
@@ -402,14 +402,15 @@ typedef struct StatsData
         *   directly successful transactions (they were successfully completed on
         *                                     the first try).
         *
-        * A failed transaction is defined as unsuccessfully retried transactions.
-        * It can be one of two types:
-        *
-        * failed (the number of failed transactions) =
+        * 'failed' (the number of failed transactions) =
         *   'serialization_failures' (they got a serialization error and were not
-        *                             successfully retried) +
+        *                        successfully retried) +
         *   'deadlock_failures' (they got a deadlock error and were not
-        *                        successfully retried).
+        *                        successfully retried) +
+        *   'other_sql_failures'  (they failed on the first try or after retries
+        *                        due to a SQL error other than serialization or
+        *                        deadlock; they are counted as a failed transaction
+        *                        only when --continue-on-error is specified).
         *
         * If the transaction was retried after a serialization or a deadlock
         * error this does not guarantee that this retry was successful. Thus
@@ -421,7 +422,7 @@ typedef struct StatsData
         *
         * 'retried' (number of all retried transactions) =
         *   successfully retried transactions +
-        *   failed transactions.
+        *   unsuccessful retried transactions.
         *----------
         */
        int64           cnt;                    /* number of successful transactions, not
@@ -440,6 +441,11 @@ typedef struct StatsData
        int64           deadlock_failures;      /* number of transactions that were not
                                                                         * successfully retried after a deadlock
                                                                         * error */
+       int64           other_sql_failures; /* number of failed transactions for
+                                                                        * reasons other than
+                                                                        * serialization/deadlock failure, which
+                                                                        * is counted if --continue-on-error is
+                                                                        * specified */
        SimpleStats latency;
        SimpleStats lag;
 } StatsData;
@@ -457,6 +463,7 @@ typedef enum EStatus
 {
        ESTATUS_NO_ERROR = 0,
        ESTATUS_META_COMMAND_ERROR,
+       ESTATUS_CONN_ERROR,
 
        /* SQL errors */
        ESTATUS_SERIALIZATION_ERROR,
@@ -770,6 +777,7 @@ static int64 total_weight = 0;
 static bool verbose_errors = false; /* print verbose messages of all errors */
 
 static bool exit_on_abort = false;     /* exit when any client is aborted */
+static bool continue_on_error = false; /* continue after errors */
 
 /* Builtin test scripts */
 typedef struct BuiltinScript
@@ -949,6 +957,7 @@ usage(void)
                   "  -T, --time=NUM           duration of benchmark test in seconds\n"
                   "  -v, --vacuum-all         vacuum all four standard tables before tests\n"
                   "  --aggregate-interval=NUM aggregate data over NUM seconds\n"
+                  "  --continue-on-error      continue running after an SQL error\n"
                   "  --exit-on-abort          exit when any client is aborted\n"
                   "  --failures-detailed      report the failures grouped by basic types\n"
                   "  --log-prefix=PREFIX      prefix for transaction time log file\n"
@@ -1467,6 +1476,7 @@ initStats(StatsData *sd, pg_time_usec_t start)
        sd->retried = 0;
        sd->serialization_failures = 0;
        sd->deadlock_failures = 0;
+       sd->other_sql_failures = 0;
        initSimpleStats(&sd->latency);
        initSimpleStats(&sd->lag);
 }
@@ -1516,6 +1526,9 @@ accumStats(StatsData *stats, bool skipped, double lat, double lag,
                case ESTATUS_DEADLOCK_ERROR:
                        stats->deadlock_failures++;
                        break;
+               case ESTATUS_OTHER_SQL_ERROR:
+                       stats->other_sql_failures++;
+                       break;
                default:
                        /* internal error which should never occur */
                        pg_fatal("unexpected error status: %d", estatus);
@@ -3231,11 +3244,43 @@ sendCommand(CState *st, Command *command)
 }
 
 /*
- * Get the error status from the error code.
+ * Read and discard all available results from the connection.
+ */
+static void
+discardAvailableResults(CState *st)
+{
+       PGresult   *res = NULL;
+
+       for (;;)
+       {
+               res = PQgetResult(st->con);
+
+               /*
+                * Read and discard results until PQgetResult() returns NULL (no more
+                * results) or a connection failure is detected. If the pipeline
+                * status is PQ_PIPELINE_ABORTED, more results may still be available
+                * even after PQgetResult() returns NULL, so continue reading in that
+                * case.
+                */
+               if ((res == NULL && PQpipelineStatus(st->con) != PQ_PIPELINE_ABORTED) ||
+                       PQstatus(st->con) == CONNECTION_BAD)
+                       break;
+
+               PQclear(res);
+       }
+       PQclear(res);
+}
+
+/*
+ * Determine the error status based on the connection status and error code.
  */
 static EStatus
-getSQLErrorStatus(const char *sqlState)
+getSQLErrorStatus(CState *st, const char *sqlState)
 {
+       discardAvailableResults(st);
+       if (PQstatus(st->con) == CONNECTION_BAD)
+               return ESTATUS_CONN_ERROR;
+
        if (sqlState != NULL)
        {
                if (strcmp(sqlState, ERRCODE_T_R_SERIALIZATION_FAILURE) == 0)
@@ -3257,6 +3302,17 @@ canRetryError(EStatus estatus)
                        estatus == ESTATUS_DEADLOCK_ERROR);
 }
 
+/*
+ * Returns true if --continue-on-error is specified and this error allows
+ * processing to continue.
+ */
+static bool
+canContinueOnError(EStatus estatus)
+{
+       return (continue_on_error &&
+                       estatus == ESTATUS_OTHER_SQL_ERROR);
+}
+
 /*
  * Process query response from the backend.
  *
@@ -3375,9 +3431,9 @@ readCommandResponse(CState *st, MetaCommand meta, char *varprefix)
 
                        case PGRES_NONFATAL_ERROR:
                        case PGRES_FATAL_ERROR:
-                               st->estatus = getSQLErrorStatus(PQresultErrorField(res,
-                                                                                                                                  PG_DIAG_SQLSTATE));
-                               if (canRetryError(st->estatus))
+                               st->estatus = getSQLErrorStatus(st, PQresultErrorField(res,
+                                                                                                                                          PG_DIAG_SQLSTATE));
+                               if (canRetryError(st->estatus) || canContinueOnError(st->estatus))
                                {
                                        if (verbose_errors)
                                                commandError(st, PQresultErrorMessage(res));
@@ -3409,11 +3465,7 @@ readCommandResponse(CState *st, MetaCommand meta, char *varprefix)
 error:
        PQclear(res);
        PQclear(next_res);
-       do
-       {
-               res = PQgetResult(st->con);
-               PQclear(res);
-       } while (res);
+       discardAvailableResults(st);
 
        return false;
 }
@@ -4041,7 +4093,7 @@ advanceConnectionState(TState *thread, CState *st, StatsData *agg)
                                        if (PQpipelineStatus(st->con) != PQ_PIPELINE_ON)
                                                st->state = CSTATE_END_COMMAND;
                                }
-                               else if (canRetryError(st->estatus))
+                               else if (canRetryError(st->estatus) || canContinueOnError(st->estatus))
                                        st->state = CSTATE_ERROR;
                                else
                                        st->state = CSTATE_ABORTED;
@@ -4562,7 +4614,8 @@ static int64
 getFailures(const StatsData *stats)
 {
        return (stats->serialization_failures +
-                       stats->deadlock_failures);
+                       stats->deadlock_failures +
+                       stats->other_sql_failures);
 }
 
 /*
@@ -4582,6 +4635,8 @@ getResultString(bool skipped, EStatus estatus)
                                return "serialization";
                        case ESTATUS_DEADLOCK_ERROR:
                                return "deadlock";
+                       case ESTATUS_OTHER_SQL_ERROR:
+                               return "other";
                        default:
                                /* internal error which should never occur */
                                pg_fatal("unexpected error status: %d", estatus);
@@ -4637,6 +4692,7 @@ doLog(TState *thread, CState *st,
                        int64           skipped = 0;
                        int64           serialization_failures = 0;
                        int64           deadlock_failures = 0;
+                       int64           other_sql_failures = 0;
                        int64           retried = 0;
                        int64           retries = 0;
 
@@ -4677,10 +4733,12 @@ doLog(TState *thread, CState *st,
                        {
                                serialization_failures = agg->serialization_failures;
                                deadlock_failures = agg->deadlock_failures;
+                               other_sql_failures = agg->other_sql_failures;
                        }
-                       fprintf(logfile, " " INT64_FORMAT " " INT64_FORMAT,
+                       fprintf(logfile, " " INT64_FORMAT " " INT64_FORMAT " " INT64_FORMAT,
                                        serialization_failures,
-                                       deadlock_failures);
+                                       deadlock_failures,
+                                       other_sql_failures);
 
                        fputc('\n', logfile);
 
@@ -6319,6 +6377,7 @@ printProgressReport(TState *threads, int64 test_start, pg_time_usec_t now,
                cur.serialization_failures +=
                        threads[i].stats.serialization_failures;
                cur.deadlock_failures += threads[i].stats.deadlock_failures;
+               cur.other_sql_failures += threads[i].stats.other_sql_failures;
        }
 
        /* we count only actually executed transactions */
@@ -6461,7 +6520,8 @@ printResults(StatsData *total,
 
        /*
         * Remaining stats are nonsensical if we failed to execute any xacts due
-        * to others than serialization or deadlock errors
+        * to other than serialization or deadlock errors and --continue-on-error
+        * is not set.
         */
        if (total_cnt <= 0)
                return;
@@ -6477,6 +6537,9 @@ printResults(StatsData *total,
                printf("number of deadlock failures: " INT64_FORMAT " (%.3f%%)\n",
                           total->deadlock_failures,
                           100.0 * total->deadlock_failures / total_cnt);
+               printf("number of other failures: " INT64_FORMAT " (%.3f%%)\n",
+                          total->other_sql_failures,
+                          100.0 * total->other_sql_failures / total_cnt);
        }
 
        /* it can be non-zero only if max_tries is not equal to one */
@@ -6580,6 +6643,10 @@ printResults(StatsData *total,
                                                           sstats->deadlock_failures,
                                                           (100.0 * sstats->deadlock_failures /
                                                                script_total_cnt));
+                                               printf(" - number of other failures: " INT64_FORMAT " (%.3f%%)\n",
+                                                          sstats->other_sql_failures,
+                                                          (100.0 * sstats->other_sql_failures /
+                                                               script_total_cnt));
                                        }
 
                                        /*
@@ -6739,6 +6806,7 @@ main(int argc, char **argv)
                {"verbose-errors", no_argument, NULL, 15},
                {"exit-on-abort", no_argument, NULL, 16},
                {"debug", no_argument, NULL, 17},
+               {"continue-on-error", no_argument, NULL, 18},
                {NULL, 0, NULL, 0}
        };
 
@@ -7092,6 +7160,10 @@ main(int argc, char **argv)
                        case 17:                        /* debug */
                                pg_logging_increase_verbosity();
                                break;
+                       case 18:                        /* continue-on-error */
+                               benchmarking_option_set = true;
+                               continue_on_error = true;
+                               break;
                        default:
                                /* getopt_long already emitted a complaint */
                                pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -7447,6 +7519,7 @@ main(int argc, char **argv)
                stats.retried += thread->stats.retried;
                stats.serialization_failures += thread->stats.serialization_failures;
                stats.deadlock_failures += thread->stats.deadlock_failures;
+               stats.other_sql_failures += thread->stats.other_sql_failures;
                latency_late += thread->latency_late;
                conn_total_duration += thread->conn_duration;
 
index f820e88abe4492e616a1f233b1483fa758c8dbe6..581e9af7907322ec99fdf75e785aabed69a9d8b9 100644 (file)
@@ -1835,6 +1835,28 @@ $node->pgbench(
 # Clean up
 $node->safe_psql('postgres', 'DROP TABLE counter;');
 
+# Test --continue-on-error
+$node->safe_psql('postgres',
+       'CREATE TABLE unique_table(i int unique);');
+
+$node->pgbench(
+       '-n -t 10 --continue-on-error --failures-detailed',
+       0,
+       [
+               qr{processed: 1/10\b},
+               qr{other failures: 9\b}
+       ],
+       [],
+       'test --continue-on-error',
+       {
+               '001_continue_on_error' => q{
+               INSERT INTO unique_table VALUES(0);
+               }
+       });
+
+# Clean up
+$node->safe_psql('postgres', 'DROP TABLE unique_table;');
+
 # done
 $node->safe_psql('postgres', 'DROP TABLESPACE regress_pgbench_tap_1_ts');
 $node->stop;