]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
instrumentation: Use Time-Stamp Counter on x86-64 to lower overhead
authorAndres Freund <andres@anarazel.de>
Tue, 7 Apr 2026 16:48:07 +0000 (12:48 -0400)
committerAndres Freund <andres@anarazel.de>
Tue, 7 Apr 2026 17:00:24 +0000 (13:00 -0400)
This allows the direct use of the Time-Stamp Counter (TSC) value retrieved
from the CPU using RDTSC/RDTSCP instructions, instead of APIs like
clock_gettime() on POSIX systems.

This reduces the overhead of EXPLAIN with ANALYZE and TIMING ON. Tests showed
that the overhead on top of actual runtime when instrumenting queries moving
lots of rows through the plan can be reduced from 2x as slow to 1.2x as slow
compared to the actual runtime. More complex workloads such as TPCH queries
have also shown ~20% gains when instrumented compared to before.

To control use of the TSC, the new "timing_clock_source" GUC is introduced,
whose default ("auto") automatically uses the TSC when reliable, for example
when running on modern Intel CPUs, or when running on Linux and the system
clocksource is reported as "tsc". The use of the operating system clock source
can be enforced by setting "system", or on x86-64 architectures the use of TSC
can be enforced by explicitly setting "tsc".

In order to use the TSC the frequency is first determined by use of CPUID, and
if not available, by running a short calibration loop at program start,
falling back to the system clock source if TSC values are not stable.

Note, that we split TSC usage into the RDTSC CPU instruction which does not
wait for out-of-order execution (faster, less precise) and the RDTSCP
instruction, which waits for outstanding instructions to retire. RDTSCP is
deemed to have little benefit in the typical InstrStartNode() /
InstrStopNode() use case of EXPLAIN, and can be up to twice as slow. To
separate these use cases, the new macro INSTR_TIME_SET_CURRENT_FAST() is
introduced, which uses RDTSC.

The original macro INSTR_TIME_SET_CURRENT() uses RDTSCP and is supposed to be
used when precision is more important than performance. When the system timing
clock source is used both of these macros instead utilize the system
APIs (clock_gettime / QueryPerformanceCounter) like before.

Additional users of interval timing, such as track_io_timing and
track_wal_io_timing could also benefit from being converted to use
INSTR_TIME_SET_CURRENT_FAST() but are left for future changes.

Author: Lukas Fittl <lukas@fittl.com>
Author: Andres Freund <andres@anarazel.de>
Author: David Geier <geidav.pg@gmail.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: David Geier <geidav.pg@gmail.com>
Reviewed-by: Lukas Fittl <lukas@fittl.com>
Reviewed-by: Zsolt Parragi <zsolt.parragi@percona.com>
Reviewed-by: Thomas Munro <thomas.munro@gmail.com> (in an earlier version)
Reviewed-by: Maciek Sakrejda <m.sakrejda@gmail.com> (in an earlier version)
Reviewed-by: Robert Haas <robertmhaas@gmail.com> (in an earlier version)
Reviewed-by: Jakub Wartak <jakub.wartak@enterprisedb.com> (in an earlier version)
Discussion: https://postgr.es/m/20200612232810.f46nbqkdhbutzqdg@alap3.anarazel.de

doc/src/sgml/config.sgml
src/backend/executor/instrument.c
src/backend/postmaster/launch_backend.c
src/backend/utils/misc/guc_parameters.dat
src/backend/utils/misc/guc_tables.c
src/backend/utils/misc/postgresql.conf.sample
src/common/instr_time.c
src/include/portability/instr_time.h
src/include/utils/guc_hooks.h
src/include/utils/guc_tables.h
src/tools/pgindent/typedefs.list

index 584bc9f49ddbed582ee7190aaa6d48f955fec3b2..8bdbb6db0f9ed99ecacc8056127bcee33bbafa7f 100644 (file)
@@ -2533,6 +2533,72 @@ include_dir 'conf.d'
      </variablelist>
     </sect2>
 
+    <sect2 id="runtime-config-resource-time">
+     <title>Timing</title>
+
+     <variablelist>
+      <varlistentry id="guc-timing-clock-source" xreflabel="timing_clock_source">
+       <term><varname>timing_clock_source</varname> (<type>enum</type>)
+       <indexterm>
+        <primary><varname>timing_clock_source</varname> configuration parameter</primary>
+       </indexterm>
+       </term>
+       <listitem>
+        <para>
+         Selects the method for making timing measurements using the OS or
+         specialized CPU instructions. Possible values are:
+         <itemizedlist>
+          <listitem>
+           <para>
+            <literal>auto</literal> (automatically chooses <acronym>TSC</acronym>
+            clock source on supported x86-64 CPUs, otherwise uses the OS system
+            clock)
+           </para>
+          </listitem>
+          <listitem>
+           <para>
+            <literal>system</literal> (measures timing using the OS system clock)
+           </para>
+          </listitem>
+          <listitem>
+           <para>
+            <literal>tsc</literal> (measures timing with a CPU instruction, e.g.
+            using <command>RDTSC</command>/<command>RDTSCP</command> on x86-64)
+           </para>
+          </listitem>
+         </itemizedlist>
+         The default is <literal>auto</literal>. Only superusers can change this
+         setting. Changing the setting during query execution is not recommended
+         and may cause interval timings to jump significantly or produce negative
+         values.
+        </para>
+        <para>
+         <indexterm>
+          <primary>Time-Stamp Counter</primary>
+          <see><acronym>TSC</acronym></see>
+         </indexterm>
+         <indexterm><primary><acronym>TSC</acronym></primary></indexterm>
+         If enabled, the <acronym>TSC</acronym> clock source, named after the
+         Time-Stamp Counter on x86-64, will use specialized CPU instructions when
+         measuring time intervals. This lowers timing overhead compared to reading
+         the OS system clock, and reduces the measurement error on top of the
+         actual runtime, for example with <command>EXPLAIN ANALYZE</command>.
+        </para>
+        <para>
+         <indexterm><primary><acronym>RDTSC</acronym></primary></indexterm>
+         On x86-64 CPUs the <acronym>TSC</acronym> clock source utilizes the
+         <command>RDTSC</command> instruction for <command>EXPLAIN ANALYZE</command>.
+         For timings that require higher precision the <command>RDTSCP</command>
+         instruction is used, which avoids inaccuracies due to CPU instruction
+         re-ordering. Use of the <acronym>TSC</acronym> clock source is not
+         supported on older x86-64 CPUs and other architectures, and is not
+         advised on systems that utilize an emulated <acronym>TSC</acronym>, as it
+         is likely slower than the system clock source.
+        </para>
+       </listitem>
+      </varlistentry>
+     </variablelist>
+    </sect2>
 
     <sect2 id="runtime-config-resource-background-writer">
      <title>Background Writer</title>
index 011a9684df0d5ddf66fb297044e0c344d1ae4a17..4c3aec7fdeef65af9273a470a2f37c7486dde4e8 100644 (file)
@@ -16,6 +16,8 @@
 #include <unistd.h>
 
 #include "executor/instrument.h"
+#include "portability/instr_time.h"
+#include "utils/guc_hooks.h"
 
 BufferUsage pgBufferUsage;
 static BufferUsage save_pgBufferUsage;
@@ -52,7 +54,7 @@ InstrStart(Instrumentation *instr)
                if (!INSTR_TIME_IS_ZERO(instr->starttime))
                        elog(ERROR, "InstrStart called twice in a row");
                else
-                       INSTR_TIME_SET_CURRENT(instr->starttime);
+                       INSTR_TIME_SET_CURRENT_FAST(instr->starttime);
        }
 
        /* save buffer usage totals at start, if needed */
@@ -78,7 +80,7 @@ InstrStopCommon(Instrumentation *instr, instr_time *accum_time)
                if (INSTR_TIME_IS_ZERO(instr->starttime))
                        elog(ERROR, "InstrStop called without start");
 
-               INSTR_TIME_SET_CURRENT(endtime);
+               INSTR_TIME_SET_CURRENT_FAST(endtime);
                INSTR_TIME_ACCUM_DIFF(*accum_time, endtime, instr->starttime);
 
                INSTR_TIME_SET_ZERO(instr->starttime);
@@ -345,3 +347,75 @@ WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub)
        dst->wal_fpi_bytes += add->wal_fpi_bytes - sub->wal_fpi_bytes;
        dst->wal_buffers_full += add->wal_buffers_full - sub->wal_buffers_full;
 }
+
+/* GUC hooks for timing_clock_source */
+
+bool
+check_timing_clock_source(int *newval, void **extra, GucSource source)
+{
+       /*
+        * Do nothing if timing is not initialized. This is only expected on child
+        * processes in EXEC_BACKEND builds, as GUC hooks can be called during
+        * InitializeGUCOptions() before InitProcessGlobals() has had a chance to
+        * run pg_initialize_timing(). Instead, TSC will be initialized via
+        * restore_backend_variables.
+        */
+#ifdef EXEC_BACKEND
+       if (!timing_initialized)
+               return true;
+#else
+       Assert(timing_initialized);
+#endif
+
+#if PG_INSTR_TSC_CLOCK
+       pg_initialize_timing_tsc();
+
+       if (*newval == TIMING_CLOCK_SOURCE_TSC && timing_tsc_frequency_khz <= 0)
+       {
+               GUC_check_errdetail("TSC is not supported as timing clock source");
+               return false;
+       }
+#endif
+
+       return true;
+}
+
+void
+assign_timing_clock_source(int newval, void *extra)
+{
+#ifdef EXEC_BACKEND
+       if (!timing_initialized)
+               return;
+#else
+       Assert(timing_initialized);
+#endif
+
+       /*
+        * Ignore the return code since the check hook already verified TSC is
+        * usable if it's explicitly requested.
+        */
+       pg_set_timing_clock_source(newval);
+}
+
+const char *
+show_timing_clock_source(void)
+{
+       switch (timing_clock_source)
+       {
+               case TIMING_CLOCK_SOURCE_AUTO:
+#if PG_INSTR_TSC_CLOCK
+                       if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC)
+                               return "auto (tsc)";
+#endif
+                       return "auto (system)";
+               case TIMING_CLOCK_SOURCE_SYSTEM:
+                       return "system";
+#if PG_INSTR_TSC_CLOCK
+               case TIMING_CLOCK_SOURCE_TSC:
+                       return "tsc";
+#endif
+       }
+
+       /* unreachable */
+       return "?";
+}
index ed0f4f2d2343678ad8d10f3389c485a40a173d90..8f3cfea880c3c11565fcc37d8361a0c2838147ed 100644 (file)
@@ -57,6 +57,7 @@
 
 #ifdef EXEC_BACKEND
 #include "nodes/queryjumble.h"
+#include "portability/instr_time.h"
 #include "storage/pg_shmem.h"
 #include "storage/spin.h"
 #endif
@@ -129,6 +130,8 @@ typedef struct
 
        int                     MyPMChildSlot;
 
+       int32           timing_tsc_frequency_khz;
+
        /*
         * These are only used by backend processes, but are here because passing
         * a socket needs some special handling on Windows. 'client_sock' is an
@@ -750,6 +753,8 @@ save_backend_variables(BackendParameters *param,
        param->MaxBackends = MaxBackends;
        param->num_pmchild_slots = num_pmchild_slots;
 
+       param->timing_tsc_frequency_khz = timing_tsc_frequency_khz;
+
 #ifdef WIN32
        param->PostmasterHandle = PostmasterHandle;
        if (!write_duplicated_handle(&param->initial_signal_pipe,
@@ -1004,6 +1009,12 @@ restore_backend_variables(BackendParameters *param)
        MaxBackends = param->MaxBackends;
        num_pmchild_slots = param->num_pmchild_slots;
 
+       timing_tsc_frequency_khz = param->timing_tsc_frequency_khz;
+
+       /* Re-run logic usually done by assign_timing_clock_source */
+       pg_initialize_timing();
+       pg_set_timing_clock_source(timing_clock_source);
+
 #ifdef WIN32
        PostmasterHandle = param->PostmasterHandle;
        pgwin32_initial_signal_pipe = param->initial_signal_pipe;
index 632f3ba4989c9ff5a58b95f27c90bd373ffc5fbd..86c1eba5dab41c83dd5a3db723e6ac0251d5547e 100644 (file)
   assign_hook => 'assign_timezone_abbreviations',
 },
 
+{ name => 'timing_clock_source', type => 'enum', context => 'PGC_SUSET', group => 'RESOURCES_TIME',
+  short_desc => 'Controls the clock source used for collecting timing measurements.',
+  long_desc => 'This enables the use of specialized clock sources, specifically the RDTSC clock source on x86-64 systems (if available), to support timing measurements with lower overhead during EXPLAIN and other instrumentation.',
+  variable => 'timing_clock_source',
+  boot_val => 'TIMING_CLOCK_SOURCE_AUTO',
+  options => 'timing_clock_source_options',
+  check_hook => 'check_timing_clock_source',
+  assign_hook => 'assign_timing_clock_source',
+  show_hook => 'show_timing_clock_source',
+},
+
 { name => 'trace_connection_negotiation', type => 'bool', context => 'PGC_POSTMASTER', group => 'DEVELOPER_OPTIONS',
   short_desc => 'Logs details of pre-authentication connection handshake.',
   flags => 'GUC_NOT_IN_SAMPLE',
index d9ca13baff97dfc4752d43a138eea6aaa7aa9a8e..290ccbc543e2578df50156e597c30a6a0f1fd566 100644 (file)
@@ -90,6 +90,7 @@
 #include "storage/standby.h"
 #include "tcop/backend_startup.h"
 #include "tcop/tcopprot.h"
+#include "portability/instr_time.h"
 #include "tsearch/ts_cache.h"
 #include "utils/builtins.h"
 #include "utils/bytea.h"
@@ -373,6 +374,15 @@ static const struct config_enum_entry huge_pages_options[] = {
        {NULL, 0, false}
 };
 
+static const struct config_enum_entry timing_clock_source_options[] = {
+       {"auto", TIMING_CLOCK_SOURCE_AUTO, false},
+       {"system", TIMING_CLOCK_SOURCE_SYSTEM, false},
+#if PG_INSTR_TSC_CLOCK
+       {"tsc", TIMING_CLOCK_SOURCE_TSC, false},
+#endif
+       {NULL, 0, false}
+};
+
 static const struct config_enum_entry huge_pages_status_options[] = {
        {"off", HUGE_PAGES_OFF, false},
        {"on", HUGE_PAGES_ON, false},
@@ -731,6 +741,7 @@ const char *const config_group_names[] =
        [CONN_AUTH_TCP] = gettext_noop("Connections and Authentication / TCP Settings"),
        [CONN_AUTH_AUTH] = gettext_noop("Connections and Authentication / Authentication"),
        [CONN_AUTH_SSL] = gettext_noop("Connections and Authentication / SSL"),
+       [RESOURCES_TIME] = gettext_noop("Resource Usage / Time"),
        [RESOURCES_MEM] = gettext_noop("Resource Usage / Memory"),
        [RESOURCES_DISK] = gettext_noop("Resource Usage / Disk"),
        [RESOURCES_KERNEL] = gettext_noop("Resource Usage / Kernel Resources"),
index 2e10eb4a36a91cd9e86b931363eb05a7958dba98..4f2bbf0529558aad1deabc6cae552d756e772c8e 100644 (file)
 #max_files_per_process = 1000           # min 64
                                         # (change requires restart)
 
+# - Time -
+
+#timing_clock_source = auto             # auto, system, tsc (if supported)
+
 # - Background Writer -
 
 #bgwriter_delay = 200ms                 # 10-10000ms between rounds
index 9271113a287c0dc1bb063aa7b5b2f76ba2de65f2..fc6e1852c30b233e35547ff07bf09e8b51cd1c8f 100644 (file)
 #include "postgres_fe.h"
 #endif
 
+#include <math.h>
+
+#include "port/pg_cpu.h"
 #include "portability/instr_time.h"
 
 /*
  * Stores what the number of ticks needs to be multiplied with to end up
  * with nanoseconds using integer math.
  *
- * On certain platforms (currently Windows) the ticks to nanoseconds conversion
- * requires floating point math because:
+ * In certain cases (TSC on x86-64, and QueryPerformanceCounter on Windows)
+ * the ticks to nanoseconds conversion requires floating point math because:
  *
  * sec = ticks / frequency_hz
  * ns  = ticks / frequency_hz * 1,000,000,000
  * value to encourage compilers to generate better assembly, since we can be
  * sure these values are not negative.
  *
- * On all other platforms we are using clock_gettime(), which uses nanoseconds
+ * In all other cases we are using clock_gettime(), which uses nanoseconds
  * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns
  * to return the original value.
  */
 uint64         ticks_per_ns_scaled = 0;
 uint64         max_ticks_no_overflow = 0;
 bool           timing_initialized = false;
+int                    timing_clock_source = TIMING_CLOCK_SOURCE_AUTO;
+
+bool           timing_tsc_enabled = false;
+int32          timing_tsc_frequency_khz = -1;
 
+static void set_ticks_per_ns(void);
 static void set_ticks_per_ns_system(void);
 
+#if PG_INSTR_TSC_CLOCK
+static bool tsc_use_by_default(void);
+static void set_ticks_per_ns_for_tsc(void);
+#endif
+
 /*
  * Initializes timing infrastructure. Must be called before making any use
  * of INSTR* macros.
@@ -75,6 +88,49 @@ pg_initialize_timing(void)
        timing_initialized = true;
 }
 
+bool
+pg_set_timing_clock_source(TimingClockSourceType source)
+{
+       Assert(timing_initialized);
+
+#if PG_INSTR_TSC_CLOCK
+       pg_initialize_timing_tsc();
+
+       switch (source)
+       {
+               case TIMING_CLOCK_SOURCE_AUTO:
+                       timing_tsc_enabled = (timing_tsc_frequency_khz > 0) && tsc_use_by_default();
+                       break;
+               case TIMING_CLOCK_SOURCE_SYSTEM:
+                       timing_tsc_enabled = false;
+                       break;
+               case TIMING_CLOCK_SOURCE_TSC:
+                       /* Tell caller TSC is not usable */
+                       if (timing_tsc_frequency_khz <= 0)
+                               return false;
+                       timing_tsc_enabled = true;
+                       break;
+       }
+#endif
+
+       set_ticks_per_ns();
+       timing_clock_source = source;
+       return true;
+}
+
+static void
+set_ticks_per_ns(void)
+{
+#if PG_INSTR_TSC_CLOCK
+       if (timing_tsc_enabled)
+       {
+               set_ticks_per_ns_for_tsc();
+               return;
+       }
+#endif
+       set_ticks_per_ns_system();
+}
+
 #ifndef WIN32
 
 static void
@@ -104,3 +160,213 @@ set_ticks_per_ns_system(void)
 }
 
 #endif                                                 /* WIN32 */
+
+/* TSC specific logic */
+
+#if PG_INSTR_TSC_CLOCK
+
+static void tsc_detect_frequency(void);
+
+/*
+ * Initialize the TSC clock source by determining its usability and frequency.
+ *
+ * This can be called multiple times without causing repeated work, as
+ * timing_tsc_frequency_khz will be set to 0 if a prior call determined the
+ * TSC is not usable. On EXEC_BACKEND (Windows), the TSC frequency may also be
+ * set by restore_backend_variables.
+ */
+void
+pg_initialize_timing_tsc(void)
+{
+       if (timing_tsc_frequency_khz < 0)
+               tsc_detect_frequency();
+}
+
+static void
+set_ticks_per_ns_for_tsc(void)
+{
+       ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / timing_tsc_frequency_khz;
+       max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
+}
+
+/*
+ * Detect the TSC frequency and whether RDTSCP is available on x86-64.
+ *
+ * This can't be reliably determined at compile time, since the
+ * availability of an "invariant" TSC (that is not affected by CPU
+ * frequency changes) is dependent on the CPU architecture. Additionally,
+ * there are cases where TSC availability is impacted by virtualization,
+ * where a simple cpuid feature check would not be enough.
+ */
+static void
+tsc_detect_frequency(void)
+{
+       timing_tsc_frequency_khz = 0;
+
+       /* We require RDTSCP support and an invariant TSC, bail if not available */
+       if (!x86_feature_available(PG_RDTSCP) || !x86_feature_available(PG_TSC_INVARIANT))
+               return;
+
+       /* Determine speed at which the TSC advances */
+       timing_tsc_frequency_khz = x86_tsc_frequency_khz();
+       if (timing_tsc_frequency_khz > 0)
+               return;
+
+       /*
+        * CPUID did not give us the TSC frequency. We can instead measure the
+        * frequency by comparing ticks against walltime in a calibration loop.
+        */
+       timing_tsc_frequency_khz = pg_tsc_calibrate_frequency();
+}
+
+/*
+ * Decides whether to use the TSC clock source if the user did not specify it
+ * one way or the other, and it is available (checked separately).
+ *
+ * Inspired by the Linux kernel's clocksource watchdog disable logic as updated
+ * in 2021 to reflect the reliability of the TSC on Intel platforms, see
+ * check_system_tsc_reliable() in arch/x86/kernel/tsc.c, as well as discussion
+ * in https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/
+ * and https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/
+ * for reference.
+ *
+ * When tsc_detect_frequency determines the TSC is viable (invariant, etc.), and
+ * we're on an Intel platform (determined via TSC_ADJUST), we consider the TSC
+ * trustworthy by default, matching the Linux kernel.
+ *
+ * On other CPU platforms (e.g. AMD), or in some virtual machines, we don't have
+ * an easy way to determine the TSC's reliability. If on Linux, we can check if
+ * TSC is the active clocksource, based on it having run the watchdog logic to
+ * monitor TSC correctness. For other platforms the user must explicitly enable
+ * it via GUC instead.
+ */
+static bool
+tsc_use_by_default(void)
+{
+       if (x86_feature_available(PG_TSC_ADJUST))
+               return true;
+
+#if defined(__linux__)
+       {
+               FILE       *fp;
+               char            buf[128];
+
+               fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
+               if (fp)
+               {
+                       bool            is_tsc = (fgets(buf, sizeof(buf), fp) != NULL &&
+                                                                 strcmp(buf, "tsc\n") == 0);
+
+                       fclose(fp);
+                       if (is_tsc)
+                               return true;
+               }
+       }
+#endif
+
+       return false;
+}
+
+/*
+ * Calibrate the TSC frequency by comparing TSC ticks against walltime.
+ *
+ * Takes initial TSC and system clock snapshots, then loops, recomputing the
+ * frequency each TSC_CALIBRATION_SKIPS iterations from cumulative TSC
+ * ticks divided by elapsed time.
+ *
+ * Once the frequency estimate stabilizes (consecutive iterations agree), we
+ * consider it converged and the frequency in KHz is returned. If either too
+ * many iterations or a time limit passes without convergence, 0 is returned.
+ */
+#define TSC_CALIBRATION_MAX_NS         (50 * NS_PER_MS)
+#define TSC_CALIBRATION_ITERATIONS     1000000
+#define TSC_CALIBRATION_SKIPS          100
+#define TSC_CALIBRATION_STABLE_CYCLES  10
+uint32
+pg_tsc_calibrate_frequency(void)
+{
+       instr_time      initial_wall;
+       int64           initial_tsc;
+       double          freq_khz = 0;
+       double          prev_freq_khz = 0;
+       int                     stable_count = 0;
+       int64           prev_tsc;
+       int                     saved_clock_source = timing_clock_source;
+
+       /*
+        * Frequency must be initialized to avoid recursion via
+        * pg_set_timing_clock_source.
+        */
+       Assert(timing_tsc_frequency_khz >= 0);
+
+       /* Ensure INSTR_* calls below work on system time */
+       pg_set_timing_clock_source(TIMING_CLOCK_SOURCE_SYSTEM);
+
+       INSTR_TIME_SET_CURRENT(initial_wall);
+
+       initial_tsc = pg_rdtscp();
+       prev_tsc = initial_tsc;
+
+       for (int i = 0; i < TSC_CALIBRATION_ITERATIONS; i++)
+       {
+               instr_time      now_wall;
+               int64           now_tsc;
+               int64           elapsed_ns;
+               int64           elapsed_ticks;
+
+               INSTR_TIME_SET_CURRENT(now_wall);
+
+               now_tsc = pg_rdtscp();
+
+               INSTR_TIME_SUBTRACT(now_wall, initial_wall);
+               elapsed_ns = INSTR_TIME_GET_NANOSEC(now_wall);
+
+               /* Safety: bail out if we've taken too long */
+               if (elapsed_ns >= TSC_CALIBRATION_MAX_NS)
+                       break;
+
+               elapsed_ticks = now_tsc - initial_tsc;
+
+               /*
+                * Skip if TSC hasn't advanced, or we walked backwards for some
+                * reason.
+                */
+               if (now_tsc == prev_tsc || elapsed_ns <= 0 || elapsed_ticks <= 0)
+                       continue;
+
+               /*
+                * We only measure frequency every TSC_CALIBRATION_SKIPS to avoid
+                * stabilizing based on just a handful of RDTSC instructions.
+                */
+               if (i % TSC_CALIBRATION_SKIPS != 0)
+                       continue;
+
+               freq_khz = ((double) elapsed_ticks / elapsed_ns) * 1000 * 1000;
+
+               /*
+                * Once freq_khz / prev_freq_khz is small, check if it stays that way.
+                * If it does for long enough, we've got a winner frequency.
+                */
+               if (prev_freq_khz != 0 && fabs(1 - freq_khz / prev_freq_khz) < 0.0001)
+               {
+                       stable_count++;
+                       if (stable_count >= TSC_CALIBRATION_STABLE_CYCLES)
+                               break;
+               }
+               else
+                       stable_count = 0;
+
+               prev_tsc = now_tsc;
+               prev_freq_khz = freq_khz;
+       }
+
+       /* Restore the previous clock source */
+       pg_set_timing_clock_source(saved_clock_source);
+
+       if (stable_count < TSC_CALIBRATION_STABLE_CYCLES)
+               return 0;                               /* did not converge */
+
+       return (uint32) freq_khz;
+}
+
+#endif                                                 /* PG_INSTR_TSC_CLOCK */
index 115f5176317ec7c21d9fb738c8d3e32d05d0bb55..5da5eb2c0575123e58880d00eadbce6eafdd114b 100644 (file)
@@ -4,9 +4,10 @@
  *       portable high-precision interval timing
  *
  * This file provides an abstraction layer to hide portability issues in
- * interval timing.  On Unix we use clock_gettime(), and on Windows we use
- * QueryPerformanceCounter().  These macros also give some breathing room to
- * use other high-precision-timing APIs.
+ * interval timing. On x86 we use the RDTSC/RDTSCP instruction directly in
+ * certain cases, or alternatively clock_gettime() on Unix-like systems and
+ * QueryPerformanceCounter() on Windows. These macros also give some breathing
+ * room to use other high-precision-timing APIs.
  *
  * The basic data type is instr_time, which all callers should treat as an
  * opaque typedef.  instr_time can store either an absolute time (of
  *
  * INSTR_TIME_SET_ZERO(t)                      set t to zero (memset is acceptable too)
  *
- * INSTR_TIME_SET_CURRENT(t)           set t to current time
+ * INSTR_TIME_SET_CURRENT_FAST(t)      set t to current time without waiting
+ *                                                                     for instructions in out-of-order window
+ *
+ * INSTR_TIME_SET_CURRENT(t)           set t to current time while waiting for
+ *                                                                     instructions in OOO to retire
  *
  *
  * INSTR_TIME_ADD(x, y)                                x += y
@@ -86,28 +91,99 @@ typedef struct instr_time
 /*
  * PG_INSTR_TICKS_TO_NS controls whether pg_ticks_to_ns/pg_ns_to_ticks needs to
  * check ticks_per_ns_scaled and potentially convert ticks <=> nanoseconds.
+ *
+ * PG_INSTR_TSC_CLOCK controls whether the TSC clock source is compiled in, and
+ * potentially used based on timing_tsc_enabled.
  */
-#ifdef WIN32
+#if defined(__x86_64__) || defined(_M_X64)
+#define PG_INSTR_TICKS_TO_NS 1
+#define PG_INSTR_TSC_CLOCK 1
+#elif defined(WIN32)
 #define PG_INSTR_TICKS_TO_NS 1
+#define PG_INSTR_TSC_CLOCK 0
 #else
 #define PG_INSTR_TICKS_TO_NS 0
+#define PG_INSTR_TSC_CLOCK 0
 #endif
 
 /*
  * Variables used to translate ticks to nanoseconds, initialized by
- * pg_initialize_timing.
+ * pg_initialize_timing and adjusted by pg_set_timing_clock_source calls or
+ * changes of the "timing_clock_source" GUC.
+ *
+ * Note that changing these values after setting an instr_time and before
+ * reading/converting it will lead to incorrect results. This is technically
+ * possible because the GUC can be changed at runtime, but unlikely, and we
+ * allow changing this at runtime to simplify testing of different sources.
  */
 extern PGDLLIMPORT uint64 ticks_per_ns_scaled;
 extern PGDLLIMPORT uint64 max_ticks_no_overflow;
 extern PGDLLIMPORT bool timing_initialized;
 
+typedef enum
+{
+       TIMING_CLOCK_SOURCE_AUTO,
+       TIMING_CLOCK_SOURCE_SYSTEM,
+#if PG_INSTR_TSC_CLOCK
+       TIMING_CLOCK_SOURCE_TSC
+#endif
+} TimingClockSourceType;
+
+extern int     timing_clock_source;
+
 /*
  * Initialize timing infrastructure
  *
- * This must be called at least once before using INSTR_TIME_SET_CURRENT* macros.
+ * This must be called at least once before using INSTR_TIME_SET_CURRENT*
+ * macros.
+ *
+ * If you want to use the TSC clock source in a client program,
+ * pg_set_timing_clock_source() needs to also be called.
  */
 extern void pg_initialize_timing(void);
 
+/*
+ * Sets the time source to be used. Mainly intended for frontend programs,
+ * the backend should set it via the timing_clock_source GUC instead.
+ *
+ * Returns false if the clock source could not be set, for example when TSC
+ * is not available despite being explicitly set.
+ */
+extern bool pg_set_timing_clock_source(TimingClockSourceType source);
+
+/* Whether to actually use TSC based on availability and GUC settings. */
+extern PGDLLIMPORT bool timing_tsc_enabled;
+
+/*
+ * TSC frequency in kHz, set during initialization.
+ *
+ * -1 = not yet initialized, 0 = TSC not usable, >0 = frequency in kHz.
+ */
+extern PGDLLIMPORT int32 timing_tsc_frequency_khz;
+
+#if PG_INSTR_TSC_CLOCK
+
+extern void pg_initialize_timing_tsc(void);
+
+extern uint32 pg_tsc_calibrate_frequency(void);
+
+#endif                                                 /* PG_INSTR_TSC_CLOCK */
+
+/*
+ * Returns the current timing clock source effectively in use, resolving
+ * TIMING_CLOCK_SOURCE_AUTO to either TIMING_CLOCK_SOURCE_SYSTEM or
+ * TIMING_CLOCK_SOURCE_TSC.
+ */
+static inline TimingClockSourceType
+pg_current_timing_clock_source(void)
+{
+#if PG_INSTR_TSC_CLOCK
+       if (timing_tsc_enabled)
+               return TIMING_CLOCK_SOURCE_TSC;
+#endif
+       return TIMING_CLOCK_SOURCE_SYSTEM;
+}
+
 #ifndef WIN32
 
 /* On POSIX, use clock_gettime() for system clock source */
@@ -125,24 +201,27 @@ extern void pg_initialize_timing(void);
  * than CLOCK_MONOTONIC.  In particular, as of macOS 10.12, Apple provides
  * CLOCK_MONOTONIC_RAW which is both faster to read and higher resolution than
  * their version of CLOCK_MONOTONIC.
+ *
+ * Note this does not get used in case the TSC clock source logic is used,
+ * which directly calls architecture specific timing instructions (e.g. RDTSC).
  */
 #if defined(__darwin__) && defined(CLOCK_MONOTONIC_RAW)
-#define PG_INSTR_CLOCK CLOCK_MONOTONIC_RAW
+#define PG_INSTR_SYSTEM_CLOCK  CLOCK_MONOTONIC_RAW
 #elif defined(CLOCK_MONOTONIC)
-#define PG_INSTR_CLOCK CLOCK_MONOTONIC
+#define PG_INSTR_SYSTEM_CLOCK  CLOCK_MONOTONIC
 #else
-#define PG_INSTR_CLOCK CLOCK_REALTIME
+#define PG_INSTR_SYSTEM_CLOCK  CLOCK_REALTIME
 #endif
 
 static inline instr_time
-pg_get_ticks(void)
+pg_get_ticks_system(void)
 {
        instr_time      now;
        struct timespec tmp;
 
        Assert(timing_initialized);
 
-       clock_gettime(PG_INSTR_CLOCK, &tmp);
+       clock_gettime(PG_INSTR_SYSTEM_CLOCK, &tmp);
        now.ticks = tmp.tv_sec * NS_PER_S + tmp.tv_nsec;
 
        return now;
@@ -153,7 +232,7 @@ pg_get_ticks(void)
 /* On Windows, use QueryPerformanceCounter() for system clock source */
 
 static inline instr_time
-pg_get_ticks(void)
+pg_get_ticks_system(void)
 {
        instr_time      now;
        LARGE_INTEGER tmp;
@@ -248,6 +327,84 @@ pg_ns_to_ticks(int64 ns)
 #endif                                                 /* PG_INSTR_TICKS_TO_NS */
 }
 
+#if PG_INSTR_TSC_CLOCK
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif                                                 /* defined(_MSC_VER) */
+
+/* Helpers to abstract compiler differences for reading the x86 TSC. */
+static inline int64
+pg_rdtsc(void)
+{
+#ifdef _MSC_VER
+       return __rdtsc();
+#else
+       return __builtin_ia32_rdtsc();
+#endif                                                 /* defined(_MSC_VER) */
+}
+
+static inline int64
+pg_rdtscp(void)
+{
+       uint32          unused;
+
+#ifdef _MSC_VER
+       return __rdtscp(&unused);
+#else
+       return __builtin_ia32_rdtscp(&unused);
+#endif                                                 /* defined(_MSC_VER) */
+}
+
+/*
+ * Marked always_inline due to a shortcoming in gcc's heuristics leading to
+ * only inlining the function partially.
+ * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=124795
+ */
+static pg_attribute_always_inline instr_time
+pg_get_ticks(void)
+{
+       if (likely(timing_tsc_enabled))
+       {
+               instr_time      now;
+
+               now.ticks = pg_rdtscp();
+               return now;
+       }
+
+       return pg_get_ticks_system();
+}
+
+static pg_attribute_always_inline instr_time
+pg_get_ticks_fast(void)
+{
+       if (likely(timing_tsc_enabled))
+       {
+               instr_time      now;
+
+               now.ticks = pg_rdtsc();
+               return now;
+       }
+
+       return pg_get_ticks_system();
+}
+
+#else
+
+static pg_attribute_always_inline instr_time
+pg_get_ticks(void)
+{
+       return pg_get_ticks_system();
+}
+
+static pg_attribute_always_inline instr_time
+pg_get_ticks_fast(void)
+{
+       return pg_get_ticks_system();
+}
+
+#endif                                                 /* PG_INSTR_TSC_CLOCK */
+
 /*
  * Common macros
  */
@@ -256,6 +413,9 @@ pg_ns_to_ticks(int64 ns)
 
 #define INSTR_TIME_SET_ZERO(t) ((t).ticks = 0)
 
+#define INSTR_TIME_SET_CURRENT_FAST(t) \
+       ((t) = pg_get_ticks_fast())
+
 #define INSTR_TIME_SET_CURRENT(t) \
        ((t) = pg_get_ticks())
 
index b01697c1f606d80a64055b8f2df9676bb48d3f88..307f4fbaefe08f4ac1f2441dab29664d90501f14 100644 (file)
@@ -163,6 +163,9 @@ extern const char *show_timezone(void);
 extern bool check_timezone_abbreviations(char **newval, void **extra,
                                                                                 GucSource source);
 extern void assign_timezone_abbreviations(const char *newval, void *extra);
+extern void assign_timing_clock_source(int newval, void *extra);
+extern bool check_timing_clock_source(int *newval, void **extra, GucSource source);
+extern const char *show_timing_clock_source(void);
 extern bool check_transaction_buffers(int *newval, void **extra, GucSource source);
 extern bool check_transaction_deferrable(bool *newval, void **extra, GucSource source);
 extern bool check_transaction_isolation(int *newval, void **extra, GucSource source);
index 71a8016196138016b155627f30ed58ebfc5a1eb7..63440b8e36c837a27274c105754b6eb8bd5e24b9 100644 (file)
@@ -60,6 +60,7 @@ enum config_group
        CONN_AUTH_TCP,
        CONN_AUTH_AUTH,
        CONN_AUTH_SSL,
+       RESOURCES_TIME,
        RESOURCES_MEM,
        RESOURCES_DISK,
        RESOURCES_KERNEL,
index 637c669a1463063fbf6c59f390d81e206b9b405f..a998bb5e8823732a3ed92c9b14fc3d2abd6147a8 100644 (file)
@@ -3185,6 +3185,7 @@ TimeoutId
 TimeoutType
 Timestamp
 TimestampTz
+TimingClockSourceType
 TmFromChar
 TmToChar
 ToastAttrInfo