From: Andres Freund <andres@anarazel.de>
Date: Tue, 7 Apr 2026 16:48:07 +0000 (-0400)
Subject: instrumentation: Standardize ticks to nanosecond conversion method
X-Git-Tag: REL_19_BETA1~405
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0022622c93d9e6419cb47110c58af87a74994ea6;p=thirdparty%2Fpostgresql.git

instrumentation: Standardize ticks to nanosecond conversion method

The timing infrastructure (INSTR_* macros) measures time elapsed using
clock_gettime() on POSIX systems, which returns the time as nanoseconds,
and QueryPerformanceCounter() on Windows, which is a specialized timing
clock source that returns a tick counter that needs to be converted to
nanoseconds using the result of QueryPerformanceFrequency().

This conversion currently happens ad-hoc on Windows, e.g. when calling
INSTR_TIME_GET_NANOSEC, which calls QueryPerformanceFrequency() on every
invocation, despite the frequency being stable after program start,
incurring unnecessary overhead. It also causes a fractured implementation
where macros are defined differently between platforms.

To ease code readability, and prepare for a future change that intends
to use a ticks-to-nanosecond conversion on x86-64 for TSC use, introduce
new pg_ticks_to_ns() / pg_ns_to_ticks() functions that get called from
INSTR_* macros on all platforms.

These functions rely on a separately initialized ticks_per_ns_scaled
value, that represents the conversion ratio. This value is initialized
from QueryPerformanceFrequency() on Windows, and set to zero on x86-64
POSIX systems, which results in the ticks being treated as nanoseconds.
Other architectures always directly return the original ticks.

To support this, pg_initialize_timing() is introduced, and is now
mandatory for both the backend and any frontend programs to call before
utilizing INSTR_* macros.

In passing, fix variable names in comment documenting INSTR_TIME_ADD_NANOSEC().

Author: Lukas Fittl <lukas@fittl.com>
Author: David Geier <geidav.pg@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: David Geier <geidav.pg@gmail.com>
Reviewed-by: Lukas Fittl <lukas@fittl.com>
Reviewed-by: Zsolt Parragi <zsolt.parragi@percona.com>
Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de
---

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 6f13e8f40a0..ae829747004 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -1954,6 +1954,9 @@ InitProcessGlobals(void)
 	MyStartTimestamp = GetCurrentTimestamp();
 	MyStartTime = timestamptz_to_time_t(MyStartTimestamp);
 
+	/* initialize timing infrastructure (required for INSTR_* calls) */
+	pg_initialize_timing();
+
 	/*
 	 * Set a different global seed in every process.  We want something
 	 * unpredictable, so if possible, use high-quality random bits for the
diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c
index aee41dbe3f9..513ae88cafc 100644
--- a/src/bin/pg_test_timing/pg_test_timing.c
+++ b/src/bin/pg_test_timing/pg_test_timing.c
@@ -43,6 +43,9 @@ main(int argc, char *argv[])
 
 	handle_args(argc, argv);
 
+	/* initialize timing infrastructure (required for INSTR_* calls) */
+	pg_initialize_timing();
+
 	loop_count = test_timing(test_duration);
 
 	output(loop_count);
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 1dae918cc09..c969afab3a5 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -6820,6 +6820,9 @@ main(int argc, char **argv)
 	int			exit_code = 0;
 	struct timeval tv;
 
+	/* initialize timing infrastructure (required for INSTR_* calls) */
+	pg_initialize_timing();
+
 	/*
 	 * Record difference between Unix time and instr_time time.  We'll use
 	 * this for logging and aggregation.
diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c
index 9a397ec87b7..69d044d405d 100644
--- a/src/bin/psql/startup.c
+++ b/src/bin/psql/startup.c
@@ -24,6 +24,7 @@
 #include "help.h"
 #include "input.h"
 #include "mainloop.h"
+#include "portability/instr_time.h"
 #include "settings.h"
 
 /*
@@ -327,6 +328,9 @@ main(int argc, char *argv[])
 
 	PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL);
 
+	/* initialize timing infrastructure (required for INSTR_* calls) */
+	pg_initialize_timing();
+
 	SyncVariables();
 
 	if (options.list_dbs)
diff --git a/src/common/Makefile b/src/common/Makefile
index 2c720caa509..1a2fbbe887f 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -59,6 +59,7 @@ OBJS_COMMON = \
 	file_perm.o \
 	file_utils.o \
 	hashfn.o \
+	instr_time.o \
 	ip.o \
 	jsonapi.o \
 	keywords.o \
diff --git a/src/common/instr_time.c b/src/common/instr_time.c
new file mode 100644
index 00000000000..9271113a287
--- /dev/null
+++ b/src/common/instr_time.c
@@ -0,0 +1,106 @@
+/*-------------------------------------------------------------------------
+ *
+ * instr_time.c
+ *	   Non-inline parts of the portable high-precision interval timing
+ *	 implementation
+ *
+ * Portions Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/common/instr_time.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef FRONTEND
+#include "postgres.h"
+#else
+#include "postgres_fe.h"
+#endif
+
+#include "portability/instr_time.h"
+
+/*
+ * Stores what the number of ticks needs to be multiplied with to end up
+ * with nanoseconds using integer math.
+ *
+ * On certain platforms (currently Windows) the ticks to nanoseconds conversion
+ * requires floating point math because:
+ *
+ * sec = ticks / frequency_hz
+ * ns  = ticks / frequency_hz * 1,000,000,000
+ * ns  = ticks * (1,000,000,000 / frequency_hz)
+ * ns  = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz
+ *
+ * Here, 'ns' is usually a floating point number. For example for a 2.5 GHz CPU
+ * the scaling factor becomes 1,000,000 / 2,500,000 = 0.4.
+ *
+ * To be able to use integer math we work around the lack of precision. We
+ * first scale the integer up (left shift by TICKS_TO_NS_SHIFT) and after the
+ * multiplication by the number of ticks in pg_ticks_to_ns() we shift right by
+ * the same amount.
+ *
+ * We remember the maximum number of ticks that can be multiplied by the scale
+ * factor without overflowing so we can check via a * b > max <=> a > max / b.
+ *
+ * However, as this is meant for interval measurements, it is unlikely that the
+ * overflow path is actually taken in typical scenarios, since overflows would
+ * only occur for intervals longer than 6.5 days.
+ *
+ * Note we utilize unsigned integers even though ticks are stored as a signed
+ * value to encourage compilers to generate better assembly, since we can be
+ * sure these values are not negative.
+ *
+ * On all other platforms we are using clock_gettime(), which uses nanoseconds
+ * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns
+ * to return the original value.
+ */
+uint64		ticks_per_ns_scaled = 0;
+uint64		max_ticks_no_overflow = 0;
+bool		timing_initialized = false;
+
+static void set_ticks_per_ns_system(void);
+
+/*
+ * Initializes timing infrastructure. Must be called before making any use
+ * of INSTR* macros.
+ */
+void
+pg_initialize_timing(void)
+{
+	if (timing_initialized)
+		return;
+
+	set_ticks_per_ns_system();
+	timing_initialized = true;
+}
+
+#ifndef WIN32
+
+static void
+set_ticks_per_ns_system(void)
+{
+	ticks_per_ns_scaled = 0;
+	max_ticks_no_overflow = 0;
+}
+
+#else							/* WIN32 */
+
+/* GetTimerFrequency returns counts per second */
+static inline double
+GetTimerFrequency(void)
+{
+	LARGE_INTEGER f;
+
+	QueryPerformanceFrequency(&f);
+	return (double) f.QuadPart;
+}
+
+static void
+set_ticks_per_ns_system(void)
+{
+	ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency();
+	max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
+}
+
+#endif							/* WIN32 */
diff --git a/src/common/meson.build b/src/common/meson.build
index 4f9b8b8263d..9bd55cda95b 100644
--- a/src/common/meson.build
+++ b/src/common/meson.build
@@ -13,6 +13,7 @@ common_sources = files(
   'file_perm.c',
   'file_utils.c',
   'hashfn.c',
+  'instr_time.c',
   'ip.c',
   'jsonapi.c',
   'keywords.c',
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index 0a1fff7c487..115f5176317 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -22,7 +22,7 @@
  *
  * INSTR_TIME_ADD(x, y)				x += y
  *
- * INSTR_TIME_ADD_NANOSEC(t, n)		x += y in nanoseconds (converts to ticks)
+ * INSTR_TIME_ADD_NANOSEC(t, n)		t += n in nanoseconds (converts to ticks)
  *
  * INSTR_TIME_SUBTRACT(x, y)		x -= y
  *
@@ -80,11 +80,37 @@ typedef struct instr_time
 #define NS_PER_MS	INT64CONST(1000000)
 #define NS_PER_US	INT64CONST(1000)
 
+/* Shift amount for fixed-point ticks-to-nanoseconds conversion. */
+#define TICKS_TO_NS_SHIFT 14
 
-#ifndef WIN32
+/*
+ * PG_INSTR_TICKS_TO_NS controls whether pg_ticks_to_ns/pg_ns_to_ticks needs to
+ * check ticks_per_ns_scaled and potentially convert ticks <=> nanoseconds.
+ */
+#ifdef WIN32
+#define PG_INSTR_TICKS_TO_NS 1
+#else
+#define PG_INSTR_TICKS_TO_NS 0
+#endif
 
+/*
+ * Variables used to translate ticks to nanoseconds, initialized by
+ * pg_initialize_timing.
+ */
+extern PGDLLIMPORT uint64 ticks_per_ns_scaled;
+extern PGDLLIMPORT uint64 max_ticks_no_overflow;
+extern PGDLLIMPORT bool timing_initialized;
 
-/* Use clock_gettime() */
+/*
+ * Initialize timing infrastructure
+ *
+ * This must be called at least once before using INSTR_TIME_SET_CURRENT* macros.
+ */
+extern void pg_initialize_timing(void);
+
+#ifndef WIN32
+
+/* On POSIX, use clock_gettime() for system clock source */
 
 #include <time.h>
 
@@ -108,67 +134,119 @@ typedef struct instr_time
 #define PG_INSTR_CLOCK	CLOCK_REALTIME
 #endif
 
-/* helper for INSTR_TIME_SET_CURRENT */
 static inline instr_time
-pg_clock_gettime_ns(void)
+pg_get_ticks(void)
 {
 	instr_time	now;
 	struct timespec tmp;
 
+	Assert(timing_initialized);
+
 	clock_gettime(PG_INSTR_CLOCK, &tmp);
 	now.ticks = tmp.tv_sec * NS_PER_S + tmp.tv_nsec;
 
 	return now;
 }
 
-#define INSTR_TIME_SET_CURRENT(t) \
-	((t) = pg_clock_gettime_ns())
-
-#define INSTR_TIME_GET_NANOSEC(t) \
-	((int64) (t).ticks)
-
-#define INSTR_TIME_ADD_NANOSEC(t, n) \
-	((t).ticks += (n))
-
-
 #else							/* WIN32 */
 
+/* On Windows, use QueryPerformanceCounter() for system clock source */
 
-/* Use QueryPerformanceCounter() */
-
-/* helper for INSTR_TIME_SET_CURRENT */
 static inline instr_time
-pg_query_performance_counter(void)
+pg_get_ticks(void)
 {
 	instr_time	now;
 	LARGE_INTEGER tmp;
 
+	Assert(timing_initialized);
+
 	QueryPerformanceCounter(&tmp);
 	now.ticks = tmp.QuadPart;
 
 	return now;
 }
 
-static inline double
-GetTimerFrequency(void)
+#endif							/* WIN32 */
+
+static inline int64
+pg_ticks_to_ns(int64 ticks)
 {
-	LARGE_INTEGER f;
+#if PG_INSTR_TICKS_TO_NS
+	int64		ns = 0;
+
+	Assert(timing_initialized);
+
+	/*
+	 * Avoid doing work if we don't use scaled ticks, e.g. system clock on
+	 * Unix (in that case ticks is counted in nanoseconds)
+	 */
+	if (ticks_per_ns_scaled == 0)
+		return ticks;
+
+	/*
+	 * Would multiplication overflow? If so perform computation in two parts.
+	 */
+	if (unlikely(ticks > (int64) max_ticks_no_overflow))
+	{
+		/*
+		 * To avoid overflow, first scale total ticks down by the fixed
+		 * factor, and *afterwards* multiply them by the frequency-based scale
+		 * factor.
+		 *
+		 * The remaining ticks can follow the regular formula, since they
+		 * won't overflow.
+		 */
+		int64		count = ticks >> TICKS_TO_NS_SHIFT;
+
+		ns = count * ticks_per_ns_scaled;
+		ticks -= (count << TICKS_TO_NS_SHIFT);
+	}
+
+	ns += (ticks * ticks_per_ns_scaled) >> TICKS_TO_NS_SHIFT;
+
+	return ns;
+#else
+	Assert(timing_initialized);
 
-	QueryPerformanceFrequency(&f);
-	return (double) f.QuadPart;
+	return ticks;
+#endif							/* PG_INSTR_TICKS_TO_NS */
 }
 
-#define INSTR_TIME_SET_CURRENT(t) \
-	((t) = pg_query_performance_counter())
+static inline int64
+pg_ns_to_ticks(int64 ns)
+{
+#if PG_INSTR_TICKS_TO_NS
+	int64		ticks = 0;
 
-#define INSTR_TIME_GET_NANOSEC(t) \
-	((int64) ((t).ticks * ((double) NS_PER_S / GetTimerFrequency())))
+	Assert(timing_initialized);
 
-#define INSTR_TIME_ADD_NANOSEC(t, n) \
-	((t).ticks += ((n) / ((double) NS_PER_S / GetTimerFrequency())))
+	/*
+	 * If ticks_per_ns_scaled is zero, ticks are already in nanoseconds (e.g.
+	 * system clock on Unix).
+	 */
+	if (ticks_per_ns_scaled == 0)
+		return ns;
 
-#endif							/* WIN32 */
+	/*
+	 * The reverse of pg_ticks_to_ns to avoid a similar overflow problem.
+	 */
+	if (unlikely(ns > (INT64_MAX >> TICKS_TO_NS_SHIFT)))
+	{
+		int64		count = ns / ticks_per_ns_scaled;
+
+		ticks = count << TICKS_TO_NS_SHIFT;
+		ns -= count * ticks_per_ns_scaled;
+	}
 
+	ticks += (ns << TICKS_TO_NS_SHIFT) / ticks_per_ns_scaled;
+
+	return ticks;
+#else
+	Assert(timing_initialized);
+
+	return ns;
+#endif							/* PG_INSTR_TICKS_TO_NS */
+}
 
 /*
  * Common macros
@@ -178,10 +256,16 @@ GetTimerFrequency(void)
 
 #define INSTR_TIME_SET_ZERO(t)	((t).ticks = 0)
 
+#define INSTR_TIME_SET_CURRENT(t) \
+	((t) = pg_get_ticks())
+
 
 #define INSTR_TIME_ADD(x,y) \
 	((x).ticks += (y).ticks)
 
+#define INSTR_TIME_ADD_NANOSEC(t, n) \
+	((t).ticks += pg_ns_to_ticks(n))
+
 #define INSTR_TIME_SUBTRACT(x,y) \
 	((x).ticks -= (y).ticks)
 
@@ -191,6 +275,9 @@ GetTimerFrequency(void)
 #define INSTR_TIME_GT(x,y) \
 	((x).ticks > (y).ticks)
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	(pg_ticks_to_ns((t).ticks))
+
 #define INSTR_TIME_GET_DOUBLE(t) \
 	((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S)
 
diff --git a/src/test/regress/expected/misc_functions.out b/src/test/regress/expected/misc_functions.out
index cf55cdf3688..c3261bff209 100644
--- a/src/test/regress/expected/misc_functions.out
+++ b/src/test/regress/expected/misc_functions.out
@@ -850,3 +850,14 @@ SELECT oldest_multixact IS NULL AS null_result FROM pg_get_multixact_stats();
 
 RESET ROLE;
 DROP ROLE regress_multixact_funcs;
+-- test instr_time nanosecond<->ticks conversion
+CREATE FUNCTION test_instr_time()
+    RETURNS bool
+    AS :'regresslib'
+    LANGUAGE C;
+SELECT test_instr_time();
+ test_instr_time 
+-----------------
+ t
+(1 row)
+
diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c
index 9a918156437..0c062056982 100644
--- a/src/test/regress/pg_regress.c
+++ b/src/test/regress/pg_regress.c
@@ -2181,6 +2181,8 @@ regression_main(int argc, char *argv[],
 	progname = get_progname(argv[0]);
 	set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_regress"));
 
+	pg_initialize_timing();
+
 	get_restricted_token();
 
 	atexit(stop_postmaster);
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c
index 68a01a1dde0..c2eaa96f086 100644
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -38,6 +38,7 @@
 #include "optimizer/plancat.h"
 #include "parser/parse_coerce.h"
 #include "port/atomics.h"
+#include "portability/instr_time.h"
 #include "postmaster/postmaster.h"	/* for MAX_BACKENDS */
 #include "storage/spin.h"
 #include "tcop/tcopprot.h"
@@ -1384,3 +1385,38 @@ test_translation(PG_FUNCTION_ARGS)
 
 	PG_RETURN_VOID();
 }
+
+/* Verify that pg_ticks_to_ns behaves correct, including overflow */
+PG_FUNCTION_INFO_V1(test_instr_time);
+Datum
+test_instr_time(PG_FUNCTION_ARGS)
+{
+	instr_time	t;
+	int64		test_ns[] = {0, 1000, INT64CONST(1000000000000000)};
+	int64		max_err;
+
+	/*
+	 * The ns-to-ticks-to-ns roundtrip may lose precision due to integer
+	 * truncation in the fixed-point conversion. The maximum error depends on
+	 * ticks_per_ns_scaled relative to the shift factor.
+	 */
+	max_err = (ticks_per_ns_scaled >> TICKS_TO_NS_SHIFT) + 1;
+
+	for (int i = 0; i < lengthof(test_ns); i++)
+	{
+		int64		result;
+
+		INSTR_TIME_SET_ZERO(t);
+		INSTR_TIME_ADD_NANOSEC(t, test_ns[i]);
+		result = INSTR_TIME_GET_NANOSEC(t);
+
+		if (result < test_ns[i] - max_err || result > test_ns[i])
+			elog(ERROR,
+				 "INSTR_TIME_GET_NANOSEC(t) yielded " INT64_FORMAT
+				 ", expected " INT64_FORMAT " (max_err " INT64_FORMAT
+				 ") in file \"%s\" line %u",
+				 result, test_ns[i], max_err, __FILE__, __LINE__);
+	}
+
+	PG_RETURN_BOOL(true);
+}
diff --git a/src/test/regress/sql/misc_functions.sql b/src/test/regress/sql/misc_functions.sql
index c8226652f2c..946ee5726cd 100644
--- a/src/test/regress/sql/misc_functions.sql
+++ b/src/test/regress/sql/misc_functions.sql
@@ -349,3 +349,10 @@ SET ROLE regress_multixact_funcs;
 SELECT oldest_multixact IS NULL AS null_result FROM pg_get_multixact_stats();
 RESET ROLE;
 DROP ROLE regress_multixact_funcs;
+
+-- test instr_time nanosecond<->ticks conversion
+CREATE FUNCTION test_instr_time()
+    RETURNS bool
+    AS :'regresslib'
+    LANGUAGE C;
+SELECT test_instr_time();