From: Andres Freund Date: Tue, 7 Apr 2026 16:48:07 +0000 (-0400) Subject: instrumentation: Standardize ticks to nanosecond conversion method X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0022622c93d9e6419cb47110c58af87a74994ea6;p=thirdparty%2Fpostgresql.git instrumentation: Standardize ticks to nanosecond conversion method The timing infrastructure (INSTR_* macros) measures time elapsed using clock_gettime() on POSIX systems, which returns the time as nanoseconds, and QueryPerformanceCounter() on Windows, which is a specialized timing clock source that returns a tick counter that needs to be converted to nanoseconds using the result of QueryPerformanceFrequency(). This conversion currently happens ad-hoc on Windows, e.g. when calling INSTR_TIME_GET_NANOSEC, which calls QueryPerformanceFrequency() on every invocation, despite the frequency being stable after program start, incurring unnecessary overhead. It also causes a fractured implementation where macros are defined differently between platforms. To ease code readability, and prepare for a future change that intends to use a ticks-to-nanosecond conversion on x86-64 for TSC use, introduce new pg_ticks_to_ns() / pg_ns_to_ticks() functions that get called from INSTR_* macros on all platforms. These functions rely on a separately initialized ticks_per_ns_scaled value, that represents the conversion ratio. This value is initialized from QueryPerformanceFrequency() on Windows, and set to zero on x86-64 POSIX systems, which results in the ticks being treated as nanoseconds. Other architectures always directly return the original ticks. To support this, pg_initialize_timing() is introduced, and is now mandatory for both the backend and any frontend programs to call before utilizing INSTR_* macros. In passing, fix variable names in comment documenting INSTR_TIME_ADD_NANOSEC(). Author: Lukas Fittl Author: David Geier Author: Andres Freund Reviewed-by: Andres Freund Reviewed-by: David Geier Reviewed-by: Lukas Fittl Reviewed-by: Zsolt Parragi Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de --- diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 6f13e8f40a0..ae829747004 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -1954,6 +1954,9 @@ InitProcessGlobals(void) MyStartTimestamp = GetCurrentTimestamp(); MyStartTime = timestamptz_to_time_t(MyStartTimestamp); + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(); + /* * Set a different global seed in every process. We want something * unpredictable, so if possible, use high-quality random bits for the diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index aee41dbe3f9..513ae88cafc 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -43,6 +43,9 @@ main(int argc, char *argv[]) handle_args(argc, argv); + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(); + loop_count = test_timing(test_duration); output(loop_count); diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index 1dae918cc09..c969afab3a5 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -6820,6 +6820,9 @@ main(int argc, char **argv) int exit_code = 0; struct timeval tv; + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(); + /* * Record difference between Unix time and instr_time time. We'll use * this for logging and aggregation. diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c index 9a397ec87b7..69d044d405d 100644 --- a/src/bin/psql/startup.c +++ b/src/bin/psql/startup.c @@ -24,6 +24,7 @@ #include "help.h" #include "input.h" #include "mainloop.h" +#include "portability/instr_time.h" #include "settings.h" /* @@ -327,6 +328,9 @@ main(int argc, char *argv[]) PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL); + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(); + SyncVariables(); if (options.list_dbs) diff --git a/src/common/Makefile b/src/common/Makefile index 2c720caa509..1a2fbbe887f 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -59,6 +59,7 @@ OBJS_COMMON = \ file_perm.o \ file_utils.o \ hashfn.o \ + instr_time.o \ ip.o \ jsonapi.o \ keywords.o \ diff --git a/src/common/instr_time.c b/src/common/instr_time.c new file mode 100644 index 00000000000..9271113a287 --- /dev/null +++ b/src/common/instr_time.c @@ -0,0 +1,106 @@ +/*------------------------------------------------------------------------- + * + * instr_time.c + * Non-inline parts of the portable high-precision interval timing + * implementation + * + * Portions Copyright (c) 2026, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/common/instr_time.c + * + *------------------------------------------------------------------------- + */ +#ifndef FRONTEND +#include "postgres.h" +#else +#include "postgres_fe.h" +#endif + +#include "portability/instr_time.h" + +/* + * Stores what the number of ticks needs to be multiplied with to end up + * with nanoseconds using integer math. + * + * On certain platforms (currently Windows) the ticks to nanoseconds conversion + * requires floating point math because: + * + * sec = ticks / frequency_hz + * ns = ticks / frequency_hz * 1,000,000,000 + * ns = ticks * (1,000,000,000 / frequency_hz) + * ns = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz + * + * Here, 'ns' is usually a floating point number. For example for a 2.5 GHz CPU + * the scaling factor becomes 1,000,000 / 2,500,000 = 0.4. + * + * To be able to use integer math we work around the lack of precision. We + * first scale the integer up (left shift by TICKS_TO_NS_SHIFT) and after the + * multiplication by the number of ticks in pg_ticks_to_ns() we shift right by + * the same amount. + * + * We remember the maximum number of ticks that can be multiplied by the scale + * factor without overflowing so we can check via a * b > max <=> a > max / b. + * + * However, as this is meant for interval measurements, it is unlikely that the + * overflow path is actually taken in typical scenarios, since overflows would + * only occur for intervals longer than 6.5 days. + * + * Note we utilize unsigned integers even though ticks are stored as a signed + * value to encourage compilers to generate better assembly, since we can be + * sure these values are not negative. + * + * On all other platforms we are using clock_gettime(), which uses nanoseconds + * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns + * to return the original value. + */ +uint64 ticks_per_ns_scaled = 0; +uint64 max_ticks_no_overflow = 0; +bool timing_initialized = false; + +static void set_ticks_per_ns_system(void); + +/* + * Initializes timing infrastructure. Must be called before making any use + * of INSTR* macros. + */ +void +pg_initialize_timing(void) +{ + if (timing_initialized) + return; + + set_ticks_per_ns_system(); + timing_initialized = true; +} + +#ifndef WIN32 + +static void +set_ticks_per_ns_system(void) +{ + ticks_per_ns_scaled = 0; + max_ticks_no_overflow = 0; +} + +#else /* WIN32 */ + +/* GetTimerFrequency returns counts per second */ +static inline double +GetTimerFrequency(void) +{ + LARGE_INTEGER f; + + QueryPerformanceFrequency(&f); + return (double) f.QuadPart; +} + +static void +set_ticks_per_ns_system(void) +{ + ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency(); + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; +} + +#endif /* WIN32 */ diff --git a/src/common/meson.build b/src/common/meson.build index 4f9b8b8263d..9bd55cda95b 100644 --- a/src/common/meson.build +++ b/src/common/meson.build @@ -13,6 +13,7 @@ common_sources = files( 'file_perm.c', 'file_utils.c', 'hashfn.c', + 'instr_time.c', 'ip.c', 'jsonapi.c', 'keywords.c', diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index 0a1fff7c487..115f5176317 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -22,7 +22,7 @@ * * INSTR_TIME_ADD(x, y) x += y * - * INSTR_TIME_ADD_NANOSEC(t, n) x += y in nanoseconds (converts to ticks) + * INSTR_TIME_ADD_NANOSEC(t, n) t += n in nanoseconds (converts to ticks) * * INSTR_TIME_SUBTRACT(x, y) x -= y * @@ -80,11 +80,37 @@ typedef struct instr_time #define NS_PER_MS INT64CONST(1000000) #define NS_PER_US INT64CONST(1000) +/* Shift amount for fixed-point ticks-to-nanoseconds conversion. */ +#define TICKS_TO_NS_SHIFT 14 -#ifndef WIN32 +/* + * PG_INSTR_TICKS_TO_NS controls whether pg_ticks_to_ns/pg_ns_to_ticks needs to + * check ticks_per_ns_scaled and potentially convert ticks <=> nanoseconds. + */ +#ifdef WIN32 +#define PG_INSTR_TICKS_TO_NS 1 +#else +#define PG_INSTR_TICKS_TO_NS 0 +#endif +/* + * Variables used to translate ticks to nanoseconds, initialized by + * pg_initialize_timing. + */ +extern PGDLLIMPORT uint64 ticks_per_ns_scaled; +extern PGDLLIMPORT uint64 max_ticks_no_overflow; +extern PGDLLIMPORT bool timing_initialized; -/* Use clock_gettime() */ +/* + * Initialize timing infrastructure + * + * This must be called at least once before using INSTR_TIME_SET_CURRENT* macros. + */ +extern void pg_initialize_timing(void); + +#ifndef WIN32 + +/* On POSIX, use clock_gettime() for system clock source */ #include @@ -108,67 +134,119 @@ typedef struct instr_time #define PG_INSTR_CLOCK CLOCK_REALTIME #endif -/* helper for INSTR_TIME_SET_CURRENT */ static inline instr_time -pg_clock_gettime_ns(void) +pg_get_ticks(void) { instr_time now; struct timespec tmp; + Assert(timing_initialized); + clock_gettime(PG_INSTR_CLOCK, &tmp); now.ticks = tmp.tv_sec * NS_PER_S + tmp.tv_nsec; return now; } -#define INSTR_TIME_SET_CURRENT(t) \ - ((t) = pg_clock_gettime_ns()) - -#define INSTR_TIME_GET_NANOSEC(t) \ - ((int64) (t).ticks) - -#define INSTR_TIME_ADD_NANOSEC(t, n) \ - ((t).ticks += (n)) - - #else /* WIN32 */ +/* On Windows, use QueryPerformanceCounter() for system clock source */ -/* Use QueryPerformanceCounter() */ - -/* helper for INSTR_TIME_SET_CURRENT */ static inline instr_time -pg_query_performance_counter(void) +pg_get_ticks(void) { instr_time now; LARGE_INTEGER tmp; + Assert(timing_initialized); + QueryPerformanceCounter(&tmp); now.ticks = tmp.QuadPart; return now; } -static inline double -GetTimerFrequency(void) +#endif /* WIN32 */ + +static inline int64 +pg_ticks_to_ns(int64 ticks) { - LARGE_INTEGER f; +#if PG_INSTR_TICKS_TO_NS + int64 ns = 0; + + Assert(timing_initialized); + + /* + * Avoid doing work if we don't use scaled ticks, e.g. system clock on + * Unix (in that case ticks is counted in nanoseconds) + */ + if (ticks_per_ns_scaled == 0) + return ticks; + + /* + * Would multiplication overflow? If so perform computation in two parts. + */ + if (unlikely(ticks > (int64) max_ticks_no_overflow)) + { + /* + * To avoid overflow, first scale total ticks down by the fixed + * factor, and *afterwards* multiply them by the frequency-based scale + * factor. + * + * The remaining ticks can follow the regular formula, since they + * won't overflow. + */ + int64 count = ticks >> TICKS_TO_NS_SHIFT; + + ns = count * ticks_per_ns_scaled; + ticks -= (count << TICKS_TO_NS_SHIFT); + } + + ns += (ticks * ticks_per_ns_scaled) >> TICKS_TO_NS_SHIFT; + + return ns; +#else + Assert(timing_initialized); - QueryPerformanceFrequency(&f); - return (double) f.QuadPart; + return ticks; +#endif /* PG_INSTR_TICKS_TO_NS */ } -#define INSTR_TIME_SET_CURRENT(t) \ - ((t) = pg_query_performance_counter()) +static inline int64 +pg_ns_to_ticks(int64 ns) +{ +#if PG_INSTR_TICKS_TO_NS + int64 ticks = 0; -#define INSTR_TIME_GET_NANOSEC(t) \ - ((int64) ((t).ticks * ((double) NS_PER_S / GetTimerFrequency()))) + Assert(timing_initialized); -#define INSTR_TIME_ADD_NANOSEC(t, n) \ - ((t).ticks += ((n) / ((double) NS_PER_S / GetTimerFrequency()))) + /* + * If ticks_per_ns_scaled is zero, ticks are already in nanoseconds (e.g. + * system clock on Unix). + */ + if (ticks_per_ns_scaled == 0) + return ns; -#endif /* WIN32 */ + /* + * The reverse of pg_ticks_to_ns to avoid a similar overflow problem. + */ + if (unlikely(ns > (INT64_MAX >> TICKS_TO_NS_SHIFT))) + { + int64 count = ns / ticks_per_ns_scaled; + + ticks = count << TICKS_TO_NS_SHIFT; + ns -= count * ticks_per_ns_scaled; + } + ticks += (ns << TICKS_TO_NS_SHIFT) / ticks_per_ns_scaled; + + return ticks; +#else + Assert(timing_initialized); + + return ns; +#endif /* PG_INSTR_TICKS_TO_NS */ +} /* * Common macros @@ -178,10 +256,16 @@ GetTimerFrequency(void) #define INSTR_TIME_SET_ZERO(t) ((t).ticks = 0) +#define INSTR_TIME_SET_CURRENT(t) \ + ((t) = pg_get_ticks()) + #define INSTR_TIME_ADD(x,y) \ ((x).ticks += (y).ticks) +#define INSTR_TIME_ADD_NANOSEC(t, n) \ + ((t).ticks += pg_ns_to_ticks(n)) + #define INSTR_TIME_SUBTRACT(x,y) \ ((x).ticks -= (y).ticks) @@ -191,6 +275,9 @@ GetTimerFrequency(void) #define INSTR_TIME_GT(x,y) \ ((x).ticks > (y).ticks) +#define INSTR_TIME_GET_NANOSEC(t) \ + (pg_ticks_to_ns((t).ticks)) + #define INSTR_TIME_GET_DOUBLE(t) \ ((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S) diff --git a/src/test/regress/expected/misc_functions.out b/src/test/regress/expected/misc_functions.out index cf55cdf3688..c3261bff209 100644 --- a/src/test/regress/expected/misc_functions.out +++ b/src/test/regress/expected/misc_functions.out @@ -850,3 +850,14 @@ SELECT oldest_multixact IS NULL AS null_result FROM pg_get_multixact_stats(); RESET ROLE; DROP ROLE regress_multixact_funcs; +-- test instr_time nanosecond<->ticks conversion +CREATE FUNCTION test_instr_time() + RETURNS bool + AS :'regresslib' + LANGUAGE C; +SELECT test_instr_time(); + test_instr_time +----------------- + t +(1 row) + diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c index 9a918156437..0c062056982 100644 --- a/src/test/regress/pg_regress.c +++ b/src/test/regress/pg_regress.c @@ -2181,6 +2181,8 @@ regression_main(int argc, char *argv[], progname = get_progname(argv[0]); set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_regress")); + pg_initialize_timing(); + get_restricted_token(); atexit(stop_postmaster); diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index 68a01a1dde0..c2eaa96f086 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -38,6 +38,7 @@ #include "optimizer/plancat.h" #include "parser/parse_coerce.h" #include "port/atomics.h" +#include "portability/instr_time.h" #include "postmaster/postmaster.h" /* for MAX_BACKENDS */ #include "storage/spin.h" #include "tcop/tcopprot.h" @@ -1384,3 +1385,38 @@ test_translation(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* Verify that pg_ticks_to_ns behaves correct, including overflow */ +PG_FUNCTION_INFO_V1(test_instr_time); +Datum +test_instr_time(PG_FUNCTION_ARGS) +{ + instr_time t; + int64 test_ns[] = {0, 1000, INT64CONST(1000000000000000)}; + int64 max_err; + + /* + * The ns-to-ticks-to-ns roundtrip may lose precision due to integer + * truncation in the fixed-point conversion. The maximum error depends on + * ticks_per_ns_scaled relative to the shift factor. + */ + max_err = (ticks_per_ns_scaled >> TICKS_TO_NS_SHIFT) + 1; + + for (int i = 0; i < lengthof(test_ns); i++) + { + int64 result; + + INSTR_TIME_SET_ZERO(t); + INSTR_TIME_ADD_NANOSEC(t, test_ns[i]); + result = INSTR_TIME_GET_NANOSEC(t); + + if (result < test_ns[i] - max_err || result > test_ns[i]) + elog(ERROR, + "INSTR_TIME_GET_NANOSEC(t) yielded " INT64_FORMAT + ", expected " INT64_FORMAT " (max_err " INT64_FORMAT + ") in file \"%s\" line %u", + result, test_ns[i], max_err, __FILE__, __LINE__); + } + + PG_RETURN_BOOL(true); +} diff --git a/src/test/regress/sql/misc_functions.sql b/src/test/regress/sql/misc_functions.sql index c8226652f2c..946ee5726cd 100644 --- a/src/test/regress/sql/misc_functions.sql +++ b/src/test/regress/sql/misc_functions.sql @@ -349,3 +349,10 @@ SET ROLE regress_multixact_funcs; SELECT oldest_multixact IS NULL AS null_result FROM pg_get_multixact_stats(); RESET ROLE; DROP ROLE regress_multixact_funcs; + +-- test instr_time nanosecond<->ticks conversion +CREATE FUNCTION test_instr_time() + RETURNS bool + AS :'regresslib' + LANGUAGE C; +SELECT test_instr_time();