]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Allow retrieving x86 TSC frequency/flags from CPUID
authorAndres Freund <andres@anarazel.de>
Tue, 7 Apr 2026 16:48:07 +0000 (12:48 -0400)
committerAndres Freund <andres@anarazel.de>
Tue, 7 Apr 2026 17:00:24 +0000 (13:00 -0400)
This adds additional x86 specific CPUID checks for flags needed for
determining whether the Time-Stamp Counter (TSC) is usable on a given system,
as well as a helper function to retrieve the TSC frequency from CPUID.

This is intended for a future patch that will utilize the TSC to lower the
overhead of timing instrumentation.

In passing, always make pg_cpuid_subleaf reset the variables used for its
result, to avoid accidentally using stale results if __get_cpuid_count errors
out and the caller doesn't check for it.

Author: Lukas Fittl <lukas@fittl.com>
Author: David Geier <geidav.pg@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: David Geier <geidav.pg@gmail.com>
Reviewed-by: John Naylor <john.naylor@postgresql.org>
Reviewed-by: Jakub Wartak <jakub.wartak@enterprisedb.com> (in an earlier version)
Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de

src/include/port/pg_cpu.h
src/port/pg_cpu_x86.c

index c5d96bb4f479ff8a5bd559987f11a0414c7b9c5b..a5d42f1b68d10e01966f1a3aa068c536dce3aac1 100644 (file)
@@ -32,8 +32,16 @@ typedef enum X86FeatureId
        PG_AVX512_VL,
        PG_AVX512_VPCLMULQDQ,
        PG_AVX512_VPOPCNTDQ,
+
+       /* identification */
+       PG_HYPERVISOR,
+
+       /* Time-Stamp Counter (TSC) flags */
+       PG_RDTSCP,
+       PG_TSC_INVARIANT,
+       PG_TSC_ADJUST,
 } X86FeatureId;
-#define X86FeaturesSize (PG_AVX512_VPOPCNTDQ + 1)
+#define X86FeaturesSize (PG_TSC_ADJUST + 1)
 
 extern PGDLLIMPORT bool X86Features[];
 
@@ -48,6 +56,8 @@ x86_feature_available(X86FeatureId feature)
        return X86Features[feature];
 }
 
+extern uint32 x86_tsc_frequency_khz(void);
+
 #endif                                                 /* defined(USE_SSE2) || defined(__i386__) */
 
 #endif                                                 /* PG_CPU_H */
index 40ff78633ca3fb256a35d1472ce8c26e81a33848..3844da511fd3ddabb952de94d4fa6b152736ab25 100644 (file)
@@ -80,13 +80,13 @@ pg_cpuid(int leaf, unsigned int *reg)
 static inline bool
 pg_cpuid_subleaf(int leaf, int subleaf, unsigned int *reg)
 {
+       memset(reg, 0, 4 * sizeof(unsigned int));
 #if defined(HAVE__GET_CPUID_COUNT)
        return __get_cpuid_count(leaf, subleaf, &reg[EAX], &reg[EBX], &reg[ECX], &reg[EDX]) == 1;
 #elif defined(HAVE__CPUIDEX)
        __cpuidex((int *) reg, leaf, subleaf);
        return true;
 #else
-       memset(reg, 0, 4 * sizeof(unsigned int));
        return false;
 #endif
 }
@@ -101,19 +101,24 @@ void
 set_x86_features(void)
 {
        unsigned int reg[4] = {0};
+       bool            have_osxsave;
 
        pg_cpuid(0x01, reg);
 
        X86Features[PG_SSE4_2] = reg[ECX] >> 20 & 1;
        X86Features[PG_POPCNT] = reg[ECX] >> 23 & 1;
+       X86Features[PG_HYPERVISOR] = reg[ECX] >> 31 & 1;
+       have_osxsave = reg[ECX] >> 27 & 1;
+
+       pg_cpuid_subleaf(0x07, 0, reg);
+
+       X86Features[PG_TSC_ADJUST] = reg[EBX] >> 1 & 1;
 
        /* leaf 7 features that depend on OSXSAVE */
-       if (reg[ECX] & (1 << 27))
+       if (have_osxsave)
        {
                uint32          xcr0_val = 0;
 
-               pg_cpuid_subleaf(0x07, 0, reg);
-
 #ifdef HAVE_XSAVE_INTRINSICS
                /* get value of Extended Control Register */
                xcr0_val = _xgetbv(0);
@@ -135,7 +140,132 @@ set_x86_features(void)
                }
        }
 
+       /* Check for other TSC related flags */
+       pg_cpuid(0x80000001, reg);
+       X86Features[PG_RDTSCP] = reg[EDX] >> 27 & 1;
+
+       pg_cpuid(0x80000007, reg);
+       X86Features[PG_TSC_INVARIANT] = reg[EDX] >> 8 & 1;
+
        X86Features[INIT_PG_X86] = true;
 }
 
+/* TSC (Time-stamp Counter) handling code */
+
+static uint32 x86_hypervisor_tsc_frequency_khz(void);
+
+/*
+ * Determine the TSC frequency of the CPU through CPUID, where supported.
+ *
+ * Needed to interpret the tick value returned by RDTSC/RDTSCP. Return value of
+ * 0 indicates the frequency information was not accessible via CPUID.
+ */
+uint32
+x86_tsc_frequency_khz(void)
+{
+       unsigned int reg[4] = {0};
+
+       if (x86_feature_available(PG_HYPERVISOR))
+       {
+               uint32          freq = x86_hypervisor_tsc_frequency_khz();
+
+               /*
+                * If the hypervisor specific logic didn't figure out the frequency,
+                * it's possible (although not likely, as often that's hidden from
+                * guests) that the non-virtualized logic can figure out the
+                * frequency.
+                */
+               if (freq > 0)
+                       return freq;
+       }
+
+       /*
+        * On modern Intel CPUs, the TSC is implemented by invariant timekeeping
+        * hardware, also called "Always Running Timer", or ART. The ART stays
+        * consistent even if the CPU changes frequency due to changing power
+        * levels.
+        *
+        * As documented in "Determining the Processor Base Frequency" in the
+        * "IntelĀ® 64 and IA-32 Architectures Software Developer's Manual",
+        * February 2026 Edition, we can get the TSC frequency as follows:
+        *
+        * Nominal TSC frequency = ( CPUID.15H:ECX[31:0] * CPUID.15H:EBX[31:0] ) /
+        * CPUID.15H:EAX[31:0]
+        *
+        * With CPUID.15H:ECX representing the nominal core crystal clock
+        * frequency, and EAX/EBX representing values used to translate the TSC
+        * value to that frequency, see "Chapter 20.17 "Time-Stamp Counter" of
+        * that manual.
+        *
+        * Older Intel CPUs, and other vendors do not set CPUID.15H:ECX, and as
+        * such we fall back to alternate approaches.
+        */
+       pg_cpuid(0x15, reg);
+       if (reg[ECX] > 0)
+       {
+               /*
+                * EBX not being set indicates invariant TSC is not available. Require
+                * EAX being non-zero too, to avoid a theoretical divide by zero.
+                */
+               if (reg[EAX] == 0 || reg[EBX] == 0)
+                       return 0;
+
+               return reg[ECX] / 1000 * reg[EBX] / reg[EAX];
+       }
+
+       /*
+        * When CPUID.15H is not available/incomplete, we can instead try to get
+        * the processor base frequency in MHz from CPUID.16H:EAX, the "Processor
+        * Frequency Information Leaf".
+        */
+       pg_cpuid(0x16, reg);
+       if (reg[EAX] > 0)
+               return reg[EAX] * 1000;
+
+       return 0;
+}
+
+/*
+ * Support for reading TSC frequency for hypervisors passing it to a guest VM.
+ *
+ * Two Hypervisors (VMware and KVM) are known to make TSC frequency in KHz
+ * available at the vendor-specific 0x40000010 leaf in the EAX register.
+ *
+ * For some other Hypervisors that have an invariant TSC, e.g. HyperV, we would
+ * need to access a model-specific register (MSR) to get the frequency. MSRs are
+ * separate from CPUID and typically not available for unprivileged processes,
+ * so we can't get the frequency this way.
+ */
+#define CPUID_HYPERVISOR_VMWARE(r) (r[EBX] == 0x61774d56 && r[ECX] == 0x4d566572 && r[EDX] == 0x65726177)      /* VMwareVMware */
+#define CPUID_HYPERVISOR_KVM(r) (r[EBX] == 0x4b4d564b && r[ECX] == 0x564b4d56 && r[EDX] == 0x0000004d) /* KVMKVMKVM */
+static uint32
+x86_hypervisor_tsc_frequency_khz(void)
+{
+       unsigned int reg[4] = {0};
+
+#if defined(HAVE__CPUIDEX)
+
+       /*
+        * The hypervisor is determined using the 0x40000000 Hypervisor
+        * information leaf, which requires use of __cpuidex to set ECX to 0 to
+        * access it.
+        *
+        * The similar __get_cpuid_count function does not work as expected since
+        * it contains a check for __get_cpuid_max, which has been observed to be
+        * lower than the special Hypervisor leaf, despite it being available.
+        */
+       __cpuidex((int *) reg, 0x40000000, 0);
+
+       if (reg[EAX] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(reg) || CPUID_HYPERVISOR_KVM(reg)))
+       {
+               __cpuidex((int *) reg, 0x40000010, 0);
+               if (reg[EAX] > 0)
+                       return reg[EAX];
+       }
+#endif                                                 /* HAVE__CPUIDEX */
+
+       return 0;
+}
+
+
 #endif                                                 /* defined(USE_SSE2) || defined(__i386__) */