From 0b096e379e6f9bd49d38020d880a7da337e570ad Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 8 Jul 2025 11:23:15 -0400 Subject: [PATCH] Change pg_test_timing to measure in nanoseconds not microseconds. Most of our platforms have better-than-microsecond timing resolution, so the original definition of this program is getting less and less useful. Make it report nanoseconds not microseconds. Also, add a second output table that reports the exact observed timing durations, up to a limit of 1024 ns; and be sure to report the largest observed duration. The documentation for this program included a lot of system-specific details that now seem largely obsolete. Move all that text to the PG wiki, where perhaps it will be easier to maintain and update. Also, improve the TAP test so that it actually runs a short standard run, allowing most of the code to be exercised; its coverage before was abysmal. Author: Hannu Krosing Co-authored-by: Tom Lane Discussion: https://postgr.es/m/be0339cc-1ae1-4892-9445-8e6d8995a44d@eisentraut.org --- doc/src/sgml/ref/pgtesttiming.sgml | 279 ++++++++---------------- src/bin/pg_test_timing/pg_test_timing.c | 172 ++++++++++++--- src/bin/pg_test_timing/t/001_basic.pl | 17 ++ 3 files changed, 251 insertions(+), 217 deletions(-) diff --git a/doc/src/sgml/ref/pgtesttiming.sgml b/doc/src/sgml/ref/pgtesttiming.sgml index a5eb3aa25e0..1fcdbf7f06e 100644 --- a/doc/src/sgml/ref/pgtesttiming.sgml +++ b/doc/src/sgml/ref/pgtesttiming.sgml @@ -30,11 +30,23 @@ PostgreSQL documentation Description - pg_test_timing is a tool to measure the timing overhead - on your system and confirm that the system time never moves backwards. + pg_test_timing is a tool to measure the + timing overhead on your system and confirm that the system time never + moves backwards. It simply reads the system clock over and over again + as fast as it can for a specified length of time, and then prints + statistics about the observed differences in successive clock readings. + + + Smaller (but not zero) differences are better, since they imply both + more-precise clock hardware and less overhead to collect a clock reading. Systems that are slow to collect timing data can give less accurate EXPLAIN ANALYZE results. + + This tool is also helpful to determine if + the track_io_timing configuration parameter is likely + to produce useful results. + @@ -59,6 +71,21 @@ PostgreSQL documentation + + + + + + Specifies the cutoff percentage for the list of exact observed + timing durations (that is, the changes in the system clock value + from one reading to the next). The list will end once the running + percentage total reaches or exceeds this value, except that the + largest observed duration will always be printed. The default + cutoff is 99.99. + + + + @@ -92,205 +119,83 @@ PostgreSQL documentation Interpreting Results - Good results will show most (>90%) individual timing calls take less than - one microsecond. Average per loop overhead will be even lower, below 100 - nanoseconds. This example from an Intel i7-860 system using a TSC clock - source shows excellent performance: - - + The first block of output has four columns, with rows showing a + shifted-by-one log2(ns) histogram of timing durations (that is, the + differences between successive clock readings). This is not the + classic log2(n+1) histogram as it counts zeros separately and then + switches to log2(ns) starting from value 1. - - Note that different units are used for the per loop time than the - histogram. The loop can have resolution within a few nanoseconds (ns), - while the individual timing calls can only resolve down to one microsecond - (us). + The columns are: + + + nanosecond value that is >= the durations in this + bucket + + + percentage of durations in this bucket + + + running-sum percentage of durations in this and previous + buckets + + + count of durations in this bucket + + - - - - Measuring Executor Timing Overhead - - When the query executor is running a statement using - EXPLAIN ANALYZE, individual operations are timed as well - as showing a summary. The overhead of your system can be checked by - counting rows with the psql program: - - -CREATE TABLE t AS SELECT * FROM generate_series(1,100000); -\timing -SELECT COUNT(*) FROM t; -EXPLAIN ANALYZE SELECT COUNT(*) FROM t; - + The second block of output goes into more detail, showing the exact + timing differences observed. For brevity this list is cut off when the + running-sum percentage exceeds the user-selectable cutoff value. + However, the largest observed difference is always shown. - - The i7-860 system measured runs the count query in 9.8 ms while - the EXPLAIN ANALYZE version takes 16.6 ms, each - processing just over 100,000 rows. That 6.8 ms difference means the timing - overhead per row is 68 ns, about twice what pg_test_timing estimated it - would be. Even that relatively small amount of overhead is making the fully - timed count statement take almost 70% longer. On more substantial queries, - the timing overhead would be less problematic. + The example results below show that 99.99% of timing loops took between + 8 and 31 nanoseconds, with the worst case somewhere between 32768 and + 65535 nanoseconds. In the second block, we can see that typical loop + time is 16 nanoseconds, and the readings appear to have full nanosecond + precision. - - - - Changing Time Sources - On some newer Linux systems, it's possible to change the clock source used - to collect timing data at any time. A second example shows the slowdown - possible from switching to the slower acpi_pm time source, on the same - system used for the fast results above: - /sys/devices/system/clocksource/clocksource0/current_clocksource -# pg_test_timing -Per loop time including overhead: 722.92 ns +Testing timing overhead for 3 seconds. +Per loop time including overhead: 16.40 ns Histogram of timing durations: - < us % of total count - 1 27.84870 1155682 - 2 72.05956 2990371 - 4 0.07810 3241 - 8 0.01357 563 - 16 0.00007 3 + <= ns % of total running % count + 0 0.0000 0.0000 0 + 1 0.0000 0.0000 0 + 3 0.0000 0.0000 0 + 7 0.0000 0.0000 0 + 15 4.5452 4.5452 8313178 + 31 95.4527 99.9979 174581501 + 63 0.0001 99.9981 253 + 127 0.0001 99.9982 165 + 255 0.0000 99.9982 35 + 511 0.0000 99.9982 1 + 1023 0.0013 99.9994 2300 + 2047 0.0004 99.9998 690 + 4095 0.0000 99.9998 9 + 8191 0.0000 99.9998 8 + 16383 0.0002 100.0000 337 + 32767 0.0000 100.0000 2 + 65535 0.0000 100.0000 1 + +Observed timing durations up to 99.9900%: + ns % of total running % count + 15 4.5452 4.5452 8313178 + 16 58.3785 62.9237 106773354 + 17 33.6840 96.6078 61607584 + 18 3.1151 99.7229 5697480 + 19 0.2638 99.9867 482570 + 20 0.0093 99.9960 17054 +... + 38051 0.0000 100.0000 1 ]]> - - In this configuration, the sample EXPLAIN ANALYZE above - takes 115.9 ms. That's 1061 ns of timing overhead, again a small multiple - of what's measured directly by this utility. That much timing overhead - means the actual query itself is only taking a tiny fraction of the - accounted for time, most of it is being consumed in overhead instead. In - this configuration, any EXPLAIN ANALYZE totals involving - many timed operations would be inflated significantly by timing overhead. - - - - FreeBSD also allows changing the time source on the fly, and it logs - information about the timer selected during boot: - - -# dmesg | grep "Timecounter" -Timecounter "ACPI-fast" frequency 3579545 Hz quality 900 -Timecounter "i8254" frequency 1193182 Hz quality 0 -Timecounters tick every 10.000 msec -Timecounter "TSC" frequency 2531787134 Hz quality 800 -# sysctl kern.timecounter.hardware=TSC -kern.timecounter.hardware: ACPI-fast -> TSC - - - - - Other systems may only allow setting the time source on boot. On older - Linux systems the "clock" kernel setting is the only way to make this sort - of change. And even on some more recent ones, the only option you'll see - for a clock source is "jiffies". Jiffies are the older Linux software clock - implementation, which can have good resolution when it's backed by fast - enough timing hardware, as in this example: - - - - - - Clock Hardware and Timing Accuracy - - - Collecting accurate timing information is normally done on computers using - hardware clocks with various levels of accuracy. With some hardware the - operating systems can pass the system clock time almost directly to - programs. A system clock can also be derived from a chip that simply - provides timing interrupts, periodic ticks at some known time interval. In - either case, operating system kernels provide a clock source that hides - these details. But the accuracy of that clock source and how quickly it can - return results varies based on the underlying hardware. - - - - Inaccurate time keeping can result in system instability. Test any change - to the clock source very carefully. Operating system defaults are sometimes - made to favor reliability over best accuracy. And if you are using a virtual - machine, look into the recommended time sources compatible with it. Virtual - hardware faces additional difficulties when emulating timers, and there are - often per operating system settings suggested by vendors. - - - - The Time Stamp Counter (TSC) clock source is the most accurate one available - on current generation CPUs. It's the preferred way to track the system time - when it's supported by the operating system and the TSC clock is - reliable. There are several ways that TSC can fail to provide an accurate - timing source, making it unreliable. Older systems can have a TSC clock that - varies based on the CPU temperature, making it unusable for timing. Trying - to use TSC on some older multicore CPUs can give a reported time that's - inconsistent among multiple cores. This can result in the time going - backwards, a problem this program checks for. And even the newest systems - can fail to provide accurate TSC timing with very aggressive power saving - configurations. - - - - Newer operating systems may check for the known TSC problems and switch to a - slower, more stable clock source when they are seen. If your system - supports TSC time but doesn't default to that, it may be disabled for a good - reason. And some operating systems may not detect all the possible problems - correctly, or will allow using TSC even in situations where it's known to be - inaccurate. - - - - The High Precision Event Timer (HPET) is the preferred timer on systems - where it's available and TSC is not accurate. The timer chip itself is - programmable to allow up to 100 nanosecond resolution, but you may not see - that much accuracy in your system clock. - - - - Advanced Configuration and Power Interface (ACPI) provides a Power - Management (PM) Timer, which Linux refers to as the acpi_pm. The clock - derived from acpi_pm will at best provide 300 nanosecond resolution. - - - - Timers used on older PC hardware include the 8254 Programmable Interval - Timer (PIT), the real-time clock (RTC), the Advanced Programmable Interrupt - Controller (APIC) timer, and the Cyclone timer. These timers aim for - millisecond resolution. - - @@ -298,6 +203,8 @@ Histogram of timing durations: + Wiki + discussion about timing diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index ce7aad4b25a..64d080335eb 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -9,19 +9,30 @@ #include #include "getopt_long.h" +#include "port/pg_bitutils.h" #include "portability/instr_time.h" static const char *progname; static unsigned int test_duration = 3; +static double max_rprct = 99.99; + +/* record duration in powers of 2 nanoseconds */ +static long long int histogram[32]; + +/* record counts of first 1024 durations directly */ +#define NUM_DIRECT 1024 +static long long int direct_histogram[NUM_DIRECT]; + +/* separately record highest observed duration */ +static int32 largest_diff; +static long long int largest_diff_count; + static void handle_args(int argc, char *argv[]); static uint64 test_timing(unsigned int duration); static void output(uint64 loop_count); -/* record duration in powers of 2 microseconds */ -static long long int histogram[32]; - int main(int argc, char *argv[]) { @@ -44,6 +55,7 @@ handle_args(int argc, char *argv[]) { static struct option long_options[] = { {"duration", required_argument, NULL, 'd'}, + {"cutoff", required_argument, NULL, 'c'}, {NULL, 0, NULL, 0} }; @@ -56,7 +68,7 @@ handle_args(int argc, char *argv[]) { if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) { - printf(_("Usage: %s [-d DURATION]\n"), progname); + printf(_("Usage: %s [-d DURATION] [-c CUTOFF]\n"), progname); exit(0); } if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) @@ -66,7 +78,7 @@ handle_args(int argc, char *argv[]) } } - while ((option = getopt_long(argc, argv, "d:", + while ((option = getopt_long(argc, argv, "d:c:", long_options, &optindex)) != -1) { switch (option) @@ -93,6 +105,26 @@ handle_args(int argc, char *argv[]) } break; + case 'c': + errno = 0; + max_rprct = strtod(optarg, &endptr); + + if (endptr == optarg || *endptr != '\0' || errno != 0) + { + fprintf(stderr, _("%s: invalid argument for option %s\n"), + progname, "--cutoff"); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit(1); + } + + if (max_rprct < 0 || max_rprct > 100) + { + fprintf(stderr, _("%s: %s must be in range %u..%u\n"), + progname, "--cutoff", 0, 100); + exit(1); + } + break; + default: fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); @@ -111,7 +143,6 @@ handle_args(int argc, char *argv[]) exit(1); } - printf(ngettext("Testing timing overhead for %u second.\n", "Testing timing overhead for %u seconds.\n", test_duration), @@ -130,19 +161,19 @@ test_timing(unsigned int duration) end_time, temp; - total_time = duration > 0 ? duration * INT64CONST(1000000) : 0; + total_time = duration > 0 ? duration * INT64CONST(1000000000) : 0; INSTR_TIME_SET_CURRENT(start_time); - cur = INSTR_TIME_GET_MICROSEC(start_time); + cur = INSTR_TIME_GET_NANOSEC(start_time); while (time_elapsed < total_time) { int32 diff, - bits = 0; + bits; prev = cur; INSTR_TIME_SET_CURRENT(temp); - cur = INSTR_TIME_GET_MICROSEC(temp); + cur = INSTR_TIME_GET_NANOSEC(temp); diff = cur - prev; /* Did time go backwards? */ @@ -154,18 +185,30 @@ test_timing(unsigned int duration) } /* What is the highest bit in the time diff? */ - while (diff) - { - diff >>= 1; - bits++; - } + if (diff > 0) + bits = pg_leftmost_one_pos32(diff) + 1; + else + bits = 0; /* Update appropriate duration bucket */ histogram[bits]++; + /* Update direct histogram of time diffs */ + if (diff < NUM_DIRECT) + direct_histogram[diff]++; + + /* Also track the largest observed duration, even if >= NUM_DIRECT */ + if (diff > largest_diff) + { + largest_diff = diff; + largest_diff_count = 1; + } + else if (diff == largest_diff) + largest_diff_count++; + loop_count++; INSTR_TIME_SUBTRACT(temp, start_time); - time_elapsed = INSTR_TIME_GET_MICROSEC(temp); + time_elapsed = INSTR_TIME_GET_NANOSEC(temp); } INSTR_TIME_SET_CURRENT(end_time); @@ -181,28 +224,95 @@ test_timing(unsigned int duration) static void output(uint64 loop_count) { - int64 max_bit = 31, - i; - char *header1 = _("< us"); - char *header2 = /* xgettext:no-c-format */ _("% of total"); - char *header3 = _("count"); + int max_bit = 31; + const char *header1 = _("<= ns"); + const char *header1b = _("ns"); + const char *header2 = /* xgettext:no-c-format */ _("% of total"); + const char *header3 = /* xgettext:no-c-format */ _("running %"); + const char *header4 = _("count"); int len1 = strlen(header1); int len2 = strlen(header2); int len3 = strlen(header3); + int len4 = strlen(header4); + double rprct; + bool stopped = false; /* find highest bit value */ while (max_bit > 0 && histogram[max_bit] == 0) max_bit--; + /* set minimum column widths */ + len1 = Max(8, len1); + len2 = Max(10, len2); + len3 = Max(10, len3); + len4 = Max(10, len4); + printf(_("Histogram of timing durations:\n")); - printf("%*s %*s %*s\n", - Max(6, len1), header1, - Max(10, len2), header2, - Max(10, len3), header3); - - for (i = 0; i <= max_bit; i++) - printf("%*ld %*.5f %*lld\n", - Max(6, len1), 1l << i, - Max(10, len2) - 1, (double) histogram[i] * 100 / loop_count, - Max(10, len3), histogram[i]); + printf("%*s %*s %*s %*s\n", + len1, header1, + len2, header2, + len3, header3, + len4, header4); + + rprct = 0; + for (int i = 0; i <= max_bit; i++) + { + double prct = (double) histogram[i] * 100 / loop_count; + + rprct += prct; + printf("%*ld %*.4f %*.4f %*lld\n", + len1, (1L << i) - 1, + len2, prct, + len3, rprct, + len4, histogram[i]); + } + + printf(_("\nObserved timing durations up to %.4f%%:\n"), max_rprct); + printf("%*s %*s %*s %*s\n", + len1, header1b, + len2, header2, + len3, header3, + len4, header4); + + rprct = 0; + for (int i = 0; i < NUM_DIRECT; i++) + { + if (direct_histogram[i]) + { + double prct = (double) direct_histogram[i] * 100 / loop_count; + bool print_it = !stopped; + + rprct += prct; + + /* if largest diff is < NUM_DIRECT, be sure we print it */ + if (i == largest_diff) + { + if (stopped) + printf("...\n"); + print_it = true; + } + + if (print_it) + printf("%*d %*.4f %*.4f %*lld\n", + len1, i, + len2, prct, + len3, rprct, + len4, direct_histogram[i]); + if (rprct >= max_rprct) + stopped = true; + } + } + + /* print largest diff when it's outside the array range */ + if (largest_diff >= NUM_DIRECT) + { + double prct = (double) largest_diff_count * 100 / loop_count; + + printf("...\n"); + printf("%*d %*.4f %*.4f %*lld\n", + len1, largest_diff, + len2, prct, + len3, 100.0, + len4, largest_diff_count); + } } diff --git a/src/bin/pg_test_timing/t/001_basic.pl b/src/bin/pg_test_timing/t/001_basic.pl index 6554cd981af..9912acc052a 100644 --- a/src/bin/pg_test_timing/t/001_basic.pl +++ b/src/bin/pg_test_timing/t/001_basic.pl @@ -25,5 +25,22 @@ command_fails_like( [ 'pg_test_timing', '--duration' => '0' ], qr/\Qpg_test_timing: --duration must be in range 1..4294967295\E/, 'pg_test_timing: --duration must be in range'); +command_fails_like( + [ 'pg_test_timing', '--cutoff' => '101' ], + qr/\Qpg_test_timing: --cutoff must be in range 0..100\E/, + 'pg_test_timing: --cutoff must be in range'); + +######################################### +# We obviously can't check for specific output, but we can +# do a simple run and make sure it produces something. + +command_like( + [ 'pg_test_timing', '--duration' => '1' ], + qr/ +\QTesting timing overhead for 1 second.\E.* +\QHistogram of timing durations:\E.* +\QObserved timing durations up to 99.9900%:\E +/sx, + 'pg_test_timing: sanity check'); done_testing(); -- 2.39.5