From: John Naylor Date: Sat, 4 Apr 2026 11:07:15 +0000 (+0700) Subject: Use AVX2 for calculating page checksums where available X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=5e13b0f24;p=thirdparty%2Fpostgresql.git Use AVX2 for calculating page checksums where available We already rely on autovectorization for computing page checksums, but on x86 we can get a further several-fold performance increase by annotating pg_checksum_block() with a function target attribute for the AVX2 instruction set extension. Not only does that use 256-bit registers, it can also use vector multiplication rather than the vector shifts and adds used in SSE2. Similar to other hardware-specific paths, we set a function pointer on first use. We don't bother to avoid this on platforms without AVX2 since the overhead of indirect calls doesn't matter for multi-kilobyte inputs. However, we do arrange so that only core has the function pointer mechanism. External programs will continue to build a normal static function and don't need to be aware of this. This matters most when using io_uring since in that case the checksum computation is not done in parallel by IO workers. Co-authored-by: Matthew Sterrett Co-authored-by: Andrew Kim Reviewed-by: Oleg Tselebrovskiy Tested-by: Ants Aasma Tested-by: Stepan Neretin (earlier version) Discussion: https://postgr.es/m/CA+vA85_5GTu+HHniSbvvP+8k3=xZO=WE84NPwiKyxztqvpfZ3Q@mail.gmail.com Discussion: https://postgr.es/m/20250911054220.3784-1-root%40ip-172-31-36-228.ec2.internal --- diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 629572ee350..e2e7015f1bb 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -687,6 +687,31 @@ fi undefine([Ac_cachevar])dnl ])# PGAC_SSE42_CRC32_INTRINSICS +# PGAC_AVX2_SUPPORT +# --------------------------- +# Check if the compiler supports AVX2 as a target +# +# If AVX2 target attribute is supported, sets pgac_avx2_support. +# +# There is deliberately not a guard for __has_attribute here +AC_DEFUN([PGAC_AVX2_SUPPORT], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx2_support])])dnl +AC_CACHE_CHECK([for AVX2 target attribute support], [Ac_cachevar], +[AC_COMPILE_IFELSE([AC_LANG_PROGRAM([ + __attribute__((target("avx2"))) + static int avx2_test(void) + { + return 0; + }], + [return avx2_test();])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_avx2_support=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_AVX2_SUPPORT + # PGAC_AVX512_PCLMUL_INTRINSICS # --------------------------- # Check if the compiler supports AVX-512 carryless multiplication diff --git a/configure b/configure index fe22bc71d0c..1182c3dc92e 100755 --- a/configure +++ b/configure @@ -17820,6 +17820,50 @@ $as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h fi +# Check for AVX2 target support +# +if test x"$host_cpu" = x"x86_64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 target attribute support" >&5 +$as_echo_n "checking for AVX2 target attribute support... " >&6; } +if ${pgac_cv_avx2_support+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + __attribute__((target("avx2"))) + static int avx2_test(void) + { + return 0; + } +int +main () +{ +return avx2_test(); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + pgac_cv_avx2_support=yes +else + pgac_cv_avx2_support=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx2_support" >&5 +$as_echo "$pgac_cv_avx2_support" >&6; } +if test x"$pgac_cv_avx2_support" = x"yes"; then + pgac_avx2_support=yes +fi + + if test x"$pgac_avx2_support" = x"yes"; then + +$as_echo "#define USE_AVX2_WITH_RUNTIME_CHECK 1" >>confdefs.h + + fi +fi + # Check for XSAVE intrinsics # { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv" >&5 diff --git a/configure.ac b/configure.ac index 6873b7546dd..39d8fe0e77b 100644 --- a/configure.ac +++ b/configure.ac @@ -2135,6 +2135,15 @@ if test x"$pgac_cv__cpuidex" = x"yes"; then AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.]) fi +# Check for AVX2 target support +# +if test x"$host_cpu" = x"x86_64"; then + PGAC_AVX2_SUPPORT() + if test x"$pgac_avx2_support" = x"yes"; then + AC_DEFINE(USE_AVX2_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX2 instructions with a runtime check.]) + fi +fi + # Check for XSAVE intrinsics # PGAC_XSAVE_INTRINSICS() diff --git a/meson.build b/meson.build index 6bc74c2ba79..1cecd7d1b84 100644 --- a/meson.build +++ b/meson.build @@ -2494,6 +2494,33 @@ int main(void) endif +############################################################### +# Check if the compiler supports AVX2 as a target +# There is deliberately not a guard for __has_attribute here +############################################################### + +if host_cpu == 'x86_64' + + prog = ''' +__attribute__((target("avx2"))) +static int avx2_test(void) +{ + return 0; +} + +int main(void) +{ + return avx2_test(); +} +''' + + if cc.links(prog, name: 'AVX2 support', args: test_c_args) + cdata.set('USE_AVX2_WITH_RUNTIME_CHECK', 1) + endif + +endif + + ############################################################### # Check for the availability of AVX-512 popcount intrinsics. ############################################################### diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c index 8716651c8b5..7ce51fe9d2e 100644 --- a/src/backend/storage/page/checksum.c +++ b/src/backend/storage/page/checksum.c @@ -13,10 +13,52 @@ */ #include "postgres.h" +#include "port/pg_cpu.h" #include "storage/checksum.h" /* * The actual code is in storage/checksum_impl.h. This is done so that * external programs can incorporate the checksum code by #include'ing - * that file from the exported Postgres headers. (Compare our CRC code.) + * that file from the exported Postgres headers. (Compare our legacy + * CRC code in pg_crc.h.) + * The PG_CHECKSUM_INTERNAL symbol allows core to use hardware-specific + * coding without affecting external programs. */ +#define PG_CHECKSUM_INTERNAL #include "storage/checksum_impl.h" /* IWYU pragma: keep */ + + +static uint32 +pg_checksum_block_fallback(const PGChecksummablePage *page) +{ +#include "storage/checksum_block.inc.c" +} + +/* + * AVX2-optimized block checksum algorithm. + */ +#ifdef USE_AVX2_WITH_RUNTIME_CHECK +pg_attribute_target("avx2") +static uint32 +pg_checksum_block_avx2(const PGChecksummablePage *page) +{ +#include "storage/checksum_block.inc.c" +} +#endif /* USE_AVX2_WITH_RUNTIME_CHECK */ + +/* + * Choose the best available checksum implementation. + */ +static uint32 +pg_checksum_choose(const PGChecksummablePage *page) +{ + pg_checksum_block = pg_checksum_block_fallback; + +#ifdef USE_AVX2_WITH_RUNTIME_CHECK + if (x86_feature_available(PG_AVX2)) + pg_checksum_block = pg_checksum_block_avx2; +#endif + + return pg_checksum_block(page); +} + +static uint32 (*pg_checksum_block) (const PGChecksummablePage *page) = pg_checksum_choose; diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index d8d61918aff..f624bda32b4 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -674,6 +674,9 @@ /* Define to 1 to build with assertion checks. (--enable-cassert) */ #undef USE_ASSERT_CHECKING +/* Define to 1 to use AVX2 instructions with a runtime check. */ +#undef USE_AVX2_WITH_RUNTIME_CHECK + /* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */ #undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index b93b828d3ac..c5d96bb4f47 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -24,6 +24,9 @@ typedef enum X86FeatureId PG_SSE4_2, PG_POPCNT, + /* 256-bit YMM registers */ + PG_AVX2, + /* 512-bit ZMM registers */ PG_AVX512_BW, PG_AVX512_VL, diff --git a/src/include/storage/checksum_block.inc.c b/src/include/storage/checksum_block.inc.c new file mode 100644 index 00000000000..6ef8a911145 --- /dev/null +++ b/src/include/storage/checksum_block.inc.c @@ -0,0 +1,42 @@ +/*------------------------------------------------------------------------- + * + * checksum_block.inc.c + * Core algorithm for page checksums, semi-private to checksum_impl.h + * and checksum.c. + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/checksum_block.inc.c + * + *------------------------------------------------------------------------- + */ + +/* there is deliberately not an #ifndef CHECKSUM_BLOCK_INC_C here */ + +uint32 sums[N_SUMS]; +uint32 result = 0; +uint32 i, + j; + +/* ensure that the size is compatible with the algorithm */ +Assert(sizeof(PGChecksummablePage) == BLCKSZ); + +/* initialize partial checksums to their corresponding offsets */ +memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); + +/* main checksum calculation */ +for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++) + for (j = 0; j < N_SUMS; j++) + CHECKSUM_COMP(sums[j], page->data[i][j]); + +/* finally add in two rounds of zeroes for additional mixing */ +for (i = 0; i < 2; i++) + for (j = 0; j < N_SUMS; j++) + CHECKSUM_COMP(sums[j], 0); + +/* xor fold partial checksums together */ +for (i = 0; i < N_SUMS; i++) + result ^= sums[i]; + +return result; diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h index 5c2dcbc63e7..57a2e43c687 100644 --- a/src/include/storage/checksum_impl.h +++ b/src/include/storage/checksum_impl.h @@ -72,12 +72,13 @@ * random segments of page with 0x00, 0xFF and random data all show optimal * 2e-16 false positive rate within margin of error. * - * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer - * multiplication instruction. As of 2013 the corresponding instruction is - * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32). - * Vectorization requires a compiler to do the vectorization for us. For recent - * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough - * to achieve vectorization. + * Vectorization of the algorithm works best with a 32bit x 32bit -> 32bit + * vector integer multiplication instruction, Examples include x86 AVX2 + * extensions (vpmulld) and ARM NEON (vmul.i32). Without that, vectorization + * is still possible if the compiler can turn multiplication by FNV_PRIME + * into a sequence of vectorized shifts and adds. For simplicity we rely + * on the compiler to do the vectorization for us. For GCC and clang the + * flags -funroll-loops -ftree-vectorize are enough to achieve vectorization. * * The optimal amount of parallelism to use depends on CPU specific instruction * latency, SIMD instruction width, throughput and the amount of registers @@ -89,8 +90,9 @@ * * The parallelism number 32 was chosen based on the fact that it is the * largest state that fits into architecturally visible x86 SSE registers while - * leaving some free registers for intermediate values. For future processors - * with 256bit vector registers this will leave some performance on the table. + * leaving some free registers for intermediate values. For processors + * with 256-bit vector registers this leaves some performance on the table. + * * When vectorization is not available it might be beneficial to restructure * the computation to calculate a subset of the columns at a time and perform * multiple passes to avoid register spilling. This optimization opportunity @@ -142,37 +144,20 @@ do { \ * Block checksum algorithm. The page must be adequately aligned * (at least on 4-byte boundary). */ +#ifdef PG_CHECKSUM_INTERNAL +/* definitions in src/backend/storage/page/checksum.c */ +static uint32 (*pg_checksum_block) (const PGChecksummablePage *page); + +#else +/* static definition for external programs */ static uint32 pg_checksum_block(const PGChecksummablePage *page) { - uint32 sums[N_SUMS]; - uint32 result = 0; - uint32 i, - j; - - /* ensure that the size is compatible with the algorithm */ - Assert(sizeof(PGChecksummablePage) == BLCKSZ); - - /* initialize partial checksums to their corresponding offsets */ - memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); - - /* main checksum calculation */ - for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++) - for (j = 0; j < N_SUMS; j++) - CHECKSUM_COMP(sums[j], page->data[i][j]); - - /* finally add in two rounds of zeroes for additional mixing */ - for (i = 0; i < 2; i++) - for (j = 0; j < N_SUMS; j++) - CHECKSUM_COMP(sums[j], 0); - - /* xor fold partial checksums together */ - for (i = 0; i < N_SUMS; i++) - result ^= sums[i]; - - return result; +#include "storage/checksum_block.inc.c" } +#endif + /* * Compute the checksum for a Postgres page. * diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c index 1331f3f4eb8..40ff78633ca 100644 --- a/src/port/pg_cpu_x86.c +++ b/src/port/pg_cpu_x86.c @@ -119,6 +119,10 @@ set_x86_features(void) xcr0_val = _xgetbv(0); #endif + /* Are YMM registers enabled? */ + if (mask_available(xcr0_val, XMM | YMM)) + X86Features[PG_AVX2] = reg[EBX] >> 5 & 1; + /* Are ZMM registers enabled? */ if (mask_available(xcr0_val, XMM | YMM | OPMASK | ZMM0_15 | ZMM16_31))