We already rely on autovectorization for computing page checksums,
but on x86 we can get a further several-fold performance increase by
annotating pg_checksum_block() with a function target attribute for
the AVX2 instruction set extension. Not only does that use 256-bit
registers, it can also use vector multiplication rather than the
vector shifts and adds used in SSE2.
Similar to other hardware-specific paths, we set a function pointer
on first use. We don't bother to avoid this on platforms without AVX2
since the overhead of indirect calls doesn't matter for multi-kilobyte
inputs. However, we do arrange so that only core has the function
pointer mechanism. External programs will continue to build a normal
static function and don't need to be aware of this.
This matters most when using io_uring since in that case the checksum
computation is not done in parallel by IO workers.
Co-authored-by: Matthew Sterrett <matthewsterrett2@gmail.com>
Co-authored-by: Andrew Kim <andrew.kim@intel.com>
Reviewed-by: Oleg Tselebrovskiy <o.tselebrovskiy@postgrespro.ru>
Tested-by: Ants Aasma <ants.aasma@cybertec.at>
Tested-by: Stepan Neretin <slpmcf@gmail.com> (earlier version)
Discussion: https://postgr.es/m/CA+vA85_5GTu+HHniSbvvP+8k3=xZO=WE84NPwiKyxztqvpfZ3Q@mail.gmail.com
Discussion: https://postgr.es/m/
20250911054220.3784-1-root%40ip-172-31-36-228.ec2.internal
undefine([Ac_cachevar])dnl
])# PGAC_SSE42_CRC32_INTRINSICS
+# PGAC_AVX2_SUPPORT
+# ---------------------------
+# Check if the compiler supports AVX2 as a target
+#
+# If AVX2 target attribute is supported, sets pgac_avx2_support.
+#
+# There is deliberately not a guard for __has_attribute here
+AC_DEFUN([PGAC_AVX2_SUPPORT],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx2_support])])dnl
+AC_CACHE_CHECK([for AVX2 target attribute support], [Ac_cachevar],
+[AC_COMPILE_IFELSE([AC_LANG_PROGRAM([
+ __attribute__((target("avx2")))
+ static int avx2_test(void)
+ {
+ return 0;
+ }],
+ [return avx2_test();])],
+ [Ac_cachevar=yes],
+ [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+ pgac_avx2_support=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX2_SUPPORT
+
# PGAC_AVX512_PCLMUL_INTRINSICS
# ---------------------------
# Check if the compiler supports AVX-512 carryless multiplication
fi
+# Check for AVX2 target support
+#
+if test x"$host_cpu" = x"x86_64"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 target attribute support" >&5
+$as_echo_n "checking for AVX2 target attribute support... " >&6; }
+if ${pgac_cv_avx2_support+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+ __attribute__((target("avx2")))
+ static int avx2_test(void)
+ {
+ return 0;
+ }
+int
+main ()
+{
+return avx2_test();
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+ pgac_cv_avx2_support=yes
+else
+ pgac_cv_avx2_support=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx2_support" >&5
+$as_echo "$pgac_cv_avx2_support" >&6; }
+if test x"$pgac_cv_avx2_support" = x"yes"; then
+ pgac_avx2_support=yes
+fi
+
+ if test x"$pgac_avx2_support" = x"yes"; then
+
+$as_echo "#define USE_AVX2_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+ fi
+fi
+
# Check for XSAVE intrinsics
#
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv" >&5
AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.])
fi
+# Check for AVX2 target support
+#
+if test x"$host_cpu" = x"x86_64"; then
+ PGAC_AVX2_SUPPORT()
+ if test x"$pgac_avx2_support" = x"yes"; then
+ AC_DEFINE(USE_AVX2_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX2 instructions with a runtime check.])
+ fi
+fi
+
# Check for XSAVE intrinsics
#
PGAC_XSAVE_INTRINSICS()
endif
+###############################################################
+# Check if the compiler supports AVX2 as a target
+# There is deliberately not a guard for __has_attribute here
+###############################################################
+
+if host_cpu == 'x86_64'
+
+ prog = '''
+__attribute__((target("avx2")))
+static int avx2_test(void)
+{
+ return 0;
+}
+
+int main(void)
+{
+ return avx2_test();
+}
+'''
+
+ if cc.links(prog, name: 'AVX2 support', args: test_c_args)
+ cdata.set('USE_AVX2_WITH_RUNTIME_CHECK', 1)
+ endif
+
+endif
+
+
###############################################################
# Check for the availability of AVX-512 popcount intrinsics.
###############################################################
*/
#include "postgres.h"
+#include "port/pg_cpu.h"
#include "storage/checksum.h"
/*
* The actual code is in storage/checksum_impl.h. This is done so that
* external programs can incorporate the checksum code by #include'ing
- * that file from the exported Postgres headers. (Compare our CRC code.)
+ * that file from the exported Postgres headers. (Compare our legacy
+ * CRC code in pg_crc.h.)
+ * The PG_CHECKSUM_INTERNAL symbol allows core to use hardware-specific
+ * coding without affecting external programs.
*/
+#define PG_CHECKSUM_INTERNAL
#include "storage/checksum_impl.h" /* IWYU pragma: keep */
+
+
+static uint32
+pg_checksum_block_fallback(const PGChecksummablePage *page)
+{
+#include "storage/checksum_block.inc.c"
+}
+
+/*
+ * AVX2-optimized block checksum algorithm.
+ */
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+pg_attribute_target("avx2")
+static uint32
+pg_checksum_block_avx2(const PGChecksummablePage *page)
+{
+#include "storage/checksum_block.inc.c"
+}
+#endif /* USE_AVX2_WITH_RUNTIME_CHECK */
+
+/*
+ * Choose the best available checksum implementation.
+ */
+static uint32
+pg_checksum_choose(const PGChecksummablePage *page)
+{
+ pg_checksum_block = pg_checksum_block_fallback;
+
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+ if (x86_feature_available(PG_AVX2))
+ pg_checksum_block = pg_checksum_block_avx2;
+#endif
+
+ return pg_checksum_block(page);
+}
+
+static uint32 (*pg_checksum_block) (const PGChecksummablePage *page) = pg_checksum_choose;
/* Define to 1 to build with assertion checks. (--enable-cassert) */
#undef USE_ASSERT_CHECKING
+/* Define to 1 to use AVX2 instructions with a runtime check. */
+#undef USE_AVX2_WITH_RUNTIME_CHECK
+
/* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */
#undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
PG_SSE4_2,
PG_POPCNT,
+ /* 256-bit YMM registers */
+ PG_AVX2,
+
/* 512-bit ZMM registers */
PG_AVX512_BW,
PG_AVX512_VL,
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * checksum_block.inc.c
+ * Core algorithm for page checksums, semi-private to checksum_impl.h
+ * and checksum.c.
+ *
+ * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/checksum_block.inc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/* there is deliberately not an #ifndef CHECKSUM_BLOCK_INC_C here */
+
+uint32 sums[N_SUMS];
+uint32 result = 0;
+uint32 i,
+ j;
+
+/* ensure that the size is compatible with the algorithm */
+Assert(sizeof(PGChecksummablePage) == BLCKSZ);
+
+/* initialize partial checksums to their corresponding offsets */
+memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
+
+/* main checksum calculation */
+for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
+ for (j = 0; j < N_SUMS; j++)
+ CHECKSUM_COMP(sums[j], page->data[i][j]);
+
+/* finally add in two rounds of zeroes for additional mixing */
+for (i = 0; i < 2; i++)
+ for (j = 0; j < N_SUMS; j++)
+ CHECKSUM_COMP(sums[j], 0);
+
+/* xor fold partial checksums together */
+for (i = 0; i < N_SUMS; i++)
+ result ^= sums[i];
+
+return result;
* random segments of page with 0x00, 0xFF and random data all show optimal
* 2e-16 false positive rate within margin of error.
*
- * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer
- * multiplication instruction. As of 2013 the corresponding instruction is
- * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32).
- * Vectorization requires a compiler to do the vectorization for us. For recent
- * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough
- * to achieve vectorization.
+ * Vectorization of the algorithm works best with a 32bit x 32bit -> 32bit
+ * vector integer multiplication instruction, Examples include x86 AVX2
+ * extensions (vpmulld) and ARM NEON (vmul.i32). Without that, vectorization
+ * is still possible if the compiler can turn multiplication by FNV_PRIME
+ * into a sequence of vectorized shifts and adds. For simplicity we rely
+ * on the compiler to do the vectorization for us. For GCC and clang the
+ * flags -funroll-loops -ftree-vectorize are enough to achieve vectorization.
*
* The optimal amount of parallelism to use depends on CPU specific instruction
* latency, SIMD instruction width, throughput and the amount of registers
*
* The parallelism number 32 was chosen based on the fact that it is the
* largest state that fits into architecturally visible x86 SSE registers while
- * leaving some free registers for intermediate values. For future processors
- * with 256bit vector registers this will leave some performance on the table.
+ * leaving some free registers for intermediate values. For processors
+ * with 256-bit vector registers this leaves some performance on the table.
+ *
* When vectorization is not available it might be beneficial to restructure
* the computation to calculate a subset of the columns at a time and perform
* multiple passes to avoid register spilling. This optimization opportunity
* Block checksum algorithm. The page must be adequately aligned
* (at least on 4-byte boundary).
*/
+#ifdef PG_CHECKSUM_INTERNAL
+/* definitions in src/backend/storage/page/checksum.c */
+static uint32 (*pg_checksum_block) (const PGChecksummablePage *page);
+
+#else
+/* static definition for external programs */
static uint32
pg_checksum_block(const PGChecksummablePage *page)
{
- uint32 sums[N_SUMS];
- uint32 result = 0;
- uint32 i,
- j;
-
- /* ensure that the size is compatible with the algorithm */
- Assert(sizeof(PGChecksummablePage) == BLCKSZ);
-
- /* initialize partial checksums to their corresponding offsets */
- memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
-
- /* main checksum calculation */
- for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
- for (j = 0; j < N_SUMS; j++)
- CHECKSUM_COMP(sums[j], page->data[i][j]);
-
- /* finally add in two rounds of zeroes for additional mixing */
- for (i = 0; i < 2; i++)
- for (j = 0; j < N_SUMS; j++)
- CHECKSUM_COMP(sums[j], 0);
-
- /* xor fold partial checksums together */
- for (i = 0; i < N_SUMS; i++)
- result ^= sums[i];
-
- return result;
+#include "storage/checksum_block.inc.c"
}
+#endif
+
/*
* Compute the checksum for a Postgres page.
*
xcr0_val = _xgetbv(0);
#endif
+ /* Are YMM registers enabled? */
+ if (mask_available(xcr0_val, XMM | YMM))
+ X86Features[PG_AVX2] = reg[EBX] >> 5 & 1;
+
/* Are ZMM registers enabled? */
if (mask_available(xcr0_val, XMM | YMM |
OPMASK | ZMM0_15 | ZMM16_31))