Use AVX2 for calculating page checksums where available

author John Naylor <john.naylor@postgresql.org>

Sat, 4 Apr 2026 11:07:15 +0000 (18:07 +0700)

committer John Naylor <john.naylor@postgresql.org>

Sat, 4 Apr 2026 11:07:15 +0000 (18:07 +0700)
author John Naylor <john.naylor@postgresql.org>
Sat, 4 Apr 2026 11:07:15 +0000 (18:07 +0700)
committer John Naylor <john.naylor@postgresql.org>
Sat, 4 Apr 2026 11:07:15 +0000 (18:07 +0700)
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4

index 629572ee350dee0a23430ad7a1dcd45b82bc1b88..e2e7015f1bbcf09e35205406093c05f3c261e847 100644 (file)
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -687,6 +687,31 @@ fi
  undefine([Ac_cachevar])dnl
  ])# PGAC_SSE42_CRC32_INTRINSICS
  
+# PGAC_AVX2_SUPPORT
+# ---------------------------
+# Check if the compiler supports AVX2 as a target
+#
+# If AVX2 target attribute is supported, sets pgac_avx2_support.
+#
+# There is deliberately not a guard for __has_attribute here
+AC_DEFUN([PGAC_AVX2_SUPPORT],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx2_support])])dnl
+AC_CACHE_CHECK([for AVX2 target attribute support], [Ac_cachevar],
+[AC_COMPILE_IFELSE([AC_LANG_PROGRAM([
+    __attribute__((target("avx2")))
+    static int avx2_test(void)
+    {
+      return 0;
+    }],
+  [return avx2_test();])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_avx2_support=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX2_SUPPORT
+
  # PGAC_AVX512_PCLMUL_INTRINSICS
  # ---------------------------
  # Check if the compiler supports AVX-512 carryless multiplication
diff --git a/configure b/configure

index fe22bc71d0cd18cc489843ce5b6f781c2b034ae4..1182c3dc92e50e91263ddaba66671e1edb16fa29 100755 (executable)
--- a/configure
+++ b/configure
@@ -17820,6 +17820,50 @@ $as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h
  
  fi
  
+# Check for AVX2 target support
+#
+if test x"$host_cpu" = x"x86_64"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 target attribute support" >&5
+$as_echo_n "checking for AVX2 target attribute support... " >&6; }
+if ${pgac_cv_avx2_support+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+    __attribute__((target("avx2")))
+    static int avx2_test(void)
+    {
+      return 0;
+    }
+int
+main ()
+{
+return avx2_test();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  pgac_cv_avx2_support=yes
+else
+  pgac_cv_avx2_support=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx2_support" >&5
+$as_echo "$pgac_cv_avx2_support" >&6; }
+if test x"$pgac_cv_avx2_support" = x"yes"; then
+  pgac_avx2_support=yes
+fi
+
+  if test x"$pgac_avx2_support" = x"yes"; then
+
+$as_echo "#define USE_AVX2_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+  fi
+fi
+
  # Check for XSAVE intrinsics
  #
  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv" >&5
diff --git a/configure.ac b/configure.ac

index 6873b7546dd5f8f61a3723ca0ad40d2c57db20d8..39d8fe0e77b36727459985480f066eaa335cdbff 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -2135,6 +2135,15 @@ if test x"$pgac_cv__cpuidex" = x"yes"; then
    AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.])
  fi
  
+# Check for AVX2 target support
+#
+if test x"$host_cpu" = x"x86_64"; then
+  PGAC_AVX2_SUPPORT()
+  if test x"$pgac_avx2_support" = x"yes"; then
+    AC_DEFINE(USE_AVX2_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX2 instructions with a runtime check.])
+  fi
+fi
+
  # Check for XSAVE intrinsics
  #
  PGAC_XSAVE_INTRINSICS()
diff --git a/meson.build b/meson.build

index 6bc74c2ba7908501313d1f420eb002593a66fee9..1cecd7d1b849c86d78d67a77c661ee7a361e0e2d 100644 (file)
--- a/meson.build
+++ b/meson.build
@@ -2494,6 +2494,33 @@ int main(void)
  endif
  
  
+###############################################################
+# Check if the compiler supports AVX2 as a target
+# There is deliberately not a guard for __has_attribute here
+###############################################################
+
+if host_cpu == 'x86_64'
+
+  prog = '''
+__attribute__((target("avx2")))
+static int avx2_test(void)
+{
+    return 0;
+}
+
+int main(void)
+{
+    return avx2_test();
+}
+'''
+
+  if cc.links(prog, name: 'AVX2 support', args: test_c_args)
+    cdata.set('USE_AVX2_WITH_RUNTIME_CHECK', 1)
+  endif
+
+endif
+
+
  ###############################################################
  # Check for the availability of AVX-512 popcount intrinsics.
  ###############################################################
diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c

index 8716651c8b55573fc3f56560dc8f3394d0d21101..7ce51fe9d2e8a93a8beab48a2ff81eedfd70ab9d 100644 (file)
--- a/src/backend/storage/page/checksum.c
+++ b/src/backend/storage/page/checksum.c
@@ -13,10 +13,52 @@
   */
  #include "postgres.h"
  
+#include "port/pg_cpu.h"
  #include "storage/checksum.h"
  /*
   * The actual code is in storage/checksum_impl.h.  This is done so that
   * external programs can incorporate the checksum code by #include'ing
- * that file from the exported Postgres headers.  (Compare our CRC code.)
+ * that file from the exported Postgres headers.  (Compare our legacy
+ * CRC code in pg_crc.h.)
+ * The PG_CHECKSUM_INTERNAL symbol allows core to use hardware-specific
+ * coding without affecting external programs.
   */
+#define PG_CHECKSUM_INTERNAL
  #include "storage/checksum_impl.h"     /* IWYU pragma: keep */
+
+
+static uint32
+pg_checksum_block_fallback(const PGChecksummablePage *page)
+{
+#include "storage/checksum_block.inc.c"
+}
+
+/*
+ * AVX2-optimized block checksum algorithm.
+ */
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+pg_attribute_target("avx2")
+static uint32
+pg_checksum_block_avx2(const PGChecksummablePage *page)
+{
+#include "storage/checksum_block.inc.c"
+}
+#endif                                                 /* USE_AVX2_WITH_RUNTIME_CHECK */
+
+/*
+ * Choose the best available checksum implementation.
+ */
+static uint32
+pg_checksum_choose(const PGChecksummablePage *page)
+{
+       pg_checksum_block = pg_checksum_block_fallback;
+
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+       if (x86_feature_available(PG_AVX2))
+               pg_checksum_block = pg_checksum_block_avx2;
+#endif
+
+       return pg_checksum_block(page);
+}
+
+static uint32 (*pg_checksum_block) (const PGChecksummablePage *page) = pg_checksum_choose;
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in

index d8d61918affec5822a8198493346d2f25d522a92..f624bda32b446314a34a0a9eff5aba9ac165fb7c 100644 (file)
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -674,6 +674,9 @@
  /* Define to 1 to build with assertion checks. (--enable-cassert) */
  #undef USE_ASSERT_CHECKING
  
+/* Define to 1 to use AVX2 instructions with a runtime check. */
+#undef USE_AVX2_WITH_RUNTIME_CHECK
+
  /* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */
  #undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
  
diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h

index b93b828d3ac27c91f950396e30ab6621b9c7ce64..c5d96bb4f479ff8a5bd559987f11a0414c7b9c5b 100644 (file)
--- a/src/include/port/pg_cpu.h
+++ b/src/include/port/pg_cpu.h
@@ -24,6 +24,9 @@ typedef enum X86FeatureId
         PG_SSE4_2,
         PG_POPCNT,
  
+       /* 256-bit YMM registers */
+       PG_AVX2,
+
         /* 512-bit ZMM registers */
         PG_AVX512_BW,
         PG_AVX512_VL,
diff --git a/src/include/storage/checksum_block.inc.c b/src/include/storage/checksum_block.inc.c

new file mode 100644 (file)

index 0000000..6ef8a91
--- /dev/null
+++ b/src/include/storage/checksum_block.inc.c
@@ -0,0 +1,42 @@
+/*-------------------------------------------------------------------------
+ *
+ * checksum_block.inc.c
+ *       Core algorithm for page checksums, semi-private to checksum_impl.h
+ *       and checksum.c.
+ *
+ * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/checksum_block.inc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/* there is deliberately not an #ifndef CHECKSUM_BLOCK_INC_C here */
+
+uint32         sums[N_SUMS];
+uint32         result = 0;
+uint32         i,
+                       j;
+
+/* ensure that the size is compatible with the algorithm */
+Assert(sizeof(PGChecksummablePage) == BLCKSZ);
+
+/* initialize partial checksums to their corresponding offsets */
+memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
+
+/* main checksum calculation */
+for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
+       for (j = 0; j < N_SUMS; j++)
+               CHECKSUM_COMP(sums[j], page->data[i][j]);
+
+/* finally add in two rounds of zeroes for additional mixing */
+for (i = 0; i < 2; i++)
+       for (j = 0; j < N_SUMS; j++)
+               CHECKSUM_COMP(sums[j], 0);
+
+/* xor fold partial checksums together */
+for (i = 0; i < N_SUMS; i++)
+       result ^= sums[i];
+
+return result;
diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h

index 5c2dcbc63e747ba36e3cde6ad7bfda6c151c4788..57a2e43c68748ce71d61356b1871ca3e84a9f981 100644 (file)
--- a/src/include/storage/checksum_impl.h
+++ b/src/include/storage/checksum_impl.h
@@ -72,12 +72,13 @@
   * random segments of page with 0x00, 0xFF and random data all show optimal
   * 2e-16 false positive rate within margin of error.
   *
- * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer
- * multiplication instruction. As of 2013 the corresponding instruction is
- * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32).
- * Vectorization requires a compiler to do the vectorization for us. For recent
- * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough
- * to achieve vectorization.
+ * Vectorization of the algorithm works best with a 32bit x 32bit -> 32bit
+ * vector integer multiplication instruction, Examples include x86 AVX2
+ * extensions (vpmulld) and ARM NEON (vmul.i32). Without that, vectorization
+ * is still possible if the compiler can turn multiplication by FNV_PRIME
+ * into a sequence of vectorized shifts and adds.  For simplicity we rely
+ * on the compiler to do the vectorization for us. For GCC and clang the
+ * flags -funroll-loops -ftree-vectorize are enough to achieve vectorization.
   *
   * The optimal amount of parallelism to use depends on CPU specific instruction
   * latency, SIMD instruction width, throughput and the amount of registers
@@ -89,8 +90,9 @@
   *
   * The parallelism number 32 was chosen based on the fact that it is the
   * largest state that fits into architecturally visible x86 SSE registers while
- * leaving some free registers for intermediate values. For future processors
- * with 256bit vector registers this will leave some performance on the table.
+ * leaving some free registers for intermediate values. For processors
+ * with 256-bit vector registers this leaves some performance on the table.
+ *
   * When vectorization is not available it might be beneficial to restructure
   * the computation to calculate a subset of the columns at a time and perform
   * multiple passes to avoid register spilling. This optimization opportunity
@@ -142,37 +144,20 @@ do { \
   * Block checksum algorithm.  The page must be adequately aligned
   * (at least on 4-byte boundary).
   */
+#ifdef PG_CHECKSUM_INTERNAL
+/* definitions in src/backend/storage/page/checksum.c */
+static uint32 (*pg_checksum_block) (const PGChecksummablePage *page);
+
+#else
+/* static definition for external programs */
  static uint32
  pg_checksum_block(const PGChecksummablePage *page)
  {
-       uint32          sums[N_SUMS];
-       uint32          result = 0;
-       uint32          i,
-                               j;
-
-       /* ensure that the size is compatible with the algorithm */
-       Assert(sizeof(PGChecksummablePage) == BLCKSZ);
-
-       /* initialize partial checksums to their corresponding offsets */
-       memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
-
-       /* main checksum calculation */
-       for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
-               for (j = 0; j < N_SUMS; j++)
-                       CHECKSUM_COMP(sums[j], page->data[i][j]);
-
-       /* finally add in two rounds of zeroes for additional mixing */
-       for (i = 0; i < 2; i++)
-               for (j = 0; j < N_SUMS; j++)
-                       CHECKSUM_COMP(sums[j], 0);
-
-       /* xor fold partial checksums together */
-       for (i = 0; i < N_SUMS; i++)
-               result ^= sums[i];
-
-       return result;
+#include "storage/checksum_block.inc.c"
  }
  
+#endif
+
  /*
   * Compute the checksum for a Postgres page.
   *
diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c

index 1331f3f4eb8f937bdfe800bc54d9b126c8610ec9..40ff78633ca3fb256a35d1472ce8c26e81a33848 100644 (file)
--- a/src/port/pg_cpu_x86.c
+++ b/src/port/pg_cpu_x86.c
@@ -119,6 +119,10 @@ set_x86_features(void)
                 xcr0_val = _xgetbv(0);
  #endif
  
+               /* Are YMM registers enabled? */
+               if (mask_available(xcr0_val, XMM | YMM))
+                       X86Features[PG_AVX2] = reg[EBX] >> 5 & 1;
+
                 /* Are ZMM registers enabled? */
                 if (mask_available(xcr0_val, XMM | YMM |
                                                    OPMASK | ZMM0_15 | ZMM16_31))
author	John Naylor <john.naylor@postgresql.org>
	Sat, 4 Apr 2026 11:07:15 +0000 (18:07 +0700)
committer	John Naylor <john.naylor@postgresql.org>
	Sat, 4 Apr 2026 11:07:15 +0000 (18:07 +0700)
config/c-compiler.m4		patch \| blob \| blame \| history
configure		patch \| blob \| blame \| history
configure.ac		patch \| blob \| blame \| history
meson.build		patch \| blob \| blame \| history
src/backend/storage/page/checksum.c		patch \| blob \| blame \| history
src/include/pg_config.h.in		patch \| blob \| blame \| history
src/include/port/pg_cpu.h		patch \| blob \| blame \| history
src/include/storage/checksum_block.inc.c	[new file with mode: 0644]	patch \| blob
src/include/storage/checksum_impl.h		patch \| blob \| blame \| history
src/port/pg_cpu_x86.c		patch \| blob \| blame \| history