]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Compute CRC32C on ARM using the Crypto Extension where available
authorJohn Naylor <john.naylor@postgresql.org>
Sat, 4 Apr 2026 13:47:01 +0000 (20:47 +0700)
committerJohn Naylor <john.naylor@postgresql.org>
Sat, 4 Apr 2026 13:47:01 +0000 (20:47 +0700)
In similar vein to commit 3c6e8c123, the ARMv8 cryptography extension
has 64x64 -> 128-bit carryless multiplication instructions suitable
for computing CRC. This was tested to be around twice as fast as
scalar CRC instructions for longer inputs.

We now do a runtime check, even for builds that target "armv8-a+crc",
but those builds can still use a direct call for constant inputs,
which we assume are short.

As for x86, the MIT-licensed implementation was generated with the
"generate" program from

https://github.com/corsix/fast-crc32/

Reviewed-by: Nathan Bossart <nathandbossart@gmail.com>
Discussion: https://postgr.es/m/CANWCAZaKhE+RD5KKouUFoxx1EbUNrNhcduM1VQ=DkSDadNEFng@mail.gmail.com

config/c-compiler.m4
configure
configure.ac
meson.build
src/include/pg_config.h.in
src/include/port/pg_crc32c.h
src/port/meson.build
src/port/pg_crc32c_armv8.c
src/port/pg_crc32c_armv8_choose.c

index e2e7015f1bbcf09e35205406093c05f3c261e847..3eab0da9cb6c2420f0c4aeba6e0981b4c0bb0a8a 100644 (file)
@@ -784,6 +784,44 @@ fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_ARMV8_CRC32C_INTRINSICS
 
+# PGAC_ARM_PLMULL
+# ---------------------------
+# Check if the compiler supports Arm CRYPTO PMULL (carryless multiplication)
+# instructions used for vectorized CRC.
+#
+# If the instructions are supported, sets pgac_arm_pmull.
+AC_DEFUN([PGAC_ARM_PLMULL],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_arm_pmull_$1])])dnl
+AC_CACHE_CHECK([for pmull and pmull2], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <arm_acle.h>
+#include <arm_neon.h>
+uint64x2_t  a;
+uint64x2_t  b;
+uint64x2_t  c;
+uint64x2_t  r1;
+uint64x2_t  r2;
+
+  #if defined(__has_attribute) && __has_attribute (target)
+  __attribute__((target("+crypto")))
+  #endif
+  static int pmull_test(void)
+  {
+    __asm("pmull  %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
+    __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
+
+    r1 = veorq_u64(r1, r2);
+    /* return computed value, to prevent the above being optimized away */
+    return (int) vgetq_lane_u64(r1, 0);
+  }],
+  [return pmull_test();])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_arm_pmull=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_ARM_PLMULL
+
 # PGAC_LOONGARCH_CRC32C_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the LoongArch CRCC instructions, using
index 1182c3dc92e50e91263ddaba66671e1edb16fa29..c56ef60226dc5602db5275cb1ad87c3b05b2ef0b 100755 (executable)
--- a/configure
+++ b/configure
@@ -18358,7 +18358,7 @@ $as_echo "SSE 4.2 with runtime check" >&6; }
 
 $as_echo "#define USE_ARMV8_CRC32C 1" >>confdefs.h
 
-      PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+      PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o"
       { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5
 $as_echo "ARMv8 CRC instructions" >&6; }
     else
@@ -18443,6 +18443,58 @@ if test x"$pgac_cv_avx512_pclmul_intrinsics" = x"yes"; then
   pgac_avx512_pclmul_intrinsics=yes
 fi
 
+else
+  if test x"$host_cpu" = x"aarch64"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pmull and pmull2" >&5
+$as_echo_n "checking for pmull and pmull2... " >&6; }
+if ${pgac_cv_arm_pmull_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <arm_acle.h>
+#include <arm_neon.h>
+uint64x2_t  a;
+uint64x2_t  b;
+uint64x2_t  c;
+uint64x2_t  r1;
+uint64x2_t  r2;
+
+  #if defined(__has_attribute) && __has_attribute (target)
+  __attribute__((target("+crypto")))
+  #endif
+  static int pmull_test(void)
+  {
+    __asm("pmull  %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
+    __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
+
+    r1 = veorq_u64(r1, r2);
+    /* return computed value, to prevent the above being optimized away */
+    return (int) vgetq_lane_u64(r1, 0);
+  }
+int
+main ()
+{
+return pmull_test();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_arm_pmull_=yes
+else
+  pgac_cv_arm_pmull_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_pmull_" >&5
+$as_echo "$pgac_cv_arm_pmull_" >&6; }
+if test x"$pgac_cv_arm_pmull_" = x"yes"; then
+  pgac_arm_pmull=yes
+fi
+
+  fi
 fi
 
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for vectorized CRC-32C" >&5
@@ -18454,8 +18506,16 @@ $as_echo "#define USE_AVX512_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: AVX-512 with runtime check" >&5
 $as_echo "AVX-512 with runtime check" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5
+  if test x"$pgac_arm_pmull" = x"yes"; then
+
+$as_echo "#define USE_PMULL_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRYPTO PMULL with runtime check" >&5
+$as_echo "CRYPTO PMULL with runtime check" >&6; }
+  else
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5
 $as_echo "none" >&6; }
+  fi
 fi
 
 # Select semaphore implementation type.
index 39d8fe0e77b36727459985480f066eaa335cdbff..ff5dd64468edc574831a67c1b29a359d5244e30a 100644 (file)
@@ -2277,7 +2277,7 @@ else
   else
     if test x"$USE_ARMV8_CRC32C" = x"1"; then
       AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.])
-      PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+      PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o"
       AC_MSG_RESULT(ARMv8 CRC instructions)
     else
       if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
@@ -2304,6 +2304,10 @@ AC_SUBST(PG_CRC32C_OBJS)
 #
 if test x"$host_cpu" = x"x86_64"; then
   PGAC_AVX512_PCLMUL_INTRINSICS()
+else
+  if test x"$host_cpu" = x"aarch64"; then
+    PGAC_ARM_PLMULL()
+  fi
 fi
 
 AC_MSG_CHECKING([for vectorized CRC-32C])
@@ -2311,7 +2315,12 @@ if test x"$pgac_avx512_pclmul_intrinsics" = x"yes"; then
   AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 CRC algorithms with a runtime check.])
   AC_MSG_RESULT(AVX-512 with runtime check)
 else
-  AC_MSG_RESULT(none)
+  if test x"$pgac_arm_pmull" = x"yes"; then
+    AC_DEFINE(USE_PMULL_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Arm PMULL CRC algorithms with a runtime check.])
+    AC_MSG_RESULT(CRYPTO PMULL with runtime check)
+  else
+    AC_MSG_RESULT(none)
+  fi
 fi
 
 # Select semaphore implementation type.
index 1cecd7d1b849c86d78d67a77c661ee7a361e0e2d..43d5ffc30b12906ec6a3b401bea387b79175caf3 100644 (file)
@@ -2747,6 +2747,39 @@ int main(void)
     have_optimized_crc = true
   endif
 
+    # Check if the compiler supports Arm CRYPTO PMULL (carryless multiplication)
+    # instructions used for vectorized CRC.
+    prog = '''
+#include <arm_acle.h>
+#include <arm_neon.h>
+uint64x2_t     a;
+uint64x2_t     b;
+uint64x2_t     c;
+
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("+crypto")))
+#endif
+int main(void)
+{
+    uint64x2_t r1;
+    uint64x2_t r2;
+
+       __asm("pmull  %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
+       __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
+
+       r1 = veorq_u64(r1, r2);
+       /* return computed value, to prevent the above being optimized away */
+       return (int) vgetq_lane_u64(r1, 0);
+}
+'''
+
+  if cc.links(prog,
+      name: 'CRYPTO CRC32C',
+      args: test_c_args)
+    # Use ARM CRYPTO Extension, with runtime check
+    cdata.set('USE_PMULL_CRC32C_WITH_RUNTIME_CHECK', 1)
+  endif
+
 elif host_cpu == 'loongarch64'
 
   prog = '''
index f624bda32b446314a34a0a9eff5aba9ac165fb7c..9f6d512347e7c08b66f222430382a0c21a04d44b 100644 (file)
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
+/* Define to 1 to use Arm PMULL CRC algorithms with a runtime check. */
+#undef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
 
index 1f8e837d11912e619658493379f5075acccd5228..10518614664d4f058f7fe75330680bf686cb71ef 100644 (file)
@@ -111,13 +111,22 @@ extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t l
 #endif
 
 #elif defined(USE_ARMV8_CRC32C)
-/* Use ARMv8 CRC Extension instructions. */
-
+/*
+ * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions.
+ * We don't need a runtime check for CRC, so for constant inputs, where
+ * we assume the input is small, we can avoid an indirect function call.
+ */
 #define COMP_CRC32C(crc, data, len)                                                    \
-       ((crc) = pg_comp_crc32c_armv8((crc), (data), (len)))
+       ((crc) = __builtin_constant_p(len) ?                                    \
+               pg_comp_crc32c_armv8((crc), (data), (len)) :            \
+               pg_comp_crc32c((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len);
+#endif
 
 #elif defined(USE_LOONGARCH_CRC32C)
 /* Use LoongArch CRCC instructions. */
@@ -131,8 +140,8 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
 #elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 
 /*
- * Use ARMv8 instructions, but perform a runtime check first
- * to check that they are available.
+ * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions,
+ * but perform a runtime check first to check that they are available.
  */
 #define COMP_CRC32C(crc, data, len) \
        ((crc) = pg_comp_crc32c((crc), (data), (len)))
@@ -141,6 +150,9 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
 extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len);
+#endif
 
 #else
 /*
index d55cb0424f33404ae12d6881d682c1985dbee929..922b3f646768d48599f19c03276ba7c2e160474e 100644 (file)
@@ -93,6 +93,7 @@ replace_funcs_pos = [
   # arm / aarch64
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
+  ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C'],
   ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
 
index 9ca0f728d398520bed505ff420d3b3742c8773de..b404e6c373e571418ad18459644f27ff63cb685b 100644 (file)
 #include <arm_acle.h>
 #endif
 
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+#include <arm_neon.h>
+#endif
+
 #include "port/pg_crc32c.h"
 
 pg_crc32c
@@ -77,3 +81,127 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
 
        return crc;
 }
+
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+
+/*
+ * Note: There is no copyright notice in the following generated code.
+ *
+ * We have modified the output to
+ *   - match our function declaration
+ *   - match whitespace to our project style
+ *   - be more friendly for pgindent
+ */
+
+/* Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i neon -p crc32c -a v4e */
+/* MIT licensed */
+
+pg_attribute_target("+crypto")
+static inline
+uint64x2_t
+clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c)
+{
+       uint64x2_t      r;
+
+__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
+       return r;
+}
+
+pg_attribute_target("+crypto")
+static inline
+uint64x2_t
+clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c)
+{
+       uint64x2_t      r;
+
+__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
+       return r;
+}
+
+pg_attribute_target("+crypto")
+pg_crc32c
+pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len)
+{
+       /* adjust names to match generated code */
+       pg_crc32c       crc0 = crc;
+       const char *buf = data;
+
+       /* align to 16 bytes */
+       for (; len && ((uintptr_t) buf & 7); --len)
+       {
+               crc0 = __crc32cb(crc0, *buf++);
+       }
+       if (((uintptr_t) buf & 8) && len >= 8)
+       {
+               crc0 = __crc32cd(crc0, *(const uint64_t *) buf);
+               buf += 8;
+               len -= 8;
+       }
+
+       if (len >= 64)
+       {
+               const char *end = buf + len;
+               const char *limit = buf + len - 64;
+
+               /* First vector chunk. */
+               uint64x2_t      x0 = vld1q_u64((const uint64_t *) buf),
+                                       y0;
+               uint64x2_t      x1 = vld1q_u64((const uint64_t *) (buf + 16)),
+                                       y1;
+               uint64x2_t      x2 = vld1q_u64((const uint64_t *) (buf + 32)),
+                                       y2;
+               uint64x2_t      x3 = vld1q_u64((const uint64_t *) (buf + 48)),
+                                       y3;
+               uint64x2_t      k;
+
+               {
+                       static const uint64_t pg_attribute_aligned(16) k_[] = {0x740eef02, 0x9e4addf8};
+
+                       k = vld1q_u64(k_);
+               }
+
+               /*
+                * pgindent complained of unmatched parens, so the following has
+                * been re-written with intrinsics:
+                *
+                * x0 = veorq_u64((uint64x2_t) {crc0, 0}, x0);
+                */
+               x0 = veorq_u64((uint64x2_t) vsetq_lane_u64(crc0, vdupq_n_u64(0), 0), x0);
+               buf += 64;
+
+               /* Main loop. */
+               while (buf <= limit)
+               {
+                       y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t *) buf)), x0 = clmul_hi_e(x0, k, y0);
+                       y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t *) (buf + 16))), x1 = clmul_hi_e(x1, k, y1);
+                       y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t *) (buf + 32))), x2 = clmul_hi_e(x2, k, y2);
+                       y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t *) (buf + 48))), x3 = clmul_hi_e(x3, k, y3);
+                       buf += 64;
+               }
+
+               /* Reduce x0 ... x3 to just x0. */
+               {
+                       static const uint64_t pg_attribute_aligned(16) k_[] = {0xf20c0dfe, 0x493c7d27};
+
+                       k = vld1q_u64(k_);
+               }
+               y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
+               y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2);
+               {
+                       static const uint64_t pg_attribute_aligned(16) k_[] = {0x3da6d0cb, 0xba4fc28e};
+
+                       k = vld1q_u64(k_);
+               }
+               y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0);
+
+               /* Reduce 128 bits to 32 bits, and multiply by x^32. */
+               crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
+               crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
+               len = end - buf;
+       }
+
+       return pg_comp_crc32c_armv8(crc0, buf, len);
+}
+
+#endif
index a1f0e540c6b4905a6207883f5355e76b9172de6f..72d70aea1e16468b4a9dff4c2bd7e724b8d2f651 100644 (file)
@@ -108,6 +108,27 @@ pg_crc32c_armv8_available(void)
 #endif
 }
 
+static inline bool
+pg_pmull_available(void)
+{
+#if defined(__aarch64__) && defined(HWCAP_PMULL)
+
+#ifdef HAVE_ELF_AUX_INFO
+       unsigned long value;
+
+       return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
+               (value & HWCAP_PMULL) != 0;
+#elif defined(HAVE_GETAUXVAL)
+       return (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0;
+#else
+       return false;
+#endif
+
+#else
+       return false;
+#endif
+}
+
 /*
  * This gets called on the first call. It replaces the function pointer
  * so that subsequent calls are routed directly to the chosen implementation.
@@ -115,10 +136,23 @@ pg_crc32c_armv8_available(void)
 static pg_crc32c
 pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 {
+       /* set fallbacks */
+#ifdef USE_ARMV8_CRC32C
+       /* On e.g. MacOS, our runtime feature detection doesn't work */
+       pg_comp_crc32c = pg_comp_crc32c_armv8;
+#else
+       pg_comp_crc32c = pg_comp_crc32c_sb8;
+#endif
+
        if (pg_crc32c_armv8_available())
+       {
                pg_comp_crc32c = pg_comp_crc32c_armv8;
-       else
-               pg_comp_crc32c = pg_comp_crc32c_sb8;
+
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+               if (pg_pmull_available())
+                       pg_comp_crc32c = pg_comp_crc32c_pmull;
+#endif
+       }
 
        return pg_comp_crc32c(crc, data, len);
 }