undefine([Ac_cachevar])dnl
])# PGAC_ARMV8_CRC32C_INTRINSICS
+# PGAC_ARM_PLMULL
+# ---------------------------
+# Check if the compiler supports Arm CRYPTO PMULL (carryless multiplication)
+# instructions used for vectorized CRC.
+#
+# If the instructions are supported, sets pgac_arm_pmull.
+AC_DEFUN([PGAC_ARM_PLMULL],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_arm_pmull_$1])])dnl
+AC_CACHE_CHECK([for pmull and pmull2], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <arm_acle.h>
+#include <arm_neon.h>
+uint64x2_t a;
+uint64x2_t b;
+uint64x2_t c;
+uint64x2_t r1;
+uint64x2_t r2;
+
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("+crypto")))
+ #endif
+ static int pmull_test(void)
+ {
+ __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
+ __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
+
+ r1 = veorq_u64(r1, r2);
+ /* return computed value, to prevent the above being optimized away */
+ return (int) vgetq_lane_u64(r1, 0);
+ }],
+ [return pmull_test();])],
+ [Ac_cachevar=yes],
+ [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+ pgac_arm_pmull=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_ARM_PLMULL
+
# PGAC_LOONGARCH_CRC32C_INTRINSICS
# ---------------------------
# Check if the compiler supports the LoongArch CRCC instructions, using
$as_echo "#define USE_ARMV8_CRC32C 1" >>confdefs.h
- PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+ PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o"
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5
$as_echo "ARMv8 CRC instructions" >&6; }
else
pgac_avx512_pclmul_intrinsics=yes
fi
+else
+ if test x"$host_cpu" = x"aarch64"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pmull and pmull2" >&5
+$as_echo_n "checking for pmull and pmull2... " >&6; }
+if ${pgac_cv_arm_pmull_+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include <arm_acle.h>
+#include <arm_neon.h>
+uint64x2_t a;
+uint64x2_t b;
+uint64x2_t c;
+uint64x2_t r1;
+uint64x2_t r2;
+
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("+crypto")))
+ #endif
+ static int pmull_test(void)
+ {
+ __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
+ __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
+
+ r1 = veorq_u64(r1, r2);
+ /* return computed value, to prevent the above being optimized away */
+ return (int) vgetq_lane_u64(r1, 0);
+ }
+int
+main ()
+{
+return pmull_test();
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ pgac_cv_arm_pmull_=yes
+else
+ pgac_cv_arm_pmull_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_pmull_" >&5
+$as_echo "$pgac_cv_arm_pmull_" >&6; }
+if test x"$pgac_cv_arm_pmull_" = x"yes"; then
+ pgac_arm_pmull=yes
+fi
+
+ fi
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for vectorized CRC-32C" >&5
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: AVX-512 with runtime check" >&5
$as_echo "AVX-512 with runtime check" >&6; }
else
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5
+ if test x"$pgac_arm_pmull" = x"yes"; then
+
+$as_echo "#define USE_PMULL_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRYPTO PMULL with runtime check" >&5
+$as_echo "CRYPTO PMULL with runtime check" >&6; }
+ else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5
$as_echo "none" >&6; }
+ fi
fi
# Select semaphore implementation type.
else
if test x"$USE_ARMV8_CRC32C" = x"1"; then
AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.])
- PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+ PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o"
AC_MSG_RESULT(ARMv8 CRC instructions)
else
if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
#
if test x"$host_cpu" = x"x86_64"; then
PGAC_AVX512_PCLMUL_INTRINSICS()
+else
+ if test x"$host_cpu" = x"aarch64"; then
+ PGAC_ARM_PLMULL()
+ fi
fi
AC_MSG_CHECKING([for vectorized CRC-32C])
AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 CRC algorithms with a runtime check.])
AC_MSG_RESULT(AVX-512 with runtime check)
else
- AC_MSG_RESULT(none)
+ if test x"$pgac_arm_pmull" = x"yes"; then
+ AC_DEFINE(USE_PMULL_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Arm PMULL CRC algorithms with a runtime check.])
+ AC_MSG_RESULT(CRYPTO PMULL with runtime check)
+ else
+ AC_MSG_RESULT(none)
+ fi
fi
# Select semaphore implementation type.
have_optimized_crc = true
endif
+ # Check if the compiler supports Arm CRYPTO PMULL (carryless multiplication)
+ # instructions used for vectorized CRC.
+ prog = '''
+#include <arm_acle.h>
+#include <arm_neon.h>
+uint64x2_t a;
+uint64x2_t b;
+uint64x2_t c;
+
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("+crypto")))
+#endif
+int main(void)
+{
+ uint64x2_t r1;
+ uint64x2_t r2;
+
+ __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
+ __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
+
+ r1 = veorq_u64(r1, r2);
+ /* return computed value, to prevent the above being optimized away */
+ return (int) vgetq_lane_u64(r1, 0);
+}
+'''
+
+ if cc.links(prog,
+ name: 'CRYPTO CRC32C',
+ args: test_c_args)
+ # Use ARM CRYPTO Extension, with runtime check
+ cdata.set('USE_PMULL_CRC32C_WITH_RUNTIME_CHECK', 1)
+ endif
+
elif host_cpu == 'loongarch64'
prog = '''
/* Define to 1 to build with PAM support. (--with-pam) */
#undef USE_PAM
+/* Define to 1 to use Arm PMULL CRC algorithms with a runtime check. */
+#undef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+
/* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
#undef USE_SLICING_BY_8_CRC32C
#endif
#elif defined(USE_ARMV8_CRC32C)
-/* Use ARMv8 CRC Extension instructions. */
-
+/*
+ * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions.
+ * We don't need a runtime check for CRC, so for constant inputs, where
+ * we assume the input is small, we can avoid an indirect function call.
+ */
#define COMP_CRC32C(crc, data, len) \
- ((crc) = pg_comp_crc32c_armv8((crc), (data), (len)))
+ ((crc) = __builtin_constant_p(len) ? \
+ pg_comp_crc32c_armv8((crc), (data), (len)) : \
+ pg_comp_crc32c((crc), (data), (len)))
#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len);
+#endif
#elif defined(USE_LOONGARCH_CRC32C)
/* Use LoongArch CRCC instructions. */
#elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
/*
- * Use ARMv8 instructions, but perform a runtime check first
- * to check that they are available.
+ * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions,
+ * but perform a runtime check first to check that they are available.
*/
#define COMP_CRC32C(crc, data, len) \
((crc) = pg_comp_crc32c((crc), (data), (len)))
extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len);
+#endif
#else
/*
# arm / aarch64
['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
+ ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C'],
['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
#include <arm_acle.h>
#endif
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+#include <arm_neon.h>
+#endif
+
#include "port/pg_crc32c.h"
pg_crc32c
return crc;
}
+
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+
+/*
+ * Note: There is no copyright notice in the following generated code.
+ *
+ * We have modified the output to
+ * - match our function declaration
+ * - match whitespace to our project style
+ * - be more friendly for pgindent
+ */
+
+/* Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i neon -p crc32c -a v4e */
+/* MIT licensed */
+
+pg_attribute_target("+crypto")
+static inline
+uint64x2_t
+clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c)
+{
+ uint64x2_t r;
+
+__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
+ return r;
+}
+
+pg_attribute_target("+crypto")
+static inline
+uint64x2_t
+clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c)
+{
+ uint64x2_t r;
+
+__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
+ return r;
+}
+
+pg_attribute_target("+crypto")
+pg_crc32c
+pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len)
+{
+ /* adjust names to match generated code */
+ pg_crc32c crc0 = crc;
+ const char *buf = data;
+
+ /* align to 16 bytes */
+ for (; len && ((uintptr_t) buf & 7); --len)
+ {
+ crc0 = __crc32cb(crc0, *buf++);
+ }
+ if (((uintptr_t) buf & 8) && len >= 8)
+ {
+ crc0 = __crc32cd(crc0, *(const uint64_t *) buf);
+ buf += 8;
+ len -= 8;
+ }
+
+ if (len >= 64)
+ {
+ const char *end = buf + len;
+ const char *limit = buf + len - 64;
+
+ /* First vector chunk. */
+ uint64x2_t x0 = vld1q_u64((const uint64_t *) buf),
+ y0;
+ uint64x2_t x1 = vld1q_u64((const uint64_t *) (buf + 16)),
+ y1;
+ uint64x2_t x2 = vld1q_u64((const uint64_t *) (buf + 32)),
+ y2;
+ uint64x2_t x3 = vld1q_u64((const uint64_t *) (buf + 48)),
+ y3;
+ uint64x2_t k;
+
+ {
+ static const uint64_t pg_attribute_aligned(16) k_[] = {0x740eef02, 0x9e4addf8};
+
+ k = vld1q_u64(k_);
+ }
+
+ /*
+ * pgindent complained of unmatched parens, so the following has
+ * been re-written with intrinsics:
+ *
+ * x0 = veorq_u64((uint64x2_t) {crc0, 0}, x0);
+ */
+ x0 = veorq_u64((uint64x2_t) vsetq_lane_u64(crc0, vdupq_n_u64(0), 0), x0);
+ buf += 64;
+
+ /* Main loop. */
+ while (buf <= limit)
+ {
+ y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t *) buf)), x0 = clmul_hi_e(x0, k, y0);
+ y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t *) (buf + 16))), x1 = clmul_hi_e(x1, k, y1);
+ y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t *) (buf + 32))), x2 = clmul_hi_e(x2, k, y2);
+ y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t *) (buf + 48))), x3 = clmul_hi_e(x3, k, y3);
+ buf += 64;
+ }
+
+ /* Reduce x0 ... x3 to just x0. */
+ {
+ static const uint64_t pg_attribute_aligned(16) k_[] = {0xf20c0dfe, 0x493c7d27};
+
+ k = vld1q_u64(k_);
+ }
+ y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
+ y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2);
+ {
+ static const uint64_t pg_attribute_aligned(16) k_[] = {0x3da6d0cb, 0xba4fc28e};
+
+ k = vld1q_u64(k_);
+ }
+ y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0);
+
+ /* Reduce 128 bits to 32 bits, and multiply by x^32. */
+ crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
+ crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
+ len = end - buf;
+ }
+
+ return pg_comp_crc32c_armv8(crc0, buf, len);
+}
+
+#endif
#endif
}
+static inline bool
+pg_pmull_available(void)
+{
+#if defined(__aarch64__) && defined(HWCAP_PMULL)
+
+#ifdef HAVE_ELF_AUX_INFO
+ unsigned long value;
+
+ return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
+ (value & HWCAP_PMULL) != 0;
+#elif defined(HAVE_GETAUXVAL)
+ return (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0;
+#else
+ return false;
+#endif
+
+#else
+ return false;
+#endif
+}
+
/*
* This gets called on the first call. It replaces the function pointer
* so that subsequent calls are routed directly to the chosen implementation.
static pg_crc32c
pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
{
+ /* set fallbacks */
+#ifdef USE_ARMV8_CRC32C
+ /* On e.g. MacOS, our runtime feature detection doesn't work */
+ pg_comp_crc32c = pg_comp_crc32c_armv8;
+#else
+ pg_comp_crc32c = pg_comp_crc32c_sb8;
+#endif
+
if (pg_crc32c_armv8_available())
+ {
pg_comp_crc32c = pg_comp_crc32c_armv8;
- else
- pg_comp_crc32c = pg_comp_crc32c_sb8;
+
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+ if (pg_pmull_available())
+ pg_comp_crc32c = pg_comp_crc32c_pmull;
+#endif
+ }
return pg_comp_crc32c(crc, data, len);
}