Add compile-time native feature detection macros

author Nathan Moinvaziri <nathan@nathanm.com>

Sat, 7 Feb 2026 08:00:44 +0000 (00:00 -0800)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Sun, 8 Mar 2026 22:33:57 +0000 (23:33 +0100)
author Nathan Moinvaziri <nathan@nathanm.com>
Sat, 7 Feb 2026 08:00:44 +0000 (00:00 -0800)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Sun, 8 Mar 2026 22:33:57 +0000 (23:33 +0100)
diff --git a/arch/arm/arm_functions.h b/arch/arm/arm_functions.h

index 34ba87b067e5045220fc9ff2e6679a7cd0d9c435..bc77adb977d69e65d2d298857592cfd199071ec4 100644 (file)
--- a/arch/arm/arm_functions.h
+++ b/arch/arm/arm_functions.h
@@ -5,6 +5,8 @@
  #ifndef ARM_FUNCTIONS_H_
  #define ARM_FUNCTIONS_H_
  
+#include "arm_natives.h"
+
  #ifdef ARM_NEON
  uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
  uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
@@ -29,15 +31,14 @@ uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *
  void slide_hash_armv6(deflate_state *s);
  #endif
  
-
  #ifdef DISABLE_RUNTIME_CPU_DETECTION
  // ARM - SIMD
-#  if (defined(ARM_SIMD) && defined(__ARM_FEATURE_SIMD32)) || defined(ARM_NOCHECK_SIMD)
+#  ifdef ARM_SIMD_NATIVE
  #    undef native_slide_hash
  #    define native_slide_hash slide_hash_armv6
  #  endif
  // ARM - NEON
-#  if (defined(ARM_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) || ARM_NOCHECK_NEON
+#  ifdef ARM_NEON_NATIVE
  #    undef native_adler32
  #    define native_adler32 adler32_neon
  #    undef native_adler32_copy
@@ -56,14 +57,14 @@ void slide_hash_armv6(deflate_state *s);
  #    define native_slide_hash slide_hash_neon
  #  endif
  // ARM - CRC32
-#  if (defined(ARM_CRC32) && defined(__ARM_FEATURE_CRC32))
+#  ifdef ARM_CRC32_NATIVE
  #    undef native_crc32
  #    define native_crc32 crc32_armv8
  #    undef native_crc32_copy
  #    define native_crc32_copy crc32_copy_armv8
  #  endif
  // ARM - PMULL EOR3
-#  if (defined(ARM_PMULL_EOR3) && defined(__ARM_FEATURE_CRC32) && defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_SHA3))
+#  ifdef ARM_PMULL_EOR3_NATIVE
  #    undef native_crc32
  #    define native_crc32 crc32_armv8_pmull_eor3
  #    undef native_crc32_copy
diff --git a/arch/arm/arm_natives.h b/arch/arm/arm_natives.h

new file mode 100644 (file)

index 0000000..311e33e
--- /dev/null
+++ b/arch/arm/arm_natives.h
@@ -0,0 +1,31 @@
+/* arm_natives.h -- ARM compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_NATIVES_H_
+#define ARM_NATIVES_H_
+
+#if defined(__ARM_FEATURE_SIMD32)
+#  ifdef ARM_SIMD
+#    define ARM_SIMD_NATIVE
+#  endif
+#endif
+/* NEON is guaranteed on ARM64 (like SSE2 on x86-64) */
+#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(ARCH_64BIT)
+#  ifdef ARM_NEON
+#    define ARM_NEON_NATIVE
+#  endif
+#endif
+/* CRC32 is optional in ARMv8.0, mandatory in ARMv8.1+ */
+#if defined(__ARM_FEATURE_CRC32) || (defined(__ARM_ARCH) && __ARM_ARCH >= 801)
+#  ifdef ARM_CRC32
+#    define ARM_CRC32_NATIVE
+#  endif
+#endif
+#if defined(__ARM_FEATURE_CRC32) && defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_SHA3)
+#  ifdef ARM_PMULL_EOR3
+#    define ARM_PMULL_EOR3_NATIVE
+#  endif
+#endif
+
+#endif /* ARM_NATIVES_H_ */
diff --git a/arch/loongarch/loongarch_functions.h b/arch/loongarch/loongarch_functions.h

index 922c6c41653152dc0200ae5171921892431d0c56..0ec8bd66d79367ba124b784054579b92c9b0a17b 100644 (file)
--- a/arch/loongarch/loongarch_functions.h
+++ b/arch/loongarch/loongarch_functions.h
@@ -8,6 +8,8 @@
  #ifndef LOONGARCH_FUNCTIONS_H_
  #define LOONGARCH_FUNCTIONS_H_
  
+#include "loongarch_natives.h"
+
  #ifdef LOONGARCH_CRC
  uint32_t crc32_loongarch64(uint32_t crc, const uint8_t *buf, size_t len);
  uint32_t crc32_copy_loongarch64(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
@@ -36,14 +38,14 @@ void slide_hash_lasx(deflate_state *s);
  #endif
  
  #ifdef DISABLE_RUNTIME_CPU_DETECTION
-// LOONGARCH - CRC32 - All known CPUs has crc instructions
-#  if defined(LOONGARCH_CRC)
+// LOONGARCH - CRC32
+#  ifdef LOONGARCH_CRC_NATIVE
  #    undef native_crc32
  #    define native_crc32 crc32_loongarch64
  #    undef native_crc32_copy
  #    define native_crc32_copy crc32_copy_loongarch64
  #  endif
-#  if defined(LOONGARCH_LSX) && defined(__loongarch_sx)
+#  ifdef LOONGARCH_LSX_NATIVE
  #    undef native_adler32
  #    define native_adler32 adler32_lsx
  #    undef native_adler32_copy
@@ -61,7 +63,7 @@ void slide_hash_lasx(deflate_state *s);
  #    undef native_slide_hash
  #    define native_slide_hash slide_hash_lsx
  #  endif
-#  if defined(LOONGARCH_LASX) && defined(__loongarch_asx)
+#  ifdef LOONGARCH_LASX_NATIVE
  #    undef native_adler32
  #    define native_adler32 adler32_lasx
  #    undef native_adler32_copy
diff --git a/arch/loongarch/loongarch_natives.h b/arch/loongarch/loongarch_natives.h

new file mode 100644 (file)

index 0000000..35f6d3c
--- /dev/null
+++ b/arch/loongarch/loongarch_natives.h
@@ -0,0 +1,25 @@
+/* loongarch_natives.h -- LoongArch compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef LOONGARCH_NATIVES_H_
+#define LOONGARCH_NATIVES_H_
+
+#if defined(__loongarch__)
+// All known CPUs have crc instructions
+#  ifdef LOONGARCH_CRC
+#    define LOONGARCH_CRC_NATIVE
+#  endif
+#endif
+#if defined(__loongarch_sx)
+#  ifdef LOONGARCH_LSX
+#    define LOONGARCH_LSX_NATIVE
+#  endif
+#endif
+#if defined(__loongarch_asx)
+#  ifdef LOONGARCH_LASX
+#    define LOONGARCH_LASX_NATIVE
+#  endif
+#endif
+
+#endif /* LOONGARCH_NATIVES_H_ */
diff --git a/arch/power/power_functions.h b/arch/power/power_functions.h

index 49ea89e819e0284adb41c985afcd35a3dc4ce509..ccc7754a4c7a8befcebc675c6eb06c569b95c72e 100644 (file)
--- a/arch/power/power_functions.h
+++ b/arch/power/power_functions.h
@@ -7,6 +7,8 @@
  #ifndef POWER_FUNCTIONS_H_
  #define POWER_FUNCTIONS_H_
  
+#include "power_natives.h"
+
  #ifdef PPC_VMX
  uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
  uint32_t adler32_copy_vmx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
@@ -29,10 +31,9 @@ uint32_t longest_match_power9(deflate_state *const s, uint32_t cur_match);
  uint32_t longest_match_slow_power9(deflate_state *const s, uint32_t cur_match);
  #endif
  
-
  #ifdef DISABLE_RUNTIME_CPU_DETECTION
  // Power - VMX
-#  if defined(PPC_VMX) && defined(__ALTIVEC__)
+#  ifdef PPC_VMX_NATIVE
  #    undef native_adler32
  #    define native_adler32 adler32_vmx
  #    undef native_adler32_copy
@@ -41,7 +42,7 @@ uint32_t longest_match_slow_power9(deflate_state *const s, uint32_t cur_match);
  #    define native_slide_hash slide_hash_vmx
  #  endif
  // Power8 - VSX
-#  if defined(POWER8_VSX) && defined(_ARCH_PWR8) && defined(__VSX__)
+#  ifdef POWER8_VSX_NATIVE
  #    undef native_adler32
  #    define native_adler32 adler32_power8
  #    undef native_adler32_copy
@@ -53,14 +54,14 @@ uint32_t longest_match_slow_power9(deflate_state *const s, uint32_t cur_match);
  #    undef native_slide_hash
  #    define native_slide_hash slide_hash_power8
  #  endif
-#  if defined(POWER8_VSX_CRC32) && defined(_ARCH_PWR8) && defined(__VSX__)
+#  ifdef POWER8_VSX_CRC32_NATIVE
  #    undef native_crc32
  #    define native_crc32 crc32_power8
  #    undef native_crc32_copy
  #    define native_crc32_copy crc32_copy_power8
  #  endif
  // Power9
-#  if defined(POWER9) && defined(_ARCH_PWR9)
+#  ifdef POWER9_NATIVE
  #    undef native_compare256
  #    define native_compare256 compare256_power9
  #    undef native_longest_match
diff --git a/arch/power/power_natives.h b/arch/power/power_natives.h

new file mode 100644 (file)

index 0000000..59ec8a8
--- /dev/null
+++ b/arch/power/power_natives.h
@@ -0,0 +1,27 @@
+/* power_natives.h -- POWER compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_NATIVES_H_
+#define POWER_NATIVES_H_
+
+#if defined(__ALTIVEC__)
+#  ifdef PPC_VMX
+#    define PPC_VMX_NATIVE
+#  endif
+#endif
+#if defined(_ARCH_PWR8) && defined(__VSX__)
+#  ifdef POWER8_VSX
+#    define POWER8_VSX_NATIVE
+#  endif
+#  ifdef POWER8_VSX_CRC32
+#    define POWER8_VSX_CRC32_NATIVE
+#  endif
+#endif
+#if defined(_ARCH_PWR9)
+#  ifdef POWER9
+#    define POWER9_NATIVE
+#  endif
+#endif
+
+#endif /* POWER_NATIVES_H_ */
diff --git a/arch/riscv/riscv_functions.h b/arch/riscv/riscv_functions.h

index 9e641966a021cfe7553de3d8cf75fba50ae5978b..89120ffabf2420ebbfc49e3785b7a68620f2ec7a 100644 (file)
--- a/arch/riscv/riscv_functions.h
+++ b/arch/riscv/riscv_functions.h
@@ -9,6 +9,8 @@
  #ifndef RISCV_FUNCTIONS_H_
  #define RISCV_FUNCTIONS_H_
  
+#include "riscv_natives.h"
+
  #ifdef RISCV_RVV
  uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
  uint32_t adler32_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
@@ -28,7 +30,7 @@ uint32_t crc32_copy_riscv64_zbc(uint32_t crc, uint8_t *dst, const uint8_t *src,
  
  #ifdef DISABLE_RUNTIME_CPU_DETECTION
  // RISCV - RVV
-#  if defined(RISCV_RVV) && defined(__riscv_v) && defined(__linux__)
+#  ifdef RISCV_RVV_NATIVE
  #    undef native_adler32
  #    define native_adler32 adler32_rvv
  #    undef native_adler32_copy
@@ -46,9 +48,8 @@ uint32_t crc32_copy_riscv64_zbc(uint32_t crc, uint8_t *dst, const uint8_t *src,
  #    undef native_slide_hash
  #    define native_slide_hash slide_hash_rvv
  #  endif
-
  // RISCV - CRC32
-#  if (defined(RISCV_CRC32_ZBC) && defined (__riscv_zbc))
+#  ifdef RISCV_ZBC_NATIVE
  #    undef native_crc32
  #    define native_crc32 crc32_riscv64_zbc
  #    undef native_crc32_copy
diff --git a/arch/riscv/riscv_natives.h b/arch/riscv/riscv_natives.h

new file mode 100644 (file)

index 0000000..38d7aba
--- /dev/null
+++ b/arch/riscv/riscv_natives.h
@@ -0,0 +1,19 @@
+/* riscv_natives.h -- RISCV compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef RISCV_NATIVES_H_
+#define RISCV_NATIVES_H_
+
+#if defined(__riscv_v) && defined(__linux__)
+#  ifdef RISCV_RVV
+#    define RISCV_RVV_NATIVE
+#  endif
+#endif
+#if defined(__riscv_zbc)
+#  ifdef RISCV_CRC32_ZBC
+#    define RISCV_ZBC_NATIVE
+#  endif
+#endif
+
+#endif /* RISCV_NATIVES_H_ */
diff --git a/arch/s390/s390_functions.h b/arch/s390/s390_functions.h

index 7de83abb6e989bf554f01930847bc3d7d26c2454..30647051f408899fb0bc76c089d9a369773af6fd 100644 (file)
--- a/arch/s390/s390_functions.h
+++ b/arch/s390/s390_functions.h
@@ -5,6 +5,8 @@
  #ifndef S390_FUNCTIONS_H_
  #define S390_FUNCTIONS_H_
  
+#include "s390_natives.h"
+
  #ifdef S390_CRC32_VX
  uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len);
  uint32_t crc32_copy_s390_vx(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
@@ -19,7 +21,8 @@ uint32_t crc32_copy_s390_vx(uint32_t crc, uint8_t *dst, const uint8_t *src, size
  #endif
  
  #ifdef DISABLE_RUNTIME_CPU_DETECTION
-#  if defined(S390_CRC32_VX) && defined(__zarch__) && __ARCH__ >= 11 && defined(__VX__)
+// S390 - CRC32 VX
+#  ifdef S390_CRC32_VX_NATIVE
  #    undef native_crc32
  #    define native_crc32 crc32_s390_vx
  #    undef native_crc32_copy
diff --git a/arch/s390/s390_natives.h b/arch/s390/s390_natives.h

new file mode 100644 (file)

index 0000000..5da913d
--- /dev/null
+++ b/arch/s390/s390_natives.h
@@ -0,0 +1,14 @@
+/* s390_natives.h -- s390 compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef S390_NATIVES_H_
+#define S390_NATIVES_H_
+
+#if defined(__zarch__) && __ARCH__ >= 11 && defined(__VX__)
+#  ifdef S390_CRC32_VX
+#    define S390_CRC32_VX_NATIVE
+#  endif
+#endif
+
+#endif /* S390_NATIVES_H_ */
diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h

index f6ec9a137c9ab00d27e94ec5deaaab09735d1fb1..7b628a851af73ebf985e73307829b655b6195e62 100644 (file)
--- a/arch/x86/x86_functions.h
+++ b/arch/x86/x86_functions.h
@@ -6,6 +6,8 @@
  #ifndef X86_FUNCTIONS_H_
  #define X86_FUNCTIONS_H_
  
+#include "x86_natives.h"
+
  /* So great news, your compiler is broken and causes stack smashing. Rather than
   * notching out its compilation we'll just remove the assignment in the functable.
   * Further context:
@@ -80,7 +82,7 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s
  
  #ifdef DISABLE_RUNTIME_CPU_DETECTION
  // X86 - SSE2
-#  if (defined(X86_SSE2) && defined(__SSE2__)) || (defined(ARCH_X86) && defined(ARCH_64BIT))
+#  ifdef X86_SSE2_NATIVE
  #    undef native_chunkmemset_safe
  #    define native_chunkmemset_safe chunkmemset_safe_sse2
  #    undef native_compare256
@@ -101,7 +103,7 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s
  #    define native_slide_hash slide_hash_sse2
  #  endif
  // X86 - SSSE3
-#  if defined(X86_SSSE3) && defined(__SSSE3__)
+#  ifdef X86_SSSE3_NATIVE
  #    undef native_adler32
  #    define native_adler32 adler32_ssse3
  #    undef native_adler32_copy
@@ -112,26 +114,26 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s
  #    define native_inflate_fast inflate_fast_ssse3
  #  endif
  // X86 - SSE4.1
-#  if defined(X86_SSE41) && defined(__SSE4_1__) && !defined(WITHOUT_CHORBA_SSE)
-#   undef native_crc32
-#   define native_crc32 crc32_chorba_sse41
-#   undef native_crc32_copy
-#   define native_crc32_copy crc32_copy_chorba_sse41
+#  if defined(X86_SSE41_NATIVE) && !defined(WITHOUT_CHORBA_SSE)
+#    undef native_crc32
+#    define native_crc32 crc32_chorba_sse41
+#    undef native_crc32_copy
+#    define native_crc32_copy crc32_copy_chorba_sse41
  #  endif
  // X86 - SSE4.2
-#  if defined(X86_SSE42) && defined(__SSE4_2__)
+#  ifdef X86_SSE42_NATIVE
  #    undef native_adler32_copy
  #    define native_adler32_copy adler32_copy_sse42
  #  endif
  // X86 - PCLMUL
-#  if defined(X86_PCLMULQDQ_CRC) && defined(__PCLMUL__)
+#  ifdef X86_PCLMULQDQ_NATIVE
  #    undef native_crc32
  #    define native_crc32 crc32_pclmulqdq
  #    undef native_crc32_copy
  #    define native_crc32_copy crc32_copy_pclmulqdq
  #  endif
  // X86 - AVX2
-#  if defined(X86_AVX2) && defined(__AVX2__)
+#  ifdef X86_AVX2_NATIVE
  #    undef native_adler32
  #    define native_adler32 adler32_avx2
  #    undef native_adler32_copy
@@ -150,7 +152,7 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s
  #    define native_slide_hash slide_hash_avx2
  #  endif
  // X86 - AVX512 (F,DQ,BW,Vl)
-#  if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)
+#  ifdef X86_AVX512_NATIVE
  #    undef native_adler32
  #    define native_adler32 adler32_avx512
  #    undef native_adler32_copy
@@ -166,14 +168,14 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s
  #    undef native_longest_match_slow
  #    define native_longest_match_slow longest_match_slow_avx512
  // X86 - AVX512 (VNNI)
-#    if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
+#    ifdef X86_AVX512VNNI_NATIVE
  #      undef native_adler32
  #      define native_adler32 adler32_avx512_vnni
  #      undef native_adler32_copy
  #      define native_adler32_copy adler32_copy_avx512_vnni
  #    endif
  // X86 - VPCLMULQDQ
-#    if defined(__PCLMUL__) && defined(__AVX512F__) && defined(__VPCLMULQDQ__)
+#    ifdef X86_VPCLMULQDQ_NATIVE
  #      undef native_crc32
  #      define native_crc32 crc32_vpclmulqdq
  #      undef native_crc32_copy
diff --git a/arch/x86/x86_natives.h b/arch/x86/x86_natives.h

new file mode 100644 (file)

index 0000000..75f249d
--- /dev/null
+++ b/arch/x86/x86_natives.h
@@ -0,0 +1,52 @@
+/* x86_natives.h -- x86 compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef X86_NATIVES_H_
+#define X86_NATIVES_H_
+
+#if defined(__SSE2__) || (defined(ARCH_X86) && defined(ARCH_64BIT))
+#  ifdef X86_SSE2
+#    define X86_SSE2_NATIVE
+#  endif
+#endif
+#if defined(__SSSE3__)
+#  ifdef X86_SSSE3
+#    define X86_SSSE3_NATIVE
+#  endif
+#endif
+#if defined(__SSE4_1__)
+#  ifdef X86_SSE41
+#    define X86_SSE41_NATIVE
+#  endif
+#endif
+#if defined(__SSE4_2__)
+#  ifdef X86_SSE42
+#    define X86_SSE42_NATIVE
+#  endif
+#endif
+#if defined(__PCLMUL__)
+#  ifdef X86_PCLMULQDQ_CRC
+#    define X86_PCLMULQDQ_NATIVE
+#  endif
+#  if defined(__AVX512F__) && defined(__VPCLMULQDQ__)
+#    define X86_VPCLMULQDQ_NATIVE
+#  endif
+#endif
+#if defined(__AVX2__)
+#  ifdef X86_AVX2
+#    define X86_AVX2_NATIVE
+#  endif
+#endif
+#if defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)
+#  ifdef X86_AVX512
+#    define X86_AVX512_NATIVE
+#  endif
+#endif
+#if defined(__AVX512VNNI__)
+#  ifdef X86_AVX512VNNI
+#    define X86_AVX512VNNI_NATIVE
+#  endif
+#endif
+
+#endif /* X86_NATIVES_H_ */
diff --git a/arch_natives.h b/arch_natives.h

new file mode 100644 (file)

index 0000000..5fe4451
--- /dev/null
+++ b/arch_natives.h
@@ -0,0 +1,24 @@
+/* arch_natives.h -- Compile-time feature detection macros for all architectures.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARCH_NATIVES_H_
+#define ARCH_NATIVES_H_
+
+#include "zbuild.h"
+
+#if defined(X86_FEATURES)
+#  include "arch/x86/x86_natives.h"
+#elif defined(ARM_FEATURES)
+#  include "arch/arm/arm_natives.h"
+#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
+#  include "arch/power/power_natives.h"
+#elif defined(S390_FEATURES)
+#  include "arch/s390/s390_natives.h"
+#elif defined(RISCV_FEATURES)
+#  include "arch/riscv/riscv_natives.h"
+#elif defined(LOONGARCH_FEATURES)
+#  include "arch/loongarch/loongarch_natives.h"
+#endif
+
+#endif /* ARCH_NATIVES_H_ */
diff --git a/functable.c b/functable.c

index b976e725090961b36635f569de872e2a8c13ba04..478051f40da35ddcbd6f925dbae10ad3298710a4 100644 (file)
--- a/functable.c
+++ b/functable.c
@@ -77,12 +77,47 @@ static int init_functable(void) {
  
      // Set up generic C code fallbacks
  #ifndef WITH_ALL_FALLBACKS
-#  if defined(ARCH_X86) && defined(ARCH_64BIT) && defined(X86_SSE2)
-    // x86_64 always has SSE2, so we can use SSE2 functions as fallbacks where available.
+    // Only use necessary generic functions when no suitable simd versions are available.
+#  ifdef X86_SSE2_NATIVE
+    // x86_64 always has SSE2
      ft.adler32 = &adler32_c;
      ft.adler32_copy = &adler32_copy_c;
      ft.crc32 = &crc32_braid;
      ft.crc32_copy = &crc32_copy_braid;
+#  elif defined(ARM_NEON_NATIVE)
+#    ifndef ARM_CRC32_NATIVE
+    ft.crc32 = &crc32_braid;
+    ft.crc32_copy = &crc32_copy_braid;
+#    endif
+#  elif defined(POWER8_VSX_NATIVE)
+#    ifndef POWER9_NATIVE
+    ft.compare256 = &compare256_c;
+    ft.longest_match = &longest_match_c;
+    ft.longest_match_slow = &longest_match_slow_c;
+#    endif
+#    ifndef POWER8_VSX_CRC32_NATIVE
+    ft.crc32 = &crc32_braid;
+    ft.crc32_copy = &crc32_copy_braid;
+#    endif
+#  elif defined(LOONGARCH_LSX_NATIVE)
+#    ifndef LOONGARCH_CRC
+    ft.crc32 = &crc32_braid;
+    ft.crc32_copy = &crc32_copy_braid;
+#    endif
+#  elif defined(RISCV_RVV_NATIVE)
+#    ifndef RISCV_ZBC_NATIVE
+    ft.crc32 = &crc32_braid;
+    ft.crc32_copy = &crc32_copy_braid;
+#    endif
+#  elif defined(S390_CRC32_VX_NATIVE)
+    ft.adler32 = &adler32_c;
+    ft.adler32_copy = &adler32_copy_c;
+    ft.chunkmemset_safe = &chunkmemset_safe_c;
+    ft.compare256 = &compare256_c;
+    ft.inflate_fast = &inflate_fast_c;
+    ft.longest_match = &longest_match_c;
+    ft.longest_match_slow = &longest_match_slow_c;
+    ft.slide_hash = &slide_hash_c;
  #  endif
  #else // WITH_ALL_FALLBACKS
      ft.adler32 = &adler32_c;
@@ -108,60 +143,82 @@ static int init_functable(void) {
  
      // X86 - SSE2
  #ifdef X86_SSE2
-#  ifdef ARCH_32BIT
+#  ifndef X86_SSE2_NATIVE
      if (cf.x86.has_sse2)
  #  endif
      {
+#  ifndef X86_AVX2_NATIVE
          ft.chunkmemset_safe = &chunkmemset_safe_sse2;
          ft.compare256 = &compare256_sse2;
-#  if !defined(WITHOUT_CHORBA_SSE)
-        ft.crc32 = &crc32_chorba_sse2;
-        ft.crc32_copy = &crc32_copy_chorba_sse2;
-#  endif
          ft.inflate_fast = &inflate_fast_sse2;
          ft.longest_match = &longest_match_sse2;
          ft.longest_match_slow = &longest_match_slow_sse2;
          ft.slide_hash = &slide_hash_sse2;
+#  endif
+#  if !defined(WITHOUT_CHORBA_SSE) && !defined(X86_PCLMULQDQ_NATIVE)
+        ft.crc32 = &crc32_chorba_sse2;
+        ft.crc32_copy = &crc32_copy_chorba_sse2;
+#  endif
      }
  #endif
      // X86 - SSSE3
  #ifdef X86_SSSE3
-    if (cf.x86.has_ssse3) {
+#  ifndef X86_SSSE3_NATIVE
+    if (cf.x86.has_ssse3)
+#  endif
+    {
          ft.adler32 = &adler32_ssse3;
          ft.adler32_copy = &adler32_copy_ssse3;
+#  ifndef X86_AVX2_NATIVE
          ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
          ft.inflate_fast = &inflate_fast_ssse3;
+#  endif
      }
  #endif
  
      // X86 - SSE4.1
-#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE)
-    if (cf.x86.has_sse41) {
+#if defined(X86_SSE41) && !defined(X86_PCLMULQDQ_NATIVE)
+#  ifndef X86_SSE41_NATIVE
+    if (cf.x86.has_sse41)
+#  endif
+    {
+#  ifndef WITHOUT_CHORBA_SSE
          ft.crc32 = &crc32_chorba_sse41;
          ft.crc32_copy = &crc32_copy_chorba_sse41;
+#  endif
      }
  #endif
  
      // X86 - SSE4.2
-#ifdef X86_SSE42
-    if (cf.x86.has_sse42) {
+#if defined(X86_SSE42) && !defined(X86_AVX512_NATIVE)
+#  ifndef X86_SSE42_NATIVE
+    if (cf.x86.has_sse42)
+#  endif
+    {
          ft.adler32_copy = &adler32_copy_sse42;
      }
  #endif
      // X86 - PCLMUL
-#ifdef X86_PCLMULQDQ_CRC
-    if (cf.x86.has_pclmulqdq) {
+#if defined(X86_PCLMULQDQ_CRC) && !defined(X86_VPCLMULQDQ_NATIVE)
+#  ifndef X86_PCLMULQDQ_NATIVE
+    if (cf.x86.has_pclmulqdq)
+#  endif
+    {
          ft.crc32 = &crc32_pclmulqdq;
          ft.crc32_copy = &crc32_copy_pclmulqdq;
      }
  #endif
-    // X86 - AVX
+    // X86 - AVX2
  #ifdef X86_AVX2
      /* BMI2 support is all but implicit with AVX2 but let's sanity check this just in case. Enabling BMI2 allows for
       * flagless shifts, resulting in fewer flag stalls for the pipeline, and allows us to set destination registers
       * for the shift results as an operand, eliminating several register-register moves when the original value needs
       * to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */
-    if (cf.x86.has_avx2 && cf.x86.has_bmi2) {
+#  ifndef X86_AVX2_NATIVE
+    if (cf.x86.has_avx2 && cf.x86.has_bmi2)
+#  endif
+    {
+#  ifndef X86_AVX512_NATIVE
          ft.adler32 = &adler32_avx2;
          ft.adler32_copy = &adler32_copy_avx2;
          ft.chunkmemset_safe = &chunkmemset_safe_avx2;
@@ -169,14 +226,20 @@ static int init_functable(void) {
          ft.inflate_fast = &inflate_fast_avx2;
          ft.longest_match = &longest_match_avx2;
          ft.longest_match_slow = &longest_match_slow_avx2;
+#  endif
          ft.slide_hash = &slide_hash_avx2;
      }
  #endif
      // X86 - AVX512 (F,DQ,BW,Vl)
  #ifdef X86_AVX512
-    if (cf.x86.has_avx512_common) {
+#  ifndef X86_AVX512_NATIVE
+    if (cf.x86.has_avx512_common)
+#  endif
+    {
+#  ifndef X86_AVX512VNNI_NATIVE
          ft.adler32 = &adler32_avx512;
          ft.adler32_copy = &adler32_copy_avx512;
+#  endif
          ft.chunkmemset_safe = &chunkmemset_safe_avx512;
          ft.compare256 = &compare256_avx512;
          ft.inflate_fast = &inflate_fast_avx512;
@@ -185,14 +248,20 @@ static int init_functable(void) {
      }
  #endif
  #ifdef X86_AVX512VNNI
-    if (cf.x86.has_avx512vnni) {
+#  ifndef X86_AVX512VNNI_NATIVE
+    if (cf.x86.has_avx512vnni)
+#  endif
+    {
          ft.adler32 = &adler32_avx512_vnni;
          ft.adler32_copy = &adler32_copy_avx512_vnni;
      }
  #endif
      // X86 - VPCLMULQDQ
  #ifdef X86_VPCLMULQDQ_CRC
-    if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) {
+#  ifndef X86_VPCLMULQDQ_NATIVE
+    if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq)
+#  endif
+    {
          ft.crc32 = &crc32_vpclmulqdq;
          ft.crc32_copy = &crc32_copy_vpclmulqdq;
      }
@@ -200,8 +269,8 @@ static int init_functable(void) {
  
  
      // ARM - SIMD
-#ifdef ARM_SIMD
-#  ifndef ARM_NOCHECK_SIMD
+#if defined(ARM_SIMD) && !defined(ARM_NEON_NATIVE)
+#  ifndef ARM_SIMD_NATIVE
      if (cf.arm.has_simd)
  #  endif
      {
@@ -210,7 +279,7 @@ static int init_functable(void) {
  #endif
      // ARM - NEON
  #ifdef ARM_NEON
-#  ifndef ARM_NOCHECK_NEON
+#  ifndef ARM_NEON_NATIVE
      if (cf.arm.has_neon)
  #  endif
      {
@@ -225,15 +294,21 @@ static int init_functable(void) {
      }
  #endif
      // ARM - CRC32
-#ifdef ARM_CRC32
-    if (cf.arm.has_crc32) {
+#if defined(ARM_CRC32) && !defined(ARM_PMULL_EOR3_NATIVE)
+#  ifndef ARM_CRC32_NATIVE
+    if (cf.arm.has_crc32)
+#  endif
+    {
          ft.crc32 = &crc32_armv8;
          ft.crc32_copy = &crc32_copy_armv8;
      }
  #endif
      // ARM - PMULL EOR3
  #ifdef ARM_PMULL_EOR3
-    if (cf.arm.has_crc32 && cf.arm.has_pmull && cf.arm.has_eor3 && cf.arm.has_fast_pmull) {
+#  ifndef ARM_PMULL_EOR3_NATIVE
+    if (cf.arm.has_crc32 && cf.arm.has_pmull && cf.arm.has_eor3 && cf.arm.has_fast_pmull)
+#  endif
+    {
          ft.crc32 = &crc32_armv8_pmull_eor3;
          ft.crc32_copy = &crc32_copy_armv8_pmull_eor3;
      }
@@ -241,7 +316,10 @@ static int init_functable(void) {
  
      // Power - VMX
  #ifdef PPC_VMX
-    if (cf.power.has_altivec) {
+#  ifndef PPC_VMX_NATIVE
+    if (cf.power.has_altivec)
+#  endif
+    {
          ft.adler32 = &adler32_vmx;
          ft.adler32_copy = &adler32_copy_vmx;
          ft.slide_hash = &slide_hash_vmx;
@@ -249,7 +327,10 @@ static int init_functable(void) {
  #endif
      // Power8 - VSX
  #ifdef POWER8_VSX
-    if (cf.power.has_arch_2_07) {
+#  ifndef POWER8_VSX_NATIVE
+    if (cf.power.has_arch_2_07)
+#  endif
+    {
          ft.adler32 = &adler32_power8;
          ft.adler32_copy = &adler32_copy_power8;
          ft.chunkmemset_safe = &chunkmemset_safe_power8;
@@ -258,14 +339,20 @@ static int init_functable(void) {
      }
  #endif
  #ifdef POWER8_VSX_CRC32
-    if (cf.power.has_arch_2_07) {
+#  ifndef POWER8_VSX_CRC32_NATIVE
+    if (cf.power.has_arch_2_07)
+#  endif
+    {
          ft.crc32 = &crc32_power8;
          ft.crc32_copy = &crc32_copy_power8;
      }
  #endif
      // Power9
  #ifdef POWER9
-    if (cf.power.has_arch_3_00) {
+#  ifndef POWER9_NATIVE
+    if (cf.power.has_arch_3_00)
+#  endif
+    {
          ft.compare256 = &compare256_power9;
          ft.longest_match = &longest_match_power9;
          ft.longest_match_slow = &longest_match_slow_power9;
@@ -275,7 +362,10 @@ static int init_functable(void) {
  
      // RISCV - RVV
  #ifdef RISCV_RVV
-    if (cf.riscv.has_rvv) {
+#  ifndef RISCV_RVV_NATIVE
+    if (cf.riscv.has_rvv)
+#  endif
+    {
          ft.adler32 = &adler32_rvv;
          ft.adler32_copy = &adler32_copy_rvv;
          ft.chunkmemset_safe = &chunkmemset_safe_rvv;
@@ -289,7 +379,10 @@ static int init_functable(void) {
  
      // RISCV - ZBC
  #ifdef RISCV_CRC32_ZBC
-    if (cf.riscv.has_zbc) {
+#  ifndef RISCV_ZBC_NATIVE
+    if (cf.riscv.has_zbc)
+#  endif
+    {
          ft.crc32 = &crc32_riscv64_zbc;
          ft.crc32_copy = &crc32_copy_riscv64_zbc;
      }
@@ -297,7 +390,10 @@ static int init_functable(void) {
  
      // S390
  #ifdef S390_CRC32_VX
-    if (cf.s390.has_vx) {
+#  ifndef S390_CRC32_VX_NATIVE
+    if (cf.s390.has_vx)
+#  endif
+    {
          ft.crc32 = &crc32_s390_vx;
          ft.crc32_copy = &crc32_copy_s390_vx;
      }
@@ -305,13 +401,19 @@ static int init_functable(void) {
  
      // LOONGARCH
  #ifdef LOONGARCH_CRC
-    if (cf.loongarch.has_crc) {
+#  ifndef LOONGARCH_CRC_NATIVE
+    if (cf.loongarch.has_crc)
+#  endif
+    {
          ft.crc32 = &crc32_loongarch64;
          ft.crc32_copy = &crc32_copy_loongarch64;
      }
  #endif
-#ifdef LOONGARCH_LSX
-    if (cf.loongarch.has_lsx) {
+#if defined(LOONGARCH_LSX) && !defined(LOONGARCH_LASX_NATIVE)
+#  ifndef LOONGARCH_LSX_NATIVE
+    if (cf.loongarch.has_lsx)
+#  endif
+    {
          ft.adler32 = &adler32_lsx;
          ft.adler32_copy = &adler32_copy_lsx;
          ft.chunkmemset_safe = &chunkmemset_safe_lsx;
@@ -323,7 +425,10 @@ static int init_functable(void) {
      }
  #endif
  #ifdef LOONGARCH_LASX
-    if (cf.loongarch.has_lasx) {
+#  ifndef LOONGARCH_LASX_NATIVE
+    if (cf.loongarch.has_lasx)
+#  endif
+    {
          ft.adler32 = &adler32_lasx;
          ft.adler32_copy = &adler32_copy_lasx;
          ft.chunkmemset_safe = &chunkmemset_safe_lasx;
author	Nathan Moinvaziri <nathan@nathanm.com>
	Sat, 7 Feb 2026 08:00:44 +0000 (00:00 -0800)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Sun, 8 Mar 2026 22:33:57 +0000 (23:33 +0100)
arch/arm/arm_functions.h		patch \| blob \| blame \| history
arch/arm/arm_natives.h	[new file with mode: 0644]	patch \| blob
arch/loongarch/loongarch_functions.h		patch \| blob \| blame \| history
arch/loongarch/loongarch_natives.h	[new file with mode: 0644]	patch \| blob
arch/power/power_functions.h		patch \| blob \| blame \| history
arch/power/power_natives.h	[new file with mode: 0644]	patch \| blob
arch/riscv/riscv_functions.h		patch \| blob \| blame \| history
arch/riscv/riscv_natives.h	[new file with mode: 0644]	patch \| blob
arch/s390/s390_functions.h		patch \| blob \| blame \| history
arch/s390/s390_natives.h	[new file with mode: 0644]	patch \| blob
arch/x86/x86_functions.h		patch \| blob \| blame \| history
arch/x86/x86_natives.h	[new file with mode: 0644]	patch \| blob
arch_natives.h	[new file with mode: 0644]	patch \| blob
functable.c		patch \| blob \| blame \| history