From: Hans Kristian Rosbach Date: Mon, 6 Feb 2023 13:41:32 +0000 (+0100) Subject: Reduce the amount of different defines required for arch-specific optimizations. X-Git-Tag: 2.1.0-beta1~40 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7e1d80742e344b294cf3802e5ef13be42672b32c;p=thirdparty%2Fzlib-ng.git Reduce the amount of different defines required for arch-specific optimizations. Also removed a reference to a nonexistant adler32_sse41 in test/test_adler32.cc. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 8aad597f..df5cc03e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -616,7 +616,7 @@ if(WITH_OPTIM) if(WITH_ACLE AND NOT "${ARCH}" MATCHES "armv[2-7]") check_acle_compiler_flag() if(HAVE_ACLE_FLAG) - add_definitions(-DARM_ACLE_CRC_HASH) + add_definitions(-DARM_ACLE) set(ACLE_SRCS ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c) set_property(SOURCE ${ACLE_SRCS} PROPERTY COMPILE_FLAGS "${ACLEFLAG} ${NOLTOFLAG}") list(APPEND ZLIB_ARCH_SRCS ${ACLE_SRCS}) @@ -630,7 +630,7 @@ if(WITH_OPTIM) if(WITH_NEON) check_neon_compiler_flag() if(MFPU_NEON_AVAILABLE) - add_definitions(-DARM_NEON -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH) + add_definitions(-DARM_NEON) set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c ${ARCHDIR}/compare256_neon.c ${ARCHDIR}/slide_hash_neon.c) list(APPEND ZLIB_ARCH_SRCS ${NEON_SRCS}) @@ -668,8 +668,7 @@ if(WITH_OPTIM) if(HAVE_VMX) add_definitions(-DPPC_FEATURES) if(HAVE_ALTIVEC) - add_definitions(-DPPC_VMX_ADLER32) - add_definitions(-DPPC_VMX_SLIDEHASH) + add_definitions(-DPPC_VMX) set(PPC_SRCS ${ARCHDIR}/adler32_vmx.c ${ARCHDIR}/slide_hash_vmx.c) list(APPEND ZLIB_ARCH_SRCS ${PPC_SRCS}) add_feature_info(ALTIVEC 1 "Support the AltiVec instruction set, using \"-maltivec\"") @@ -682,11 +681,8 @@ if(WITH_OPTIM) # Power8 specific options and files if(WITH_POWER8) if(HAVE_POWER8_INTRIN) - add_definitions(-DPOWER8) + add_definitions(-DPOWER8_VSX) add_definitions(-DPOWER_FEATURES) - add_definitions(-DPOWER8_VSX_ADLER32) - add_definitions(-DPOWER8_VSX_CHUNKSET) - add_definitions(-DPOWER8_VSX_SLIDEHASH) set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/chunkset_power8.c ${ARCHDIR}/slide_hash_power8.c) if("${ARCH}" MATCHES "powerpc64(le)?") add_definitions(-DPOWER8_VSX_CRC32) @@ -748,7 +744,7 @@ if(WITH_OPTIM) if(WITH_AVX2) check_avx2_intrinsics() if(HAVE_AVX2_INTRIN) - add_definitions(-DX86_AVX2 -DX86_AVX2_ADLER32 -DX86_AVX_CHUNKSET) + add_definitions(-DX86_AVX2) set(AVX2_SRCS ${ARCHDIR}/slide_hash_avx2.c) add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"") list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx.c) @@ -766,7 +762,7 @@ if(WITH_OPTIM) if(WITH_AVX512) check_avx512_intrinsics() if(HAVE_AVX512_INTRIN) - add_definitions(-DX86_AVX512 -DX86_AVX512_ADLER32) + add_definitions(-DX86_AVX512) list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c) add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"") list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS}) @@ -782,7 +778,7 @@ if(WITH_OPTIM) if(WITH_AVX512VNNI) check_avx512vnni_intrinsics() if(HAVE_AVX512VNNI_INTRIN) - add_definitions(-DX86_AVX512VNNI -DX86_AVX512VNNI_ADLER32) + add_definitions(-DX86_AVX512VNNI) add_feature_info(AVX512VNNI_ADLER32 1 "Support AVX512VNNI adler32, using \"${AVX512VNNIFLAG}\"") list(APPEND AVX512VNNI_SRCS ${ARCHDIR}/adler32_avx512_vnni.c) list(APPEND ZLIB_ARCH_SRCS ${AVX512VNNI_SRCS}) @@ -805,7 +801,7 @@ if(WITH_OPTIM) if(WITH_SSE42) check_sse42_intrinsics() if(HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN) - add_definitions(-DX86_SSE42_CRC_HASH -DX86_SSE42_ADLER32) + add_definitions(-DX86_SSE42) set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c ${ARCHDIR}/insert_string_sse42.c) add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized CRC hash generation, using \"${SSE42FLAG}\"") list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS}) @@ -821,7 +817,7 @@ if(WITH_OPTIM) if(WITH_SSE2) check_sse2_intrinsics() if(HAVE_SSE2_INTRIN) - add_definitions(-DX86_SSE2 -DX86_SSE2_CHUNKSET -DX86_SSE2_SLIDEHASH) + add_definitions(-DX86_SSE2) set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c) list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS}) if(NOT ${ARCH} MATCHES "x86_64") @@ -838,7 +834,7 @@ if(WITH_OPTIM) if(WITH_SSSE3) check_ssse3_intrinsics() if(HAVE_SSSE3_INTRIN) - add_definitions(-DX86_SSSE3 -DX86_SSSE3_ADLER32) + add_definitions(-DX86_SSSE3) set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c) add_feature_info(SSSE3_ADLER32 1 "Support SSSE3-accelerated adler32, using \"${SSSE3FLAG}\"") list(APPEND ZLIB_ARCH_SRCS ${SSSE3_SRCS}) diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c index 7f898d18..f1c43ff0 100644 --- a/arch/arm/adler32_neon.c +++ b/arch/arm/adler32_neon.c @@ -5,7 +5,7 @@ * Adam Stylinski * For conditions of distribution and use, see copyright notice in zlib.h */ -#ifdef ARM_NEON_ADLER32 +#ifdef ARM_NEON #include "neon_intrins.h" #include "../../zbuild.h" #include "../../adler32_p.h" diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c index b119f212..668c0019 100644 --- a/arch/arm/chunkset_neon.c +++ b/arch/arm/chunkset_neon.c @@ -2,7 +2,7 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#ifdef ARM_NEON_CHUNKSET +#ifdef ARM_NEON #include "neon_intrins.h" #include "../../zbuild.h" #include "../generic/chunk_permute_table.h" diff --git a/arch/arm/crc32_acle.c b/arch/arm/crc32_acle.c index 445c370a..a4e54d71 100644 --- a/arch/arm/crc32_acle.c +++ b/arch/arm/crc32_acle.c @@ -5,7 +5,7 @@ * */ -#ifdef ARM_ACLE_CRC_HASH +#ifdef ARM_ACLE #ifdef _MSC_VER # include #else diff --git a/arch/arm/insert_string_acle.c b/arch/arm/insert_string_acle.c index de990238..9ac3ccb4 100644 --- a/arch/arm/insert_string_acle.c +++ b/arch/arm/insert_string_acle.c @@ -5,7 +5,7 @@ * */ -#ifdef ARM_ACLE_CRC_HASH +#ifdef ARM_ACLE #ifndef _MSC_VER # include #endif diff --git a/arch/arm/neon_intrins.h b/arch/arm/neon_intrins.h index 06e310c9..d6b57f64 100644 --- a/arch/arm/neon_intrins.h +++ b/arch/arm/neon_intrins.h @@ -7,7 +7,7 @@ # include #endif -#if defined(ARM_NEON_ADLER32) && !defined(__aarch64__) && !defined(_M_ARM64) +#if defined(ARM_NEON) && !defined(__aarch64__) && !defined(_M_ARM64) /* Compatibility shim for the _high family of functions */ #define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b)) #define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c)) @@ -15,7 +15,7 @@ #define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b)) #endif -#ifdef ARM_NEON_SLIDEHASH +#ifdef ARM_NEON #define vqsubq_u16_x4_x1(out, a, b) do { \ out.val[0] = vqsubq_u16(a.val[0], b); \ @@ -24,9 +24,8 @@ out.val[3] = vqsubq_u16(a.val[3], b); \ } while (0) -#endif -#if !defined(ARM_NEON_HASLD4) && (defined(ARM_NEON_ADLER32) || defined(ARM_NEON_SLIDEHASH)) +# ifndef ARM_NEON_HASLD4 static inline uint16x8x4_t vld1q_u16_x4(uint16_t const *a) { uint16x8x4_t ret = (uint16x8x4_t) {{ @@ -52,6 +51,7 @@ static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) { vst1q_u16(p + 16, a.val[2]); vst1q_u16(p + 24, a.val[3]); } -#endif // HASLD4 check +# endif // HASLD4 check +#endif #endif // include guard ARM_NEON_INTRINS_H diff --git a/arch/arm/slide_hash_neon.c b/arch/arm/slide_hash_neon.c index 5bb4dc50..a96ca117 100644 --- a/arch/arm/slide_hash_neon.c +++ b/arch/arm/slide_hash_neon.c @@ -8,7 +8,7 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#if defined(ARM_NEON_SLIDEHASH) +#ifdef ARM_NEON #include "neon_intrins.h" #include "../../zbuild.h" #include "../../deflate.h" diff --git a/arch/power/adler32_power8.c b/arch/power/adler32_power8.c index 737c6f2f..4aaea9f5 100644 --- a/arch/power/adler32_power8.c +++ b/arch/power/adler32_power8.c @@ -36,7 +36,7 @@ * https://www.ietf.org/rfc/rfc1950.txt */ -#ifdef POWER8_VSX_ADLER32 +#ifdef POWER8_VSX #include #include "zbuild.h" @@ -150,4 +150,4 @@ Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t le return adler32_len_16(s1, buf, len, s2); } -#endif /* POWER8_VSX_ADLER32 */ +#endif /* POWER8_VSX */ diff --git a/arch/power/adler32_vmx.c b/arch/power/adler32_vmx.c index 47193286..ef1649b5 100644 --- a/arch/power/adler32_vmx.c +++ b/arch/power/adler32_vmx.c @@ -5,7 +5,7 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#ifdef PPC_VMX_ADLER32 +#ifdef PPC_VMX #include #include "zbuild.h" #include "adler32_p.h" diff --git a/arch/power/chunkset_power8.c b/arch/power/chunkset_power8.c index 389be081..443aae92 100644 --- a/arch/power/chunkset_power8.c +++ b/arch/power/chunkset_power8.c @@ -2,7 +2,7 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#ifdef POWER8_VSX_CHUNKSET +#ifdef POWER8_VSX #include #include "../../zbuild.h" diff --git a/arch/power/slide_hash_power8.c b/arch/power/slide_hash_power8.c index 5b078ec9..d01e0acd 100644 --- a/arch/power/slide_hash_power8.c +++ b/arch/power/slide_hash_power8.c @@ -4,9 +4,9 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#ifdef POWER8_VSX_SLIDEHASH +#ifdef POWER8_VSX #define SLIDE_PPC slide_hash_power8 #include "slide_ppc_tpl.h" -#endif /* POWER8_VSX_SLIDEHASH */ +#endif /* POWER8_VSX */ diff --git a/arch/power/slide_hash_vmx.c b/arch/power/slide_hash_vmx.c index cf9bd7b7..5a87ef7d 100644 --- a/arch/power/slide_hash_vmx.c +++ b/arch/power/slide_hash_vmx.c @@ -2,9 +2,9 @@ * Copyright (C) 2017-2021 Mika T. Lindqvist * For conditions of distribution and use, see copyright notice in zlib.h */ -#ifdef PPC_VMX_SLIDEHASH +#ifdef PPC_VMX #define SLIDE_PPC slide_hash_vmx #include "slide_ppc_tpl.h" -#endif /* PPC_VMX_SLIDEHASH */ +#endif /* PPC_VMX */ diff --git a/arch/x86/adler32_avx2.c b/arch/x86/adler32_avx2.c index dcd1166f..797d299e 100644 --- a/arch/x86/adler32_avx2.c +++ b/arch/x86/adler32_avx2.c @@ -7,7 +7,7 @@ #include -#ifdef X86_AVX2_ADLER32 +#ifdef X86_AVX2 #include "adler32_avx2_tpl.h" diff --git a/arch/x86/adler32_avx2_p.h b/arch/x86/adler32_avx2_p.h index f7079bf3..f0f8a4a8 100644 --- a/arch/x86/adler32_avx2_p.h +++ b/arch/x86/adler32_avx2_p.h @@ -6,7 +6,7 @@ #ifndef ADLER32_AVX2_P_H_ #define ADLER32_AVX2_P_H_ -#if defined(X86_AVX2_ADLER32) || defined(X86_AVX512VNNI_ADLER32) +#if defined(X86_AVX2) || defined(X86_AVX512VNNI) /* 32 bit horizontal sum, adapted from Agner Fog's vector library. */ static inline uint32_t hsum256(__m256i x) { diff --git a/arch/x86/adler32_avx2_tpl.h b/arch/x86/adler32_avx2_tpl.h index 0b2e89be..a94f44b4 100644 --- a/arch/x86/adler32_avx2_tpl.h +++ b/arch/x86/adler32_avx2_tpl.h @@ -10,7 +10,7 @@ #include "../../fallback_builtins.h" #include "adler32_avx2_p.h" -#ifdef X86_SSE42_ADLER32 +#ifdef X86_SSE42 extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len); diff --git a/arch/x86/adler32_avx512.c b/arch/x86/adler32_avx512.c index c0bf0721..e6ebb05d 100644 --- a/arch/x86/adler32_avx512.c +++ b/arch/x86/adler32_avx512.c @@ -6,7 +6,7 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#ifdef X86_AVX512_ADLER32 +#ifdef X86_AVX512 #include "adler32_avx512_tpl.h" diff --git a/arch/x86/adler32_avx512_tpl.h b/arch/x86/adler32_avx512_tpl.h index 6ed39b45..7546afef 100644 --- a/arch/x86/adler32_avx512_tpl.h +++ b/arch/x86/adler32_avx512_tpl.h @@ -11,7 +11,7 @@ #include #include "adler32_avx512_p.h" -#ifdef X86_AVX512_ADLER32 +#ifdef X86_AVX512 #ifdef COPY Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { @@ -35,9 +35,9 @@ rem_peel: _mm512_mask_storeu_epi8(dst, storemask, copy_vec); #endif -#ifdef X86_AVX2_ADLER32 +#ifdef X86_AVX2 return adler32_avx2(adler, src, len); -#elif defined(X86_SSSE3_ADLER32) +#elif defined(X86_SSSE3) return adler32_ssse3(adler, src, len); #else return adler32_len_16(adler0, src, len, adler1); diff --git a/arch/x86/adler32_avx512_vnni.c b/arch/x86/adler32_avx512_vnni.c index 42a16606..8dcc93d0 100644 --- a/arch/x86/adler32_avx512_vnni.c +++ b/arch/x86/adler32_avx512_vnni.c @@ -7,7 +7,7 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#ifdef X86_AVX512VNNI_ADLER32 +#ifdef X86_AVX512VNNI #include "../../zbuild.h" #include "../../adler32_p.h" @@ -28,16 +28,16 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size rem_peel: if (len < 32) -#if defined(X86_SSSE3_ADLER32) +#if defined(X86_SSSE3) return adler32_ssse3(adler, src, len); #else return adler32_len_16(adler0, src, len, adler1); #endif if (len < 64) -#ifdef X86_AVX2_ADLER32 +#ifdef X86_AVX2 return adler32_avx2(adler, src, len); -#elif defined(X86_SSE3_ADLER32) +#elif defined(X86_SSE3) return adler32_ssse3(adler, src, len); #else return adler32_len_16(adler0, src, len, adler1); @@ -135,7 +135,7 @@ rem_peel_copy: __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src); _mm256_mask_storeu_epi8(dst, storemask, copy_vec); -#if defined(X86_SSSE3_ADLER32) +#if defined(X86_SSSE3) return adler32_ssse3(adler, src, len); #else return adler32_len_16(adler0, src, len, adler1); diff --git a/arch/x86/adler32_sse42.c b/arch/x86/adler32_sse42.c index ec051340..257a3609 100644 --- a/arch/x86/adler32_sse42.c +++ b/arch/x86/adler32_sse42.c @@ -12,7 +12,7 @@ #include "adler32_ssse3_p.h" #include -#ifdef X86_SSE42_ADLER32 +#ifdef X86_SSE42 Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { uint32_t adler0, adler1; diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c index 1f4abba5..99ce7958 100644 --- a/arch/x86/adler32_ssse3.c +++ b/arch/x86/adler32_ssse3.c @@ -10,7 +10,7 @@ #include "../../adler32_p.h" #include "adler32_ssse3_p.h" -#ifdef X86_SSSE3_ADLER32 +#ifdef X86_SSSE3 #include diff --git a/arch/x86/adler32_ssse3_p.h b/arch/x86/adler32_ssse3_p.h index 0b7ddcf9..d7ec3fe0 100644 --- a/arch/x86/adler32_ssse3_p.h +++ b/arch/x86/adler32_ssse3_p.h @@ -6,7 +6,7 @@ #ifndef ADLER32_SSSE3_P_H_ #define ADLER32_SSSE3_P_H_ -#ifdef X86_SSSE3_ADLER32 +#ifdef X86_SSSE3 #include #include diff --git a/arch/x86/chunkset_avx.c b/arch/x86/chunkset_avx.c index e128e8f7..c2df2322 100644 --- a/arch/x86/chunkset_avx.c +++ b/arch/x86/chunkset_avx.c @@ -3,7 +3,7 @@ */ #include "zbuild.h" -#ifdef X86_AVX_CHUNKSET +#ifdef X86_AVX2 #include #include "../generic/chunk_permute_table.h" diff --git a/arch/x86/insert_string_sse42.c b/arch/x86/insert_string_sse42.c index 6fe4c81e..2668f0ea 100644 --- a/arch/x86/insert_string_sse42.c +++ b/arch/x86/insert_string_sse42.c @@ -45,6 +45,6 @@ #define INSERT_STRING insert_string_sse4 #define QUICK_INSERT_STRING quick_insert_string_sse4 -#ifdef X86_SSE42_CRC_HASH +#ifdef X86_SSE42 # include "../../insert_string_tpl.h" #endif diff --git a/configure b/configure index c0f9524b..fdb5b69d 100755 --- a/configure +++ b/configure @@ -1545,8 +1545,8 @@ case "${ARCH}" in check_avx2_intrinsics if test ${HAVE_AVX2_INTRIN} -eq 1; then - CFLAGS="${CFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32 -DX86_AVX_CHUNKSET" - SFLAGS="${SFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32 -DX86_AVX_CHUNKSET" + CFLAGS="${CFLAGS} -DX86_AVX2" + SFLAGS="${SFLAGS} -DX86_AVX2" ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_hash_avx2.o chunkset_avx.o compare256_avx2.o adler32_avx2.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_avx2.lo chunkset_avx.lo compare256_avx2.lo adler32_avx2.lo" fi @@ -1554,8 +1554,8 @@ case "${ARCH}" in check_avx512_intrinsics if test ${HAVE_AVX512_INTRIN} -eq 1; then - CFLAGS="${CFLAGS} -DX86_AVX512 -DX86_AVX512_ADLER32" - SFLAGS="${SFLAGS} -DX86_AVX512 -DX86_AVX512_ADLER32" + CFLAGS="${CFLAGS} -DX86_AVX512" + SFLAGS="${SFLAGS} -DX86_AVX512" ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512.lo" @@ -1570,8 +1570,8 @@ case "${ARCH}" in check_avx512vnni_intrinsics if test ${HAVE_AVX512VNNI_INTRIN} -eq 1; then - CFLAGS="${CFLAGS} -DX86_AVX512VNNI -DX86_AVX512VNNI_ADLER32" - SFLAGS="${SFLAGS} -DX86_AVX512VNNI -DX86_AVX512VNNI_ADLER32" + CFLAGS="${CFLAGS} -DX86_AVX512VNNI" + SFLAGS="${SFLAGS} -DX86_AVX512VNNI" ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512_vnni.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512_vnni.lo" fi @@ -1589,8 +1589,8 @@ case "${ARCH}" in check_sse42_intrinsics if test ${HAVE_SSE42CRC_INTRIN} -eq 1 || test ${HAVE_SSE42CRC_INLINE_ASM} -eq 1; then - CFLAGS="${CFLAGS} -DX86_SSE42_CRC_HASH -DX86_SSE42_ADLER32" - SFLAGS="${SFLAGS} -DX86_SSE42_CRC_HASH -DX86_SSE42_ADLER32" + CFLAGS="${CFLAGS} -DX86_SSE42" + SFLAGS="${SFLAGS} -DX86_SSE42" if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_SSE42_CRC_INTRIN" @@ -1604,8 +1604,8 @@ case "${ARCH}" in check_sse2_intrinsics if test ${HAVE_SSE2_INTRIN} -eq 1; then - CFLAGS="${CFLAGS} -DX86_SSE2 -DX86_SSE2_CHUNKSET" - SFLAGS="${SFLAGS} -DX86_SSE2 -DX86_SSE2_CHUNKSET" + CFLAGS="${CFLAGS} -DX86_SSE2" + SFLAGS="${SFLAGS} -DX86_SSE2" ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_sse2.o compare256_sse2.o slide_hash_sse2.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_sse2.lo compare256_sse2.lo slide_hash_sse2.lo" @@ -1618,8 +1618,8 @@ case "${ARCH}" in check_ssse3_intrinsics if test ${HAVE_SSSE3_INTRIN} -eq 1; then - CFLAGS="${CFLAGS} -DX86_SSSE3 -DX86_SSSE3_ADLER32" - SFLAGS="${SFLAGS} -DX86_SSSE3 -DX86_SSSE3_ADLER32" + CFLAGS="${CFLAGS} -DX86_SSSE3" + SFLAGS="${SFLAGS} -DX86_SSSE3" ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_ssse3.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_ssse3.lo" fi @@ -1785,9 +1785,6 @@ EOF SFLAGS="${SFLAGS} -DARM_NEON_HASLD4" fi - CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" - SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o compare256_neon.o slide_hash_neon.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo compare256_neon.lo slide_hash_neon.lo" fi @@ -1812,9 +1809,6 @@ EOF SFLAGS="${SFLAGS} -DARM_NEON_HASLD4" fi - CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" - SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o compare256_neon.o slide_hash_neon.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo compare256_neon.lo slide_hash_neon.lo" fi @@ -1825,8 +1819,8 @@ EOF if test $without_optimizations -eq 0; then if test $ACLE_AVAILABLE -eq 1; then - CFLAGS="${CFLAGS} -DARM_ACLE_CRC_HASH" - SFLAGS="${SFLAGS} -DARM_ACLE_CRC_HASH" + CFLAGS="${CFLAGS} -DARM_ACLE" + SFLAGS="${SFLAGS} -DARM_ACLE" ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_acle.o insert_string_acle.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_acle.lo insert_string_acle.lo" @@ -1845,9 +1839,6 @@ EOF SFLAGS="${SFLAGS} -DARM_NEON_HASLD4" fi - CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" - SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o compare256_neon.o slide_hash_neon.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo compare256_neon.lo slide_hash_neon.lo" fi @@ -1899,8 +1890,8 @@ EOF if test $native -eq 0; then ARCH="${ARCH}+crc" fi - CFLAGS="${CFLAGS} -DARM_ACLE_CRC_HASH" - SFLAGS="${SFLAGS} -DARM_ACLE_CRC_HASH" + CFLAGS="${CFLAGS} -DARM_ACLE" + SFLAGS="${SFLAGS} -DARM_ACLE" ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_acle.o insert_string_acle.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_acle.lo insert_string_acle.lo" fi @@ -1909,8 +1900,8 @@ EOF if test $native -eq 0; then ARCH="${ARCH}+simd" fi - CFLAGS="${CFLAGS} -DARM_NEON -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" - SFLAGS="${SFLAGS} -DARM_NEON -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH" + CFLAGS="${CFLAGS} -DARM_NEON" + SFLAGS="${SFLAGS} -DARM_NEON" ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o compare256_neon.o slide_hash_neon.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo compare256_neon.lo slide_hash_neon.lo" fi @@ -1949,15 +1940,15 @@ EOF ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power_features.lo" fi if test $HAVE_VMX -eq 1 -a $HAVE_ALTIVEC_INTRIN -eq 1; then - CFLAGS="${CFLAGS} -DPPC_VMX_ADLER32 -DPPC_VMX_SLIDEHASH" - SFLAGS="${SFLAGS} -DPPC_VMX_ADLER32 -DPPC_VMX_SLIDEHASH" + CFLAGS="${CFLAGS} -DPPC_VMX" + SFLAGS="${SFLAGS} -DPPC_VMX" ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_vmx.o slide_hash_vmx.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_vmx.lo slide_hash_vmx.lo" fi if test $HAVE_POWER8_INTRIN -eq 1; then - CFLAGS="${CFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_CHUNKSET -DPOWER8_VSX_SLIDEHASH" - SFLAGS="${SFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_CHUNKSET -DPOWER8_VSX_SLIDEHASH" + CFLAGS="${CFLAGS} -DPOWER8_VSX -DPOWER_FEATURES" + SFLAGS="${SFLAGS} -DPOWER8_VSX -DPOWER_FEATURES" ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_power8.o chunkset_power8.o slide_hash_power8.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_power8.lo chunkset_power8.lo slide_hash_power8.lo" diff --git a/cpu_features.h b/cpu_features.h index f71372dd..c098ee2d 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -26,39 +26,39 @@ extern void cpu_check_features(void); typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len); extern uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len); -#ifdef ARM_NEON_ADLER32 +#ifdef ARM_NEON extern uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len); #endif -#ifdef PPC_VMX_ADLER32 +#ifdef PPC_VMX extern uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len); #endif -#ifdef X86_SSSE3_ADLER32 +#ifdef X86_SSSE3 extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len); #endif -#ifdef X86_AVX2_ADLER32 +#ifdef X86_AVX2 extern uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len); #endif -#ifdef X86_AVX512_ADLER32 +#ifdef X86_AVX512 extern uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len); #endif -#ifdef X86_AVX512VNNI_ADLER32 +#ifdef X86_AVX512VNNI extern uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len); #endif -#ifdef POWER8_VSX_ADLER32 +#ifdef POWER8_VSX extern uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len); #endif /* adler32 folding */ -#ifdef X86_SSE42_ADLER32 +#ifdef X86_SSE42 extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif -#ifdef X86_AVX2_ADLER32 +#ifdef X86_AVX2 extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif -#ifdef X86_AVX512_ADLER32 +#ifdef X86_AVX512 extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif -#ifdef X86_AVX512VNNI_ADLER32 +#ifdef X86_AVX512VNNI extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif @@ -74,22 +74,22 @@ extern uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); /* memory chunking */ extern uint32_t chunksize_c(void); extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left); -#ifdef X86_SSE2_CHUNKSET +#ifdef X86_SSE2 extern uint32_t chunksize_sse2(void); extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left); #endif #ifdef X86_SSE41 extern uint8_t* chunkmemset_safe_sse41(uint8_t *out, unsigned dist, unsigned len, unsigned left); #endif -#ifdef X86_AVX_CHUNKSET +#ifdef X86_AVX2 extern uint32_t chunksize_avx(void); extern uint8_t* chunkmemset_safe_avx(uint8_t *out, unsigned dist, unsigned len, unsigned left); #endif -#ifdef ARM_NEON_CHUNKSET +#ifdef ARM_NEON extern uint32_t chunksize_neon(void); extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left); #endif -#ifdef POWER8_VSX_CHUNKSET +#ifdef POWER8_VSX extern uint32_t chunksize_power8(void); extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left); #endif @@ -102,19 +102,19 @@ typedef struct zng_stream_s zng_stream; /* inflate fast loop */ extern void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start); -#ifdef X86_SSE2_CHUNKSET +#ifdef X86_SSE2 extern void inflate_fast_sse2(PREFIX3(stream) *strm, uint32_t start); #endif #ifdef X86_SSE41 extern void inflate_fast_sse41(PREFIX3(stream) *strm, uint32_t start); #endif -#ifdef X86_AVX_CHUNKSET +#ifdef X86_AVX2 extern void inflate_fast_avx(PREFIX3(stream) *strm, uint32_t start); #endif -#ifdef ARM_NEON_CHUNKSET +#ifdef ARM_NEON extern void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start); #endif -#ifdef POWER8_VSX_CHUNKSET +#ifdef POWER8_VSX extern void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start); #endif @@ -122,9 +122,9 @@ extern void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start); typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len); extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len); -#ifdef ARM_ACLE_CRC_HASH +#ifdef ARM_ACLE extern uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len); -#elif defined(POWER8_VSX_CRC32) +#elif defined(POWER8_VSX) extern uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len); #elif defined(S390_CRC32_VX) extern uint32_t PREFIX(s390_crc32_vx)(uint32_t crc, const uint8_t *buf, size_t len); @@ -159,9 +159,9 @@ extern uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1); #ifdef DEFLATE_H_ /* insert_string */ extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count); -#ifdef X86_SSE42_CRC_HASH +#ifdef X86_SSE42 extern void insert_string_sse4(deflate_state *const s, const uint32_t str, uint32_t count); -#elif defined(ARM_ACLE_CRC_HASH) +#elif defined(ARM_ACLE) extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count); #endif @@ -213,9 +213,9 @@ extern uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match) /* quick_insert_string */ extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str); -#ifdef X86_SSE42_CRC_HASH +#ifdef X86_SSE42 extern Pos quick_insert_string_sse4(deflate_state *const s, const uint32_t str); -#elif defined(ARM_ACLE_CRC_HASH) +#elif defined(ARM_ACLE) extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str); #endif @@ -224,13 +224,13 @@ typedef void (*slide_hash_func)(deflate_state *s); #ifdef X86_SSE2 extern void slide_hash_sse2(deflate_state *s); -#elif defined(ARM_NEON_SLIDEHASH) +#elif defined(ARM_NEON) extern void slide_hash_neon(deflate_state *s); #endif -#if defined(PPC_VMX_SLIDEHASH) +#if defined(PPC_VMX) extern void slide_hash_vmx(deflate_state *s); #endif -#if defined(POWER8_VSX_SLIDEHASH) +#if defined(POWER8_VSX) extern void slide_hash_power8(deflate_state *s); #endif #ifdef X86_AVX2 @@ -239,9 +239,9 @@ extern void slide_hash_avx2(deflate_state *s); /* update_hash */ extern uint32_t update_hash_c(deflate_state *const s, uint32_t h, uint32_t val); -#ifdef X86_SSE42_CRC_HASH +#ifdef X86_SSE42 extern uint32_t update_hash_sse4(deflate_state *const s, uint32_t h, uint32_t val); -#elif defined(ARM_ACLE_CRC_HASH) +#elif defined(ARM_ACLE) extern uint32_t update_hash_acle(deflate_state *const s, uint32_t h, uint32_t val); #endif #endif diff --git a/functable.c b/functable.c index b64b4bd6..a02aae77 100644 --- a/functable.c +++ b/functable.c @@ -56,28 +56,24 @@ static void init_functable(void) { // Select arch-optimized functions // X86 - SSE2 -#if defined(X86_SSE2) || defined(X86_SSE2_CHUNKSET) +#ifdef X86_SSE2 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) if (x86_cpu_has_sse2) # endif { -# ifdef X86_SSE2 ft.slide_hash = &slide_hash_sse2; -# ifdef HAVE_BUILTIN_CTZ - ft.longest_match = &longest_match_sse2; - ft.longest_match_slow = &longest_match_slow_sse2; - ft.compare256 = &compare256_sse2; -# endif -# endif -# ifdef X86_SSE2_CHUNKSET ft.chunksize = &chunksize_sse2; ft.chunkmemset_safe = &chunkmemset_safe_sse2; ft.inflate_fast = &inflate_fast_sse2; +# ifdef HAVE_BUILTIN_CTZ + ft.longest_match = &longest_match_sse2; + ft.longest_match_slow = &longest_match_slow_sse2; + ft.compare256 = &compare256_sse2; # endif } #endif // X86 - SSSE3 -#ifdef X86_SSSE3_ADLER32 +#ifdef X86_SSSE3 if (x86_cpu_has_ssse3) ft.adler32 = &adler32_ssse3; #endif @@ -88,12 +84,9 @@ static void init_functable(void) { ft.inflate_fast = &inflate_fast_sse41; } #endif -#ifdef X86_SSE42_ADLER32 - if (x86_cpu_has_sse42) - ft.adler32_fold_copy = &adler32_fold_copy_sse42; -#endif -#ifdef X86_SSE42_CRC_HASH +#ifdef X86_SSE42 if (x86_cpu_has_sse42) { + ft.adler32_fold_copy = &adler32_fold_copy_sse42; ft.update_hash = &update_hash_sse4; ft.insert_string = &insert_string_sse4; ft.quick_insert_string = &quick_insert_string_sse4; @@ -110,16 +103,14 @@ static void init_functable(void) { } #endif // X86 - AVX -#ifdef X86_AVX_CHUNKSET +#ifdef X86_AVX2 if (x86_cpu_has_avx2) { ft.chunksize = &chunksize_avx; ft.chunkmemset_safe = &chunkmemset_safe_avx; ft.inflate_fast = &inflate_fast_avx; - } -#endif -#ifdef X86_AVX2 - if (x86_cpu_has_avx2) { ft.slide_hash = &slide_hash_avx2; + ft.adler32 = &adler32_avx2; + ft.adler32_fold_copy = &adler32_fold_copy_avx2; # ifdef HAVE_BUILTIN_CTZ ft.longest_match = &longest_match_avx2; ft.longest_match_slow = &longest_match_slow_avx2; @@ -127,19 +118,13 @@ static void init_functable(void) { # endif } #endif -#ifdef X86_AVX2_ADLER32 - if (x86_cpu_has_avx2) { - ft.adler32 = &adler32_avx2; - ft.adler32_fold_copy = &adler32_fold_copy_avx2; - } -#endif -#ifdef X86_AVX512_ADLER32 +#ifdef X86_AVX512 if (x86_cpu_has_avx512) { ft.adler32 = &adler32_avx512; ft.adler32_fold_copy = &adler32_fold_copy_avx512; } #endif -#ifdef X86_AVX512VNNI_ADLER32 +#ifdef X86_AVX512VNNI if (x86_cpu_has_avx512vnni) { ft.adler32 = &adler32_avx512_vnni; ft.adler32_fold_copy = &adler32_fold_copy_avx512_vnni; @@ -148,34 +133,25 @@ static void init_functable(void) { // ARM - NEON -#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) - if (arm_cpu_has_neon) { - ft.compare256 = &compare256_neon; - ft.longest_match = &longest_match_neon; - ft.longest_match_slow = &longest_match_slow_neon; - } -#endif -#ifdef ARM_NEON_ADLER32 +#ifdef ARM_NEON # ifndef ARM_NOCHECK_NEON if (arm_cpu_has_neon) # endif + { ft.adler32 = &adler32_neon; -#endif -#ifdef ARM_NEON_SLIDEHASH -# ifndef ARM_NOCHECK_NEON - if (arm_cpu_has_neon) -# endif ft.slide_hash = &slide_hash_neon; -#endif -#ifdef ARM_NEON_CHUNKSET - if (arm_cpu_has_neon) { ft.chunksize = &chunksize_neon; ft.chunkmemset_safe = &chunkmemset_safe_neon; ft.inflate_fast = &inflate_fast_neon; +# ifdef HAVE_BUILTIN_CTZLL + ft.compare256 = &compare256_neon; + ft.longest_match = &longest_match_neon; + ft.longest_match_slow = &longest_match_slow_neon; +# endif } #endif // ARM - ACLE -#ifdef ARM_ACLE_CRC_HASH +#ifdef ARM_ACLE if (arm_cpu_has_crc32) { ft.crc32 = &crc32_acle; ft.update_hash = &update_hash_acle; @@ -184,34 +160,27 @@ static void init_functable(void) { } #endif + // Power - VMX -#ifdef PPC_VMX_SLIDEHASH - if (power_cpu_has_altivec) - ft.slide_hash = &slide_hash_vmx; -#endif -#ifdef PPC_VMX_ADLER32 - if (power_cpu_has_altivec) +#ifdef PPC_VMX + if (power_cpu_has_altivec) { ft.adler32 = &adler32_vmx; + ft.slide_hash = &slide_hash_vmx; + } #endif // Power8 - VSX -#ifdef POWER8_VSX_SLIDEHASH - if (power_cpu_has_arch_2_07) - ft.slide_hash = &slide_hash_power8; -#endif -#ifdef POWER8_VSX_ADLER32 - if (power_cpu_has_arch_2_07) +#ifdef POWER8_VSX + if (power_cpu_has_arch_2_07) { ft.adler32 = &adler32_power8; + ft.chunkmemset_safe = &chunkmemset_safe_power8; + ft.chunksize = &chunksize_power8; + ft.inflate_fast = &inflate_fast_power8; + ft.slide_hash = &slide_hash_power8; + } #endif #ifdef POWER8_VSX_CRC32 if (power_cpu_has_arch_2_07) ft.crc32 = &crc32_power8; -#endif -#ifdef POWER8_VSX_CHUNKSET - if (power_cpu_has_arch_2_07) { - ft.chunksize = &chunksize_power8; - ft.chunkmemset_safe = &chunkmemset_safe_power8; - ft.inflate_fast = &inflate_fast_power8; - } #endif // Power9 #ifdef POWER9 @@ -222,6 +191,7 @@ static void init_functable(void) { } #endif + // S390 #ifdef S390_CRC32_VX if (PREFIX(s390_cpu_has_vx)) diff --git a/test/benchmarks/benchmark_adler32.cc b/test/benchmarks/benchmark_adler32.cc index b94912ac..19691376 100644 --- a/test/benchmarks/benchmark_adler32.cc +++ b/test/benchmarks/benchmark_adler32.cc @@ -64,26 +64,26 @@ public: BENCHMARK_ADLER32(c, adler32_c, 1); -#ifdef ARM_NEON_ADLER32 +#ifdef ARM_NEON BENCHMARK_ADLER32(neon, adler32_neon, arm_cpu_has_neon); #endif -#ifdef PPC_VMX_ADLER32 +#ifdef PPC_VMX BENCHMARK_ADLER32(vmx, adler32_vmx, power_cpu_has_altivec); #endif -#ifdef POWER8_VSX_ADLER32 +#ifdef POWER8_VSX BENCHMARK_ADLER32(power8, adler32_power8, power_cpu_has_arch_2_07); #endif -#ifdef X86_SSSE3_ADLER32 +#ifdef X86_SSSE3 BENCHMARK_ADLER32(ssse3, adler32_ssse3, x86_cpu_has_ssse3); #endif -#ifdef X86_AVX2_ADLER32 +#ifdef X86_AVX2 BENCHMARK_ADLER32(avx2, adler32_avx2, x86_cpu_has_avx2); #endif -#ifdef X86_AVX512_ADLER32 +#ifdef X86_AVX512 BENCHMARK_ADLER32(avx512, adler32_avx512, x86_cpu_has_avx512); #endif -#ifdef X86_AVX512VNNI_ADLER32 +#ifdef X86_AVX512VNNI BENCHMARK_ADLER32(avx512_vnni, adler32_avx512_vnni, x86_cpu_has_avx512vnni); #endif diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc index 62998d41..d508a004 100644 --- a/test/benchmarks/benchmark_adler32_copy.cc +++ b/test/benchmarks/benchmark_adler32_copy.cc @@ -85,34 +85,34 @@ public: BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1); -#ifdef ARM_NEON_ADLER32 +#ifdef ARM_NEON /* If we inline this copy for neon, the function would go here */ //BENCHMARK_ADLER32_COPY(neon, adler32_neon, arm_cpu_has_neon); BENCHMARK_ADLER32_BASELINE_COPY(neon_copy_baseline, adler32_neon, arm_cpu_has_neon); #endif -#ifdef PPC_VMX_ADLER32 +#ifdef PPC_VMX //BENCHMARK_ADLER32_COPY(vmx_inline_copy, adler32_fold_copy_vmx, power_cpu_has_altivec); BENCHMARK_ADLER32_BASELINE_COPY(vmx_copy_baseline, adler32_vmx, power_cpu_has_altivec); #endif -#ifdef POWER8_VSX_ADLER32 +#ifdef POWER8_VSX //BENCHMARK_ADLER32_COPY(power8_inline_copy, adler32_fold_copy_power8, power_cpu_has_arch_2_07); BENCHMARK_ADLER32_BASELINE_COPY(power8, adler32_power8, power_cpu_has_arch_2_07); #endif -#ifdef X86_SSE42_ADLER32 +#ifdef X86_SSE42 BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, x86_cpu_has_ssse3); BENCHMARK_ADLER32_COPY(sse42, adler32_fold_copy_sse42, x86_cpu_has_sse42); #endif -#ifdef X86_AVX2_ADLER32 +#ifdef X86_AVX2 BENCHMARK_ADLER32_BASELINE_COPY(avx2_baseline, adler32_avx2, x86_cpu_has_avx2); BENCHMARK_ADLER32_COPY(avx2, adler32_fold_copy_avx2, x86_cpu_has_avx2); #endif -#ifdef X86_AVX512_ADLER32 +#ifdef X86_AVX512 BENCHMARK_ADLER32_BASELINE_COPY(avx512_baseline, adler32_avx512, x86_cpu_has_avx512); BENCHMARK_ADLER32_COPY(avx512, adler32_fold_copy_avx512, x86_cpu_has_avx512); #endif -#ifdef X86_AVX512VNNI_ADLER32 +#ifdef X86_AVX512VNNI BENCHMARK_ADLER32_BASELINE_COPY(avx512_vnni_baseline, adler32_avx512_vnni, x86_cpu_has_avx512vnni); BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_fold_copy_avx512_vnni, x86_cpu_has_avx512vnni); #endif diff --git a/test/benchmarks/benchmark_crc32.cc b/test/benchmarks/benchmark_crc32.cc index f17ebf6c..b5ecda51 100644 --- a/test/benchmarks/benchmark_crc32.cc +++ b/test/benchmarks/benchmark_crc32.cc @@ -57,9 +57,9 @@ public: BENCHMARK_CRC32(braid, PREFIX(crc32_braid), 1); -#ifdef ARM_ACLE_CRC_HASH +#ifdef ARM_ACLE BENCHMARK_CRC32(acle, crc32_acle, arm_cpu_has_crc32); -#elif defined(POWER8_VSX_CRC32) +#elif defined(POWER8_VSX) BENCHMARK_CRC32(power8, crc32_power8, power_cpu_has_arch_2_07); #elif defined(S390_CRC32_VX) BENCHMARK_CRC32(vx, PREFIX(s390_crc32_vx), PREFIX(s390_cpu_has_vx)); diff --git a/test/benchmarks/benchmark_slidehash.cc b/test/benchmarks/benchmark_slidehash.cc index 4ec87b6d..5ffa7039 100644 --- a/test/benchmarks/benchmark_slidehash.cc +++ b/test/benchmarks/benchmark_slidehash.cc @@ -68,13 +68,13 @@ public: BENCHMARK_SLIDEHASH(c, slide_hash_c, 1); -#ifdef ARM_NEON_SLIDEHASH +#ifdef ARM_NEON BENCHMARK_SLIDEHASH(neon, slide_hash_neon, arm_cpu_has_neon); #endif -#ifdef POWER8_VSX_SLIDEHASH +#ifdef POWER8_VSX BENCHMARK_SLIDEHASH(power8, slide_hash_power8, power_cpu_has_arch_2_07); #endif -#ifdef PPC_VMX_SLIDEHASH +#ifdef PPC_VMX BENCHMARK_SLIDEHASH(vmx, slide_hash_vmx, power_cpu_has_altivec); #endif diff --git a/test/test_adler32.cc b/test/test_adler32.cc index fa113da5..7f88f255 100644 --- a/test/test_adler32.cc +++ b/test/test_adler32.cc @@ -364,26 +364,23 @@ INSTANTIATE_TEST_SUITE_P(adler32, adler32_variant, testing::ValuesIn(tests)); TEST_ADLER32(c, adler32_c, 1) -#ifdef ARM_NEON_ADLER32 +#ifdef ARM_NEON TEST_ADLER32(neon, adler32_neon, arm_cpu_has_neon) -#elif defined(POWER8_VSX_ADLER32) +#elif defined(POWER8_VSX) TEST_ADLER32(power8, adler32_power8, power_cpu_has_arch_2_07) -#elif defined(PPC_VMX_ADLER32) +#elif defined(PPC_VMX) TEST_ADLER32(vmx, adler32_vmx, power_cpu_has_altivec) #endif -#ifdef X86_SSSE3_ADLER32 +#ifdef X86_SSSE3 TEST_ADLER32(ssse3, adler32_ssse3, x86_cpu_has_ssse3) #endif -#ifdef X86_SSE41_ADLER32 -TEST_ADLER32(sse41, adler32_sse41, x86_cpu_has_sse41) -#endif -#ifdef X86_AVX2_ADLER32 +#ifdef X86_AVX2 TEST_ADLER32(avx2, adler32_avx2, x86_cpu_has_avx2) #endif -#ifdef X86_AVX512_ADLER32 +#ifdef X86_AVX512 TEST_ADLER32(avx512, adler32_avx512, x86_cpu_has_avx512) #endif -#ifdef X86_AVX512VNNI_ADLER32 +#ifdef X86_AVX512VNNI TEST_ADLER32(avx512_vnni, adler32_avx512_vnni, x86_cpu_has_avx512vnni) #endif diff --git a/test/test_crc32.cc b/test/test_crc32.cc index 6b6af4bc..c46d0eb8 100644 --- a/test/test_crc32.cc +++ b/test/test_crc32.cc @@ -208,7 +208,7 @@ INSTANTIATE_TEST_SUITE_P(crc32, crc32_variant, testing::ValuesIn(tests)); TEST_CRC32(braid, PREFIX(crc32_braid), 1) -#ifdef ARM_ACLE_CRC_HASH +#ifdef ARM_ACLE TEST_CRC32(acle, crc32_acle, arm_cpu_has_crc32) #elif defined(POWER8_VSX_CRC32) TEST_CRC32(power8, crc32_power8, power_cpu_has_arch_2_07) diff --git a/win32/Makefile.a64 b/win32/Makefile.a64 index 8537bd5f..2a0f3cfe 100644 --- a/win32/Makefile.a64 +++ b/win32/Makefile.a64 @@ -91,12 +91,9 @@ OBJS = $(OBJS) gzlib.obj gzread.obj gzwrite.obj !endif WFLAGS = $(WFLAGS) \ - -DARM_ACLE_CRC_HASH \ + -DARM_ACLE \ -D__ARM_NEON__=1 \ -DARM_NEON \ - -DARM_NEON_ADLER32 \ - -DARM_NEON_CHUNKSET \ - -DARM_NEON_SLIDEHASH \ -DARM_NOCHECK_NEON \ # OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj adler32_neon.obj chunkset_neon.obj compare256_neon.obj slide_hash_neon.obj diff --git a/win32/Makefile.arm b/win32/Makefile.arm index 58a7fc86..7d3f1b58 100644 --- a/win32/Makefile.arm +++ b/win32/Makefile.arm @@ -95,7 +95,7 @@ OBJS = $(OBJS) gzlib.obj gzread.obj gzwrite.obj !endif !if "$(WITH_ACLE)" != "" -WFLAGS = $(WFLAGS) -DARM_ACLE_CRC_HASH +WFLAGS = $(WFLAGS) -DARM_ACLE OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj !endif !if "$(WITH_VFPV3)" != "" @@ -106,9 +106,6 @@ CFLAGS = $(CFLAGS) $(NEON_ARCH) WFLAGS = $(WFLAGS) \ -D__ARM_NEON__=1 \ -DARM_NEON \ - -DARM_NEON_ADLER32 \ - -DARM_NEON_CHUNKSET \ - -DARM_NEON_SLIDEHASH \ -DARM_NOCHECK_NEON \ # OBJS = $(OBJS) adler32_neon.obj chunkset_neon.obj compare256_neon.obj slide_hash_neon.obj diff --git a/win32/Makefile.msc b/win32/Makefile.msc index 9c00737a..f2f0631a 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -30,14 +30,10 @@ WFLAGS = \ -DX86_FEATURES \ -DX86_PCLMULQDQ_CRC \ -DX86_SSE2 \ - -DX86_SSE42_ADLER32 \ + -DX86_SSE42 \ -DX86_SSE42_CRC_INTRIN \ - -DX86_SSE42_CRC_HASH \ - -DX86_SSSE3_ADLER32 \ - -DX86_AVX2 \ - -DX86_AVX2_ADLER32 \ - -DX86_AVX_CHUNKSET \ - -DX86_SSE2_CHUNKSET + -DX86_SSSE3 \ + -DX86_AVX2 LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest ARFLAGS = -nologo