From: Cameron Cawley Date: Thu, 29 Sep 2022 18:55:34 +0000 (+0100) Subject: Move the NEON compatibility defines into a separate file X-Git-Tag: 2.1.0-beta1~151 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9c839540ed522d10a7519d78881ca7666b042db6;p=thirdparty%2Fzlib-ng.git Move the NEON compatibility defines into a separate file --- diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c index 9b55ca0c0..9b9d65ddd 100644 --- a/arch/arm/adler32_neon.c +++ b/arch/arm/adler32_neon.c @@ -6,14 +6,9 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ #ifdef ARM_NEON_ADLER32 -#ifdef _M_ARM64 -# include -#else -# include -#endif +#include "neon_intrins.h" #include "../../zbuild.h" #include "../../adler32_p.h" -#include "../../fallback_builtins.h" static void NEON_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) { static const uint16_t ALIGNED_(16) taps[64] = { diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c index 3b8d2c001..2c64ce59e 100644 --- a/arch/arm/chunkset_neon.c +++ b/arch/arm/chunkset_neon.c @@ -3,11 +3,7 @@ */ #ifdef ARM_NEON_CHUNKSET -#ifdef _M_ARM64 -# include -#else -# include -#endif +#include "neon_intrins.h" #include "../../zbuild.h" #include "../generic/chunk_permute_table.h" diff --git a/arch/arm/compare256_neon.c b/arch/arm/compare256_neon.c index 53a088cc0..7daeba411 100644 --- a/arch/arm/compare256_neon.c +++ b/arch/arm/compare256_neon.c @@ -3,14 +3,13 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) -#ifdef _M_ARM64 -# include -#else -# include -#endif #include "../../zbuild.h" +#include "fallback_builtins.h" + +#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) +#include "neon_intrins.h" + static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) { uint32_t len = 0; diff --git a/arch/arm/neon_intrins.h b/arch/arm/neon_intrins.h new file mode 100644 index 000000000..5885779ee --- /dev/null +++ b/arch/arm/neon_intrins.h @@ -0,0 +1,59 @@ +#ifndef ARM_NEON_INTRINS_H +#define ARM_NEON_INTRINS_H + +#ifdef _M_ARM64 +# include +#else +# include +#endif + +#if defined(ARM_NEON_ADLER32) && !defined(__aarch64__) && !defined(_M_ARM64) +/* Compatibility shim for the _high family of functions */ +#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b)) +#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c)) +#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c)) +#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b)) +#endif + +#ifdef ARM_NEON_SLIDEHASH + +#define vqsubq_u16_x4_x1(out, a, b) do { \ + out.val[0] = vqsubq_u16(a.val[0], b); \ + out.val[1] = vqsubq_u16(a.val[1], b); \ + out.val[2] = vqsubq_u16(a.val[2], b); \ + out.val[3] = vqsubq_u16(a.val[3], b); \ +} while (0) + +/* Have to check for hard float ABI on GCC/clang, but not + * on MSVC (we don't compile for the soft float ABI on windows) + */ +#if !defined(ARM_NEON_HASLD4) && (defined(__ARM_FP) || defined(_MSC_VER)) + +static inline uint16x8x4_t vld1q_u16_x4(uint16_t *a) { + uint16x8x4_t ret = (uint16x8x4_t) {{ + vld1q_u16(a), + vld1q_u16(a+8), + vld1q_u16(a+16), + vld1q_u16(a+24)}}; + return ret; +} + +static inline uint8x16x4_t vld1q_u8_x4(uint8_t *a) { + uint8x16x4_t ret = (uint8x16x4_t) {{ + vld1q_u8(a), + vld1q_u8(a+16), + vld1q_u8(a+32), + vld1q_u8(a+48)}}; + return ret; +} + +static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) { + vst1q_u16(p, a.val[0]); + vst1q_u16(p + 8, a.val[1]); + vst1q_u16(p + 16, a.val[2]); + vst1q_u16(p + 24, a.val[3]); +} +#endif // HASLD4 check and hard float +#endif // ARM_NEON_SLIDEHASH + +#endif // include guard ARM_NEON_INTRINS_H diff --git a/arch/arm/slide_hash_neon.c b/arch/arm/slide_hash_neon.c index 8dc887379..5bb4dc505 100644 --- a/arch/arm/slide_hash_neon.c +++ b/arch/arm/slide_hash_neon.c @@ -9,14 +9,9 @@ */ #if defined(ARM_NEON_SLIDEHASH) -#ifdef _M_ARM64 -# include -#else -# include -#endif +#include "neon_intrins.h" #include "../../zbuild.h" #include "../../deflate.h" -#include "../../fallback_builtins.h" /* SIMD version of hash_chain rebase */ static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) { diff --git a/fallback_builtins.h b/fallback_builtins.h index 3e5894207..32bb54dc6 100644 --- a/fallback_builtins.h +++ b/fallback_builtins.h @@ -71,59 +71,4 @@ static inline __m512i _mm512_zextsi128_si512(__m128i a) { #endif // __AVX2__ -#if defined(ARM_NEON_ADLER32) && !defined(__aarch64__) && !defined(_M_ARM64) -/* Compatibility shim for the _high family of functions */ -#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b)) -#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c)) -#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c)) -#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b)) -#endif - -#ifdef ARM_NEON_SLIDEHASH - -#define vqsubq_u16_x4_x1(out, a, b) do { \ - out.val[0] = vqsubq_u16(a.val[0], b); \ - out.val[1] = vqsubq_u16(a.val[1], b); \ - out.val[2] = vqsubq_u16(a.val[2], b); \ - out.val[3] = vqsubq_u16(a.val[3], b); \ -} while (0) - -/* Have to check for hard float ABI on GCC/clang, but not - * on MSVC (we don't compile for the soft float ABI on windows) - */ -#if !defined(ARM_NEON_HASLD4) && (defined(__ARM_FP) || defined(_MSC_VER)) - -#ifdef _M_ARM64 -# include -#else -# include -#endif - -static inline uint16x8x4_t vld1q_u16_x4(uint16_t *a) { - uint16x8x4_t ret = (uint16x8x4_t) {{ - vld1q_u16(a), - vld1q_u16(a+8), - vld1q_u16(a+16), - vld1q_u16(a+24)}}; - return ret; -} - -static inline uint8x16x4_t vld1q_u8_x4(uint8_t *a) { - uint8x16x4_t ret = (uint8x16x4_t) {{ - vld1q_u8(a), - vld1q_u8(a+16), - vld1q_u8(a+32), - vld1q_u8(a+48)}}; - return ret; -} - -static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) { - vst1q_u16(p, a.val[0]); - vst1q_u16(p + 8, a.val[1]); - vst1q_u16(p + 16, a.val[2]); - vst1q_u16(p + 24, a.val[3]); -} -#endif // HASLD4 check and hard float -#endif // ARM_NEON_SLIDEHASH - #endif // include guard FALLBACK_BUILTINS_H