Move the NEON compatibility defines into a separate file

author Cameron Cawley <ccawley2011@gmail.com>

Thu, 29 Sep 2022 18:55:34 +0000 (19:55 +0100)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Tue, 11 Oct 2022 10:22:33 +0000 (12:22 +0200)
author Cameron Cawley <ccawley2011@gmail.com>
Thu, 29 Sep 2022 18:55:34 +0000 (19:55 +0100)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Tue, 11 Oct 2022 10:22:33 +0000 (12:22 +0200)
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c

index 9b55ca0c0802c62ab5c608bf41921a5740d49e02..9b9d65ddd47d4dd7632fff0b2ecde59a755926eb 100644 (file)
--- a/arch/arm/adler32_neon.c
+++ b/arch/arm/adler32_neon.c
@@ -6,14 +6,9 @@
   * For conditions of distribution and use, see copyright notice in zlib.h
   */
  #ifdef ARM_NEON_ADLER32
-#ifdef _M_ARM64
-#  include <arm64_neon.h>
-#else
-#  include <arm_neon.h>
-#endif
+#include "neon_intrins.h"
  #include "../../zbuild.h"
  #include "../../adler32_p.h"
-#include "../../fallback_builtins.h"
  
  static void NEON_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
      static const uint16_t ALIGNED_(16) taps[64] = {
diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c

index 3b8d2c0010ad9de7aa2cc7a4b2440237fc5b0f0c..2c64ce59e3c8436f5283f38f22661cf97707e179 100644 (file)
--- a/arch/arm/chunkset_neon.c
+++ b/arch/arm/chunkset_neon.c
@@ -3,11 +3,7 @@
   */
  
  #ifdef ARM_NEON_CHUNKSET
-#ifdef _M_ARM64
-#  include <arm64_neon.h>
-#else
-#  include <arm_neon.h>
-#endif
+#include "neon_intrins.h"
  #include "../../zbuild.h"
  #include "../generic/chunk_permute_table.h"
  
diff --git a/arch/arm/compare256_neon.c b/arch/arm/compare256_neon.c

index 53a088cc0094d5da5074d766c44b66deb63e0f97..7daeba411ecea7d0c3fbee707ac7c57a5acb2b2c 100644 (file)
--- a/arch/arm/compare256_neon.c
+++ b/arch/arm/compare256_neon.c
@@ -3,14 +3,13 @@
   * For conditions of distribution and use, see copyright notice in zlib.h
   */
  
-#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
-#ifdef _M_ARM64
-#  include <arm64_neon.h>
-#else
-#  include <arm_neon.h>
-#endif
  #include "../../zbuild.h"
  
+#include "fallback_builtins.h"
+
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+#include "neon_intrins.h"
+
  static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) {
      uint32_t len = 0;
  
diff --git a/arch/arm/neon_intrins.h b/arch/arm/neon_intrins.h

new file mode 100644 (file)

index 0000000..5885779
--- /dev/null
+++ b/arch/arm/neon_intrins.h
@@ -0,0 +1,59 @@
+#ifndef ARM_NEON_INTRINS_H
+#define ARM_NEON_INTRINS_H
+
+#ifdef _M_ARM64
+#  include <arm64_neon.h>
+#else
+#  include <arm_neon.h>
+#endif
+
+#if defined(ARM_NEON_ADLER32) && !defined(__aarch64__) && !defined(_M_ARM64)
+/* Compatibility shim for the _high family of functions */
+#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
+#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
+#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
+#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
+#endif
+
+#ifdef ARM_NEON_SLIDEHASH
+
+#define vqsubq_u16_x4_x1(out, a, b) do { \
+    out.val[0] = vqsubq_u16(a.val[0], b); \
+    out.val[1] = vqsubq_u16(a.val[1], b); \
+    out.val[2] = vqsubq_u16(a.val[2], b); \
+    out.val[3] = vqsubq_u16(a.val[3], b); \
+} while (0)
+
+/* Have to check for hard float ABI on GCC/clang, but not
+ * on MSVC (we don't compile for the soft float ABI on windows)
+ */
+#if !defined(ARM_NEON_HASLD4) && (defined(__ARM_FP) || defined(_MSC_VER))
+
+static inline uint16x8x4_t vld1q_u16_x4(uint16_t *a) {
+    uint16x8x4_t ret = (uint16x8x4_t) {{
+                          vld1q_u16(a),
+                          vld1q_u16(a+8),
+                          vld1q_u16(a+16),
+                          vld1q_u16(a+24)}};
+    return ret;
+}
+
+static inline uint8x16x4_t vld1q_u8_x4(uint8_t *a) {
+    uint8x16x4_t ret = (uint8x16x4_t) {{
+                          vld1q_u8(a),
+                          vld1q_u8(a+16),
+                          vld1q_u8(a+32),
+                          vld1q_u8(a+48)}};
+    return ret;
+}
+
+static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
+    vst1q_u16(p, a.val[0]);
+    vst1q_u16(p + 8, a.val[1]);
+    vst1q_u16(p + 16, a.val[2]);
+    vst1q_u16(p + 24, a.val[3]);
+}
+#endif // HASLD4 check and hard float
+#endif // ARM_NEON_SLIDEHASH
+
+#endif // include guard ARM_NEON_INTRINS_H
diff --git a/arch/arm/slide_hash_neon.c b/arch/arm/slide_hash_neon.c

index 8dc887379c8e777c92ea02756d0ca4d9b5431cf0..5bb4dc505c6cd6c4c1f4bbe4288125053ff40203 100644 (file)
--- a/arch/arm/slide_hash_neon.c
+++ b/arch/arm/slide_hash_neon.c
@@ -9,14 +9,9 @@
   */
  
  #if defined(ARM_NEON_SLIDEHASH)
-#ifdef _M_ARM64
-#  include <arm64_neon.h>
-#else
-#  include <arm_neon.h>
-#endif
+#include "neon_intrins.h"
  #include "../../zbuild.h"
  #include "../../deflate.h"
-#include "../../fallback_builtins.h"
  
  /* SIMD version of hash_chain rebase */
  static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
diff --git a/fallback_builtins.h b/fallback_builtins.h

index 3e589420789b838dd014f2d4c0a770becbd8e010..32bb54dc6fb85a1ae3d201780dab14cb10f67cde 100644 (file)
--- a/fallback_builtins.h
+++ b/fallback_builtins.h
@@ -71,59 +71,4 @@ static inline __m512i _mm512_zextsi128_si512(__m128i a) {
  
  #endif // __AVX2__
  
-#if defined(ARM_NEON_ADLER32) && !defined(__aarch64__) && !defined(_M_ARM64)
-/* Compatibility shim for the _high family of functions */
-#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
-#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
-#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
-#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
-#endif
-
-#ifdef ARM_NEON_SLIDEHASH
-
-#define vqsubq_u16_x4_x1(out, a, b) do { \
-    out.val[0] = vqsubq_u16(a.val[0], b); \
-    out.val[1] = vqsubq_u16(a.val[1], b); \
-    out.val[2] = vqsubq_u16(a.val[2], b); \
-    out.val[3] = vqsubq_u16(a.val[3], b); \
-} while (0)
-
-/* Have to check for hard float ABI on GCC/clang, but not
- * on MSVC (we don't compile for the soft float ABI on windows)
- */
-#if !defined(ARM_NEON_HASLD4) && (defined(__ARM_FP) || defined(_MSC_VER))
-
-#ifdef _M_ARM64
-#  include <arm64_neon.h>
-#else
-#  include <arm_neon.h>
-#endif
-
-static inline uint16x8x4_t vld1q_u16_x4(uint16_t *a) {
-    uint16x8x4_t ret = (uint16x8x4_t) {{
-                          vld1q_u16(a),
-                          vld1q_u16(a+8),
-                          vld1q_u16(a+16),
-                          vld1q_u16(a+24)}};
-    return ret;
-}
-
-static inline uint8x16x4_t vld1q_u8_x4(uint8_t *a) {
-    uint8x16x4_t ret = (uint8x16x4_t) {{
-                          vld1q_u8(a),
-                          vld1q_u8(a+16),
-                          vld1q_u8(a+32),
-                          vld1q_u8(a+48)}};
-    return ret;
-}
-
-static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
-    vst1q_u16(p, a.val[0]);
-    vst1q_u16(p + 8, a.val[1]);
-    vst1q_u16(p + 16, a.val[2]);
-    vst1q_u16(p + 24, a.val[3]);
-}
-#endif // HASLD4 check and hard float
-#endif // ARM_NEON_SLIDEHASH
-
  #endif // include guard FALLBACK_BUILTINS_H
author	Cameron Cawley <ccawley2011@gmail.com>
	Thu, 29 Sep 2022 18:55:34 +0000 (19:55 +0100)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Tue, 11 Oct 2022 10:22:33 +0000 (12:22 +0200)
arch/arm/adler32_neon.c		patch \| blob \| blame \| history
arch/arm/chunkset_neon.c		patch \| blob \| blame \| history
arch/arm/compare256_neon.c		patch \| blob \| blame \| history
arch/arm/neon_intrins.h	[new file with mode: 0644]	patch \| blob
arch/arm/slide_hash_neon.c		patch \| blob \| blame \| history
fallback_builtins.h		patch \| blob \| blame \| history