)
set(ZLIB_PRIVATE_HDRS
arch/generic/chunk_permute_table.h
- arch/generic/compare256_p.h
arch/generic/generic_functions.h
adler32_p.h
arch_functions.h
arch/generic/adler32_c.c
arch/generic/crc32_braid_c.c
)
-
- # x86_64 does not need compare256 fallback if we have BUILTIN_CTZ
- if(NOT HAVE_BUILTIN_CTZ)
- list(APPEND ZLIB_GENERIC_SRCS arch/generic/compare256_c.c)
- endif()
else()
list(APPEND ZLIB_GENERIC_SRCS ${ZLIB_ALL_FALLBACK_SRCS})
add_definitions(-DWITH_ALL_FALLBACKS)
uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
-
-# ifdef HAVE_BUILTIN_CTZLL
uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
uint32_t longest_match_neon(deflate_state *const s, uint32_t cur_match);
uint32_t longest_match_slow_neon(deflate_state *const s, uint32_t cur_match);
-# endif
void slide_hash_neon(deflate_state *s);
-void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef ARM_CRC32
# define native_adler32_copy adler32_copy_neon
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_neon
+# undef native_compare256
+# define native_compare256 compare256_neon
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_neon
+# undef native_longest_match
+# define native_longest_match longest_match_neon
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_neon
# undef native_slide_hash
# define native_slide_hash slide_hash_neon
-# ifdef HAVE_BUILTIN_CTZLL
-# undef native_compare256
-# define native_compare256 compare256_neon
-# undef native_longest_match
-# define native_longest_match longest_match_neon
-# undef native_longest_match_slow
-# define native_longest_match_slow longest_match_slow_neon
-# endif
# endif
// ARM - CRC32
# if (defined(ARM_CRC32) && defined(__ARM_FEATURE_CRC32))
#include "deflate.h"
#include "fallback_builtins.h"
-#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+#if defined(ARM_NEON)
#include "neon_intrins.h"
static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) {
chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
-compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zendian.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
-compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zendian.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
*/
#include "zbuild.h"
-#include "compare256_p.h"
+#include "zendian.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+/* 8-bit integer comparison for hardware without unaligned loads */
+static inline uint32_t compare256_8_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+
+ do {
+ if (src0[0] != src1[0])
+ return len;
+ if (src0[1] != src1[1])
+ return len + 1;
+ if (src0[2] != src1[2])
+ return len + 2;
+ if (src0[3] != src1[3])
+ return len + 3;
+ if (src0[4] != src1[4])
+ return len + 4;
+ if (src0[5] != src1[5])
+ return len + 5;
+ if (src0[6] != src1[6])
+ return len + 6;
+ if (src0[7] != src1[7])
+ return len + 7;
+ src0 += 8, src1 += 8, len += 8;
+ } while (len < 256);
+
+ return 256;
+}
+
+/* 64-bit integer comparison for hardware with unaligned loads */
+static inline uint32_t compare256_64_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+
+ do {
+ uint64_t sv = zng_memread_8(src0);
+ uint64_t mv = zng_memread_8(src1);
+ uint64_t diff = sv ^ mv;
+ if (diff)
+ return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8;
+ src0 += 8, src1 += 8, len += 8;
+
+ sv = zng_memread_8(src0);
+ mv = zng_memread_8(src1);
+ diff = sv ^ mv;
+ if (diff)
+ return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8;
+ src0 += 8, src1 += 8, len += 8;
+ } while (len < 256);
+
+ return 256;
+}
-// Set optimal COMPARE256 function variant
#if OPTIMAL_CMP == 8
-# define COMPARE256 compare256_8
-#elif defined(HAVE_BUILTIN_CTZLL)
-# define COMPARE256 compare256_64
-#elif defined(HAVE_BUILTIN_CTZ)
-# define COMPARE256 compare256_32
+# define COMPARE256 compare256_8_static
#else
-# define COMPARE256 compare256_16
+# define COMPARE256 compare256_64_static
+#endif
+
+#ifdef WITH_ALL_FALLBACKS
+Z_INTERNAL uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_8_static(src0, src1);
+}
+
+Z_INTERNAL uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_64_static(src0, src1);
+}
#endif
Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) {
-/* compare256_p.h -- 256 byte memory comparison with match length return
- * Copyright (C) 2020 Nathan Moinvaziri
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#include "zmemory.h"
-#include "deflate.h"
-#include "fallback_builtins.h"
-
-/* 8-bit integer comparison */
-static inline uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) {
- uint32_t len = 0;
-
- do {
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- } while (len < 256);
-
- return 256;
-}
-
-/* 16-bit integer comparison */
-static inline uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1) {
- uint32_t len = 0;
-
- do {
- if (zng_memcmp_2(src0, src1) != 0)
- return len + (*src0 == *src1);
- src0 += 2, src1 += 2, len += 2;
-
- if (zng_memcmp_2(src0, src1) != 0)
- return len + (*src0 == *src1);
- src0 += 2, src1 += 2, len += 2;
-
- if (zng_memcmp_2(src0, src1) != 0)
- return len + (*src0 == *src1);
- src0 += 2, src1 += 2, len += 2;
-
- if (zng_memcmp_2(src0, src1) != 0)
- return len + (*src0 == *src1);
- src0 += 2, src1 += 2, len += 2;
- } while (len < 256);
-
- return 256;
-}
-
-#ifdef HAVE_BUILTIN_CTZ
-/* 32-bit integer comparison */
-static inline uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1) {
- uint32_t len = 0;
-
- do {
- uint32_t sv, mv, diff;
-
- sv = zng_memread_4(src0);
- mv = zng_memread_4(src1);
-
- diff = sv ^ mv;
- if (diff)
- return len + zng_ctz32(Z_U32_FROM_LE(diff)) / 8;
-
- src0 += 4, src1 += 4, len += 4;
- } while (len < 256);
-
- return 256;
-}
-#endif
-
-#ifdef HAVE_BUILTIN_CTZLL
-/* 64-bit integer comparison */
-static inline uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) {
- uint32_t len = 0;
-
- do {
- uint64_t sv, mv, diff;
-
- sv = zng_memread_8(src0);
- mv = zng_memread_8(src1);
-
- diff = sv ^ mv;
- if (diff)
- return len + zng_ctz64(Z_U64_FROM_LE(diff)) / 8;
-
- src0 += 8, src1 += 8, len += 8;
- } while (len < 256);
-
- return 256;
-}
-#endif
uint8_t* chunkmemset_safe_c(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+#ifdef WITH_ALL_FALLBACKS
+uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1);
+uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1);
+#endif
uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
uint32_t crc32_braid(uint32_t crc, const uint8_t *buf, size_t len);
*/
#include "zbuild.h"
+#include "zendian.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"
-#if defined(LOONGARCH_LASX) && defined(HAVE_BUILTIN_CTZ)
+#ifdef LOONGARCH_LASX
#include <lasxintrin.h>
#include "lasxintrin_ext.h"
*/
#include "zbuild.h"
+#include "zendian.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"
-#if defined(LOONGARCH_LSX) && defined(HAVE_BUILTIN_CTZ)
+#ifdef LOONGARCH_LSX
#include <lsxintrin.h>
#include "lsxintrin_ext.h"
#ifdef LOONGARCH_LSX
uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len);
uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-void slide_hash_lsx(deflate_state *s);
-# ifdef HAVE_BUILTIN_CTZ
- uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1);
- uint32_t longest_match_lsx(deflate_state *const s, uint32_t cur_match);
- uint32_t longest_match_slow_lsx(deflate_state *const s, uint32_t cur_match);
-# endif
uint8_t* chunkmemset_safe_lsx(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1);
void inflate_fast_lsx(PREFIX3(stream) *strm, uint32_t start);
+uint32_t longest_match_lsx(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_lsx(deflate_state *const s, uint32_t cur_match);
+void slide_hash_lsx(deflate_state *s);
#endif
#ifdef LOONGARCH_LASX
uint32_t adler32_lasx(uint32_t adler, const uint8_t *src, size_t len);
uint32_t adler32_copy_lasx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-void slide_hash_lasx(deflate_state *s);
-# ifdef HAVE_BUILTIN_CTZ
- uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1);
- uint32_t longest_match_lasx(deflate_state *const s, uint32_t cur_match);
- uint32_t longest_match_slow_lasx(deflate_state *const s, uint32_t cur_match);
-# endif
uint8_t* chunkmemset_safe_lasx(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1);
void inflate_fast_lasx(PREFIX3(stream) *strm, uint32_t start);
+uint32_t longest_match_lasx(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_lasx(deflate_state *const s, uint32_t cur_match);
+void slide_hash_lasx(deflate_state *s);
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
# define native_adler32 adler32_lsx
# undef native_adler32_copy
# define native_adler32_copy adler32_copy_lsx
-# undef native_slide_hash
-# define native_slide_hash slide_hash_lsx
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_lsx
+# undef native_compare256
+# define native_compare256 compare256_lsx
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_lsx
-# ifdef HAVE_BUILTIN_CTZ
-# undef native_compare256
-# define native_compare256 compare256_lsx
-# undef native_longest_match
-# define native_longest_match longest_match_lsx
-# undef native_longest_match_slow
-# define native_longest_match_slow longest_match_slow_lsx
-# endif
+# undef native_longest_match
+# define native_longest_match longest_match_lsx
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_lsx
+# undef native_slide_hash
+# define native_slide_hash slide_hash_lsx
# endif
# if defined(LOONGARCH_LASX) && defined(__loongarch_asx)
# undef native_adler32
# define native_adler32 adler32_lasx
# undef native_adler32_copy
# define native_adler32_copy adler32_copy_lasx
-# undef native_slide_hash
-# define native_slide_hash slide_hash_lasx
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_lasx
+# undef native_compare256
+# define native_compare256 compare256_lasx
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_lasx
-# ifdef HAVE_BUILTIN_CTZ
-# undef native_compare256
-# define native_compare256 compare256_lasx
-# undef native_longest_match
-# define native_longest_match longest_match_lasx
-# undef native_longest_match_slow
-# define native_longest_match_slow longest_match_slow_lasx
-# endif
+# undef native_longest_match
+# define native_longest_match longest_match_lasx
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_lasx
+# undef native_slide_hash
+# define native_slide_hash slide_hash_lasx
# endif
#endif
#include "zbuild.h"
#include "zmemory.h"
#include "deflate.h"
-#include "fallback_builtins.h"
#include <riscv_vector.h>
*/
#include "zbuild.h"
+#include "zendian.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"
-#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+#ifdef X86_AVX2
#include <immintrin.h>
#ifdef _MSC_VER
*/
#include "zbuild.h"
+#include "zendian.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"
-#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL)
+#ifdef X86_AVX512
#include <immintrin.h>
#ifdef _MSC_VER
// 16 bytes
xmm_src0_0 = _mm_loadu_si128((__m128i*)src0);
xmm_src1_0 = _mm_loadu_si128((__m128i*)src1);
- mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0); // zero-extended to use __builtin_ctz
+ mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0);
if (mask_0 != 0x0000FFFF)
return zng_ctz32(~mask_0); /* Invert bits so identical = 0 */
*/
#include "zbuild.h"
+#include "zendian.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"
-#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+#ifdef X86_SSE2
#include <emmintrin.h>
#ifdef X86_SSE2
uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
+uint32_t longest_match_sse2(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_sse2(deflate_state *const s, uint32_t cur_match);
+void slide_hash_sse2(deflate_state *s);
-# ifdef HAVE_BUILTIN_CTZ
- uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
- uint32_t longest_match_sse2(deflate_state *const s, uint32_t cur_match);
- uint32_t longest_match_slow_sse2(deflate_state *const s, uint32_t cur_match);
-# endif
- void slide_hash_sse2(deflate_state *s);
- void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
# if !defined(WITHOUT_CHORBA_SSE)
uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len);
uint32_t crc32_copy_chorba_sse2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
-
-# ifdef HAVE_BUILTIN_CTZ
- uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
- uint32_t longest_match_avx2(deflate_state *const s, uint32_t cur_match);
- uint32_t longest_match_slow_avx2(deflate_state *const s, uint32_t cur_match);
-# endif
- void slide_hash_avx2(deflate_state *s);
- void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
+uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
+uint32_t longest_match_avx2(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_avx2(deflate_state *const s, uint32_t cur_match);
+void slide_hash_avx2(deflate_state *s);
#endif
#ifdef X86_AVX512
uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1);
void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start);
-# ifdef HAVE_BUILTIN_CTZLL
- uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1);
- uint32_t longest_match_avx512(deflate_state *const s, uint32_t cur_match);
- uint32_t longest_match_slow_avx512(deflate_state *const s, uint32_t cur_match);
-# endif
+uint32_t longest_match_avx512(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_avx512(deflate_state *const s, uint32_t cur_match);
#endif
#ifdef X86_AVX512VNNI
uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
# if (defined(X86_SSE2) && defined(__SSE2__)) || (defined(ARCH_X86) && defined(ARCH_64BIT))
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_sse2
+# undef native_compare256
+# define native_compare256 compare256_sse2
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_sse2
+# undef native_longest_match
+# define native_longest_match longest_match_sse2
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_sse2
+# if !defined(WITHOUT_CHORBA_SSE)
+# undef native_crc32
+# define native_crc32 crc32_chorba_sse2
+# endif
# undef native_slide_hash
# define native_slide_hash slide_hash_sse2
-# ifdef HAVE_BUILTIN_CTZ
-# undef native_compare256
-# define native_compare256 compare256_sse2
-# undef native_longest_match
-# define native_longest_match longest_match_sse2
-# undef native_longest_match_slow
-# define native_longest_match_slow longest_match_slow_sse2
-# if !defined(WITHOUT_CHORBA_SSE)
-# undef native_crc32
-# define native_crc32 crc32_chorba_sse2
-# endif
-# endif
# endif
// X86 - SSSE3
# if defined(X86_SSSE3) && defined(__SSSE3__)
# define native_adler32_copy adler32_copy_avx2
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_avx2
+# undef native_compare256
+# define native_compare256 compare256_avx2
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_avx2
+# undef native_longest_match
+# define native_longest_match longest_match_avx2
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_avx2
# undef native_slide_hash
# define native_slide_hash slide_hash_avx2
-# ifdef HAVE_BUILTIN_CTZ
-# undef native_compare256
-# define native_compare256 compare256_avx2
-# undef native_longest_match
-# define native_longest_match longest_match_avx2
-# undef native_longest_match_slow
-# define native_longest_match_slow longest_match_slow_avx2
-# endif
# endif
// X86 - AVX512 (F,DQ,BW,Vl)
# if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)
# define native_adler32_copy adler32_copy_avx512
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_avx512
+# undef native_compare256
+# define native_compare256 compare256_avx512
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_avx512
-# ifdef HAVE_BUILTIN_CTZLL
-# undef native_compare256
-# define native_compare256 compare256_avx512
-# undef native_longest_match
-# define native_longest_match longest_match_avx512
-# undef native_longest_match_slow
-# define native_longest_match_slow longest_match_slow_avx512
-# endif
+# undef native_longest_match
+# define native_longest_match longest_match_avx512
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_avx512
// X86 - AVX512 (VNNI)
# if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
# undef native_adler32
*/
#include "zbuild.h"
+#include "zendian.h"
#include "zmemory.h"
#include "fallback_builtins.h"
typedef uint32_t (*compare256_rle_func)(const uint8_t* src0, const uint8_t* src1);
-/* 8-bit integer comparison */
+/* 8-bit RLE comparison for hardware without unaligned loads */
static inline uint32_t compare256_rle_8(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
+ uint8_t val = *src0;
do {
- if (*src0 != *src1)
- return len;
- src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src1 += 1, len += 1;
- } while (len < 256);
-
- return 256;
-}
-
-/* 16-bit integer comparison */
-static inline uint32_t compare256_rle_16(const uint8_t *src0, const uint8_t *src1) {
- uint32_t len = 0;
- uint16_t src0_cmp;
-
- src0_cmp = zng_memread_2(src0);
-
- do {
- if (src0_cmp != zng_memread_2(src1))
- return len + (*src0 == *src1);
- src1 += 2, len += 2;
- if (src0_cmp != zng_memread_2(src1))
- return len + (*src0 == *src1);
- src1 += 2, len += 2;
- if (src0_cmp != zng_memread_2(src1))
- return len + (*src0 == *src1);
- src1 += 2, len += 2;
- if (src0_cmp != zng_memread_2(src1))
- return len + (*src0 == *src1);
- src1 += 2, len += 2;
- } while (len < 256);
-
- return 256;
-}
-
-#ifdef HAVE_BUILTIN_CTZ
-/* 32-bit integer comparison */
-static inline uint32_t compare256_rle_32(const uint8_t *src0, const uint8_t *src1) {
- uint32_t sv, len = 0;
- uint16_t src0_cmp;
-
- src0_cmp = zng_memread_2(src0);
- sv = ((uint32_t)src0_cmp << 16) | src0_cmp;
-
- do {
- uint32_t mv, diff;
-
- mv = zng_memread_4(src1);
-
- diff = sv ^ mv;
- if (diff)
- return len + zng_ctz32(Z_U32_TO_LE(diff)) / 8;
-
- src1 += 4, len += 4;
+ if (val != src1[0])
+ return len;
+ if (val != src1[1])
+ return len + 1;
+ if (val != src1[2])
+ return len + 2;
+ if (val != src1[3])
+ return len + 3;
+ if (val != src1[4])
+ return len + 4;
+ if (val != src1[5])
+ return len + 5;
+ if (val != src1[6])
+ return len + 6;
+ if (val != src1[7])
+ return len + 7;
+ src1 += 8, len += 8;
} while (len < 256);
return 256;
}
-#endif
-#ifdef HAVE_BUILTIN_CTZLL
-/* 64-bit integer comparison */
+/* 64-bit RLE comparison for hardware with unaligned loads */
static inline uint32_t compare256_rle_64(const uint8_t *src0, const uint8_t *src1) {
uint32_t src0_cmp32, len = 0;
uint16_t src0_cmp;
- uint64_t sv;
+ uint64_t sv, mv, diff;
src0_cmp = zng_memread_2(src0);
src0_cmp32 = ((uint32_t)src0_cmp << 16) | src0_cmp;
sv = ((uint64_t)src0_cmp32 << 32) | src0_cmp32;
do {
- uint64_t mv, diff;
-
mv = zng_memread_8(src1);
-
diff = sv ^ mv;
if (diff)
return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8;
return 256;
}
-#endif
#if OPTIMAL_CMP == 8
# define compare256_rle compare256_rle_8
-#elif defined(HAVE_BUILTIN_CTZLL)
-# define compare256_rle compare256_rle_64
-#elif defined(HAVE_BUILTIN_CTZ)
-# define compare256_rle compare256_rle_32
#else
-# define compare256_rle compare256_rle_16
+# define compare256_rle compare256_rle_64
#endif
/* ===========================================================================
ft.adler32_copy = &adler32_copy_c;
ft.crc32 = &crc32_braid;
ft.crc32_copy = &crc32_copy_braid;
-# ifndef HAVE_BUILTIN_CTZ
- ft.longest_match = &longest_match_c;
- ft.longest_match_slow = &longest_match_slow_c;
- ft.compare256 = &compare256_c;
-# endif
# endif
#else // WITH_ALL_FALLBACKS
ft.adler32 = &adler32_c;
ft.adler32_copy = &adler32_copy_c;
ft.chunkmemset_safe = &chunkmemset_safe_c;
+ ft.compare256 = &compare256_c;
ft.crc32 = &crc32_braid;
ft.crc32_copy = &crc32_copy_braid;
ft.inflate_fast = &inflate_fast_c;
- ft.slide_hash = &slide_hash_c;
ft.longest_match = &longest_match_c;
ft.longest_match_slow = &longest_match_slow_c;
- ft.compare256 = &compare256_c;
+ ft.slide_hash = &slide_hash_c;
#endif
// Select arch-optimized functions
# endif
{
ft.chunkmemset_safe = &chunkmemset_safe_sse2;
+ ft.compare256 = &compare256_sse2;
# if !defined(WITHOUT_CHORBA_SSE)
ft.crc32 = &crc32_chorba_sse2;
ft.crc32_copy = &crc32_copy_chorba_sse2;
# endif
ft.inflate_fast = &inflate_fast_sse2;
- ft.slide_hash = &slide_hash_sse2;
-# ifdef HAVE_BUILTIN_CTZ
- ft.compare256 = &compare256_sse2;
ft.longest_match = &longest_match_sse2;
ft.longest_match_slow = &longest_match_slow_sse2;
-# endif
+ ft.slide_hash = &slide_hash_sse2;
}
#endif
// X86 - SSSE3
ft.adler32 = &adler32_avx2;
ft.adler32_copy = &adler32_copy_avx2;
ft.chunkmemset_safe = &chunkmemset_safe_avx2;
- ft.inflate_fast = &inflate_fast_avx2;
- ft.slide_hash = &slide_hash_avx2;
-# ifdef HAVE_BUILTIN_CTZ
ft.compare256 = &compare256_avx2;
+ ft.inflate_fast = &inflate_fast_avx2;
ft.longest_match = &longest_match_avx2;
ft.longest_match_slow = &longest_match_slow_avx2;
-# endif
+ ft.slide_hash = &slide_hash_avx2;
}
#endif
// X86 - AVX512 (F,DQ,BW,Vl)
ft.adler32 = &adler32_avx512;
ft.adler32_copy = &adler32_copy_avx512;
ft.chunkmemset_safe = &chunkmemset_safe_avx512;
- ft.inflate_fast = &inflate_fast_avx512;
-# ifdef HAVE_BUILTIN_CTZLL
ft.compare256 = &compare256_avx512;
+ ft.inflate_fast = &inflate_fast_avx512;
ft.longest_match = &longest_match_avx512;
ft.longest_match_slow = &longest_match_slow_avx512;
-# endif
}
#endif
#ifdef X86_AVX512VNNI
ft.adler32 = &adler32_neon;
ft.adler32_copy = &adler32_copy_neon;
ft.chunkmemset_safe = &chunkmemset_safe_neon;
- ft.inflate_fast = &inflate_fast_neon;
- ft.slide_hash = &slide_hash_neon;
-# ifdef HAVE_BUILTIN_CTZLL
ft.compare256 = &compare256_neon;
+ ft.inflate_fast = &inflate_fast_neon;
ft.longest_match = &longest_match_neon;
ft.longest_match_slow = &longest_match_slow_neon;
-# endif
+ ft.slide_hash = &slide_hash_neon;
}
#endif
// ARM - CRC32
if (cf.loongarch.has_lsx) {
ft.adler32 = &adler32_lsx;
ft.adler32_copy = &adler32_copy_lsx;
- ft.slide_hash = slide_hash_lsx;
-# ifdef HAVE_BUILTIN_CTZ
+ ft.chunkmemset_safe = &chunkmemset_safe_lsx;
ft.compare256 = &compare256_lsx;
+ ft.inflate_fast = &inflate_fast_lsx;
ft.longest_match = &longest_match_lsx;
ft.longest_match_slow = &longest_match_slow_lsx;
-# endif
- ft.chunkmemset_safe = &chunkmemset_safe_lsx;
- ft.inflate_fast = &inflate_fast_lsx;
+ ft.slide_hash = slide_hash_lsx;
}
#endif
#ifdef LOONGARCH_LASX
if (cf.loongarch.has_lasx) {
ft.adler32 = &adler32_lasx;
ft.adler32_copy = &adler32_copy_lasx;
- ft.slide_hash = slide_hash_lasx;
-# ifdef HAVE_BUILTIN_CTZ
+ ft.chunkmemset_safe = &chunkmemset_safe_lasx;
ft.compare256 = &compare256_lasx;
+ ft.inflate_fast = &inflate_fast_lasx;
ft.longest_match = &longest_match_lasx;
ft.longest_match_slow = &longest_match_slow_lasx;
-# endif
- ft.chunkmemset_safe = &chunkmemset_safe_lasx;
- ft.inflate_fast = &inflate_fast_lasx;
+ ft.slide_hash = slide_hash_lasx;
}
#endif
# include "zbuild.h"
# include "arch_functions.h"
# include "../test_cpu_features.h"
-# include "arch/generic/compare256_p.h"
}
#define MAX_COMPARE_SIZE (256 + 64)
BENCHMARK_COMPARE256(native, native_compare256, 1);
#else
+#ifdef WITH_ALL_FALLBACKS
BENCHMARK_COMPARE256(8, compare256_8, 1);
-BENCHMARK_COMPARE256(16, compare256_16, 1);
-#if defined(HAVE_BUILTIN_CTZ)
-BENCHMARK_COMPARE256(32, compare256_32, 1);
-#endif
-#if defined(HAVE_BUILTIN_CTZLL)
BENCHMARK_COMPARE256(64, compare256_64, 1);
#endif
-#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+#ifdef X86_SSE2
BENCHMARK_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2);
#endif
-#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+#ifdef X86_AVX2
BENCHMARK_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2);
#endif
-#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL)
+#ifdef X86_AVX512
BENCHMARK_COMPARE256(avx512, compare256_avx512, test_cpu_features.x86.has_avx512_common);
#endif
-#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+#ifdef ARM_NEON
BENCHMARK_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon);
#endif
#ifdef POWER9
#ifdef RISCV_RVV
BENCHMARK_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv);
#endif
-#if defined(LOONGARCH_LSX) && defined(HAVE_BUILTIN_CTZ)
+#ifdef LOONGARCH_LSX
BENCHMARK_COMPARE256(lsx, compare256_lsx, test_cpu_features.loongarch.has_lsx);
#endif
-#if defined(LOONGARCH_LASX) && defined(HAVE_BUILTIN_CTZ)
+#ifdef LOONGARCH_LASX
BENCHMARK_COMPARE256(lasx, compare256_lasx, test_cpu_features.loongarch.has_lasx);
#endif
BENCHMARK_REGISTER_F(compare256_rle, name)->Arg(1)->Arg(10)->Arg(40)->Arg(80)->Arg(100)->Arg(175)->Arg(256);;
BENCHMARK_COMPARE256_RLE(8, compare256_rle_8, 1);
-BENCHMARK_COMPARE256_RLE(16, compare256_rle_16, 1);
-#if defined(HAVE_BUILTIN_CTZ)
-BENCHMARK_COMPARE256_RLE(32, compare256_rle_32, 1);
-#endif
-#if defined(HAVE_BUILTIN_CTZLL)
BENCHMARK_COMPARE256_RLE(64, compare256_rle_64, 1);
-#endif
# include "zutil.h"
# include "arch_functions.h"
# include "test_cpu_features.h"
-# include "arch/generic/compare256_p.h"
}
#include <gtest/gtest.h>
TEST_COMPARE256(native, native_compare256, 1)
#else
+#ifdef WITH_ALL_FALLBACKS
TEST_COMPARE256(8, compare256_8, 1)
-TEST_COMPARE256(16, compare256_16, 1)
-#if defined(HAVE_BUILTIN_CTZ)
-TEST_COMPARE256(32, compare256_32, 1)
-#endif
-#if defined(HAVE_BUILTIN_CTZLL)
TEST_COMPARE256(64, compare256_64, 1)
#endif
-#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+#ifdef X86_SSE2
TEST_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2)
#endif
-#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+#ifdef X86_AVX2
TEST_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2)
#endif
-#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL)
+#ifdef X86_AVX512
TEST_COMPARE256(avx512, compare256_avx512, test_cpu_features.x86.has_avx512_common)
#endif
-#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+#ifdef ARM_NEON
TEST_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon)
#endif
#ifdef POWER9
#ifdef RISCV_RVV
TEST_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv)
#endif
-#if defined(LOONGARCH_LSX) && defined(HAVE_BUILTIN_CTZ)
+#ifdef LOONGARCH_LSX
TEST_COMPARE256(lsx, compare256_lsx, test_cpu_features.loongarch.has_lsx)
#endif
-#if defined(LOONGARCH_LASX) && defined(HAVE_BUILTIN_CTZ)
+#ifdef LOONGARCH_LASX
TEST_COMPARE256(lasx, compare256_lasx, test_cpu_features.loongarch.has_lasx)
#endif
}
TEST_COMPARE256_RLE(8, compare256_rle_8, 1)
-TEST_COMPARE256_RLE(16, compare256_rle_16, 1)
-#if defined(HAVE_BUILTIN_CTZ)
-TEST_COMPARE256_RLE(32, compare256_rle_32, 1)
-#endif
-#if defined(HAVE_BUILTIN_CTZLL)
TEST_COMPARE256_RLE(64, compare256_rle_64, 1)
-#endif