From: Hans Kristian Rosbach Date: Fri, 20 Dec 2024 22:31:37 +0000 (+0100) Subject: Continued cleanup of old UNALIGNED_OK checks X-Git-Tag: 2.2.3~1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=bf05e882b807e35ab4d4ac2b6456bd380d68f67d;p=thirdparty%2Fzlib-ng.git Continued cleanup of old UNALIGNED_OK checks - Remove obsolete checks - Fix checks that are inconsistent - Stop compiling compare256/longest_match variants that never gets called - Improve how the generic compare256 functions are handled. - Allow overriding OPTIMAL_CMP This simplifies the code and avoids having a lot of code in the compiled library than can never get executed. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 01edc153..b97c2021 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1074,6 +1074,9 @@ set(ZLIB_PUBLIC_HDRS ${CMAKE_CURRENT_BINARY_DIR}/zlib${SUFFIX}.h ) set(ZLIB_PRIVATE_HDRS + arch/generic/chunk_permute_table.h + arch/generic/compare256_p.h + arch/generic/generic_functions.h adler32_p.h chunkset_tpl.h compare256_rle.h diff --git a/arch/generic/Makefile.in b/arch/generic/Makefile.in index 15d51d31..2522d7d0 100644 --- a/arch/generic/Makefile.in +++ b/arch/generic/Makefile.in @@ -40,10 +40,10 @@ chunkset_c.o: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl. chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c -compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h +compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c -compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h +compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h diff --git a/arch/generic/compare256_c.c b/arch/generic/compare256_c.c index bdcb8139..ad535523 100644 --- a/arch/generic/compare256_c.c +++ b/arch/generic/compare256_c.c @@ -4,187 +4,28 @@ */ #include "zbuild.h" -#include "zmemory.h" -#include "deflate.h" -#include "fallback_builtins.h" - -/* ALIGNED, byte comparison */ -static inline uint32_t compare256_c_static(const uint8_t *src0, const uint8_t *src1) { - uint32_t len = 0; - - do { - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - } while (len < 256); - - return 256; -} +#include "compare256_p.h" + +// Set optimal COMPARE256 function variant +#if OPTIMAL_CMP == 8 +# define COMPARE256 compare256_8 +#elif defined(HAVE_BUILTIN_CTZLL) +# define COMPARE256 compare256_64 +#elif defined(HAVE_BUILTIN_CTZ) +# define COMPARE256 compare256_32 +#else +# define COMPARE256 compare256_16 +#endif Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) { - return compare256_c_static(src0, src1); + return COMPARE256(src0, src1); } +// Generate longest_match_c #define LONGEST_MATCH longest_match_c -#define COMPARE256 compare256_c_static - #include "match_tpl.h" +// Generate longest_match_slow_c #define LONGEST_MATCH_SLOW #define LONGEST_MATCH longest_match_slow_c -#define COMPARE256 compare256_c_static - -#include "match_tpl.h" - -#if OPTIMAL_CMP >= 32 - -/* 16-bit unaligned integer comparison */ -static inline uint32_t compare256_16_static(const uint8_t *src0, const uint8_t *src1) { - uint32_t len = 0; - - do { - if (zng_memcmp_2(src0, src1) != 0) - return len + (*src0 == *src1); - src0 += 2, src1 += 2, len += 2; - - if (zng_memcmp_2(src0, src1) != 0) - return len + (*src0 == *src1); - src0 += 2, src1 += 2, len += 2; - - if (zng_memcmp_2(src0, src1) != 0) - return len + (*src0 == *src1); - src0 += 2, src1 += 2, len += 2; - - if (zng_memcmp_2(src0, src1) != 0) - return len + (*src0 == *src1); - src0 += 2, src1 += 2, len += 2; - } while (len < 256); - - return 256; -} - -Z_INTERNAL uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1) { - return compare256_16_static(src0, src1); -} - -#define LONGEST_MATCH longest_match_16 -#define COMPARE256 compare256_16_static - #include "match_tpl.h" - -#define LONGEST_MATCH_SLOW -#define LONGEST_MATCH longest_match_slow_16 -#define COMPARE256 compare256_16_static - -#include "match_tpl.h" - -#ifdef HAVE_BUILTIN_CTZ -/* 32-bit unaligned integer comparison */ -static inline uint32_t compare256_32_static(const uint8_t *src0, const uint8_t *src1) { - uint32_t len = 0; - - do { - uint32_t sv, mv, diff; - - sv = zng_memread_4(src0); - mv = zng_memread_4(src1); - - diff = sv ^ mv; - if (diff) { -#if BYTE_ORDER == LITTLE_ENDIAN - uint32_t match_byte = __builtin_ctz(diff) / 8; -#else - uint32_t match_byte = __builtin_clz(diff) / 8; -#endif - return len + match_byte; - } - - src0 += 4, src1 += 4, len += 4; - } while (len < 256); - - return 256; -} - -Z_INTERNAL uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1) { - return compare256_32_static(src0, src1); -} - -#define LONGEST_MATCH longest_match_32 -#define COMPARE256 compare256_32_static - -#include "match_tpl.h" - -#define LONGEST_MATCH_SLOW -#define LONGEST_MATCH longest_match_slow_32 -#define COMPARE256 compare256_32_static - -#include "match_tpl.h" - -#endif - -#if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64 -/* 64-bit integer comparison */ -static inline uint32_t compare256_64_static(const uint8_t *src0, const uint8_t *src1) { - uint32_t len = 0; - - do { - uint64_t sv, mv, diff; - - sv = zng_memread_8(src0); - mv = zng_memread_8(src1); - - diff = sv ^ mv; - if (diff) { -#if BYTE_ORDER == LITTLE_ENDIAN - uint64_t match_byte = __builtin_ctzll(diff) / 8; -#else - uint64_t match_byte = __builtin_clzll(diff) / 8; -#endif - return len + (uint32_t)match_byte; - } - - src0 += 8, src1 += 8, len += 8; - } while (len < 256); - - return 256; -} - -Z_INTERNAL uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) { - return compare256_64_static(src0, src1); -} - -#define LONGEST_MATCH longest_match_64 -#define COMPARE256 compare256_64_static - -#include "match_tpl.h" - -#define LONGEST_MATCH_SLOW -#define LONGEST_MATCH longest_match_slow_64 -#define COMPARE256 compare256_64_static - -#include "match_tpl.h" - -#endif - -#endif diff --git a/arch/generic/compare256_p.h b/arch/generic/compare256_p.h new file mode 100644 index 00000000..ac934841 --- /dev/null +++ b/arch/generic/compare256_p.h @@ -0,0 +1,123 @@ +/* compare256_p.h -- 256 byte memory comparison with match length return + * Copyright (C) 2020 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zmemory.h" +#include "deflate.h" +#include "fallback_builtins.h" + +/* 8-bit integer comparison */ +static inline uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + if (*src0 != *src1) + return len; + src0 += 1, src1 += 1, len += 1; + if (*src0 != *src1) + return len; + src0 += 1, src1 += 1, len += 1; + if (*src0 != *src1) + return len; + src0 += 1, src1 += 1, len += 1; + if (*src0 != *src1) + return len; + src0 += 1, src1 += 1, len += 1; + if (*src0 != *src1) + return len; + src0 += 1, src1 += 1, len += 1; + if (*src0 != *src1) + return len; + src0 += 1, src1 += 1, len += 1; + if (*src0 != *src1) + return len; + src0 += 1, src1 += 1, len += 1; + if (*src0 != *src1) + return len; + src0 += 1, src1 += 1, len += 1; + } while (len < 256); + + return 256; +} + +/* 16-bit integer comparison */ +static inline uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + if (zng_memcmp_2(src0, src1) != 0) + return len + (*src0 == *src1); + src0 += 2, src1 += 2, len += 2; + + if (zng_memcmp_2(src0, src1) != 0) + return len + (*src0 == *src1); + src0 += 2, src1 += 2, len += 2; + + if (zng_memcmp_2(src0, src1) != 0) + return len + (*src0 == *src1); + src0 += 2, src1 += 2, len += 2; + + if (zng_memcmp_2(src0, src1) != 0) + return len + (*src0 == *src1); + src0 += 2, src1 += 2, len += 2; + } while (len < 256); + + return 256; +} + +#ifdef HAVE_BUILTIN_CTZ +/* 32-bit integer comparison */ +static inline uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + uint32_t sv, mv, diff; + + sv = zng_memread_4(src0); + mv = zng_memread_4(src1); + + diff = sv ^ mv; + if (diff) { +# if BYTE_ORDER == LITTLE_ENDIAN + uint32_t match_byte = __builtin_ctz(diff) / 8; +# else + uint32_t match_byte = __builtin_clz(diff) / 8; +# endif + return len + match_byte; + } + + src0 += 4, src1 += 4, len += 4; + } while (len < 256); + + return 256; +} +#endif + +#ifdef HAVE_BUILTIN_CTZLL +/* 64-bit integer comparison */ +static inline uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + uint64_t sv, mv, diff; + + sv = zng_memread_8(src0); + mv = zng_memread_8(src1); + + diff = sv ^ mv; + if (diff) { +# if BYTE_ORDER == LITTLE_ENDIAN + uint64_t match_byte = __builtin_ctzll(diff) / 8; +# else + uint64_t match_byte = __builtin_clzll(diff) / 8; +# endif + return len + (uint32_t)match_byte; + } + + src0 += 8, src1 += 8, len += 8; + } while (len < 256); + + return 256; +} +#endif diff --git a/arch/generic/generic_functions.h b/arch/generic/generic_functions.h index 9fa31a88..b0366bae 100644 --- a/arch/generic/generic_functions.h +++ b/arch/generic/generic_functions.h @@ -28,15 +28,6 @@ void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start); uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len); uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1); -#if OPTIMAL_CMP >= 32 - uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1); -# ifdef HAVE_BUILTIN_CTZ - uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1); -# endif -# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64 - uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1); -# endif -#endif typedef void (*slide_hash_func)(deflate_state *s); @@ -44,41 +35,6 @@ void slide_hash_c(deflate_state *s); uint32_t longest_match_c(deflate_state *const s, Pos cur_match); uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match); -#if OPTIMAL_CMP >= 32 - uint32_t longest_match_16(deflate_state *const s, Pos cur_match); - uint32_t longest_match_slow_16(deflate_state *const s, Pos cur_match); -# ifdef HAVE_BUILTIN_CTZ - uint32_t longest_match_32(deflate_state *const s, Pos cur_match); - uint32_t longest_match_slow_32(deflate_state *const s, Pos cur_match); -# endif -# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64 - uint32_t longest_match_64(deflate_state *const s, Pos cur_match); - uint32_t longest_match_slow_64(deflate_state *const s, Pos cur_match); -# endif -#endif - - -// Select generic implementation for longest_match, longest_match_slow, longest_match_slow functions. -#if OPTIMAL_CMP >= 32 -# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64 -# define longest_match_generic longest_match_64 -# define longest_match_slow_generic longest_match_slow_64 -# define compare256_generic compare256_64 -# elif defined(HAVE_BUILTIN_CTZ) -# define longest_match_generic longest_match_32 -# define longest_match_slow_generic longest_match_slow_32 -# define compare256_generic compare256_32 -# else -# define longest_match_generic longest_match_16 -# define longest_match_slow_generic longest_match_slow_16 -# define compare256_generic compare256_16 -# endif -#else -# define longest_match_generic longest_match_c -# define longest_match_slow_generic longest_match_slow_c -# define compare256_generic compare256_c -#endif - #ifdef DISABLE_RUNTIME_CPU_DETECTION // Generic code @@ -93,9 +49,9 @@ uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match); # define native_crc32_fold_reset crc32_fold_reset_c # define native_inflate_fast inflate_fast_c # define native_slide_hash slide_hash_c -# define native_longest_match longest_match_generic -# define native_longest_match_slow longest_match_slow_generic -# define native_compare256 compare256_generic +# define native_longest_match longest_match_c +# define native_longest_match_slow longest_match_slow_c +# define native_compare256 compare256_c #endif #endif diff --git a/compare256_rle.h b/compare256_rle.h index 0c80d962..5edfd734 100644 --- a/compare256_rle.h +++ b/compare256_rle.h @@ -6,12 +6,11 @@ #include "zbuild.h" #include "zmemory.h" #include "fallback_builtins.h" -#include "zendian.h" typedef uint32_t (*compare256_rle_func)(const uint8_t* src0, const uint8_t* src1); -/* ALIGNED, byte comparison */ -static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1) { +/* 8-bit integer comparison */ +static inline uint32_t compare256_rle_8(const uint8_t *src0, const uint8_t *src1) { uint32_t len = 0; do { @@ -44,8 +43,7 @@ static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1 return 256; } -#if OPTIMAL_CMP >= 32 -/* 16-bit unaligned integer comparison */ +/* 16-bit integer comparison */ static inline uint32_t compare256_rle_16(const uint8_t *src0, const uint8_t *src1) { uint32_t len = 0; uint16_t src0_cmp; @@ -71,7 +69,7 @@ static inline uint32_t compare256_rle_16(const uint8_t *src0, const uint8_t *src } #ifdef HAVE_BUILTIN_CTZ -/* 32-bit unaligned integer comparison */ +/* 32-bit integer comparison */ static inline uint32_t compare256_rle_32(const uint8_t *src0, const uint8_t *src1) { uint32_t sv, len = 0; uint16_t src0_cmp; @@ -99,11 +97,10 @@ static inline uint32_t compare256_rle_32(const uint8_t *src0, const uint8_t *src return 256; } - #endif -#if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64 -/* 64-bit unaligned integer comparison */ +#ifdef HAVE_BUILTIN_CTZLL +/* 64-bit integer comparison */ static inline uint32_t compare256_rle_64(const uint8_t *src0, const uint8_t *src1) { uint32_t src0_cmp32, len = 0; uint16_t src0_cmp; @@ -133,8 +130,4 @@ static inline uint32_t compare256_rle_64(const uint8_t *src0, const uint8_t *src return 256; } - #endif - -#endif - diff --git a/deflate_rle.c b/deflate_rle.c index 8c554457..9e398104 100644 --- a/deflate_rle.c +++ b/deflate_rle.c @@ -5,21 +5,19 @@ */ #include "zbuild.h" -#include "compare256_rle.h" #include "deflate.h" #include "deflate_p.h" #include "functable.h" +#include "compare256_rle.h" -#if OPTIMAL_CMP >= 32 -# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64 -# define compare256_rle compare256_rle_64 -# elif defined(HAVE_BUILTIN_CTZ) -# define compare256_rle compare256_rle_32 -# else -# define compare256_rle compare256_rle_16 -# endif +#if OPTIMAL_CMP == 8 +# define compare256_rle compare256_rle_8 +#elif defined(HAVE_BUILTIN_CTZLL) +# define compare256_rle compare256_rle_64 +#elif defined(HAVE_BUILTIN_CTZ) +# define compare256_rle compare256_rle_32 #else -# define compare256_rle compare256_rle_c +# define compare256_rle compare256_rle_16 #endif /* =========================================================================== diff --git a/functable.c b/functable.c index 9c114568..3cee95bf 100644 --- a/functable.c +++ b/functable.c @@ -61,9 +61,9 @@ static void init_functable(void) { ft.crc32_fold_reset = &crc32_fold_reset_c; ft.inflate_fast = &inflate_fast_c; ft.slide_hash = &slide_hash_c; - ft.longest_match = &longest_match_generic; - ft.longest_match_slow = &longest_match_slow_generic; - ft.compare256 = &compare256_generic; + ft.longest_match = &longest_match_c; + ft.longest_match_slow = &longest_match_slow_c; + ft.compare256 = &compare256_c; // Select arch-optimized functions diff --git a/match_tpl.h b/match_tpl.h index 5d00bd01..47e9aed9 100644 --- a/match_tpl.h +++ b/match_tpl.h @@ -22,6 +22,9 @@ * IN assertions: cur_match is the head of the hash chain for the current * string (strstart) and its distance is <= MAX_DIST, and prev_length >=1 * OUT assertion: the match length is not greater than s->lookahead + * + * The LONGEST_MATCH_SLOW variant spends more time to attempt to find longer + * matches once a match has already been found. */ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { unsigned int strstart = s->strstart; @@ -40,15 +43,8 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { uint32_t chain_length, nice_match, best_len, offset; uint32_t lookahead = s->lookahead; Pos match_offset = 0; -#if OPTIMAL_CMP >= 64 uint64_t scan_start; uint64_t scan_end; -#elif OPTIMAL_CMP >= 32 - uint32_t scan_start; - uint32_t scan_end; -#else - uint8_t scan_end[8]; -#endif #define GOTO_NEXT_CHAIN \ if (--chain_length && (cur_match = prev[cur_match & wmask]) > limit) \ @@ -64,26 +60,14 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { * to find the next best match length. */ offset = best_len-1; -#if OPTIMAL_CMP >= 32 if (best_len >= sizeof(uint32_t)) { offset -= 2; -#if OPTIMAL_CMP >= 64 if (best_len >= sizeof(uint64_t)) offset -= 4; -#endif } -#endif -#if OPTIMAL_CMP >= 64 scan_start = zng_memread_8(scan); scan_end = zng_memread_8(scan+offset); -#elif OPTIMAL_CMP >= 32 - scan_start = zng_memread_4(scan); - scan_end = zng_memread_4(scan+offset); -#else - scan_end[0] = *(scan+offset); - scan_end[1] = *(scan+offset+1); -#endif mbase_end = (mbase_start+offset); /* Do not waste too much time if we already have a good match */ @@ -143,7 +127,6 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { * that depend on those values. However the length of the match is limited to the * lookahead, so the output of deflate is not affected by the uninitialized values. */ -#if OPTIMAL_CMP >= 32 if (best_len < sizeof(uint32_t)) { for (;;) { if (zng_memcmp_2(mbase_end+cur_match, &scan_end) == 0 && @@ -151,7 +134,6 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { break; GOTO_NEXT_CHAIN; } -# if OPTIMAL_CMP >= 64 } else if (best_len >= sizeof(uint64_t)) { for (;;) { if (zng_memcmp_8(mbase_end+cur_match, &scan_end) == 0 && @@ -159,7 +141,6 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { break; GOTO_NEXT_CHAIN; } -# endif } else { for (;;) { if (zng_memcmp_4(mbase_end+cur_match, &scan_end) == 0 && @@ -168,14 +149,6 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { GOTO_NEXT_CHAIN; } } -#else - for (;;) { - if (mbase_end[cur_match] == scan_end[0] && mbase_end[cur_match+1] == scan_end[1] && - mbase_start[cur_match] == scan[0] && mbase_start[cur_match+1] == scan[1]) - break; - GOTO_NEXT_CHAIN; - } -#endif uint32_t len = COMPARE256(scan+2, mbase_start+cur_match+2) + 2; Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan"); @@ -191,24 +164,13 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { return best_len; offset = best_len-1; -#if OPTIMAL_CMP >= 32 if (best_len >= sizeof(uint32_t)) { offset -= 2; -#if OPTIMAL_CMP >= 64 if (best_len >= sizeof(uint64_t)) offset -= 4; -#endif } -#endif -#if OPTIMAL_CMP >= 64 scan_end = zng_memread_8(scan+offset); -#elif OPTIMAL_CMP >= 32 - scan_end = zng_memread_4(scan+offset); -#else - scan_end[0] = *(scan+offset); - scan_end[1] = *(scan+offset+1); -#endif #ifdef LONGEST_MATCH_SLOW /* Look for a better string offset */ @@ -286,4 +248,3 @@ break_matching: #undef LONGEST_MATCH_SLOW #undef LONGEST_MATCH -#undef COMPARE256 diff --git a/test/benchmarks/benchmark_compare256.cc b/test/benchmarks/benchmark_compare256.cc index 22c9b4f1..c27bff13 100644 --- a/test/benchmarks/benchmark_compare256.cc +++ b/test/benchmarks/benchmark_compare256.cc @@ -12,6 +12,7 @@ extern "C" { # include "zutil_p.h" # include "arch_functions.h" # include "../test_cpu_features.h" +# include "arch/generic/compare256_p.h" } #define MAX_COMPARE_SIZE (256) @@ -60,21 +61,19 @@ public: } \ BENCHMARK_REGISTER_F(compare256, name)->Range(1, MAX_COMPARE_SIZE); -BENCHMARK_COMPARE256(c, compare256_c, 1); - #ifdef DISABLE_RUNTIME_CPU_DETECTION BENCHMARK_COMPARE256(native, native_compare256, 1); #else -#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32 +BENCHMARK_COMPARE256(8, compare256_8, 1); BENCHMARK_COMPARE256(16, compare256_16, 1); -# if defined(HAVE_BUILTIN_CTZ) +#if defined(HAVE_BUILTIN_CTZ) BENCHMARK_COMPARE256(32, compare256_32, 1); -# endif -# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64 +#endif +#if defined(HAVE_BUILTIN_CTZLL) BENCHMARK_COMPARE256(64, compare256_64, 1); -# endif #endif + #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) BENCHMARK_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2); #endif diff --git a/test/benchmarks/benchmark_compare256_rle.cc b/test/benchmarks/benchmark_compare256_rle.cc index 82441629..33e39034 100644 --- a/test/benchmarks/benchmark_compare256_rle.cc +++ b/test/benchmarks/benchmark_compare256_rle.cc @@ -59,14 +59,11 @@ public: } \ BENCHMARK_REGISTER_F(compare256_rle, name)->Range(1, MAX_COMPARE_SIZE); -BENCHMARK_COMPARE256_RLE(c, compare256_rle_c, 1); - -#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32 +BENCHMARK_COMPARE256_RLE(8, compare256_rle_8, 1); BENCHMARK_COMPARE256_RLE(16, compare256_rle_16, 1); -# if defined(HAVE_BUILTIN_CTZ) +#if defined(HAVE_BUILTIN_CTZ) BENCHMARK_COMPARE256_RLE(32, compare256_rle_32, 1); -# endif -# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64 +#endif +#if defined(HAVE_BUILTIN_CTZLL) BENCHMARK_COMPARE256_RLE(64, compare256_rle_64, 1); -# endif #endif diff --git a/test/test_compare256.cc b/test/test_compare256.cc index da25a75c..035e63c9 100644 --- a/test/test_compare256.cc +++ b/test/test_compare256.cc @@ -12,6 +12,7 @@ extern "C" { # include "zutil.h" # include "arch_functions.h" # include "test_cpu_features.h" +# include "arch/generic/compare256_p.h" } #include @@ -59,20 +60,17 @@ static inline void compare256_match_check(compare256_func compare256) { compare256_match_check(func); \ } -TEST_COMPARE256(c, compare256_c, 1) - #ifdef DISABLE_RUNTIME_CPU_DETECTION TEST_COMPARE256(native, native_compare256, 1) #else -#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32 +TEST_COMPARE256(8, compare256_8, 1) TEST_COMPARE256(16, compare256_16, 1) -# if defined(HAVE_BUILTIN_CTZ) +#if defined(HAVE_BUILTIN_CTZ) TEST_COMPARE256(32, compare256_32, 1) -# endif -# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64 +#endif +#if defined(HAVE_BUILTIN_CTZLL) TEST_COMPARE256(64, compare256_64, 1) -# endif #endif #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) diff --git a/test/test_compare256_rle.cc b/test/test_compare256_rle.cc index 65e80a56..dc531195 100644 --- a/test/test_compare256_rle.cc +++ b/test/test_compare256_rle.cc @@ -50,14 +50,11 @@ static inline void compare256_rle_match_check(compare256_rle_func compare256_rle compare256_rle_match_check(func); \ } -TEST_COMPARE256_RLE(c, compare256_rle_c, 1) - -#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32 +TEST_COMPARE256_RLE(8, compare256_rle_8, 1) TEST_COMPARE256_RLE(16, compare256_rle_16, 1) -# if defined(HAVE_BUILTIN_CTZ) +#if defined(HAVE_BUILTIN_CTZ) TEST_COMPARE256_RLE(32, compare256_rle_32, 1) -# endif -# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64 +#endif +#if defined(HAVE_BUILTIN_CTZLL) TEST_COMPARE256_RLE(64, compare256_rle_64, 1) -# endif #endif diff --git a/zbuild.h b/zbuild.h index 623ff7f7..157ab6ff 100644 --- a/zbuild.h +++ b/zbuild.h @@ -243,36 +243,43 @@ # define Tracecv(c, x) #endif -#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64__) || defined(_M_AMD64) -# define OPTIMAL_CMP 64 -#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || \ - defined(__i686__) || defined(_X86_) || defined(_M_IX86) -# define OPTIMAL_CMP 32 -#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) -# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32) +/* OPTIMAL_CMP values determine the comparison width: + * 64: Best for 64-bit architectures with unaligned access + * 32: Best for 32-bit architectures with unaligned access + * 16: Safe default for unknown architectures + * 8: Safe fallback for architectures without unaligned access + * Note: The unaligned access mentioned is cpu-support, this allows compiler or + * separate unaligned intrinsics to utilize safe unaligned access, without + * utilizing unaligned C pointers that are known to have undefined behavior. + */ +#if !defined(OPTIMAL_CMP) +# if defined(__x86_64__) || defined(_M_X64) || defined(__amd64__) || defined(_M_AMD64) # define OPTIMAL_CMP 64 -# else -# define OPTIMAL_CMP 8 -# endif -#elif defined(__arm__) || defined(_M_ARM) -# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32) +# elif defined(__i386__) || defined(__i486__) || defined(__i586__) || \ + defined(__i686__) || defined(_X86_) || defined(_M_IX86) +# define OPTIMAL_CMP 32 +# elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32) +# define OPTIMAL_CMP 64 +# else +# define OPTIMAL_CMP 8 +# endif +# elif defined(__arm__) || defined(_M_ARM) +# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32) +# define OPTIMAL_CMP 32 +# else +# define OPTIMAL_CMP 8 +# endif +# elif defined(__powerpc64__) || defined(__ppc64__) +# define OPTIMAL_CMP 64 +# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) # define OPTIMAL_CMP 32 -# else -# define OPTIMAL_CMP 8 # endif -#elif defined(__powerpc64__) || defined(__ppc64__) -# define OPTIMAL_CMP 64 -#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) -# define OPTIMAL_CMP 32 -#endif -#if defined(NO_UNALIGNED) -# undef OPTIMAL_CMP #endif #if !defined(OPTIMAL_CMP) -# define OPTIMAL_CMP 8 +# define OPTIMAL_CMP 16 #endif - #if defined(__has_feature) # if __has_feature(address_sanitizer) # define Z_ADDRESS_SANITIZER 1 diff --git a/zmemory.h b/zmemory.h index 99ffd9e6..fc850a72 100644 --- a/zmemory.h +++ b/zmemory.h @@ -73,7 +73,7 @@ static inline void zng_memwrite_8(void *ptr, uint64_t val) { calls to unaligned comparisons when unaligned access is supported. Use memcmp only when unaligned support is not available to avoid an extra call to memcpy. */ static inline int32_t zng_memcmp_2(const void *src0, const void *src1) { -#if defined(HAVE_MAY_ALIAS) || OPTIMAL_CMP >= 16 +#if defined(HAVE_MAY_ALIAS) return zng_memread_2(src0) != zng_memread_2(src1); #else return memcmp(src0, src1, 2); @@ -81,7 +81,7 @@ static inline int32_t zng_memcmp_2(const void *src0, const void *src1) { } static inline int32_t zng_memcmp_4(const void *src0, const void *src1) { -#if defined(HAVE_MAY_ALIAS) || OPTIMAL_CMP >= 32 +#if defined(HAVE_MAY_ALIAS) return zng_memread_4(src0) != zng_memread_4(src1); #else return memcmp(src0, src1, 4); @@ -89,7 +89,7 @@ static inline int32_t zng_memcmp_4(const void *src0, const void *src1) { } static inline int32_t zng_memcmp_8(const void *src0, const void *src1) { -#if defined(HAVE_MAY_ALIAS) || OPTIMAL_CMP >= 64 +#if defined(HAVE_MAY_ALIAS) return zng_memread_8(src0) != zng_memread_8(src1); #else return memcmp(src0, src1, 8);