${CMAKE_CURRENT_BINARY_DIR}/zlib${SUFFIX}.h
)
set(ZLIB_PRIVATE_HDRS
+ arch/generic/chunk_permute_table.h
+ arch/generic/compare256_p.h
+ arch/generic/generic_functions.h
adler32_p.h
chunkset_tpl.h
compare256_rle.h
chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
-compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
-compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
*/
#include "zbuild.h"
-#include "zmemory.h"
-#include "deflate.h"
-#include "fallback_builtins.h"
-
-/* ALIGNED, byte comparison */
-static inline uint32_t compare256_c_static(const uint8_t *src0, const uint8_t *src1) {
- uint32_t len = 0;
-
- do {
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- } while (len < 256);
-
- return 256;
-}
+#include "compare256_p.h"
+
+// Set optimal COMPARE256 function variant
+#if OPTIMAL_CMP == 8
+# define COMPARE256 compare256_8
+#elif defined(HAVE_BUILTIN_CTZLL)
+# define COMPARE256 compare256_64
+#elif defined(HAVE_BUILTIN_CTZ)
+# define COMPARE256 compare256_32
+#else
+# define COMPARE256 compare256_16
+#endif
Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) {
- return compare256_c_static(src0, src1);
+ return COMPARE256(src0, src1);
}
+// Generate longest_match_c
#define LONGEST_MATCH longest_match_c
-#define COMPARE256 compare256_c_static
-
#include "match_tpl.h"
+// Generate longest_match_slow_c
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_c
-#define COMPARE256 compare256_c_static
-
-#include "match_tpl.h"
-
-#if OPTIMAL_CMP >= 32
-
-/* 16-bit unaligned integer comparison */
-static inline uint32_t compare256_16_static(const uint8_t *src0, const uint8_t *src1) {
- uint32_t len = 0;
-
- do {
- if (zng_memcmp_2(src0, src1) != 0)
- return len + (*src0 == *src1);
- src0 += 2, src1 += 2, len += 2;
-
- if (zng_memcmp_2(src0, src1) != 0)
- return len + (*src0 == *src1);
- src0 += 2, src1 += 2, len += 2;
-
- if (zng_memcmp_2(src0, src1) != 0)
- return len + (*src0 == *src1);
- src0 += 2, src1 += 2, len += 2;
-
- if (zng_memcmp_2(src0, src1) != 0)
- return len + (*src0 == *src1);
- src0 += 2, src1 += 2, len += 2;
- } while (len < 256);
-
- return 256;
-}
-
-Z_INTERNAL uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1) {
- return compare256_16_static(src0, src1);
-}
-
-#define LONGEST_MATCH longest_match_16
-#define COMPARE256 compare256_16_static
-
#include "match_tpl.h"
-
-#define LONGEST_MATCH_SLOW
-#define LONGEST_MATCH longest_match_slow_16
-#define COMPARE256 compare256_16_static
-
-#include "match_tpl.h"
-
-#ifdef HAVE_BUILTIN_CTZ
-/* 32-bit unaligned integer comparison */
-static inline uint32_t compare256_32_static(const uint8_t *src0, const uint8_t *src1) {
- uint32_t len = 0;
-
- do {
- uint32_t sv, mv, diff;
-
- sv = zng_memread_4(src0);
- mv = zng_memread_4(src1);
-
- diff = sv ^ mv;
- if (diff) {
-#if BYTE_ORDER == LITTLE_ENDIAN
- uint32_t match_byte = __builtin_ctz(diff) / 8;
-#else
- uint32_t match_byte = __builtin_clz(diff) / 8;
-#endif
- return len + match_byte;
- }
-
- src0 += 4, src1 += 4, len += 4;
- } while (len < 256);
-
- return 256;
-}
-
-Z_INTERNAL uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1) {
- return compare256_32_static(src0, src1);
-}
-
-#define LONGEST_MATCH longest_match_32
-#define COMPARE256 compare256_32_static
-
-#include "match_tpl.h"
-
-#define LONGEST_MATCH_SLOW
-#define LONGEST_MATCH longest_match_slow_32
-#define COMPARE256 compare256_32_static
-
-#include "match_tpl.h"
-
-#endif
-
-#if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
-/* 64-bit integer comparison */
-static inline uint32_t compare256_64_static(const uint8_t *src0, const uint8_t *src1) {
- uint32_t len = 0;
-
- do {
- uint64_t sv, mv, diff;
-
- sv = zng_memread_8(src0);
- mv = zng_memread_8(src1);
-
- diff = sv ^ mv;
- if (diff) {
-#if BYTE_ORDER == LITTLE_ENDIAN
- uint64_t match_byte = __builtin_ctzll(diff) / 8;
-#else
- uint64_t match_byte = __builtin_clzll(diff) / 8;
-#endif
- return len + (uint32_t)match_byte;
- }
-
- src0 += 8, src1 += 8, len += 8;
- } while (len < 256);
-
- return 256;
-}
-
-Z_INTERNAL uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) {
- return compare256_64_static(src0, src1);
-}
-
-#define LONGEST_MATCH longest_match_64
-#define COMPARE256 compare256_64_static
-
-#include "match_tpl.h"
-
-#define LONGEST_MATCH_SLOW
-#define LONGEST_MATCH longest_match_slow_64
-#define COMPARE256 compare256_64_static
-
-#include "match_tpl.h"
-
-#endif
-
-#endif
--- /dev/null
+/* compare256_p.h -- 256 byte memory comparison with match length return
+ * Copyright (C) 2020 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+/* 8-bit integer comparison */
+static inline uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+
+ do {
+ if (*src0 != *src1)
+ return len;
+ src0 += 1, src1 += 1, len += 1;
+ if (*src0 != *src1)
+ return len;
+ src0 += 1, src1 += 1, len += 1;
+ if (*src0 != *src1)
+ return len;
+ src0 += 1, src1 += 1, len += 1;
+ if (*src0 != *src1)
+ return len;
+ src0 += 1, src1 += 1, len += 1;
+ if (*src0 != *src1)
+ return len;
+ src0 += 1, src1 += 1, len += 1;
+ if (*src0 != *src1)
+ return len;
+ src0 += 1, src1 += 1, len += 1;
+ if (*src0 != *src1)
+ return len;
+ src0 += 1, src1 += 1, len += 1;
+ if (*src0 != *src1)
+ return len;
+ src0 += 1, src1 += 1, len += 1;
+ } while (len < 256);
+
+ return 256;
+}
+
+/* 16-bit integer comparison */
+static inline uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+
+ do {
+ if (zng_memcmp_2(src0, src1) != 0)
+ return len + (*src0 == *src1);
+ src0 += 2, src1 += 2, len += 2;
+
+ if (zng_memcmp_2(src0, src1) != 0)
+ return len + (*src0 == *src1);
+ src0 += 2, src1 += 2, len += 2;
+
+ if (zng_memcmp_2(src0, src1) != 0)
+ return len + (*src0 == *src1);
+ src0 += 2, src1 += 2, len += 2;
+
+ if (zng_memcmp_2(src0, src1) != 0)
+ return len + (*src0 == *src1);
+ src0 += 2, src1 += 2, len += 2;
+ } while (len < 256);
+
+ return 256;
+}
+
+#ifdef HAVE_BUILTIN_CTZ
+/* 32-bit integer comparison */
+static inline uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+
+ do {
+ uint32_t sv, mv, diff;
+
+ sv = zng_memread_4(src0);
+ mv = zng_memread_4(src1);
+
+ diff = sv ^ mv;
+ if (diff) {
+# if BYTE_ORDER == LITTLE_ENDIAN
+ uint32_t match_byte = __builtin_ctz(diff) / 8;
+# else
+ uint32_t match_byte = __builtin_clz(diff) / 8;
+# endif
+ return len + match_byte;
+ }
+
+ src0 += 4, src1 += 4, len += 4;
+ } while (len < 256);
+
+ return 256;
+}
+#endif
+
+#ifdef HAVE_BUILTIN_CTZLL
+/* 64-bit integer comparison */
+static inline uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+
+ do {
+ uint64_t sv, mv, diff;
+
+ sv = zng_memread_8(src0);
+ mv = zng_memread_8(src1);
+
+ diff = sv ^ mv;
+ if (diff) {
+# if BYTE_ORDER == LITTLE_ENDIAN
+ uint64_t match_byte = __builtin_ctzll(diff) / 8;
+# else
+ uint64_t match_byte = __builtin_clzll(diff) / 8;
+# endif
+ return len + (uint32_t)match_byte;
+ }
+
+ src0 += 8, src1 += 8, len += 8;
+ } while (len < 256);
+
+ return 256;
+}
+#endif
uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
-#if OPTIMAL_CMP >= 32
- uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1);
-# ifdef HAVE_BUILTIN_CTZ
- uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1);
-# endif
-# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
- uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1);
-# endif
-#endif
typedef void (*slide_hash_func)(deflate_state *s);
uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
-#if OPTIMAL_CMP >= 32
- uint32_t longest_match_16(deflate_state *const s, Pos cur_match);
- uint32_t longest_match_slow_16(deflate_state *const s, Pos cur_match);
-# ifdef HAVE_BUILTIN_CTZ
- uint32_t longest_match_32(deflate_state *const s, Pos cur_match);
- uint32_t longest_match_slow_32(deflate_state *const s, Pos cur_match);
-# endif
-# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
- uint32_t longest_match_64(deflate_state *const s, Pos cur_match);
- uint32_t longest_match_slow_64(deflate_state *const s, Pos cur_match);
-# endif
-#endif
-
-
-// Select generic implementation for longest_match, longest_match_slow, longest_match_slow functions.
-#if OPTIMAL_CMP >= 32
-# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
-# define longest_match_generic longest_match_64
-# define longest_match_slow_generic longest_match_slow_64
-# define compare256_generic compare256_64
-# elif defined(HAVE_BUILTIN_CTZ)
-# define longest_match_generic longest_match_32
-# define longest_match_slow_generic longest_match_slow_32
-# define compare256_generic compare256_32
-# else
-# define longest_match_generic longest_match_16
-# define longest_match_slow_generic longest_match_slow_16
-# define compare256_generic compare256_16
-# endif
-#else
-# define longest_match_generic longest_match_c
-# define longest_match_slow_generic longest_match_slow_c
-# define compare256_generic compare256_c
-#endif
-
#ifdef DISABLE_RUNTIME_CPU_DETECTION
// Generic code
# define native_crc32_fold_reset crc32_fold_reset_c
# define native_inflate_fast inflate_fast_c
# define native_slide_hash slide_hash_c
-# define native_longest_match longest_match_generic
-# define native_longest_match_slow longest_match_slow_generic
-# define native_compare256 compare256_generic
+# define native_longest_match longest_match_c
+# define native_longest_match_slow longest_match_slow_c
+# define native_compare256 compare256_c
#endif
#endif
#include "zbuild.h"
#include "zmemory.h"
#include "fallback_builtins.h"
-#include "zendian.h"
typedef uint32_t (*compare256_rle_func)(const uint8_t* src0, const uint8_t* src1);
-/* ALIGNED, byte comparison */
-static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1) {
+/* 8-bit integer comparison */
+static inline uint32_t compare256_rle_8(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
return 256;
}
-#if OPTIMAL_CMP >= 32
-/* 16-bit unaligned integer comparison */
+/* 16-bit integer comparison */
static inline uint32_t compare256_rle_16(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
uint16_t src0_cmp;
}
#ifdef HAVE_BUILTIN_CTZ
-/* 32-bit unaligned integer comparison */
+/* 32-bit integer comparison */
static inline uint32_t compare256_rle_32(const uint8_t *src0, const uint8_t *src1) {
uint32_t sv, len = 0;
uint16_t src0_cmp;
return 256;
}
-
#endif
-#if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
-/* 64-bit unaligned integer comparison */
+#ifdef HAVE_BUILTIN_CTZLL
+/* 64-bit integer comparison */
static inline uint32_t compare256_rle_64(const uint8_t *src0, const uint8_t *src1) {
uint32_t src0_cmp32, len = 0;
uint16_t src0_cmp;
return 256;
}
-
#endif
-
-#endif
-
*/
#include "zbuild.h"
-#include "compare256_rle.h"
#include "deflate.h"
#include "deflate_p.h"
#include "functable.h"
+#include "compare256_rle.h"
-#if OPTIMAL_CMP >= 32
-# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
-# define compare256_rle compare256_rle_64
-# elif defined(HAVE_BUILTIN_CTZ)
-# define compare256_rle compare256_rle_32
-# else
-# define compare256_rle compare256_rle_16
-# endif
+#if OPTIMAL_CMP == 8
+# define compare256_rle compare256_rle_8
+#elif defined(HAVE_BUILTIN_CTZLL)
+# define compare256_rle compare256_rle_64
+#elif defined(HAVE_BUILTIN_CTZ)
+# define compare256_rle compare256_rle_32
#else
-# define compare256_rle compare256_rle_c
+# define compare256_rle compare256_rle_16
#endif
/* ===========================================================================
ft.crc32_fold_reset = &crc32_fold_reset_c;
ft.inflate_fast = &inflate_fast_c;
ft.slide_hash = &slide_hash_c;
- ft.longest_match = &longest_match_generic;
- ft.longest_match_slow = &longest_match_slow_generic;
- ft.compare256 = &compare256_generic;
+ ft.longest_match = &longest_match_c;
+ ft.longest_match_slow = &longest_match_slow_c;
+ ft.compare256 = &compare256_c;
// Select arch-optimized functions
* IN assertions: cur_match is the head of the hash chain for the current
* string (strstart) and its distance is <= MAX_DIST, and prev_length >=1
* OUT assertion: the match length is not greater than s->lookahead
+ *
+ * The LONGEST_MATCH_SLOW variant spends more time to attempt to find longer
+ * matches once a match has already been found.
*/
Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
unsigned int strstart = s->strstart;
uint32_t chain_length, nice_match, best_len, offset;
uint32_t lookahead = s->lookahead;
Pos match_offset = 0;
-#if OPTIMAL_CMP >= 64
uint64_t scan_start;
uint64_t scan_end;
-#elif OPTIMAL_CMP >= 32
- uint32_t scan_start;
- uint32_t scan_end;
-#else
- uint8_t scan_end[8];
-#endif
#define GOTO_NEXT_CHAIN \
if (--chain_length && (cur_match = prev[cur_match & wmask]) > limit) \
* to find the next best match length.
*/
offset = best_len-1;
-#if OPTIMAL_CMP >= 32
if (best_len >= sizeof(uint32_t)) {
offset -= 2;
-#if OPTIMAL_CMP >= 64
if (best_len >= sizeof(uint64_t))
offset -= 4;
-#endif
}
-#endif
-#if OPTIMAL_CMP >= 64
scan_start = zng_memread_8(scan);
scan_end = zng_memread_8(scan+offset);
-#elif OPTIMAL_CMP >= 32
- scan_start = zng_memread_4(scan);
- scan_end = zng_memread_4(scan+offset);
-#else
- scan_end[0] = *(scan+offset);
- scan_end[1] = *(scan+offset+1);
-#endif
mbase_end = (mbase_start+offset);
/* Do not waste too much time if we already have a good match */
* that depend on those values. However the length of the match is limited to the
* lookahead, so the output of deflate is not affected by the uninitialized values.
*/
-#if OPTIMAL_CMP >= 32
if (best_len < sizeof(uint32_t)) {
for (;;) {
if (zng_memcmp_2(mbase_end+cur_match, &scan_end) == 0 &&
break;
GOTO_NEXT_CHAIN;
}
-# if OPTIMAL_CMP >= 64
} else if (best_len >= sizeof(uint64_t)) {
for (;;) {
if (zng_memcmp_8(mbase_end+cur_match, &scan_end) == 0 &&
break;
GOTO_NEXT_CHAIN;
}
-# endif
} else {
for (;;) {
if (zng_memcmp_4(mbase_end+cur_match, &scan_end) == 0 &&
GOTO_NEXT_CHAIN;
}
}
-#else
- for (;;) {
- if (mbase_end[cur_match] == scan_end[0] && mbase_end[cur_match+1] == scan_end[1] &&
- mbase_start[cur_match] == scan[0] && mbase_start[cur_match+1] == scan[1])
- break;
- GOTO_NEXT_CHAIN;
- }
-#endif
uint32_t len = COMPARE256(scan+2, mbase_start+cur_match+2) + 2;
Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan");
return best_len;
offset = best_len-1;
-#if OPTIMAL_CMP >= 32
if (best_len >= sizeof(uint32_t)) {
offset -= 2;
-#if OPTIMAL_CMP >= 64
if (best_len >= sizeof(uint64_t))
offset -= 4;
-#endif
}
-#endif
-#if OPTIMAL_CMP >= 64
scan_end = zng_memread_8(scan+offset);
-#elif OPTIMAL_CMP >= 32
- scan_end = zng_memread_4(scan+offset);
-#else
- scan_end[0] = *(scan+offset);
- scan_end[1] = *(scan+offset+1);
-#endif
#ifdef LONGEST_MATCH_SLOW
/* Look for a better string offset */
#undef LONGEST_MATCH_SLOW
#undef LONGEST_MATCH
-#undef COMPARE256
# include "zutil_p.h"
# include "arch_functions.h"
# include "../test_cpu_features.h"
+# include "arch/generic/compare256_p.h"
}
#define MAX_COMPARE_SIZE (256)
} \
BENCHMARK_REGISTER_F(compare256, name)->Range(1, MAX_COMPARE_SIZE);
-BENCHMARK_COMPARE256(c, compare256_c, 1);
-
#ifdef DISABLE_RUNTIME_CPU_DETECTION
BENCHMARK_COMPARE256(native, native_compare256, 1);
#else
-#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32
+BENCHMARK_COMPARE256(8, compare256_8, 1);
BENCHMARK_COMPARE256(16, compare256_16, 1);
-# if defined(HAVE_BUILTIN_CTZ)
+#if defined(HAVE_BUILTIN_CTZ)
BENCHMARK_COMPARE256(32, compare256_32, 1);
-# endif
-# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
+#endif
+#if defined(HAVE_BUILTIN_CTZLL)
BENCHMARK_COMPARE256(64, compare256_64, 1);
-# endif
#endif
+
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
BENCHMARK_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2);
#endif
} \
BENCHMARK_REGISTER_F(compare256_rle, name)->Range(1, MAX_COMPARE_SIZE);
-BENCHMARK_COMPARE256_RLE(c, compare256_rle_c, 1);
-
-#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32
+BENCHMARK_COMPARE256_RLE(8, compare256_rle_8, 1);
BENCHMARK_COMPARE256_RLE(16, compare256_rle_16, 1);
-# if defined(HAVE_BUILTIN_CTZ)
+#if defined(HAVE_BUILTIN_CTZ)
BENCHMARK_COMPARE256_RLE(32, compare256_rle_32, 1);
-# endif
-# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
+#endif
+#if defined(HAVE_BUILTIN_CTZLL)
BENCHMARK_COMPARE256_RLE(64, compare256_rle_64, 1);
-# endif
#endif
# include "zutil.h"
# include "arch_functions.h"
# include "test_cpu_features.h"
+# include "arch/generic/compare256_p.h"
}
#include <gtest/gtest.h>
compare256_match_check(func); \
}
-TEST_COMPARE256(c, compare256_c, 1)
-
#ifdef DISABLE_RUNTIME_CPU_DETECTION
TEST_COMPARE256(native, native_compare256, 1)
#else
-#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32
+TEST_COMPARE256(8, compare256_8, 1)
TEST_COMPARE256(16, compare256_16, 1)
-# if defined(HAVE_BUILTIN_CTZ)
+#if defined(HAVE_BUILTIN_CTZ)
TEST_COMPARE256(32, compare256_32, 1)
-# endif
-# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
+#endif
+#if defined(HAVE_BUILTIN_CTZLL)
TEST_COMPARE256(64, compare256_64, 1)
-# endif
#endif
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
compare256_rle_match_check(func); \
}
-TEST_COMPARE256_RLE(c, compare256_rle_c, 1)
-
-#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32
+TEST_COMPARE256_RLE(8, compare256_rle_8, 1)
TEST_COMPARE256_RLE(16, compare256_rle_16, 1)
-# if defined(HAVE_BUILTIN_CTZ)
+#if defined(HAVE_BUILTIN_CTZ)
TEST_COMPARE256_RLE(32, compare256_rle_32, 1)
-# endif
-# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
+#endif
+#if defined(HAVE_BUILTIN_CTZLL)
TEST_COMPARE256_RLE(64, compare256_rle_64, 1)
-# endif
#endif
# define Tracecv(c, x)
#endif
-#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64__) || defined(_M_AMD64)
-# define OPTIMAL_CMP 64
-#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || \
- defined(__i686__) || defined(_X86_) || defined(_M_IX86)
-# define OPTIMAL_CMP 32
-#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
-# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32)
+/* OPTIMAL_CMP values determine the comparison width:
+ * 64: Best for 64-bit architectures with unaligned access
+ * 32: Best for 32-bit architectures with unaligned access
+ * 16: Safe default for unknown architectures
+ * 8: Safe fallback for architectures without unaligned access
+ * Note: The unaligned access mentioned is cpu-support, this allows compiler or
+ * separate unaligned intrinsics to utilize safe unaligned access, without
+ * utilizing unaligned C pointers that are known to have undefined behavior.
+ */
+#if !defined(OPTIMAL_CMP)
+# if defined(__x86_64__) || defined(_M_X64) || defined(__amd64__) || defined(_M_AMD64)
# define OPTIMAL_CMP 64
-# else
-# define OPTIMAL_CMP 8
-# endif
-#elif defined(__arm__) || defined(_M_ARM)
-# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32)
+# elif defined(__i386__) || defined(__i486__) || defined(__i586__) || \
+ defined(__i686__) || defined(_X86_) || defined(_M_IX86)
+# define OPTIMAL_CMP 32
+# elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32)
+# define OPTIMAL_CMP 64
+# else
+# define OPTIMAL_CMP 8
+# endif
+# elif defined(__arm__) || defined(_M_ARM)
+# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32)
+# define OPTIMAL_CMP 32
+# else
+# define OPTIMAL_CMP 8
+# endif
+# elif defined(__powerpc64__) || defined(__ppc64__)
+# define OPTIMAL_CMP 64
+# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
# define OPTIMAL_CMP 32
-# else
-# define OPTIMAL_CMP 8
# endif
-#elif defined(__powerpc64__) || defined(__ppc64__)
-# define OPTIMAL_CMP 64
-#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
-# define OPTIMAL_CMP 32
-#endif
-#if defined(NO_UNALIGNED)
-# undef OPTIMAL_CMP
#endif
#if !defined(OPTIMAL_CMP)
-# define OPTIMAL_CMP 8
+# define OPTIMAL_CMP 16
#endif
-
#if defined(__has_feature)
# if __has_feature(address_sanitizer)
# define Z_ADDRESS_SANITIZER 1
calls to unaligned comparisons when unaligned access is supported. Use memcmp only when
unaligned support is not available to avoid an extra call to memcpy. */
static inline int32_t zng_memcmp_2(const void *src0, const void *src1) {
-#if defined(HAVE_MAY_ALIAS) || OPTIMAL_CMP >= 16
+#if defined(HAVE_MAY_ALIAS)
return zng_memread_2(src0) != zng_memread_2(src1);
#else
return memcmp(src0, src1, 2);
}
static inline int32_t zng_memcmp_4(const void *src0, const void *src1) {
-#if defined(HAVE_MAY_ALIAS) || OPTIMAL_CMP >= 32
+#if defined(HAVE_MAY_ALIAS)
return zng_memread_4(src0) != zng_memread_4(src1);
#else
return memcmp(src0, src1, 4);
}
static inline int32_t zng_memcmp_8(const void *src0, const void *src1) {
-#if defined(HAVE_MAY_ALIAS) || OPTIMAL_CMP >= 64
+#if defined(HAVE_MAY_ALIAS)
return zng_memread_8(src0) != zng_memread_8(src1);
#else
return memcmp(src0, src1, 8);