From: Nathan Moinvaziri Date: Sat, 8 Jan 2022 22:42:09 +0000 (-0800) Subject: Convert compare258 to compare256 and moved 2 byte check into deflate_quick. Prevents... X-Git-Tag: 2.1.0-beta1~444 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=66506ace8d1b1cbd9fd5d2cfbae1c4f0b54ca9b9;p=thirdparty%2Fzlib-ng.git Convert compare258 to compare256 and moved 2 byte check into deflate_quick. Prevents having multiple compare258 functions with 2 byte checks. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 911bcb3fe..9e7e35359 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -719,8 +719,8 @@ if(WITH_OPTIM) add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"") list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx.c) add_feature_info(AVX_CHUNKSET 1 "Support AVX optimized chunkset, using \"${AVX2FLAG}\"") - list(APPEND AVX2_SRCS ${ARCHDIR}/compare258_avx2.c) - add_feature_info(AVX2_COMPARE258 1 "Support AVX2 optimized compare258, using \"${AVX2FLAG}\"") + list(APPEND AVX2_SRCS ${ARCHDIR}/compare256_avx2.c) + add_feature_info(AVX2_COMPARE256 1 "Support AVX2 optimized compare256, using \"${AVX2FLAG}\"") list(APPEND AVX2_SRCS ${ARCHDIR}/adler32_avx2.c) add_feature_info(AVX2_ADLER32 1 "Support AVX2-accelerated adler32, using \"${AVX2FLAG}\"") list(APPEND ZLIB_ARCH_SRCS ${AVX2_SRCS}) @@ -782,8 +782,8 @@ if(WITH_OPTIM) endif() if(HAVE_SSE42CMPSTR_INTRIN) add_definitions(-DX86_SSE42_CMP_STR) - set(SSE42_SRCS ${ARCHDIR}/compare258_sse42.c) - add_feature_info(SSE42_COMPARE258 1 "Support SSE4.2 optimized compare258, using \"${SSE42FLAG}\"") + set(SSE42_SRCS ${ARCHDIR}/compare256_sse42.c) + add_feature_info(SSE42_COMPARE256 1 "Support SSE4.2 optimized compare256, using \"${SSE42FLAG}\"") list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS}) set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}") endif() @@ -946,7 +946,7 @@ set(ZLIB_PRIVATE_HDRS set(ZLIB_SRCS adler32.c chunkset.c - compare258.c + compare256.c compress.c crc32.c crc32_comb.c diff --git a/Makefile.in b/Makefile.in index 3b037302c..9f64dabab 100644 --- a/Makefile.in +++ b/Makefile.in @@ -76,7 +76,7 @@ pkgconfigdir = ${libdir}/pkgconfig OBJZ = \ adler32.o \ chunkset.o \ - compare258.o \ + compare256.o \ compress.o \ crc32.o \ crc32_comb.o \ @@ -112,7 +112,7 @@ OBJC = $(OBJZ) $(OBJG) PIC_OBJZ = \ adler32.lo \ chunkset.lo \ - compare258.lo \ + compare256.lo \ compress.lo \ crc32.lo \ crc32_comb.lo \ diff --git a/README.md b/README.md index 79d6648bd..599fdd957 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Features * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, & ACLE * Hash table implementation using CRC32-C intrinsics on x86 and ARM * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX - * Compare256/258 implementations using SSE4.2 & AVX2 + * Compare256 implementations using SSE4.2 & AVX2 * Inflate chunk copying using SSE2, AVX2, Neon & VSX * CRC32 implementation using IBM Z vector instructions * Support for hardware-accelerated deflate using IBM Z DFLTCC diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index d481c0ee7..f0d7c38f9 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -32,8 +32,8 @@ all: \ adler32_ssse3.o adler32_ssse3.lo \ chunkset_avx.o chunkset_avx.lo \ chunkset_sse2.o chunkset_sse2.lo \ - compare258_avx2.o compare258_avx2.lo \ - compare258_sse42.o compare258_sse42.lo \ + compare256_avx2.o compare256_avx2.lo \ + compare256_sse42.o compare256_sse42.lo \ insert_string_sse42.o insert_string_sse42.lo \ crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \ crc32_fold_vpclmulqdq.o crc32_fold_vpclmulqdq.lo \ @@ -58,17 +58,17 @@ chunkset_sse2.o: chunkset_sse2.lo: $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c -compare258_avx2.o: - $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx2.c +compare256_avx2.o: + $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c -compare258_avx2.lo: - $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx2.c +compare256_avx2.lo: + $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c -compare258_sse42.o: - $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse42.c +compare256_sse42.o: + $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse42.c -compare258_sse42.lo: - $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse42.c +compare256_sse42.lo: + $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse42.c insert_string_sse42.o: $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c diff --git a/arch/x86/compare258_avx2.c b/arch/x86/compare256_avx2.c similarity index 81% rename from arch/x86/compare258_avx2.c rename to arch/x86/compare256_avx2.c index 9aefde16a..253976b5b 100644 --- a/arch/x86/compare258_avx2.c +++ b/arch/x86/compare256_avx2.c @@ -1,4 +1,4 @@ -/* compare258_avx2.c -- AVX2 version of compare258 +/* compare256_avx2.c -- AVX2 version of compare256 * Copyright Mika T. Lindqvist * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -47,15 +47,8 @@ static inline uint32_t compare256_unaligned_avx2_static(const unsigned char *src return 256; } -static inline uint32_t compare258_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) { - if (*(uint16_t *)src0 != *(uint16_t *)src1) - return (*src0 == *src1); - - return compare256_unaligned_avx2_static(src0+2, src1+2) + 2; -} - -Z_INTERNAL uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1) { - return compare258_unaligned_avx2_static(src0, src1); +Z_INTERNAL uint32_t compare256_unaligned_avx2(const unsigned char *src0, const unsigned char *src1) { + return compare256_unaligned_avx2_static(src0, src1); } #define LONGEST_MATCH longest_match_unaligned_avx2 diff --git a/arch/x86/compare258_sse42.c b/arch/x86/compare256_sse42.c similarity index 82% rename from arch/x86/compare258_sse42.c rename to arch/x86/compare256_sse42.c index a5d568ff4..3e5f15056 100644 --- a/arch/x86/compare258_sse42.c +++ b/arch/x86/compare256_sse42.c @@ -1,4 +1,4 @@ -/* compare258_sse42.c -- SSE4.2 version of compare258 +/* compare256_sse42.c -- SSE4.2 version of compare256 * * Copyright (C) 2013 Intel Corporation. All rights reserved. * Authors: @@ -54,15 +54,8 @@ static inline uint32_t compare256_unaligned_sse4_static(const unsigned char *src return 256; } -static inline uint32_t compare258_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) { - if (*(uint16_t *)src0 != *(uint16_t *)src1) - return (*src0 == *src1); - - return compare256_unaligned_sse4_static(src0+2, src1+2) + 2; -} - -Z_INTERNAL uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1) { - return compare258_unaligned_sse4_static(src0, src1); +Z_INTERNAL uint32_t compare256_unaligned_sse4(const unsigned char *src0, const unsigned char *src1) { + return compare256_unaligned_sse4_static(src0, src1); } #define LONGEST_MATCH longest_match_unaligned_sse4 diff --git a/compare258.c b/compare256.c similarity index 73% rename from compare258.c rename to compare256.c index 8260f332b..20d967e8b 100644 --- a/compare258.c +++ b/compare256.c @@ -1,4 +1,4 @@ -/* compare258.c -- aligned and unaligned versions of compare258 +/* compare256.c -- 256 byte memory comparison with match length return * Copyright (C) 2020 Nathan Moinvaziri * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -42,19 +42,8 @@ static inline uint32_t compare256_c_static(const unsigned char *src0, const unsi return 256; } -static inline uint32_t compare258_c_static(const unsigned char *src0, const unsigned char *src1) { - if (*src0 != *src1) - return 0; - src0 += 1, src1 += 1; - if (*src0 != *src1) - return 1; - src0 += 1, src1 += 1; - - return compare256_c_static(src0, src1) + 2; -} - -Z_INTERNAL uint32_t compare258_c(const unsigned char *src0, const unsigned char *src1) { - return compare258_c_static(src0, src1); +Z_INTERNAL uint32_t compare256_c(const unsigned char *src0, const unsigned char *src1) { + return compare256_c_static(src0, src1); } #define LONGEST_MATCH longest_match_c @@ -91,15 +80,8 @@ static inline uint32_t compare256_unaligned_16_static(const unsigned char *src0, return 256; } -static inline uint32_t compare258_unaligned_16_static(const unsigned char *src0, const unsigned char *src1) { - if (*(uint16_t *)src0 != *(uint16_t *)src1) - return (*src0 == *src1); - - return compare256_unaligned_16_static(src0+2, src1+2) + 2; -} - -Z_INTERNAL uint32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char *src1) { - return compare258_unaligned_16_static(src0, src1); +Z_INTERNAL uint32_t compare256_unaligned_16(const unsigned char *src0, const unsigned char *src1) { + return compare256_unaligned_16_static(src0, src1); } #define LONGEST_MATCH longest_match_unaligned_16 @@ -136,15 +118,8 @@ static inline uint32_t compare256_unaligned_32_static(const unsigned char *src0, return 256; } -static inline uint32_t compare258_unaligned_32_static(const unsigned char *src0, const unsigned char *src1) { - if (*(uint16_t *)src0 != *(uint16_t *)src1) - return (*src0 == *src1); - - return compare256_unaligned_32_static(src0+2, src1+2) + 2; -} - -Z_INTERNAL uint32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char *src1) { - return compare258_unaligned_32_static(src0, src1); +Z_INTERNAL uint32_t compare256_unaligned_32(const unsigned char *src0, const unsigned char *src1) { + return compare256_unaligned_32_static(src0, src1); } #define LONGEST_MATCH longest_match_unaligned_32 @@ -183,15 +158,8 @@ static inline uint32_t compare256_unaligned_64_static(const unsigned char *src0, return 256; } -static inline uint32_t compare258_unaligned_64_static(const unsigned char *src0, const unsigned char *src1) { - if (*(uint16_t *)src0 != *(uint16_t *)src1) - return (*src0 == *src1); - - return compare256_unaligned_64_static(src0+2, src1+2) + 2; -} - -Z_INTERNAL uint32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char *src1) { - return compare258_unaligned_64_static(src0, src1); +Z_INTERNAL uint32_t compare256_unaligned_64(const unsigned char *src0, const unsigned char *src1) { + return compare256_unaligned_64_static(src0, src1); } #define LONGEST_MATCH longest_match_unaligned_64 diff --git a/configure b/configure index f5fe81cbc..215d0f286 100755 --- a/configure +++ b/configure @@ -1438,8 +1438,8 @@ case "${ARCH}" in if test ${HAVE_AVX2_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32 -DX86_AVX_CHUNKSET" SFLAGS="${SFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32 -DX86_AVX_CHUNKSET" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_hash_avx2.o chunkset_avx.o compare258_avx2.o adler32_avx2.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_avx2.lo chunkset_avx.lo compare258_avx2.lo adler32_avx2.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_hash_avx2.o chunkset_avx.o compare256_avx2.o adler32_avx2.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_avx2.lo chunkset_avx.lo compare256_avx2.lo adler32_avx2.lo" fi check_avx512_intrinsics @@ -1495,8 +1495,8 @@ case "${ARCH}" in CFLAGS="${CFLAGS} -DX86_SSE42_CMP_STR" SFLAGS="${SFLAGS} -DX86_SSE42_CMP_STR" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_sse42.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_sse42.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare256_sse42.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare256_sse42.lo" fi check_sse2_intrinsics diff --git a/deflate_quick.c b/deflate_quick.c index 8b9839716..a214a7fe1 100644 --- a/deflate_quick.c +++ b/deflate_quick.c @@ -89,18 +89,23 @@ Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) { dist = (int64_t)s->strstart - hash_head; if (dist <= MAX_DIST(s) && dist > 0) { - match_len = functable.compare258(s->window + s->strstart, s->window + hash_head); + const uint8_t *str_start = s->window + s->strstart; + const uint8_t *match_start = s->window + hash_head; - if (match_len >= WANT_MIN_MATCH) { - if (UNLIKELY(match_len > s->lookahead)) - match_len = s->lookahead; + if (*(uint16_t *)str_start == *(uint16_t *)match_start) { + match_len = functable.compare256(str_start+2, match_start+2) + 2; - check_match(s, s->strstart, hash_head, match_len); + if (match_len >= WANT_MIN_MATCH) { + if (UNLIKELY(match_len > s->lookahead)) + match_len = s->lookahead; - zng_tr_emit_dist(s, static_ltree, static_dtree, match_len - STD_MIN_MATCH, (uint32_t)dist); - s->lookahead -= match_len; - s->strstart += match_len; - continue; + check_match(s, s->strstart, hash_head, match_len); + + zng_tr_emit_dist(s, static_ltree, static_dtree, match_len - STD_MIN_MATCH, (uint32_t)dist); + s->lookahead -= match_len; + s->strstart += match_len; + continue; + } } } } diff --git a/functable.c b/functable.c index 1c113c8db..3cfa13ade 100644 --- a/functable.c +++ b/functable.c @@ -144,19 +144,19 @@ extern uint32_t crc32_power8(uint32_t, const unsigned char *, uint64_t); extern uint32_t s390_crc32_vx(uint32_t, const unsigned char *, uint64_t); #endif -/* compare258 */ -extern uint32_t compare258_c(const unsigned char *src0, const unsigned char *src1); +/* compare256 */ +extern uint32_t compare256_c(const unsigned char *src0, const unsigned char *src1); #ifdef UNALIGNED_OK -extern uint32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char *src1); -extern uint32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char *src1); +extern uint32_t compare256_unaligned_16(const unsigned char *src0, const unsigned char *src1); +extern uint32_t compare256_unaligned_32(const unsigned char *src0, const unsigned char *src1); #ifdef UNALIGNED64_OK -extern uint32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char *src1); +extern uint32_t compare256_unaligned_64(const unsigned char *src0, const unsigned char *src1); #endif #ifdef X86_SSE42_CMP_STR -extern uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1); +extern uint32_t compare256_unaligned_sse4(const unsigned char *src0, const unsigned char *src1); #endif #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) -extern uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1); +extern uint32_t compare256_unaligned_avx2(const unsigned char *src0, const unsigned char *src1); #endif #endif @@ -544,29 +544,29 @@ Z_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t return functable.crc32(crc, buf, len); } -Z_INTERNAL uint32_t compare258_stub(const unsigned char *src0, const unsigned char *src1) { +Z_INTERNAL uint32_t compare256_stub(const unsigned char *src0, const unsigned char *src1) { #ifdef UNALIGNED_OK # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) - functable.compare258 = &compare258_unaligned_64; + functable.compare256 = &compare256_unaligned_64; # elif defined(HAVE_BUILTIN_CTZ) - functable.compare258 = &compare258_unaligned_32; + functable.compare256 = &compare256_unaligned_32; # else - functable.compare258 = &compare258_unaligned_16; + functable.compare256 = &compare256_unaligned_16; # endif # ifdef X86_SSE42_CMP_STR if (x86_cpu_has_sse42) - functable.compare258 = &compare258_unaligned_sse4; + functable.compare256 = &compare256_unaligned_sse4; # endif # if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) if (x86_cpu_has_avx2) - functable.compare258 = &compare258_unaligned_avx2; + functable.compare256 = &compare256_unaligned_avx2; # endif #else - functable.compare258 = &compare258_c; + functable.compare256 = &compare256_c; #endif - return functable.compare258(src0, src1); + return functable.compare256(src0, src1); } Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) { @@ -630,7 +630,7 @@ Z_INTERNAL Z_TLS struct functable_s functable = { crc32_fold_copy_stub, crc32_fold_final_stub, slide_hash_stub, - compare258_stub, + compare256_stub, longest_match_stub, longest_match_slow_stub, chunksize_stub, diff --git a/functable.h b/functable.h index 36039fcb2..f029ed4d1 100644 --- a/functable.h +++ b/functable.h @@ -19,7 +19,7 @@ struct functable_s { void (* crc32_fold_copy) (crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); uint32_t (* crc32_fold_final) (crc32_fold *crc); void (* slide_hash) (deflate_state *s); - uint32_t (* compare258) (const unsigned char *src0, const unsigned char *src1); + uint32_t (* compare256) (const unsigned char *src0, const unsigned char *src1); uint32_t (* longest_match) (deflate_state *const s, Pos cur_match); uint32_t (* longest_match_slow) (deflate_state *const s, Pos cur_match); uint32_t (* chunksize) (void); diff --git a/win32/Makefile.a64 b/win32/Makefile.a64 index 580b5bfb1..82ca6c2dc 100644 --- a/win32/Makefile.a64 +++ b/win32/Makefile.a64 @@ -46,7 +46,7 @@ OBJS = \ adler32.obj \ armfeature.obj \ chunkset.obj \ - compare258.obj \ + compare256.obj \ compress.obj \ crc32.obj \ crc32_comb.obj \ diff --git a/win32/Makefile.arm b/win32/Makefile.arm index 12b08dc67..ace50794e 100644 --- a/win32/Makefile.arm +++ b/win32/Makefile.arm @@ -49,7 +49,7 @@ OBJS = \ adler32.obj \ armfeature.obj \ chunkset.obj \ - compare258.obj \ + compare256.obj \ compress.obj \ crc32.obj \ crc32_comb.obj \ diff --git a/win32/Makefile.msc b/win32/Makefile.msc index ff6c6963c..cfdfcca74 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -53,9 +53,9 @@ OBJS = \ chunkset.obj \ chunkset_avx.obj \ chunkset_sse2.obj \ - compare258.obj \ - compare258_avx2.obj \ - compare258_sse42.obj \ + compare256.obj \ + compare256_avx2.obj \ + compare256_sse42.obj \ compress.obj \ crc32.obj \ crc32_comb.obj \