From: Nathan Moinvaziri Date: Thu, 18 Dec 2025 00:35:18 +0000 (-0800) Subject: Add missing adler32_copy_ssse3 implementation X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0dfd7c0acbd463fc2d083756afb97a5e3d84e9ec;p=thirdparty%2Fzlib-ng.git Add missing adler32_copy_ssse3 implementation --- diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c index 7dab9b497..7c1dc84c9 100644 --- a/arch/x86/adler32_ssse3.c +++ b/arch/x86/adler32_ssse3.c @@ -14,7 +14,7 @@ #include -Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) { +static inline uint32_t adler32_impl(uint32_t adler, const uint8_t *buf, size_t len) { uint32_t sum2; /* split Adler-32 into component sums */ @@ -153,4 +153,14 @@ unaligned_jmp: return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0); } +Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) { + return adler32_impl(adler, buf, len); +} + +/* SSSE3 unaligned stores have a huge penalty, so we use memcpy. */ +Z_INTERNAL uint32_t adler32_copy_ssse3(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + adler = adler32_impl(adler, src, len); + memcpy(dst, src, len); + return adler; +} #endif diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h index 0f9aa1824..1c197e849 100644 --- a/arch/x86/x86_functions.h +++ b/arch/x86/x86_functions.h @@ -33,6 +33,7 @@ uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsign #ifdef X86_SSSE3 uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_copy_ssse3(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, unsigned len, unsigned left); void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start); #endif @@ -110,6 +111,8 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s # if defined(X86_SSSE3) && defined(__SSSE3__) # undef native_adler32 # define native_adler32 adler32_ssse3 +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_ssse3 # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_ssse3 # undef native_inflate_fast diff --git a/functable.c b/functable.c index 108575191..758a33553 100644 --- a/functable.c +++ b/functable.c @@ -135,6 +135,7 @@ static int init_functable(void) { #ifdef X86_SSSE3 if (cf.x86.has_ssse3) { ft.adler32 = &adler32_ssse3; + ft.adler32_copy = &adler32_copy_ssse3; ft.chunkmemset_safe = &chunkmemset_safe_ssse3; ft.inflate_fast = &inflate_fast_ssse3; } diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc index 05b1f0fac..c506fccd8 100644 --- a/test/benchmarks/benchmark_adler32_copy.cc +++ b/test/benchmarks/benchmark_adler32_copy.cc @@ -108,7 +108,9 @@ BENCHMARK_ADLER32_BASELINE_COPY(power8, adler32_power8, test_cpu_features.power. //BENCHMARK_ADLER32_COPY(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv); BENCHMARK_ADLER32_BASELINE_COPY(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv); #endif - +#ifdef X86_SSSE3 +BENCHMARK_ADLER32_COPY(ssse3, adler32_copy_ssse3, test_cpu_features.x86.has_ssse3); +#endif #ifdef X86_SSE42 BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, test_cpu_features.x86.has_ssse3); BENCHMARK_ADLER32_COPY(sse42, adler32_copy_sse42, test_cpu_features.x86.has_sse42);