if(WITH_SSE42)
check_sse42_intrinsics()
if(HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN)
- add_definitions(-DX86_SSE42_CRC_HASH)
- set(SSE42_SRCS ${ARCHDIR}/insert_string_sse42.c)
+ add_definitions(-DX86_SSE42_CRC_HASH -DX86_SSE42_ADLER32)
+ set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c ${ARCHDIR}/insert_string_sse42.c)
add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized CRC hash generation, using \"${SSE42FLAG}\"")
list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
typedef struct adler32_fold_s {
uint8_t adler[64]; // First half of component sums
uint8_t sum2[64]; // Second half of component sums
- uint32_t nsums; // The number of scalar sums performed
+ uint8_t leftover[16]; // A buffer for sub 16 sized carry over, sized for full loads and alignment
+ uint32_t nsums; // The number of scalar sums leftover
+ uint32_t bytes_leftover; // The number of leftover bytes from the previous sum
} adler32_fold;
Z_INTERNAL void adler32_fold_reset_c(adler32_fold *adler, uint32_t init_adler);
return adler | (sum2 << 16);
}
+static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *buf, uint8_t *dst, size_t len, uint32_t sum2) {
+ while (len--) {
+ *dst = *buf++;
+ adler += *dst++;
+ sum2 += adler;
+ }
+ adler %= BASE;
+ sum2 %= BASE; /* only added so many BASE's */
+ /* return recombined sums */
+ return adler | (sum2 << 16);
+}
+
static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
while (len) {
--len;
adler32_avx2.o adler32_avx2.lo \
adler32_avx512.o adler32_avx512.lo \
adler32_avx512_vnni.o adler32_avx512_vnni.lo \
+ adler32_sse42.o adler32_sse42.lo \
adler32_ssse3.o adler32_ssse3.lo \
chunkset_avx.o chunkset_avx.lo \
chunkset_sse2.o chunkset_sse2.lo \
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c
- $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
+ $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
- $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+ $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
$(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
- $(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+ $(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
- $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+ $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_sse42.o: $(SRCDIR)/adler32_sse42.c
+ $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
+
+adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c
+ $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
mostlyclean: clean
clean:
--- /dev/null
+/* adler32_sse4.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "../../adler32_fold.h"
+#include "adler32_ssse3_p.h"
+#include <immintrin.h>
+
+#ifdef X86_SSE42_ADLER32
+
+Z_INTERNAL void adler32_fold_reset_sse42(adler32_fold *adler, uint32_t init_adler) {
+ adler->nsums = init_adler;
+}
+
+Z_INTERNAL uint32_t adler32_fold_final_sse42(adler32_fold *adler) {
+ return adler->nsums;
+}
+
+#include "adler32_sse42_tpl.h"
+#undef ADLER32_SSE42_TPL_H_
+#define COPY
+#include "adler32_sse42_tpl.h"
+#undef COPY
+
+#endif
--- /dev/null
+/* adler32_ssse3_tpl.h -- adler32 ssse3 vectorized function templates
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_SSE42_TPL_H_
+#define ADLER32_SSE42_TPL_H_
+
+#include "../../zbuild.h"
+#include <immintrin.h>
+#include "../../adler32_fold.h"
+#include "../../adler32_p.h"
+#include "adler32_ssse3_p.h"
+
+#ifdef COPY
+Z_INTERNAL void adler32_fold_copy_sse42(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) {
+#else
+Z_INTERNAL void adler32_fold_sse42(adler32_fold *adler, const uint8_t *src, size_t len) {
+#endif
+
+ uint32_t adler0, adler1;
+ adler1 = (adler->nsums >> 16) & 0xffff;
+ adler0 = adler->nsums & 0xffff;
+
+ if (len < 16) {
+rem_peel:
+#ifdef COPY
+ adler->nsums = adler32_copy_len_16(adler0, src, dst, len, adler1);
+#else
+ adler->nsums = adler32_len_16(adler0, src, len, adler1);
+#endif
+ return;
+ }
+
+ __m128i vbuf, vbuf_0;
+ __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+ v_sad_sum2, vsum2, vsum2_0;
+ __m128i zero = _mm_setzero_si128();
+ const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+ const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+ const __m128i dot3v = _mm_set1_epi16(1);
+ size_t k;
+
+ while (len >= 16) {
+
+ k = MIN(len, NMAX);
+ k -= k % 16;
+ len -= k;
+
+ vs1 = _mm_cvtsi32_si128(adler0);
+ vs2 = _mm_cvtsi32_si128(adler1);
+
+ vs3 = _mm_setzero_si128();
+ vs2_0 = _mm_setzero_si128();
+ vs1_0 = vs1;
+
+ while (k >= 32) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_loadu_si128((__m128i*)src);
+ vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
+ src += 32;
+ k -= 32;
+
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+#ifdef COPY
+ _mm_storeu_si128((__m128i*)dst, vbuf);
+ _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
+ dst += 32;
+#endif
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+ v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+ vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+ vs1_0 = vs1;
+ }
+
+ vs2 = _mm_add_epi32(vs2_0, vs2);
+ vs3 = _mm_slli_epi32(vs3, 5);
+ vs2 = _mm_add_epi32(vs3, vs2);
+ vs3 = _mm_setzero_si128();
+
+ while (k >= 16) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_loadu_si128((__m128i*)src);
+ src += 16;
+ k -= 16;
+
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+
+#ifdef COPY
+ _mm_storeu_si128((__m128i*)dst, vbuf);
+ dst += 16;
+#endif
+ }
+
+ vs3 = _mm_slli_epi32(vs3, 4);
+ vs2 = _mm_add_epi32(vs2, vs3);
+
+ adler0 = partial_hsum(vs1) % BASE;
+ adler1 = hsum(vs2) % BASE;
+ }
+
+ /* If this is true, there's fewer than 16 elements remaining */
+ if (len) {
+ goto rem_peel;
+ }
+
+ adler->nsums = adler0 | (adler1 << 16);
+}
+
+#endif
#include "../../zbuild.h"
#include "../../adler32_p.h"
+#include "adler32_ssse3_p.h"
#ifdef X86_SSSE3_ADLER32
#include <immintrin.h>
-static inline uint32_t partial_hsum(__m128i x) {
- __m128i second_int = _mm_bsrli_si128(x, 8);
- __m128i sum = _mm_add_epi32(x, second_int);
- return _mm_cvtsi128_si32(sum);
-}
-
-static inline uint32_t hsum(__m128i x) {
- __m128i sum1 = _mm_unpackhi_epi64(x, x);
- __m128i sum2 = _mm_add_epi32(x, sum1);
- __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
- __m128i sum4 = _mm_add_epi32(sum2, sum3);
- return _mm_cvtsi128_si32(sum4);
-}
-
Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) {
uint32_t sum2;
--- /dev/null
+/* adler32_ssse3_p.h -- adler32 ssse3 utility functions
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_SSSE3_P_H_
+#define ADLER32_SSSE3_P_H_
+
+#ifdef X86_SSSE3_ADLER32
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static inline uint32_t partial_hsum(__m128i x) {
+ __m128i second_int = _mm_bsrli_si128(x, 8);
+ __m128i sum = _mm_add_epi32(x, second_int);
+ return _mm_cvtsi128_si32(sum);
+}
+
+static inline uint32_t hsum(__m128i x) {
+ __m128i sum1 = _mm_unpackhi_epi64(x, x);
+ __m128i sum2 = _mm_add_epi32(x, sum1);
+ __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+ __m128i sum4 = _mm_add_epi32(sum2, sum3);
+ return _mm_cvtsi128_si32(sum4);
+}
+#endif
+
+#endif
--- /dev/null
+/* adler32_ssse3_tpl.h -- adler32 ssse3 vectorized function templates
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_SSSE3_TPL_H_
+#define ADLER32_SSSE3_TPL_H_
+
+#include "../../zbuild.h"
+#include <immintrin.h>
+#include "../../adler32_fold.h"
+#include "../../adler32_p.h"
+#include "adler32_ssse3_p.h"
+
+#ifdef COPY
+Z_INTERNAL void adler32_fold_copy_ssse3(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) {
+#else
+Z_INTERNAL void adler32_fold_ssse3(adler32_fold *adler, const uint8_t *src, size_t len) {
+#endif
+ uint32_t adler0, adler1;
+
+ /* split Adler-32 into component sums */
+ adler1 = (adler->nsums >> 16) & 0xffff;
+ adler0 = adler->nsums & 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (UNLIKELY(len == 1)) {
+#ifdef COPY
+ *(dst++) = *src;
+#endif
+ adler->nsums = adler32_len_1(adler0, src, adler1);
+ return;
+ }
+
+ /* initial Adler-32 value (deferred check for len == 1 speed) */
+ if (UNLIKELY(src == NULL)) {
+ adler->nsums = 1L;
+ return;
+ }
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (UNLIKELY(len < 16)) {
+ goto sub16;
+ }
+
+ const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+ const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+ const __m128i dot3v = _mm_set1_epi16(1);
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+ vbuf_0, v_sad_sum2, vsum2, vsum2_0;
+
+ /* If our buffer is unaligned (likely), make the determination whether
+ * or not there's enough of a buffer to consume to make the scalar, aligning
+ * additions worthwhile or if it's worth it to just eat the cost of an unaligned
+ * load. This is a pretty simple test, just test if 16 - the remainder + len is
+ * < 16 */
+ size_t max_iters = NMAX;
+ size_t rem = (uintptr_t)src & 15;
+ size_t align_offset = 16 - rem;
+ size_t k = 0;
+ if (rem) {
+ if (len < 16 + align_offset) {
+ /* Let's eat the cost of this one unaligned load so that
+ * we don't completely skip over the vectorization. Doing
+ * 16 bytes at a time unaligned is is better than 16 + <= 15
+ * sums */
+ vbuf = _mm_loadu_si128((__m128i*)src);
+ len -= 16;
+ src += 16;
+#ifdef COPY
+ _mm_storeu_si128((__m128i*)dst, vbuf);
+ dst += 16;
+#endif
+ vs1 = _mm_cvtsi32_si128(adler0);
+ vs2 = _mm_cvtsi32_si128(adler1);
+ vs3 = _mm_setzero_si128();
+ vs1_0 = vs1;
+ goto unaligned_jmp;
+ }
+
+#ifdef COPY
+ memcpy(dst, src, align_offset);
+ dst += align_offset;
+#endif
+ for (size_t i = 0; i < align_offset; ++i) {
+ adler0 += *(src++);
+ adler1 += adler0;
+ }
+
+ /* lop off the max number of sums based on the scalar sums done
+ * above */
+ len -= align_offset;
+ max_iters -= align_offset;
+ }
+
+
+ while (len >= 16) {
+ vs1 = _mm_cvtsi32_si128(adler0);
+ vs2 = _mm_cvtsi32_si128(adler1);
+ vs3 = _mm_setzero_si128();
+ vs2_0 = _mm_setzero_si128();
+ vs1_0 = vs1;
+
+ k = (len < max_iters ? len : max_iters);
+ k -= k % 16;
+ len -= k;
+
+ while (k >= 32) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_load_si128((__m128i*)src);
+ vbuf_0 = _mm_load_si128((__m128i*)(src + 16));
+ src += 32;
+ k -= 32;
+
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+#ifdef COPY
+ _mm_storeu_si128((__m128i*)dst, vbuf);
+ _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
+ dst += 32;
+#endif
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+
+ vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+ vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+ vs1_0 = vs1;
+ }
+
+ vs2 = _mm_add_epi32(vs2_0, vs2);
+ vs3 = _mm_slli_epi32(vs3, 5);
+ vs2 = _mm_add_epi32(vs3, vs2);
+ vs3 = _mm_setzero_si128();
+
+ while (k >= 16) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_load_si128((__m128i*)src);
+ src += 16;
+ k -= 16;
+
+unaligned_jmp:
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+#ifdef COPY
+ _mm_storeu_si128((__m128i*)dst, vbuf);
+ dst += 16;
+#endif
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+ }
+
+ vs3 = _mm_slli_epi32(vs3, 4);
+ vs2 = _mm_add_epi32(vs2, vs3);
+
+ /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
+ * a partial reduction sum implicitly and only summing to integers in vector positions
+ * 0 and 2. This saves us some contention on the shuffle port(s) */
+ adler0 = partial_hsum(vs1) % BASE;
+ adler1 = hsum(vs2) % BASE;
+ max_iters = NMAX;
+ }
+
+sub16:
+#ifdef COPY
+ adler->nsums = adler32_copy_len_16(adler0, src, dst, len, adler1);
+#else
+ /* Process tail (len < 16). */
+ adler->nsums = adler32_len_16(adler0, src, len, adler1);
+#endif
+}
+
+#endif
check_sse42_intrinsics
if test ${HAVE_SSE42CRC_INTRIN} -eq 1 || test ${HAVE_SSE42CRC_INLINE_ASM} -eq 1; then
- CFLAGS="${CFLAGS} -DX86_SSE42_CRC_HASH"
- SFLAGS="${SFLAGS} -DX86_SSE42_CRC_HASH"
+ CFLAGS="${CFLAGS} -DX86_SSE42_CRC_HASH -DX86_SSE42_ADLER32"
+ SFLAGS="${SFLAGS} -DX86_SSE42_CRC_HASH -DX86_SSE42_ADLER32"
if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then
CFLAGS="${CFLAGS} -DX86_SSE42_CRC_INTRIN"
SFLAGS="${SFLAGS} -DX86_SSE42_CRC_INTRIN"
fi
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} insert_string_sse42.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} insert_string_sse42.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_sse42.o insert_string_sse42.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_sse42.lo insert_string_sse42.lo"
fi
check_sse2_intrinsics
#define CPU_FEATURES_H_
#include "crc32_fold.h"
+#include "adler32_fold.h"
#if defined(X86_FEATURES)
# include "arch/x86/x86_features.h"
#ifdef X86_SSSE3_ADLER32
extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
#endif
+#ifdef X86_SSE42_ADLER32
+extern void adler32_fold_reset_sse42(adler32_fold *adler, uint32_t init_adler);
+extern void adler32_fold_copy_sse42(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern void adler32_fold_sse42(adler32_fold *adler, const uint8_t *src, size_t len);
+extern uint32_t adler32_fold_final_sse42(adler32_fold *adler);
+#endif
#ifdef X86_AVX2_ADLER32
extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
#endif
strm->adler = functable.crc32_fold_reset(&s->crc_fold);
else
#endif
+ //strm->adler = ADLER32_INITIAL_VALUE;
+ {
strm->adler = ADLER32_INITIAL_VALUE;
+ functable.adler32_fold_reset(&s->adler_fold, ADLER32_INITIAL_VALUE);
+ }
s->last_flush = -2;
zng_tr_init(s);
if (s->strstart != 0)
put_uint32_msb(s, strm->adler);
strm->adler = ADLER32_INITIAL_VALUE;
+ functable.adler32_fold_reset(&s->adler_fold, ADLER32_INITIAL_VALUE);
s->status = BUSY_STATE;
/* Compression must start with an empty pending buffer */
put_uint32(s, (uint32_t)strm->total_in);
} else
#endif
- if (s->wrap == 1)
- put_uint32_msb(s, strm->adler);
+ {
+ strm->adler = functable.adler32_fold_final(&s->adler_fold);
+ if (s->wrap == 1)
+ put_uint32_msb(s, strm->adler);
+ }
PREFIX(flush_pending)(strm);
/* If avail_out is zero, the application will call deflate again
* to flush the rest.
functable.crc32_fold_copy(&strm->state->crc_fold, buf, strm->next_in, len);
#endif
} else {
- memcpy(buf, strm->next_in, len);
if (strm->state->wrap == 1)
- strm->adler = functable.adler32(strm->adler, buf, len);
+ functable.adler32_fold_copy(&strm->state->adler_fold, buf, strm->next_in, len);
+ else
+ memcpy(buf, strm->next_in, len);
+ //strm->adler = functable.adler32(strm->adler, buf, len);
}
strm->next_in += len;
strm->total_in += len;
subject to change. Applications should only use zlib.h.
*/
+#include "adler32_fold.h"
#include "zutil.h"
#include "zendian.h"
#include "crc32_fold.h"
int nice_match; /* Stop searching when current match exceeds this */
+ struct adler32_fold_s ALIGNED_(64) adler_fold;
struct crc32_fold_s ALIGNED_(16) crc_fold;
/* used by trees.c: */
Z_INTERNAL void adler32_fold_reset_stub(adler32_fold *adler, uint32_t init_adler) {
functable.adler32_fold_reset = &adler32_fold_reset_c;
+#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX2_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
+ if (x86_cpu_has_sse42)
+ functable.adler32_fold_reset = &adler32_fold_reset_sse42;
+#endif
functable.adler32_fold_reset(adler, init_adler);
}
Z_INTERNAL void adler32_fold_copy_stub(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) {
functable.adler32_fold_copy = &adler32_fold_copy_c;
+#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX2_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
+ if (x86_cpu_has_sse42)
+ functable.adler32_fold_copy = &adler32_fold_copy_sse42;
+#endif
functable.adler32_fold_copy(adler, dst, src, len);
}
Z_INTERNAL void adler32_fold_stub(adler32_fold *adler, const uint8_t *src, size_t len) {
functable.adler32_fold = &adler32_fold_c;
+#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX2_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
+ if (x86_cpu_has_sse42)
+ functable.adler32_fold = &adler32_fold_sse42;
+#endif
functable.adler32_fold(adler, src, len);
}
Z_INTERNAL uint32_t adler32_fold_final_stub(adler32_fold *adler) {
functable.adler32_fold_final = &adler32_fold_final_c;
+#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX2_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
+ if (x86_cpu_has_sse42)
+ functable.adler32_fold_final = &adler32_fold_final_sse42;
+#endif
return functable.adler32_fold_final(adler);
}
static inline void inf_chksum_cpy(PREFIX3(stream) *strm, uint8_t *dst,
const uint8_t *src, uint32_t copy) {
+ if (!copy) return;
struct inflate_state *state = (struct inflate_state*)strm->state;
#ifdef GUNZIP
if (state->flags) {
-DX86_FEATURES \
-DX86_PCLMULQDQ_CRC \
-DX86_SSE2 \
+ -DX86_SSE42_ADLER32 \
-DX86_SSE42_CRC_INTRIN \
-DX86_SSE42_CRC_HASH \
+ -DX86_SSSE3_ADLER32 \
-DX86_AVX2 \
+ -DX86_AVX2_ADLER32 \
-DX86_AVX_CHUNKSET \
-DX86_SSE2_CHUNKSET \
#
OBJS = \
adler32.obj \
+ adler32_avx2.obj \
+ adler32_avx512.obj \
+ adler32_avx512_vnni.obj \
+ adler32_sse42.obj \
+ adler32_ssse3.obj \
adler32_fold.obj \
chunkset.obj \
chunkset_avx.obj \
SRCDIR = $(TOP)
# Keep the dependences in sync with top-level Makefile.in
adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h $(SRCDIR)/adler32_p.h
+adler32_avx2.obj: $(SRCDIR)/arch/x86/adler32_avx2.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/fallback_builtins.h
+adler32_avx512.obj: $(SRCDIR)/arch/x86/adler32_avx512.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/arch/x86/adler32_avx512_p.h
+adler32_avx512_vnni.obj: $(SRCDIR)/arch/x86/adler32_avx512_vnni.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/arch/x86/adler32_avx512_p.h
+adler32_sse42.obj: $(SRCDIR)/arch/x86/adler32_sse42.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/adler32_fold.h \
+ $(SRCDIR)/arch/x86/adler32_ssse3_p.h $(SRCDIR)/arch/x86/adler32_sse42_tpl.h
+adler32_ssse3.obj: $(SRCDIR)/arch/x86/adler32_ssse3.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/adler32_fold.h \
+ $(SRCDIR)/arch/x86/adler32_ssse3_p.h
adler32_fold.obj: $(SRCDIR)/adler32_fold.c $(SRCDIR)/zbuild.h $(SRCDIR)/adler32_fold.h $(SRCDIR)/functable.h
functable.obj: $(SRCDIR)/functable.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zendian.h $(SRCDIR)/arch/x86/x86_features.h
gzlib.obj: $(SRCDIR)/gzlib.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h $(SRCDIR)/zutil_p.h