option(WITH_AVX512VNNI "Build with AVX512 VNNI extensions" ON)
option(WITH_SSE2 "Build with SSE2" ON)
option(WITH_SSSE3 "Build with SSSE3" ON)
- option(WITH_SSE4 "Build with SSE4" ON)
+ option(WITH_SSE41 "Build with SSE41" ON)
+ option(WITH_SSE42 "Build with SSE42" ON)
option(WITH_PCLMULQDQ "Build with PCLMULQDQ" ON)
endif()
WITH_DFLTCC_INFLATE
WITH_CRC32_VX
WITH_AVX2 WITH_SSE2
- WITH_SSSE3 WITH_SSE4
+ WITH_SSSE3 WITH_SSE41
+ WITH_SSE42
WITH_PCLMULQDQ
WITH_ALTIVEC
WITH_POWER8
set(WITH_AVX512VNNI OFF)
endif()
endif()
- if(WITH_SSE4)
- check_sse4_intrinsics()
+ if(WITH_SSE41)
+ check_sse41_intrinsics()
+ if(HAVE_SSE41_INTRIN)
+ add_definitions(-DX86_SSE41 -DX86_SSE41_ADLER32)
+ set(SSE41_SRCS ${ARCHDIR}/adler32_sse41.c)
+ add_feature_info(SSE4_ADLER32 1 "Support SSE41-accelerated adler32, using \"${SSE41FLAG}\"")
+ list(APPEND ZLIB_ARCH_SRCS ${SSE41_SRCS})
+ set_property(SOURCE ${SSE41_SRCS} PROPERTY COMPILE_FLAGS "${SSE41FLAG} ${NOLTOFLAG}")
+ else()
+ set(WITH_SSE41 OFF)
+ endif()
+ endif()
+ if(WITH_SSE42)
+ check_sse42_intrinsics()
if(HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN)
add_definitions(-DX86_SSE42_CRC_HASH)
set(SSE42_SRCS ${ARCHDIR}/insert_string_sse.c)
- add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized CRC hash generation, using \"${SSE4FLAG}\"")
+ add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized CRC hash generation, using \"${SSE42FLAG}\"")
list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
- set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE4FLAG} ${NOLTOFLAG}")
+ set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
if(HAVE_SSE42CRC_INTRIN)
add_definitions(-DX86_SSE42_CRC_INTRIN)
endif()
if(HAVE_SSE42CMPSTR_INTRIN)
add_definitions(-DX86_SSE42_CMP_STR)
set(SSE42_SRCS ${ARCHDIR}/compare258_sse.c)
- add_feature_info(SSE42_COMPARE258 1 "Support SSE4.2 optimized compare258, using \"${SSE4FLAG}\"")
+ add_feature_info(SSE42_COMPARE258 1 "Support SSE4.2 optimized compare258, using \"${SSE42FLAG}\"")
list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
- set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE4FLAG} ${NOLTOFLAG}")
+ set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
endif()
if(NOT HAVE_SSE42CRC_INLINE_ASM AND NOT HAVE_SSE42CRC_INTRIN AND NOT HAVE_SSE42CMPSTR_INTRIN)
set(WITH_SSE4 OFF)
set(WITH_SSSE3 OFF)
endif()
endif()
- if(WITH_PCLMULQDQ AND WITH_SSSE3 AND WITH_SSE4)
+ if(WITH_PCLMULQDQ AND WITH_SSSE3 AND WITH_SSE42)
check_pclmulqdq_intrinsics()
if(HAVE_PCLMULQDQ_INTRIN AND HAVE_SSSE3_INTRIN)
add_definitions(-DX86_PCLMULQDQ_CRC)
set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_fold_pclmulqdq.c)
- add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${SSSE3FLAG} ${SSE4FLAG} ${PCLMULFLAG}\"")
+ add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG}\"")
list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS})
- set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE4FLAG} ${PCLMULFLAG} ${NOLTOFLAG}")
+ set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}")
else()
set(WITH_PCLMULQDQ OFF)
endif()
add_feature_info(WITH_AVX512VNNI WITH_AVX512VNNI "Build with AVX512 VNNI")
add_feature_info(WITH_SSE2 WITH_SSE2 "Build with SSE2")
add_feature_info(WITH_SSSE3 WITH_SSSE3 "Build with SSSE3")
- add_feature_info(WITH_SSE4 WITH_SSE4 "Build with SSE4")
+ add_feature_info(WITH_SSE41 WITH_SSE41 "Build with SSE41")
+ add_feature_info(WITH_SSE42 WITH_SSE42 "Build with SSE42")
add_feature_info(WITH_PCLMULQDQ WITH_PCLMULQDQ "Build with PCLMULQDQ")
endif()
| WITH_AVX512 | | Build with AVX512 intrinsics | ON |
| WITH_AVX512VNNI | | Build with AVX512VNNI intrinsics | ON |
| WITH_SSE2 | | Build with SSE2 intrinsics | ON |
-| WITH_SSE4 | | Build with SSE4 intrinsics | ON |
+| WITH_SSE41 | | Build with SSE41 intrinsics | ON |
+| WITH_SSE42 | | Build with SSE42 intrinsics | ON |
| WITH_PCLMULQDQ | | Build with PCLMULQDQ intrinsics | ON |
| WITH_ACLE | --without-acle | Build with ACLE intrinsics | ON |
| WITH_NEON | --without-neon | Build with NEON intrinsics | ON |
AVX2FLAG=-mavx2
SSE2FLAG=-msse2
SSSE3FLAG=-mssse3
-SSE4FLAG=-msse4
+SSE41FLAG=-msse4.1
+SSE42FLAG=-msse4.2
PCLMULFLAG=-mpclmul
NOLTOFLAG=
adler32_avx.o adler32_avx.lo \
adler32_avx512.o adler32_avx512.lo \
adler32_avx512_vnni.o adler32_avx512_vnni.lo \
+ adler32_sse41.o adler32_sse41.lo \
adler32_ssse3.o adler32_ssse3.lo \
chunkset_avx.o chunkset_avx.lo \
chunkset_sse.o chunkset_sse.lo \
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
compare258_sse.o:
- $(CC) $(CFLAGS) $(SSE4FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
+ $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
compare258_sse.lo:
- $(CC) $(SFLAGS) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
+ $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
insert_string_sse.o:
- $(CC) $(CFLAGS) $(SSE4FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c
+ $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c
insert_string_sse.lo:
- $(CC) $(SFLAGS) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c
+ $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c
crc32_fold_pclmulqdq.o:
- $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c
+ $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c
crc32_fold_pclmulqdq.lo:
- $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c
+ $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c
slide_hash_avx.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx.c
adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
$(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+adler32_sse41.o: $(SRCDIR)/adler32_sse41.c
+ $(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse41.c
+
+adler32_sse41.lo: $(SRCDIR)/adler32_sse41.c
+ $(CC) $(SFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse41.c
+
adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
--- /dev/null
+/* adler32_sse41.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#include "../../adler32_p.h"
+
+#ifdef X86_SSE41_ADLER32
+
+#include <immintrin.h>
+
+static inline uint32_t partial_hsum(__m128i x) {
+ __m128i second_int = _mm_bsrli_si128(x, 8);
+ __m128i sum = _mm_add_epi32(x, second_int);
+ return _mm_cvtsi128_si32(sum);
+}
+
+static inline uint32_t hsum(__m128i x) {
+ __m128i sum1 = _mm_unpackhi_epi64(x, x);
+ __m128i sum2 = _mm_add_epi32(x, sum1);
+ __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+ __m128i sum4 = _mm_add_epi32(sum2, sum3);
+ return _mm_cvtsi128_si32(sum4);
+}
+
+Z_INTERNAL uint32_t adler32_sse41(uint32_t adler, const unsigned char *buf, size_t len) {
+ uint32_t sum2;
+
+ /* split Adler-32 into component sums */
+ sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (UNLIKELY(len == 1))
+ return adler32_len_1(adler, buf, sum2);
+
+ /* initial Adler-32 value (deferred check for len == 1 speed) */
+ if (UNLIKELY(buf == NULL))
+ return 1L;
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (UNLIKELY(len < 16))
+ return adler32_len_16(adler, buf, len, sum2);
+
+ const __m128i dot2v = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+ const __m128i dot3v = _mm_set1_epi16(1);
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i vs1 = _mm_cvtsi32_si128(adler);
+ __m128i vs2 = _mm_cvtsi32_si128(sum2);
+
+ while (len >= 16) {
+ __m128i vs1_0 = vs1;
+ __m128i vs3 = _mm_setzero_si128();
+
+ int k = (len < NMAX ? (int)len : NMAX);
+ k -= k % 16;
+ len -= k;
+
+ /* Aligned version of the loop */
+ if (((uintptr_t)buf & 15) == 0) {
+ while (k >= 16) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ __m128i vbuf = _mm_load_si128((__m128i*)buf);
+ buf += 16;
+ k -= 16;
+
+ __m128i v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+ __m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+ __m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+ }
+ } else {
+ while (k >= 16) {
+ __m128i vbuf = _mm_loadu_si128((__m128i*)buf);
+ buf += 16;
+ k -= 16;
+
+ __m128i v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+ __m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+ __m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+ }
+ }
+
+ vs3 = _mm_slli_epi32(vs3, 4);
+ vs2 = _mm_add_epi32(vs2, vs3);
+
+ /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
+ * a partial reduction sum implicitly and only summing to integers in vector positions
+ * 0 and 2. This saves us some contention on the shuffle port(s) */
+ adler = partial_hsum(vs1) % BASE;
+ sum2 = hsum(vs2) % BASE;
+
+ vs1 = _mm_cvtsi32_si128(adler);
+ vs2 = _mm_cvtsi32_si128(sum2);
+ }
+
+ /* Process tail (len < 16). */
+ return adler32_len_16(adler, buf, len, sum2);
+}
+
+#endif
CFLAGS="${CFLAGS} -DX86_SSE41_ADLER32"
SFLAGS="${SFLAGS} -DX86_SSE41_ADLER32"
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_sse41.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_sse41.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_sse41.lo"
fi
check_sse42_intrinsics
#ifdef PPC_VMX_ADLER32
extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len);
#endif
+#ifdef X86_SSE41_ADLER32
+extern uint32_t adler32_sse41(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
#ifdef X86_SSSE3_ADLER32
extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
#endif
if (x86_cpu_has_ssse3)
functable.adler32 = &adler32_ssse3;
#endif
+#ifdef X86_SSE41_ADLER32
+ if (x86_cpu_has_sse41)
+ functable.adler32 = &adler32_sse41;
+#endif
#ifdef X86_AVX2_ADLER32
if (x86_cpu_has_avx2)
functable.adler32 = &adler32_avx2;