+++ /dev/null
-/* adler32_sse41.c -- compute the Adler-32 checksum of a data stream
- * Copyright (C) 1995-2011 Mark Adler
- * Authors:
- * Adam Stylinski <kungfujesus06@gmail.com>
- * Brian Bockelman <bockelman@gmail.com>
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-
-#ifdef X86_SSE41_ADLER32
-
-#include <immintrin.h>
-
-static inline uint32_t partial_hsum(__m128i x) {
- __m128i second_int = _mm_bsrli_si128(x, 8);
- __m128i sum = _mm_add_epi32(x, second_int);
- return _mm_cvtsi128_si32(sum);
-}
-
-static inline uint32_t hsum(__m128i x) {
- __m128i sum1 = _mm_unpackhi_epi64(x, x);
- __m128i sum2 = _mm_add_epi32(x, sum1);
- __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
- __m128i sum4 = _mm_add_epi32(sum2, sum3);
- return _mm_cvtsi128_si32(sum4);
-}
-
-Z_INTERNAL uint32_t adler32_sse41(uint32_t adler, const unsigned char *buf, size_t len) {
- uint32_t sum2;
-
- /* split Adler-32 into component sums */
- sum2 = (adler >> 16) & 0xffff;
- adler &= 0xffff;
-
- /* in case user likes doing a byte at a time, keep it fast */
- if (UNLIKELY(len == 1))
- return adler32_len_1(adler, buf, sum2);
-
- /* initial Adler-32 value (deferred check for len == 1 speed) */
- if (UNLIKELY(buf == NULL))
- return 1L;
-
- /* in case short lengths are provided, keep it somewhat fast */
- if (UNLIKELY(len < 16))
- return adler32_len_16(adler, buf, len, sum2);
-
- const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
- const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
- const __m128i dot3v = _mm_set1_epi16(1);
- const __m128i zero = _mm_setzero_si128();
-
- __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
- vbuf_0, v_sad_sum2, vsum2, vsum2_0;
-
- /* If our buffer is unaligned (likely), make the determination whether
- * or not there's enough of a buffer to consume to make the scalar, aligning
- * additions worthwhile or if it's worth it to just eat the cost of an unaligned
- * load. This is a pretty simple test, just test if 16 - the remainder + len is
- * < 16 */
- int max_iters = NMAX;
- int rem = (uintptr_t)buf & 15;
- int align_offset = 16 - rem;
- int k = 0;
- if (rem) {
- if (len < 16 + align_offset) {
- /* Let's eat the cost of this one unaligned load so that
- * we don't completely skip over the vectorization. Doing
- * 16 bytes at a time unaligned is is better than 16 + <= 15
- * sums */
- vbuf = _mm_loadu_si128((__m128i*)buf);
- len -= 16;
- buf += 16;
- vs1 = _mm_cvtsi32_si128(adler);
- vs2 = _mm_cvtsi32_si128(sum2);
- vs3 = _mm_setzero_si128();
- vs1_0 = vs1;
- goto unaligned_jmp;
- }
-
- for (int i = 0; i < align_offset; ++i) {
- adler += *(buf++);
- sum2 += adler;
- }
-
- /* lop off the max number of sums based on the scalar sums done
- * above */
- len -= align_offset;
- max_iters -= align_offset;
- }
-
-
- while (len >= 16) {
- vs1 = _mm_cvtsi32_si128(adler);
- vs2 = _mm_cvtsi32_si128(sum2);
- vs3 = _mm_setzero_si128();
- vs2_0 = _mm_setzero_si128();
- vs1_0 = vs1;
-
- k = (len < max_iters ? (int)len : max_iters);
- k -= k % 16;
- len -= k;
-
- while (k >= 32) {
- /*
- vs1 = adler + sum(c[i])
- vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
- */
- vbuf = _mm_load_si128((__m128i*)buf);
- vbuf_0 = _mm_load_si128((__m128i*)(buf + 16));
- buf += 32;
- k -= 32;
-
- v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
- v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
- vs1 = _mm_add_epi32(v_sad_sum1, vs1);
- vs3 = _mm_add_epi32(vs1_0, vs3);
-
- vs1 = _mm_add_epi32(v_sad_sum2, vs1);
- v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
- vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
- v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
- vs2 = _mm_add_epi32(vsum2, vs2);
- vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
- vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
- vs1_0 = vs1;
- }
-
- vs2 = _mm_add_epi32(vs2_0, vs2);
- vs3 = _mm_slli_epi32(vs3, 5);
- vs2 = _mm_add_epi32(vs3, vs2);
- vs3 = _mm_setzero_si128();
-
- while (k >= 16) {
- /*
- vs1 = adler + sum(c[i])
- vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
- */
- vbuf = _mm_load_si128((__m128i*)buf);
- buf += 16;
- k -= 16;
-
-unaligned_jmp:
- v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
- vs1 = _mm_add_epi32(v_sad_sum1, vs1);
- vs3 = _mm_add_epi32(vs1_0, vs3);
- v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
- vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
- vs2 = _mm_add_epi32(vsum2, vs2);
- vs1_0 = vs1;
- }
-
- vs3 = _mm_slli_epi32(vs3, 4);
- vs2 = _mm_add_epi32(vs2, vs3);
-
- /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
- * a partial reduction sum implicitly and only summing to integers in vector positions
- * 0 and 2. This saves us some contention on the shuffle port(s) */
- adler = partial_hsum(vs1) % BASE;
- sum2 = hsum(vs2) % BASE;
- max_iters = NMAX;
- }
-
- /* Process tail (len < 16). */
- return adler32_len_16(adler, buf, len, sum2);
-}
-
-#endif
-/* adler32.c -- compute the Adler-32 checksum of a data stream
+/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <immintrin.h>
+static inline uint32_t partial_hsum(__m128i x) {
+ __m128i second_int = _mm_bsrli_si128(x, 8);
+ __m128i sum = _mm_add_epi32(x, second_int);
+ return _mm_cvtsi128_si32(sum);
+}
+
+static inline uint32_t hsum(__m128i x) {
+ __m128i sum1 = _mm_unpackhi_epi64(x, x);
+ __m128i sum2 = _mm_add_epi32(x, sum1);
+ __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+ __m128i sum4 = _mm_add_epi32(sum2, sum3);
+ return _mm_cvtsi128_si32(sum4);
+}
+
Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) {
uint32_t sum2;
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
- uint32_t ALIGNED_(16) s1[4], s2[4];
-
- s1[0] = s1[1] = s1[2] = 0; s1[3] = adler;
- s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2;
-
- char ALIGNED_(16) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- __m128i dot1v = _mm_load_si128((__m128i*)dot1);
- char ALIGNED_(16) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
- __m128i dot2v = _mm_load_si128((__m128i*)dot2);
- short ALIGNED_(16) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1};
- __m128i dot3v = _mm_load_si128((__m128i*)dot3);
-
- // We will need to multiply by
- //char ALIGNED_(16) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4};
+ const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+ const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+ const __m128i dot3v = _mm_set1_epi16(1);
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+ vbuf_0, v_sad_sum2, vsum2, vsum2_0;
+
+ /* If our buffer is unaligned (likely), make the determination whether
+ * or not there's enough of a buffer to consume to make the scalar, aligning
+ * additions worthwhile or if it's worth it to just eat the cost of an unaligned
+ * load. This is a pretty simple test, just test if 16 - the remainder + len is
+ * < 16 */
+ int max_iters = NMAX;
+ int rem = (uintptr_t)buf & 15;
+ int align_offset = 16 - rem;
+ int k = 0;
+ if (rem) {
+ if (len < 16 + align_offset) {
+ /* Let's eat the cost of this one unaligned load so that
+ * we don't completely skip over the vectorization. Doing
+ * 16 bytes at a time unaligned is is better than 16 + <= 15
+ * sums */
+ vbuf = _mm_loadu_si128((__m128i*)buf);
+ len -= 16;
+ buf += 16;
+ vs1 = _mm_cvtsi32_si128(adler);
+ vs2 = _mm_cvtsi32_si128(sum2);
+ vs3 = _mm_setzero_si128();
+ vs1_0 = vs1;
+ goto unaligned_jmp;
+ }
+
+ for (int i = 0; i < align_offset; ++i) {
+ adler += *(buf++);
+ sum2 += adler;
+ }
+
+ /* lop off the max number of sums based on the scalar sums done
+ * above */
+ len -= align_offset;
+ max_iters -= align_offset;
+ }
- char ALIGNED_(16) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
- __m128i shiftv = _mm_load_si128((__m128i*)shift);
while (len >= 16) {
- __m128i vs1 = _mm_load_si128((__m128i*)s1);
- __m128i vs2 = _mm_load_si128((__m128i*)s2);
- __m128i vs1_0 = vs1;
-
- int k = (len < NMAX ? (int)len : NMAX);
- k -= k % 16;
- len -= k;
-
- while (k >= 16) {
- /*
- vs1 = adler + sum(c[i])
- vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
-
- NOTE: 256-bit equivalents are:
- _mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts
- _mm256_madd_epi16 <- Sums 16 shorts to 8 int32_t.
- We could rewrite the below to use 256-bit instructions instead of 128-bit.
- */
- __m128i vbuf = _mm_loadu_si128((__m128i*)buf);
- buf += 16;
- k -= 16;
-
- __m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
- __m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t;
- __m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
- vs1 = _mm_add_epi32(vsum1, vs1);
- __m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
- vs1_0 = _mm_sll_epi32(vs1_0, shiftv);
- vsum2 = _mm_add_epi32(vsum2, vs2);
- vs2 = _mm_add_epi32(vsum2, vs1_0);
- vs1_0 = vs1;
- }
-
- // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
- // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
-
- uint32_t ALIGNED_(16) s1_unpack[4];
- uint32_t ALIGNED_(16) s2_unpack[4];
-
- _mm_store_si128((__m128i*)s1_unpack, vs1);
- _mm_store_si128((__m128i*)s2_unpack, vs2);
-
- adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE);
- adler %= BASE;
- s1[3] = adler;
-
- sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE);
- sum2 %= BASE;
- s2[3] = sum2;
+ vs1 = _mm_cvtsi32_si128(adler);
+ vs2 = _mm_cvtsi32_si128(sum2);
+ vs3 = _mm_setzero_si128();
+ vs2_0 = _mm_setzero_si128();
+ vs1_0 = vs1;
+
+ k = (len < max_iters ? (int)len : max_iters);
+ k -= k % 16;
+ len -= k;
+
+ while (k >= 32) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_load_si128((__m128i*)buf);
+ vbuf_0 = _mm_load_si128((__m128i*)(buf + 16));
+ buf += 32;
+ k -= 32;
+
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+
+ vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+ vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+ vs1_0 = vs1;
+ }
+
+ vs2 = _mm_add_epi32(vs2_0, vs2);
+ vs3 = _mm_slli_epi32(vs3, 5);
+ vs2 = _mm_add_epi32(vs3, vs2);
+ vs3 = _mm_setzero_si128();
+
+ while (k >= 16) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_load_si128((__m128i*)buf);
+ buf += 16;
+ k -= 16;
+
+unaligned_jmp:
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+ }
+
+ vs3 = _mm_slli_epi32(vs3, 4);
+ vs2 = _mm_add_epi32(vs2, vs3);
+
+ /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
+ * a partial reduction sum implicitly and only summing to integers in vector positions
+ * 0 and 2. This saves us some contention on the shuffle port(s) */
+ adler = partial_hsum(vs1) % BASE;
+ sum2 = hsum(vs2) % BASE;
+ max_iters = NMAX;
}
/* Process tail (len < 16). */