From: Adam Stylinski Date: Wed, 16 Feb 2022 14:42:40 +0000 (-0500) Subject: Unlocked more ILP in SSE variant of adler checksum X-Git-Tag: 2.1.0-beta1~364 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f7d284c68b08560a6903fe88c7bc63fedfbbc421;p=thirdparty%2Fzlib-ng.git Unlocked more ILP in SSE variant of adler checksum This helps uarchs such as sandybridge more than Yorkfield, but there were some measurable gains on a Core 2 Quad Q9650 as well. We can sum to two separate vs2 variables and add them back together at the end, allowing for some overlapping multiply-adds. This was only about a 9-12% gain on the Q9650 but it nearly doubled performance on cascade lake and is likely to have appreciable gains on everything in between those two. --- diff --git a/arch/x86/adler32_sse41.c b/arch/x86/adler32_sse41.c index 718be0f7..602f8ec1 100644 --- a/arch/x86/adler32_sse41.c +++ b/arch/x86/adler32_sse41.c @@ -46,11 +46,13 @@ Z_INTERNAL uint32_t adler32_sse41(uint32_t adler, const unsigned char *buf, size if (UNLIKELY(len < 16)) return adler32_len_16(adler, buf, len, sum2); - const __m128i dot2v = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); + const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); const __m128i dot3v = _mm_set1_epi16(1); const __m128i zero = _mm_setzero_si128(); - __m128i vbuf, vs1_0, vs3, vs1, vs2, v_sad_sum1, v_short_sum2, vsum2; + __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, + vbuf_0, v_sad_sum2, vsum2, vsum2_0; /* If our buffer is unaligned (likely), make the determination whether * or not there's enough of a buffer to consume to make the scalar, aligning @@ -93,12 +95,43 @@ Z_INTERNAL uint32_t adler32_sse41(uint32_t adler, const unsigned char *buf, size vs1 = _mm_cvtsi32_si128(adler); vs2 = _mm_cvtsi32_si128(sum2); vs3 = _mm_setzero_si128(); + vs2_0 = _mm_setzero_si128(); vs1_0 = vs1; k = (len < max_iters ? (int)len : max_iters); k -= k % 16; len -= k; + while (k >= 32) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_load_si128((__m128i*)buf); + vbuf_0 = _mm_load_si128((__m128i*)(buf + 16)); + buf += 32; + k -= 32; + + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero); + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + + vs1 = _mm_add_epi32(v_sad_sum2, vs1); + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0); + vs2 = _mm_add_epi32(vsum2, vs2); + vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v); + vs2_0 = _mm_add_epi32(vsum2_0, vs2_0); + vs1_0 = vs1; + } + + vs2 = _mm_add_epi32(vs2_0, vs2); + vs3 = _mm_slli_epi32(vs3, 5); + vs2 = _mm_add_epi32(vs3, vs2); + vs3 = _mm_setzero_si128(); + while (k >= 16) { /* vs1 = adler + sum(c[i]) @@ -112,7 +145,7 @@ unaligned_jmp: v_sad_sum1 = _mm_sad_epu8(vbuf, zero); vs1 = _mm_add_epi32(v_sad_sum1, vs1); vs3 = _mm_add_epi32(vs1_0, vs3); - v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0); vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); vs2 = _mm_add_epi32(vsum2, vs2); vs1_0 = vs1;