From: Nathan Moinvaziri Date: Mon, 25 May 2020 13:32:55 +0000 (-0700) Subject: Added new lines in Alder32 AVX2 and SSSE3 to improve readability. X-Git-Tag: 1.9.9-b1~223 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=38bdf9b1d890243f30e1f8d085d45c5a44cbaa99;p=thirdparty%2Fzlib-ng.git Added new lines in Alder32 AVX2 and SSSE3 to improve readability. --- diff --git a/arch/x86/adler32_avx.c b/arch/x86/adler32_avx.c index 8907cdd3b..14b7ad46b 100644 --- a/arch/x86/adler32_avx.c +++ b/arch/x86/adler32_avx.c @@ -37,24 +37,35 @@ uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) { return adler32_len_16(adler, buf, len, sum2); uint32_t ALIGNED_(32) s1[8], s2[8]; + memset(s1, 0, sizeof(s1)); s1[7] = adler; // TODO: would a masked load be faster? memset(s2, 0, sizeof(s2)); s2[7] = sum2; - char ALIGNED_(32) dot1[32] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + + char ALIGNED_(32) dot1[32] = \ + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; __m256i dot1v = _mm256_load_si256((__m256i*)dot1); - char ALIGNED_(32) dot2[32] = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; + char ALIGNED_(32) dot2[32] = \ + {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; __m256i dot2v = _mm256_load_si256((__m256i*)dot2); - short ALIGNED_(32) dot3[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + short ALIGNED_(32) dot3[16] = \ + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; __m256i dot3v = _mm256_load_si256((__m256i*)dot3); + // We will need to multiply by char ALIGNED_(32) shift[16] = {5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; __m128i shiftv = _mm_load_si128((__m128i*)shift); + while (len >= 32) { __m256i vs1 = _mm256_load_si256((__m256i*)s1); __m256i vs2 = _mm256_load_si256((__m256i*)s2); __m256i vs1_0 = vs1; + int k = (len < NMAX ? (int)len : NMAX); k -= k % 32; len -= k; + while (k >= 32) { /* vs1 = adler + sum(c[i]) @@ -63,8 +74,9 @@ uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) { __m256i vbuf = _mm256_loadu_si256((__m256i*)buf); buf += 32; k -= 32; + __m256i v_short_sum1 = _mm256_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts. - __m256i vsum1 = _mm256_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t; + __m256i vsum1 = _mm256_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t; __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); vs1 = _mm256_add_epi32(vsum1, vs1); __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); @@ -73,16 +85,22 @@ uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) { vs2 = _mm256_add_epi32(vsum2, vs1_0); vs1_0 = vs1; } + // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on. uint32_t ALIGNED_(32) s1_unpack[8]; uint32_t ALIGNED_(32) s2_unpack[8]; + _mm256_store_si256((__m256i*)s1_unpack, vs1); _mm256_store_si256((__m256i*)s2_unpack, vs2); - adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) + (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE); + + adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) + + (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE); MOD(adler); s1[7] = adler; - sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE) + (s2_unpack[4] % BASE) + (s2_unpack[5] % BASE) + (s2_unpack[6] % BASE) + (s2_unpack[7] % BASE); + + sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE) + + (s2_unpack[4] % BASE) + (s2_unpack[5] % BASE) + (s2_unpack[6] % BASE) + (s2_unpack[7] % BASE); MOD(sum2); s2[7] = sum2; } diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c index 5c7a466eb..123a737bb 100644 --- a/arch/x86/adler32_ssse3.c +++ b/arch/x86/adler32_ssse3.c @@ -37,25 +37,32 @@ uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) { return adler32_len_16(adler, buf, len, sum2); uint32_t ALIGNED_(16) s1[4], s2[4]; + s1[0] = s1[1] = s1[2] = 0; s1[3] = adler; s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2; + char ALIGNED_(16) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; __m128i dot1v = _mm_load_si128((__m128i*)dot1); char ALIGNED_(16) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; __m128i dot2v = _mm_load_si128((__m128i*)dot2); short ALIGNED_(16) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1}; __m128i dot3v = _mm_load_si128((__m128i*)dot3); + // We will need to multiply by //char ALIGNED_(16) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4}; + char ALIGNED_(16) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; __m128i shiftv = _mm_load_si128((__m128i*)shift); + while (len >= 16) { __m128i vs1 = _mm_load_si128((__m128i*)s1); __m128i vs2 = _mm_load_si128((__m128i*)s2); __m128i vs1_0 = vs1; + int k = (len < NMAX ? (int)len : NMAX); k -= k % 16; len -= k; + while (k >= 16) { /* vs1 = adler + sum(c[i]) @@ -69,6 +76,7 @@ uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) { __m128i vbuf = _mm_loadu_si128((__m128i*)buf); buf += 16; k -= 16; + __m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts. __m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t; __m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); @@ -79,15 +87,20 @@ uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) { vs2 = _mm_add_epi32(vsum2, vs1_0); vs1_0 = vs1; } + // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on. + uint32_t ALIGNED_(16) s1_unpack[4]; uint32_t ALIGNED_(16) s2_unpack[4]; + _mm_store_si128((__m128i*)s1_unpack, vs1); _mm_store_si128((__m128i*)s2_unpack, vs2); + adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE); MOD(adler); s1[3] = adler; + sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE); MOD(sum2); s2[3] = sum2;