From: Nathan Moin Vaziri Date: Fri, 3 Apr 2026 00:16:27 +0000 (-0700) Subject: Add ACCUM_ROUND macro to crc32_chorba_sse2 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=HEAD;p=thirdparty%2Fzlib-ng.git Add ACCUM_ROUND macro to crc32_chorba_sse2 --- diff --git a/arch/x86/crc32_chorba_sse2.c b/arch/x86/crc32_chorba_sse2.c index 271adbda..93ec5d53 100644 --- a/arch/x86/crc32_chorba_sse2.c +++ b/arch/x86/crc32_chorba_sse2.c @@ -8,19 +8,36 @@ #include "arch/x86/x86_intrins.h" #include "arch_functions.h" +#define LSHIFT_QWORD(x) _mm_unpacklo_epi64(_mm_setzero_si128(), (x)) +#define RSHIFT_QWORD(x) _mm_unpackhi_epi64((x), _mm_setzero_si128()) +#define ALIGNR_QWORD(a, b) _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b), 1)) + #define READ_NEXT(in, off, a, b) \ do { \ a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \ b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \ - } while (0); + } while (0) #define NEXT_ROUND(invec, a, b, c, d) \ do { \ a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \ b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \ c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \ - d = _mm_srli_epi64(invec, 20); \ - } while (0); + d = _mm_srli_epi64(invec, 20); \ + } while (0) + +#define ACCUM_ROUND() \ + do { \ + __m128i a4_ = _mm_unpacklo_epi64(next56, ab4); \ + __m128i b2c2 = ALIGNR_QWORD(ab2, cd2); \ + __m128i b4c4 = ALIGNR_QWORD(ab4, cd4); \ + __m128i d2_ = RSHIFT_QWORD(cd2); \ + next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); \ + next12 = _mm_xor_si128(next12, b2c2); \ + next34 = _mm_xor_si128(b4c4, cd3); \ + next34 = _mm_xor_si128(next34, d2_); \ + next56 = RSHIFT_QWORD(cd4); \ + } while (0) Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t *buf, size_t len) { /* The calling function ensured that this is aligned correctly */ @@ -28,10 +45,7 @@ Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t ALIGNED_(16) uint64_t final[9] = {0}; uint64_t next1 = ~crc; crc = 0; - uint64_t next2 = 0; - uint64_t next3 = 0; - uint64_t next4 = 0; - uint64_t next5 = 0; + uint64_t next2, next3, next4, next5; __m128i next12 = _mm_cvtsi64_si128(next1); __m128i next34 = _mm_setzero_si128(); @@ -55,40 +69,32 @@ Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t chorba34 = _mm_xor_si128(chorba34, next34); chorba56 = _mm_xor_si128(chorba56, next56); chorba78 = _mm_xor_si128(chorba78, chorba12); - __m128i chorba45 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba34), _mm_castsi128_pd(chorba56), 1)); - __m128i chorba23 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba12), - _mm_castsi128_pd(chorba34), 1)); - + __m128i chorba45 = ALIGNR_QWORD(chorba34, chorba56); + __m128i chorba23 = ALIGNR_QWORD(chorba12, chorba34); + i += 8 * 8; /* 0-3 */ READ_NEXT(input, i, in1in2, in3in4); - __m128i chorba34xor = _mm_xor_si128(chorba34, _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12)); + __m128i chorba34xor = _mm_xor_si128(chorba34, LSHIFT_QWORD(chorba12)); in1in2 = _mm_xor_si128(in1in2, chorba34xor); NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); in3in4 = _mm_xor_si128(in3in4, ab1); /* _hopefully_ we don't get a huge domain switching penalty for this. This seems to be the best sequence */ - __m128i chorba56xor = _mm_xor_si128(chorba56, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2)); + __m128i chorba56xor = _mm_xor_si128(chorba56, LSHIFT_QWORD(ab2)); in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56xor, chorba23)); in3in4 = _mm_xor_si128(in3in4, chorba12); NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); - __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); - __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4); - a4_ = _mm_xor_si128(b2c2, a4_); - next12 = _mm_xor_si128(ab3, a4_); - next12 = _mm_xor_si128(next12, cd1); - - __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); - __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); - - next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_)); - next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + /* chorba56 already consumed next56, clear it so ACCUM_ROUND + does not xor the stale value into next12 */ + next56 = _mm_setzero_si128(); + ACCUM_ROUND(); i += 32; @@ -105,30 +111,19 @@ Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t in3in4 = _mm_xor_si128(in3in4, next34); in3in4 = _mm_xor_si128(in3in4, ab1); in3in4 = _mm_xor_si128(in3in4, chorba56); - __m128i chorba67 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba56), _mm_castsi128_pd(chorba78), 1)); - in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba67, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2))); + __m128i chorba67 = ALIGNR_QWORD(chorba56, chorba78); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba67, LSHIFT_QWORD(ab2))); NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); - b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); - a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4); - a4_ = _mm_xor_si128(b2c2, a4_); - next12 = _mm_xor_si128(ab3, cd1); - - next12 = _mm_xor_si128(next12, a4_); - next12 = _mm_xor_si128(next12, next56); - b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); - next34 = _mm_xor_si128(b4c4, cd3); - d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); - next34 = _mm_xor_si128(next34, d2_); - next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + ACCUM_ROUND(); i += 32; /* 8-11 */ READ_NEXT(input, i, in1in2, in3in4); - __m128i chorba80 = _mm_unpackhi_epi64(chorba78, _mm_setzero_si128()); + __m128i chorba80 = RSHIFT_QWORD(chorba78); __m128i next12_chorba12 = _mm_xor_si128(next12, chorba12); in1in2 = _mm_xor_si128(in1in2, chorba80); in1in2 = _mm_xor_si128(in1in2, chorba78); @@ -138,22 +133,13 @@ Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t in3in4 = _mm_xor_si128(next34, in3in4); in3in4 = _mm_xor_si128(in3in4, ab1); - __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + __m128i a2_ = LSHIFT_QWORD(ab2); in3in4 = _mm_xor_si128(in3in4, chorba34); in3in4 = _mm_xor_si128(in3in4, a2_); NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); - a4_ = _mm_unpacklo_epi64(next56, ab4); - next12 = _mm_xor_si128(a4_, ab3); - next12 = _mm_xor_si128(next12, cd1); - b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); - b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); - d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); - next12 = _mm_xor_si128(next12, b2c2); - next34 = _mm_xor_si128(b4c4, cd3); - next34 = _mm_xor_si128(next34, d2_); - next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + ACCUM_ROUND(); i += 32; @@ -163,7 +149,7 @@ Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t in1in2 = _mm_xor_si128(in1in2, next12); __m128i chorb56xorchorb12 = _mm_xor_si128(chorba56, chorba12); in1in2 = _mm_xor_si128(in1in2, chorb56xorchorb12); - __m128i chorb1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12); + __m128i chorb1_ = LSHIFT_QWORD(chorba12); in1in2 = _mm_xor_si128(in1in2, chorb1_); NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); @@ -171,33 +157,25 @@ Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t in3in4 = _mm_xor_si128(next34, in3in4); in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78)); in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba34, chorba12)); - in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2))); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, LSHIFT_QWORD(ab2))); NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); - a4_ = _mm_unpacklo_epi64(next56, ab4); - next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); - b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); - b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); - d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); - next12 = _mm_xor_si128(next12, b2c2); - next34 = _mm_xor_si128(b4c4, cd3); - next34 = _mm_xor_si128(next34, d2_); - next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + ACCUM_ROUND(); i += 32; /* 16-19 */ READ_NEXT(input, i, in1in2, in3in4); - __m128i chorba1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12); + __m128i chorba1_ = LSHIFT_QWORD(chorba12); in1in2 = _mm_xor_si128(_mm_xor_si128(next12, in1in2), _mm_xor_si128(chorba56, chorba45)); in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba12, chorba34)); in1in2 = _mm_xor_si128(chorba1_, in1in2); NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); - a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + a2_ = LSHIFT_QWORD(ab2); in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78)); in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56, chorba34)); in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, chorba67)); @@ -206,15 +184,7 @@ Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); - a4_ = _mm_unpacklo_epi64(next56, ab4); - next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); - b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); - b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); - d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); - next12 = _mm_xor_si128(next12, b2c2); - next34 = _mm_xor_si128(b4c4, cd3); - next34 = _mm_xor_si128(next34, d2_); - next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + ACCUM_ROUND(); i += 32; @@ -228,7 +198,7 @@ Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); - a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + a2_ = LSHIFT_QWORD(ab2); in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1)); in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba67)); in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34)); @@ -237,15 +207,7 @@ Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); - a4_ = _mm_unpacklo_epi64(next56, ab4); - next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); - b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); - b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); - d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); - next12 = _mm_xor_si128(next12, b2c2); - next34 = _mm_xor_si128(b4c4, cd3); - next34 = _mm_xor_si128(next34, d2_); - next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + ACCUM_ROUND(); i += 32; @@ -259,7 +221,7 @@ Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); - a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + a2_ = LSHIFT_QWORD(ab2); in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1)); in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba56)); in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34)); @@ -267,15 +229,7 @@ Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); - a4_ = _mm_unpacklo_epi64(next56, ab4); - next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); - b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); - b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); - d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); - next12 = _mm_xor_si128(next12, b2c2); - next34 = _mm_xor_si128(b4c4, cd3); - next34 = _mm_xor_si128(next34, d2_); - next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + ACCUM_ROUND(); i += 32; @@ -287,22 +241,14 @@ Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); - a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + a2_ = LSHIFT_QWORD(ab2); in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1)); in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba80)); in3in4 = _mm_xor_si128(a2_, in3in4); NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); - a4_ = _mm_unpacklo_epi64(next56, ab4); - next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); - b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); - b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); - d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); - next12 = _mm_xor_si128(next12, b2c2); - next34 = _mm_xor_si128(b4c4, cd3); - next34 = _mm_xor_si128(next34, d2_); - next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + ACCUM_ROUND(); } for (; (i + 40 + 32) < len; i += 32) { @@ -314,24 +260,14 @@ Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); - __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + __m128i a2_ = LSHIFT_QWORD(ab2); __m128i ab1_next34 = _mm_xor_si128(next34, ab1); in3in4 = _mm_xor_si128(in3in4, ab1_next34); in3in4 = _mm_xor_si128(a2_, in3in4); NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); - __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); - __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4); - a4_ = _mm_xor_si128(b2c2, a4_); - next12 = _mm_xor_si128(ab3, a4_); - next12 = _mm_xor_si128(next12, cd1); - - __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); - __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); - next12 = _mm_xor_si128(next12, next56); - next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_)); - next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + ACCUM_ROUND(); } next1 = _mm_cvtsi128_si64(next12);