From: Adam Stylinski Date: Sat, 29 Apr 2023 15:33:05 +0000 (-0400) Subject: Use ternary logic to xor 3 operands for "fold16" X-Git-Tag: 2.1.1-beta2~16 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9087c75f8d9a35c7d7801440c23deb555e77714e;p=thirdparty%2Fzlib-ng.git Use ternary logic to xor 3 operands for "fold16" This strategy is borrowed from ISA-L in this commit: https://github.com/intel/isa-l/commit/c2bec3ea65ce35b01311d1cc4b314f6b4986b9c8 We can also use it in the "fold final" routine but we'd have to take some extra care to only use it on AVX512 capable systems. --- diff --git a/arch/x86/crc32_fold_vpclmulqdq_tpl.h b/arch/x86/crc32_fold_vpclmulqdq_tpl.h index 67f08e128..3ea5c3305 100644 --- a/arch/x86/crc32_fold_vpclmulqdq_tpl.h +++ b/arch/x86/crc32_fold_vpclmulqdq_tpl.h @@ -40,8 +40,7 @@ static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3); z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); - zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); - zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0); + zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96); #ifdef COPY _mm512_storeu_si512((__m512i *)dst, zmm_t0); @@ -70,15 +69,10 @@ static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10); zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10); - zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); - zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1); - zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2); - zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3); - - zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0); - zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1); - zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2); - zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3); + zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96); + zmm_crc1 = _mm512_ternarylogic_epi32(zmm_crc1, z1, zmm_t1, 0x96); + zmm_crc2 = _mm512_ternarylogic_epi32(zmm_crc2, z2, zmm_t2, 0x96); + zmm_crc3 = _mm512_ternarylogic_epi32(zmm_crc3, z3, zmm_t3, 0x96); #ifdef COPY _mm512_storeu_si512((__m512i *)dst, zmm_t0); @@ -93,18 +87,15 @@ static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, // zmm_crc[0,1,2,3] -> zmm_crc0 z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); - zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); - zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1); + zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc1, 0x96); z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); - zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); - zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2); + zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc2, 0x96); z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); - zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); - zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3); + zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc3, 0x96); // zmm_crc0 -> xmm_crc[0, 1, 2, 3] *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);