From: krk Date: Thu, 29 Jan 2026 19:55:09 +0000 (+0000) Subject: Interleave load/compute stages in AVX2 base64 encoder X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b2499af52920502bedfc6fd0fada6a8a3182d0d3;p=thirdparty%2Fopenssl.git Interleave load/compute stages in AVX2 base64 encoder Reviewed-by: Dmitry Belyavskiy Reviewed-by: Paul Dale MergeDate: Fri Feb 13 14:31:45 2026 (Merged from https://github.com/openssl/openssl/pull/29858) --- diff --git a/crypto/evp/enc_b64_avx2.c b/crypto/evp/enc_b64_avx2.c index 141e7fcc87e..1e2b00e6473 100644 --- a/crypto/evp/enc_b64_avx2.c +++ b/crypto/evp/enc_b64_avx2.c @@ -494,39 +494,47 @@ int encode_base64_avx2(EVP_ENCODE_CTX *ctx, unsigned char *dst, /* Process 96 bytes at a time */ for (; i + 100 <= srclen; i += 96) { _mm_prefetch((const char *)(input + i + 192), _MM_HINT_T0); - /* We shave off 4 bytes from the beginning and the end */ + /* + * Interleaved for each vector: load, shuffle, bit-split, lookup + * before starting the next, giving the OoO engine independent work chains + * across execution ports. + */ const __m128i lo0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 0)); const __m128i hi0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 1)); + __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf); + const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00)); + const __m256i t1_0 = _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040)); + const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0)); + const __m256i t3_0 = _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010)); + const __m256i input0 = _mm256_or_si256(t1_0, t3_0); + const __m128i lo1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 2)); const __m128i hi1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 3)); + __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf); + const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00)); + const __m256i t1_1 = _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040)); + const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0)); + const __m256i t3_1 = _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010)); + const __m256i input1 = _mm256_or_si256(t1_1, t3_1); + const __m128i lo2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 4)); const __m128i hi2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 5)); + __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf); + const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00)); + const __m256i t1_2 = _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040)); + const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0)); + const __m256i t3_2 = _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010)); + const __m256i input2 = _mm256_or_si256(t1_2, t3_2); + const __m128i lo3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 6)); const __m128i hi3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 7)); - __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf); - __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf); - __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf); __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf); - const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00)); - const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00)); - const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00)); const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00)); - const __m256i t1_0 = _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040)); - const __m256i t1_1 = _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040)); - const __m256i t1_2 = _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040)); const __m256i t1_3 = _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040)); - const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0)); - const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0)); - const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0)); const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0)); - const __m256i t3_0 = _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010)); - const __m256i t3_1 = _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010)); - const __m256i t3_2 = _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010)); const __m256i t3_3 = _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010)); - const __m256i input0 = _mm256_or_si256(t1_0, t3_0); - const __m256i input1 = _mm256_or_si256(t1_1, t3_1); - const __m256i input2 = _mm256_or_si256(t1_2, t3_2); const __m256i input3 = _mm256_or_si256(t1_3, t3_3); + __m256i vec0; __m256i vec1; __m256i vec2;