Interleave load/compute stages in AVX2 base64 encoder

author krk <keremkat@gmail.com>

Thu, 29 Jan 2026 19:55:09 +0000 (19:55 +0000)

committer Tomas Mraz <tomas@openssl.org>

Fri, 13 Feb 2026 14:31:35 +0000 (15:31 +0100)
author krk <keremkat@gmail.com>
Thu, 29 Jan 2026 19:55:09 +0000 (19:55 +0000)
committer Tomas Mraz <tomas@openssl.org>
Fri, 13 Feb 2026 14:31:35 +0000 (15:31 +0100)
diff --git a/crypto/evp/enc_b64_avx2.c b/crypto/evp/enc_b64_avx2.c

index 141e7fcc87e5f761b27b8082089e7ad596bb345b..1e2b00e6473cfed55e1137cdf8b881ab0d4580da 100644 (file)
--- a/crypto/evp/enc_b64_avx2.c
+++ b/crypto/evp/enc_b64_avx2.c
@@ -494,39 +494,47 @@ int encode_base64_avx2(EVP_ENCODE_CTX *ctx, unsigned char *dst,
      /* Process 96 bytes at a time */
      for (; i + 100 <= srclen; i += 96) {
          _mm_prefetch((const char *)(input + i + 192), _MM_HINT_T0);
-        /* We shave off 4 bytes from the beginning and the end */
+        /*
+         * Interleaved for each vector: load, shuffle, bit-split, lookup
+         * before starting the next, giving the OoO engine independent work chains
+         * across execution ports.
+         */
          const __m128i lo0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 0));
          const __m128i hi0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 1));
+        __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
+        const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
+        const __m256i t1_0 = _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
+        const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
+        const __m256i t3_0 = _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
+        const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
+
          const __m128i lo1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 2));
          const __m128i hi1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 3));
+        __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
+        const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
+        const __m256i t1_1 = _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
+        const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
+        const __m256i t3_1 = _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
+        const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
+
          const __m128i lo2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 4));
          const __m128i hi2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 5));
+        __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
+        const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
+        const __m256i t1_2 = _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
+        const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
+        const __m256i t3_2 = _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
+        const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
+
          const __m128i lo3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 6));
          const __m128i hi3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 7));
-        __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
-        __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
-        __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
          __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
-        const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
-        const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
-        const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
          const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
-        const __m256i t1_0 = _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
-        const __m256i t1_1 = _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
-        const __m256i t1_2 = _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
          const __m256i t1_3 = _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
-        const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
-        const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
-        const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
          const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
-        const __m256i t3_0 = _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
-        const __m256i t3_1 = _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
-        const __m256i t3_2 = _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
          const __m256i t3_3 = _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
-        const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
-        const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
-        const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
          const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
+
          __m256i vec0;
          __m256i vec1;
          __m256i vec2;
author	krk <keremkat@gmail.com>
	Thu, 29 Jan 2026 19:55:09 +0000 (19:55 +0000)
committer	Tomas Mraz <tomas@openssl.org>
	Fri, 13 Feb 2026 14:31:35 +0000 (15:31 +0100)