From: Nick Nuon Date: Thu, 6 Nov 2025 00:32:25 +0000 (-0500) Subject: Added AVX2 encoding + scalar improvements X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=3a69b1902892883d81c41747b2230c5168511026;p=thirdparty%2Fopenssl.git Added AVX2 encoding + scalar improvements Reviewed-by: Dmitry Belyavskiy Reviewed-by: Paul Dale (Merged from https://github.com/openssl/openssl/pull/29178) --- diff --git a/crypto/evp/bio_b64.c b/crypto/evp/bio_b64.c index de95a57057e..ff5c4e41ed2 100644 --- a/crypto/evp/bio_b64.c +++ b/crypto/evp/bio_b64.c @@ -41,6 +41,8 @@ typedef struct b64_struct { EVP_ENCODE_CTX *base64; unsigned char buf[EVP_ENCODE_LENGTH(B64_BLOCK_SIZE) + 10]; unsigned char tmp[B64_BLOCK_SIZE]; + unsigned char *encoded_buf; + size_t encoded_buf_len; } BIO_B64_CTX; static const BIO_METHOD methods_b64 = { @@ -72,6 +74,8 @@ static int b64_new(BIO *bi) ctx->cont = 1; ctx->start = 1; + ctx->encoded_buf = NULL; + ctx->encoded_buf_len = 0; ctx->base64 = EVP_ENCODE_CTX_new(); if (ctx->base64 == NULL) { OPENSSL_free(ctx); @@ -95,6 +99,9 @@ static int b64_free(BIO *a) if (ctx == NULL) return 0; + OPENSSL_free(ctx->encoded_buf); + ctx->encoded_buf = NULL; + ctx->encoded_buf_len = 0; EVP_ENCODE_CTX_free(ctx->base64); OPENSSL_free(ctx); BIO_set_data(a, NULL); @@ -379,95 +386,36 @@ static int b64_write(BIO *b, const char *in, int inl) if (in == NULL || inl <= 0) return 0; - while (inl > 0) { - n = inl > B64_BLOCK_SIZE ? B64_BLOCK_SIZE : inl; + int encoded_length = EVP_ENCODE_LENGTH(inl); - if ((BIO_get_flags(b) & BIO_FLAGS_BASE64_NO_NL) != 0) { - if (ctx->tmp_len > 0) { - if (!ossl_assert(ctx->tmp_len <= 3)) { - ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR); - return ret == 0 ? -1 : ret; - } - n = 3 - ctx->tmp_len; - /* - * There's a theoretical possibility for this - */ - if (n > inl) - n = inl; - memcpy(&(ctx->tmp[ctx->tmp_len]), in, n); - ctx->tmp_len += n; - ret += n; - if (ctx->tmp_len < 3) - break; - ctx->buf_len = EVP_EncodeBlock(ctx->buf, ctx->tmp, ctx->tmp_len); - if (!ossl_assert(ctx->buf_len <= (int)sizeof(ctx->buf))) { - ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR); - return ret == 0 ? -1 : ret; - } - if (!ossl_assert(ctx->buf_len >= ctx->buf_off)) { - ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR); - return ret == 0 ? -1 : ret; - } - /* - * Since we're now done using the temporary buffer, the - * length should be 0'd - */ - ctx->tmp_len = 0; - } else { - if (n < 3) { - memcpy(ctx->tmp, in, n); - ctx->tmp_len = n; - ret += n; - break; - } - n -= n % 3; - ctx->buf_len = EVP_EncodeBlock(ctx->buf, (unsigned char *)in, n); - if (!ossl_assert(ctx->buf_len <= (int)sizeof(ctx->buf))) { - ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR); - return ret == 0 ? -1 : ret; - } - if (!ossl_assert(ctx->buf_len >= ctx->buf_off)) { - ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR); - return ret == 0 ? -1 : ret; - } - ret += n; - } - } else { - if (!EVP_EncodeUpdate(ctx->base64, ctx->buf, &ctx->buf_len, - (unsigned char *)in, n)) - return ret == 0 ? -1 : ret; - if (!ossl_assert(ctx->buf_len <= (int)sizeof(ctx->buf))) { - ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR); - return ret == 0 ? -1 : ret; - } - if (!ossl_assert(ctx->buf_len >= ctx->buf_off)) { - ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR); - return ret == 0 ? -1 : ret; - } - ret += n; + if (ctx->encoded_buf == NULL || (size_t)encoded_length > ctx->encoded_buf_len) { + OPENSSL_free(ctx->encoded_buf); + ctx->encoded_buf = OPENSSL_malloc(encoded_length); + if (ctx->encoded_buf == NULL) { + ERR_raise(ERR_LIB_BIO, ERR_R_MALLOC_FAILURE); + return -1; } - inl -= n; - in += n; + ctx->encoded_buf_len = encoded_length; + } - ctx->buf_off = 0; - n = ctx->buf_len; - while (n > 0) { - i = BIO_write(next, &(ctx->buf[ctx->buf_off]), n); - if (i <= 0) { - BIO_copy_next_retry(b); - return ret == 0 ? i : ret; - } - n -= i; - ctx->buf_off += i; - if (!ossl_assert(ctx->buf_off <= (int)sizeof(ctx->buf))) { - ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR); - return ret == 0 ? -1 : ret; - } - if (!ossl_assert(ctx->buf_len >= ctx->buf_off)) { - ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR); - return ret == 0 ? -1 : ret; - } - } + unsigned char *encoded = ctx->encoded_buf; + + if (encoded == NULL) { + ERR_raise(ERR_LIB_BIO, ERR_R_MALLOC_FAILURE); + return -1; + } + int n_bytes_enc = 0; + if (!EVP_EncodeUpdate(ctx->base64, encoded, &n_bytes_enc, + (unsigned char *)in, inl)) { + if (ret == 0) + return -1; + return ret; + } + ret += inl; + i = BIO_write(next, encoded, n_bytes_enc); + if (i <= 0) { + BIO_copy_next_retry(b); + return ret == 0 ? i : ret; ctx->buf_len = 0; ctx->buf_off = 0; } diff --git a/crypto/evp/build.info b/crypto/evp/build.info index 45945afcabe..5897acd943d 100644 --- a/crypto/evp/build.info +++ b/crypto/evp/build.info @@ -8,6 +8,8 @@ $COMMON=digest.c evp_enc.c evp_lib.c evp_fetch.c evp_utils.c \ SOURCE[../../libcrypto]=$COMMON\ encode.c evp_key.c evp_cnf.c \ + enc_b64_scalar.c \ + enc_b64_avx2.c \ e_des.c e_bf.c e_idea.c e_des3.c \ e_rc4.c e_aes.c names.c e_aria.c e_sm4.c \ e_xcbc_d.c e_rc2.c e_cast.c e_rc5.c m_null.c \ diff --git a/crypto/evp/enc_b64_avx2.c b/crypto/evp/enc_b64_avx2.c new file mode 100644 index 00000000000..b3f46e66aec --- /dev/null +++ b/crypto/evp/enc_b64_avx2.c @@ -0,0 +1,661 @@ +#include +#include "enc_b64_scalar.h" +#include "enc_b64_avx2.h" +#include "internal/cryptlib.h" +#include "crypto/evp.h" +#include "evp_local.h" + +#if defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64) +#define STRINGIFY_IMPLEMENTATION_(a) #a +#define STRINGIFY(a) STRINGIFY_IMPLEMENTATION_(a) + +#ifdef __clang__ +/* + * clang does not have GCC push pop + * warning: clang attribute push can't be used within a namespace in clang up + * til 8.0 so OPENSSL_TARGET_REGION and OPENSSL_UNTARGET_REGION must be + * outside* of a namespace. + */ +#define OPENSSL_TARGET_REGION(T) \ + _Pragma(STRINGIFY(clang attribute push(__attribute__((target(T))), \ + apply_to = function))) +#define OPENSSL_UNTARGET_REGION _Pragma("clang attribute pop") +#elif defined(__GNUC__) +#define OPENSSL_TARGET_REGION(T) \ + _Pragma("GCC push_options") _Pragma(STRINGIFY(GCC target(T))) +#define OPENSSL_UNTARGET_REGION _Pragma("GCC pop_options") +#endif /* clang then gcc */ + +/* Default target region macros don't do anything. */ +#ifndef OPENSSL_TARGET_REGION +#define OPENSSL_TARGET_REGION(T) +#define OPENSSL_UNTARGET_REGION +#endif + +#define OPENSSL_TARGET_AVX2 \ + OPENSSL_TARGET_REGION("avx2") +#define OPENSSL_UNTARGET_AVX2 OPENSSL_UNTARGET_REGION + +/* + * Ensure this whole block is compiled with AVX2 enabled on GCC. + * Clang/MSVC will just ignore these pragmas. + */ + +#include +#include +#include +#include + +OPENSSL_TARGET_AVX2 +static __m256i lookup_pshufb_std(__m256i input) +{ + __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51)); + const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input); + + result = _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13))); + __m256i shift_LUT = _mm256_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '0' - 52, + '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, + '/' - 63, 'A', 0, 0, + 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '0' - 52, + '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, + '/' - 63, 'A', 0, 0); + + result = _mm256_shuffle_epi8(shift_LUT, result); + return _mm256_add_epi8(result, input); +} +OPENSSL_UNTARGET_AVX2 + +OPENSSL_TARGET_AVX2 +static inline __m256i lookup_pshufb_srp(__m256i input) +{ + const __m256i zero = _mm256_setzero_si256(); + const __m256i hi = _mm256_set1_epi8((char)0x80); + __m256i invalid = _mm256_or_si256(_mm256_cmpgt_epi8(zero, input), + _mm256_cmpgt_epi8(input, + _mm256_set1_epi8(63))); + __m256i idx = _mm256_setzero_si256(); + + idx = _mm256_sub_epi8(idx, _mm256_cmpgt_epi8(input, _mm256_set1_epi8(9))); + idx = _mm256_sub_epi8(idx, _mm256_cmpgt_epi8(input, _mm256_set1_epi8(35))); + idx = _mm256_blendv_epi8(idx, _mm256_set1_epi8(3), + _mm256_cmpeq_epi8(input, _mm256_set1_epi8(62))); + idx = _mm256_blendv_epi8(idx, _mm256_set1_epi8(4), + _mm256_cmpeq_epi8(input, _mm256_set1_epi8(63))); + + /* Zero-out invalid lanes via PSHUFB's high-bit mechanism */ + idx = _mm256_or_si256(idx, _mm256_and_si256(invalid, hi)); + + const __m256i shift_LUT = _mm256_setr_epi8('0' - 0, 'A' - 10, 'a' - 36, '.' - 62, '/' - 63, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + '0' - 0, 'A' - 10, 'a' - 36, '.' - 62, '/' - 63, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0); + + __m256i shift = _mm256_shuffle_epi8(shift_LUT, idx); + __m256i ascii = _mm256_add_epi8(shift, input); + return ascii; +} +OPENSSL_UNTARGET_AVX2 + +OPENSSL_TARGET_AVX2 +static inline __m256i shift_right_zeros(__m256i v, int n) +{ + switch (n) { + case 0: + return v; + case 1: + return _mm256_srli_si256(v, 1); + case 2: + return _mm256_srli_si256(v, 2); + case 3: + return _mm256_srli_si256(v, 3); + case 4: + return _mm256_srli_si256(v, 4); + case 5: + return _mm256_srli_si256(v, 5); + case 6: + return _mm256_srli_si256(v, 6); + case 7: + return _mm256_srli_si256(v, 7); + case 8: + return _mm256_srli_si256(v, 8); + case 9: + return _mm256_srli_si256(v, 9); + case 10: + return _mm256_srli_si256(v, 10); + case 11: + return _mm256_srli_si256(v, 11); + case 12: + return _mm256_srli_si256(v, 12); + case 13: + return _mm256_srli_si256(v, 13); + case 14: + return _mm256_srli_si256(v, 14); + case 15: + return _mm256_srli_si256(v, 15); + default: + return _mm256_setzero_si256(); + } +} +OPENSSL_UNTARGET_AVX2 + +OPENSSL_TARGET_AVX2 +static inline __m256i shift_left_zeros(__m256i v, int n) +{ + switch (n) { + case 0: + return v; + case 1: + return _mm256_slli_si256(v, 1); + case 2: + return _mm256_slli_si256(v, 2); + case 3: + return _mm256_slli_si256(v, 3); + case 4: + return _mm256_slli_si256(v, 4); + case 5: + return _mm256_slli_si256(v, 5); + case 6: + return _mm256_slli_si256(v, 6); + case 7: + return _mm256_slli_si256(v, 7); + case 8: + return _mm256_slli_si256(v, 8); + case 9: + return _mm256_slli_si256(v, 9); + case 10: + return _mm256_slli_si256(v, 10); + case 11: + return _mm256_slli_si256(v, 11); + case 12: + return _mm256_slli_si256(v, 12); + case 13: + return _mm256_slli_si256(v, 13); + case 14: + return _mm256_slli_si256(v, 14); + case 15: + return _mm256_slli_si256(v, 15); + case 16: + return _mm256_setzero_si256(); + default: + return _mm256_setzero_si256(); + } +} +OPENSSL_UNTARGET_AVX2 + +static const uint8_t shuffle_masks[16][16] = { + { 0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, + { 0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, + { 0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, + { 0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, + { 0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, + { 0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, + { 0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, + { 0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9, 10, 11, 12, 13, 14 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9, 10, 11, 12, 13, 14 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9, 10, 11, 12, 13, 14 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80 } +}; + +/** + * Insert a line feed character in the 64-byte input at index K in [0,32). + */ +OPENSSL_TARGET_AVX2 +static inline __m256i insert_line_feed32(__m256i input, int K) +{ + __m256i line_feed_vector = _mm256_set1_epi8('\n'); + __m128i identity = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + if (K >= 16) { + __m128i maskhi = _mm_loadu_si128((__m128i *)shuffle_masks[K - 16]); + __m256i mask = _mm256_set_m128i(maskhi, identity); + __m256i lf_pos = _mm256_cmpeq_epi8(mask, _mm256_set1_epi8((char)0x80)); + __m256i shuffled = _mm256_shuffle_epi8(input, mask); + __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos); + + return result; + } + /* Shift input right by 1 byte */ + __m256i shift = _mm256_alignr_epi8(input, _mm256_permute2x128_si256(input, input, 0x21), + 15); + input = _mm256_blend_epi32(input, shift, 0xF0); + __m128i masklo = _mm_loadu_si128((__m128i *)shuffle_masks[K]); + __m256i mask = _mm256_set_m128i(identity, masklo); + __m256i lf_pos = _mm256_cmpeq_epi8(mask, _mm256_set1_epi8((char)0x80)); + __m256i shuffled = _mm256_shuffle_epi8(input, mask); + __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos); + return result; +} +OPENSSL_UNTARGET_AVX2 + +OPENSSL_TARGET_AVX2 +static inline size_t ins_nl_gt32(__m256i v, uint8_t *out, int stride, + int *wrap_cnt) +{ + const int until_nl = stride - *wrap_cnt; + + if (until_nl > 32) { + _mm256_storeu_si256((__m256i *)out, v); + + *wrap_cnt += 32; + return 32; + } + + if (until_nl == 32) { + _mm256_storeu_si256((__m256i *)out, v); + + out[32] = '\n'; + *wrap_cnt = 0; + return 33; + } + + const uint8_t last = (uint8_t)_mm256_extract_epi8(v, 31); + const __m256i with_lf = insert_line_feed32(v, until_nl); + _mm256_storeu_si256((__m256i *)out, with_lf); + out[32] = last; + + *wrap_cnt = 32 - until_nl; + return 33; +} +OPENSSL_UNTARGET_AVX2 + +OPENSSL_TARGET_AVX2 +static inline size_t insert_nl_gt16(const __m256i v0, + uint8_t *output, + int wrap_max, int *wrap_cnt) +{ + uint8_t *out = output; + int wrap_rem = wrap_max - *wrap_cnt; + _mm256_storeu_si256((__m256i *)(output), v0); + + if (wrap_rem > 32) { + *wrap_cnt += 32; + return 32; + } + + __m256i all_ff_mask = _mm256_set1_epi8((char)0xFF); + + __m256i mask_second_lane = _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, + (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, + (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, + (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF); + + __m256i blended_0L = v0; + int surplus_0 = wrap_rem < 16 ? 1 : 0; + if (surplus_0 == 1) { + __m256i shifted_0_L = shift_left_zeros(shift_right_zeros(v0, wrap_rem), + wrap_rem + surplus_0); + __m256i mask_shifted_0_L = shift_left_zeros(all_ff_mask, wrap_rem + surplus_0); + __m256i mask = _mm256_or_si256(mask_shifted_0_L, mask_second_lane); + __m256i shifted_1_L = shift_left_zeros(v0, 1); + __m256i shifted = _mm256_blendv_epi8(shifted_0_L, shifted_1_L, mask); + + blended_0L = _mm256_blendv_epi8(v0, shifted, mask); + _mm256_storeu_si256((__m256i *)(output), blended_0L); + wrap_rem += wrap_max; + } + + int surplus_1 = (wrap_rem >= 16 && wrap_rem < 32) ? 1 : 0; + int last_of_1L = _mm256_extract_epi8(v0, 31); + + if (surplus_1 == 1) { + uint16_t sec_last_of_1L = _mm256_extract_epi8(v0, 30); + int wrap_rem_1 = wrap_rem - 16; + __m256i shifted_1_L = shift_left_zeros(shift_right_zeros(v0, wrap_rem_1), + wrap_rem_1 + surplus_0 + surplus_1); + __m256i mask_shifted_1_L = shift_left_zeros(all_ff_mask, wrap_rem_1 + surplus_0 + surplus_1); + __m256i mask = _mm256_and_si256(mask_second_lane, mask_shifted_1_L); + __m256i blended_1L = _mm256_blendv_epi8(blended_0L, shifted_1_L, mask); + _mm256_storeu_si256((__m256i *)(output), blended_1L); + + output[wrap_rem + surplus_0] = '\n'; + output[31 + surplus_0] = (uint8_t)sec_last_of_1L; + output[31 + surplus_0 + surplus_1] = last_of_1L; + } + + if (surplus_0 == 1) { + output[wrap_rem - wrap_max] = '\n'; + output[16] = _mm256_extract_epi8(v0, 15); + output[31 + surplus_0 + surplus_1] = last_of_1L; + } + + *wrap_cnt = wrap_rem > 32 ? 32 - (wrap_rem - wrap_max) : 32 - wrap_rem; + + int nl_at_end = 0; + if (*wrap_cnt == wrap_max || *wrap_cnt == 0) { + *wrap_cnt = 0; + output[32 + surplus_0 + surplus_1] = '\n'; + nl_at_end = 1; + } + + out += 32 + surplus_0 + surplus_1 + nl_at_end; + size_t written = (size_t)(out - output); + + return written; +} +OPENSSL_UNTARGET_AVX2 + +OPENSSL_TARGET_AVX2 +static inline size_t insert_nl_2nd_vec_stride_12(const __m256i v0, + uint8_t *output, + int dummy_stride, + int *wrap_cnt) +{ + __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, (char)0xFF, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + (char)0xFF, + (char)0xFF, (char)0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, (char)0xFF, + 12); + __m256i shuffled = _mm256_shuffle_epi8(v0, shuffling_mask); + + _mm256_storeu_si256((__m256i *)(output + 0), shuffled); + + int16_t rem_1_L_ext = _mm256_extract_epi16(v0, 7); + int8_t rem_2_L_ext_P1 = _mm256_extract_epi8(v0, 29); + int16_t rem_2_L_ext_P2 = _mm256_extract_epi16(v0, 15); + + uint8_t *out = output; + out[4] = '\n'; + memcpy(out + 15, &rem_1_L_ext, sizeof(rem_1_L_ext)); + out[16 + 1] = '\n'; + memcpy(out + 15 + 17, &rem_2_L_ext_P1, sizeof(rem_2_L_ext_P1)); + out[16 + 14] = '\n'; + memcpy(out + 15 + 17 + 1, &rem_2_L_ext_P2, sizeof(rem_2_L_ext_P2)); + + out += 32 + 3; + *wrap_cnt = 4; + + size_t written = (out - output); + return written; +} +OPENSSL_UNTARGET_AVX2 + +OPENSSL_TARGET_AVX2 +static inline __m256i insert_newlines_by_mask(__m256i data, __m256i mask) +{ + __m256i newline = _mm256_set1_epi8('\n'); + + return _mm256_or_si256(_mm256_and_si256(mask, newline), + _mm256_andnot_si256(mask, data)); +} +OPENSSL_UNTARGET_AVX2 + +OPENSSL_TARGET_AVX2 +static inline size_t insert_nl_str4(const __m256i v0, uint8_t *output) +{ + __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, (char)0xFF, 4, 5, 6, + 7, (char)0xFF, 8, 9, 10, 11, (char)0xFF, 12, + (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, 0, 1, 2, 3, + (char)0xFF, 4, 5, 6, 7, (char)0xFF, 8, 9); + __m256i mask_5_bytes = _mm256_setr_epi8(0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF, + 0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF, + 0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF, + 0, 0); + __m256i shuffled_4_bytes = _mm256_shuffle_epi8(v0, shuffling_mask); + __m256i v0_w_nl = insert_newlines_by_mask(shuffled_4_bytes, mask_5_bytes); + + _mm256_storeu_si256((__m256i *)(output + 0), v0_w_nl); + + /* Handle cross-lane remainder logic */ + /* Without macros, _mm256_srli_si256 complains that the last arg must be an 8-bit immediate */ +#define B_LANE 16 /* Bytes per lane */ +#define N_RET_1_L 3 /* bytes "shifted out" of lane 0 */ +#define N_RET_2_L (N_RET_1_L + 4) /* bytes "shifted out" of lane 1 */ + + /* Bytes that were shifted out of lane 0 */ + __m256i rem_1_L = _mm256_srli_si256(v0, B_LANE - N_RET_1_L); + + /* Bytes that were shifted out of lane 1 */ + __m256i rem_2_L_P1 = _mm256_srli_si256(_mm256_slli_si256(_mm256_srli_si256(v0, B_LANE - N_RET_2_L), + B_LANE - N_RET_1_L), + B_LANE - 2); + + /* isolate the bytes that were shifted out of lane 1 */ + __m256i rem_2_L_P2 = _mm256_slli_si256( + _mm256_srli_si256(v0, + B_LANE - N_RET_2_L + N_RET_1_L), + N_RET_1_L); + + __m256i rem_2_L = _mm256_or_si256(rem_2_L_P1, rem_2_L_P2); + + int32_t rem_1_L_ext = _mm256_extract_epi32(rem_1_L, 0); + int64_t rem_2_L_ext = _mm256_extract_epi64(rem_2_L, 2); + + uint8_t *out = output + 16; + memcpy(out, &rem_1_L_ext, sizeof(rem_1_L_ext)); + out += 3; + *out++ = '\n'; + + out = output + 32; + memcpy(out, &rem_2_L_ext, sizeof(rem_2_L_ext)); + out += 2; + *out++ = '\n'; + out += 4; + *out++ = '\n'; + + size_t written = (out - output); + return written; +} +OPENSSL_UNTARGET_AVX2 + +OPENSSL_TARGET_AVX2 +static inline size_t insert_nl_str8(const __m256i v0, uint8_t *output) +{ + __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, (char)0xFF, + 8, 9, 10, 11, 12, 13, 14, + (char)0xFF, (char)0xFF, 0, 1, 2, 3, 4, 5, 6, + 7, (char)0xFF, 8, 9, 10, 11, 12); + __m256i shuffled_4_bytes = _mm256_shuffle_epi8(v0, shuffling_mask); + _mm256_storeu_si256((__m256i *)(output), shuffled_4_bytes); + int8_t rem_1_L = _mm256_extract_epi8(v0, 15); + int8_t rem_2_L_P1 = _mm256_extract_epi8(v0, 29); + int16_t rem_2_L_P2 = _mm256_extract_epi16(v0, 15); + uint8_t *out = output; + + memcpy(out + 16, &rem_1_L, sizeof(rem_1_L)); + memcpy(out + 32, &rem_2_L_P1, sizeof(rem_2_L_P1)); + memcpy(out + 32 + 1, &rem_2_L_P2, sizeof(rem_2_L_P2)); + + output[8] = '\n'; + output[17] = '\n'; + output[26] = '\n'; + output[35] = '\n'; + + out += 32 + 4; + + size_t written = (out - output); + return written; +} +OPENSSL_UNTARGET_AVX2 + +OPENSSL_TARGET_AVX2 +int encode_base64_avx2(EVP_ENCODE_CTX *ctx, unsigned char *dst, + const unsigned char *src, int srclen, int ctx_length, + int *final_wrap_cnt) +{ + const uint8_t *input = (const uint8_t *)src; + uint8_t *out = (uint8_t *)dst; + int i = 0; + int stride = (ctx == NULL) ? 0 : ctx_length / 3 * 4; + int wrap_cnt = 0; + const int use_srp = (ctx != NULL + && (ctx->flags & EVP_ENCODE_CTX_USE_SRP_ALPHABET) != 0); + const __m256i shuf = _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, + 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); + int base = 0; + + /* Process 96 bytes at a time */ + for (; i + 100 <= srclen; i += 96) { + /* We shave off 4 bytes from the beginning and the end */ + const __m128i lo0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 0)); + const __m128i hi0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 1)); + const __m128i lo1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 2)); + const __m128i hi1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 3)); + const __m128i lo2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 4)); + const __m128i hi2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 5)); + const __m128i lo3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 6)); + const __m128i hi3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 7)); + __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf); + __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf); + __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf); + __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf); + const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00)); + const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00)); + const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00)); + const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00)); + const __m256i t1_0 = _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040)); + const __m256i t1_1 = _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040)); + const __m256i t1_2 = _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040)); + const __m256i t1_3 = _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040)); + const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0)); + const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0)); + const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0)); + const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0)); + const __m256i t3_0 = _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010)); + const __m256i t3_1 = _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010)); + const __m256i t3_2 = _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010)); + const __m256i t3_3 = _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010)); + const __m256i input0 = _mm256_or_si256(t1_0, t3_0); + const __m256i input1 = _mm256_or_si256(t1_1, t3_1); + const __m256i input2 = _mm256_or_si256(t1_2, t3_2); + const __m256i input3 = _mm256_or_si256(t1_3, t3_3); + __m256i vec0; + __m256i vec1; + __m256i vec2; + __m256i vec3; + + if (use_srp) { + vec0 = lookup_pshufb_srp(input0); + vec1 = lookup_pshufb_srp(input1); + vec2 = lookup_pshufb_srp(input2); + vec3 = lookup_pshufb_srp(input3); + + } else { + vec0 = lookup_pshufb_std(input0); + vec1 = lookup_pshufb_std(input1); + vec2 = lookup_pshufb_std(input2); + vec3 = lookup_pshufb_std(input3); + } + + if (stride == 0) { + _mm256_storeu_si256((__m256i *)out, vec0); + + out += 32; + _mm256_storeu_si256((__m256i *)out, vec1); + + out += 32; + _mm256_storeu_si256((__m256i *)out, vec2); + + out += 32; + _mm256_storeu_si256((__m256i *)out, vec3); + + out += 32; + } else if (stride == 64) { + _mm256_storeu_si256((__m256i *)out, vec0); + + out += 32; + _mm256_storeu_si256((__m256i *)out, vec1); + + out += 32; + *(out++) = '\n'; + + _mm256_storeu_si256((__m256i *)out, vec2); + out += 32; + + _mm256_storeu_si256((__m256i *)out, vec3); + out += 32; + + *(out++) = '\n'; + } else if (stride == 4) { + int out_idx = 0; + + out_idx += (int)insert_nl_str4(vec0, out + out_idx); + out_idx += (int)insert_nl_str4(vec1, out + out_idx); + out_idx += (int)insert_nl_str4(vec2, out + out_idx); + out_idx += (int)insert_nl_str4(vec3, out + out_idx); + + out += out_idx; + } else if (stride == 8) { + + out += insert_nl_str8(vec0, out); + out += insert_nl_str8(vec1, out); + out += insert_nl_str8(vec2, out); + out += insert_nl_str8(vec3, out); + + } else if (stride == 12) { + switch (base) { + case 0: + + out += insert_nl_gt16(vec0, out, stride, &wrap_cnt); + out += insert_nl_2nd_vec_stride_12(vec1, out, stride, &wrap_cnt); + out += insert_nl_gt16(vec2, out, stride, &wrap_cnt); + out += insert_nl_gt16(vec3, out, stride, &wrap_cnt); + break; + case 1: + out += insert_nl_2nd_vec_stride_12(vec0, out, stride, &wrap_cnt); + out += insert_nl_gt16(vec1, out, stride, &wrap_cnt); + out += insert_nl_gt16(vec2, out, stride, &wrap_cnt); + out += insert_nl_2nd_vec_stride_12(vec3, out, stride, &wrap_cnt); + break; + default: /* base == 2 */ + out += insert_nl_gt16(vec0, out, stride, &wrap_cnt); + out += insert_nl_gt16(vec1, out, stride, &wrap_cnt); + out += insert_nl_2nd_vec_stride_12(vec2, out, stride, &wrap_cnt); + out += insert_nl_gt16(vec3, out, stride, &wrap_cnt); + break; + } + + if (++base == 3) + base = 0; + } else if (stride >= 32) { + out += ins_nl_gt32(vec0, out, stride, &wrap_cnt); + out += ins_nl_gt32(vec1, out, stride, &wrap_cnt); + out += ins_nl_gt32(vec2, out, stride, &wrap_cnt); + out += ins_nl_gt32(vec3, out, stride, &wrap_cnt); + } else if (stride >= 16) { + out += insert_nl_gt16(vec0, out, stride, &wrap_cnt); + out += insert_nl_gt16(vec1, out, stride, &wrap_cnt); + out += insert_nl_gt16(vec2, out, stride, &wrap_cnt); + out += insert_nl_gt16(vec3, out, stride, &wrap_cnt); + } + } + + if (stride == 0) { + for (; i + 28 <= srclen; i += 24) { + /* lo = [xxxx|DDDC|CCBB|BAAA] */ + /* hi = [xxxx|HHHG|GGFF|FEEE] */ + const __m128i lo = _mm_loadu_si128((const __m128i *)(input + i)); + const __m128i hi = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3)); + /* + * bytes from groups A, B and C are needed in separate 32-bit lanes + * in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA] + */ + __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf); + const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00)); + const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); + const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0)); + const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); + const __m256i indices = _mm256_or_si256(t1, t3); + _mm256_storeu_si256((__m256i *)out, (use_srp ? lookup_pshufb_srp : lookup_pshufb_std)(indices)); + + out += 32; + } + } + *final_wrap_cnt = wrap_cnt; + + if (stride >= 32 && wrap_cnt == stride) { + wrap_cnt = 0; + *out++ = '\n'; + } + + return (int)(out - (uint8_t *)dst) + +evp_encodeblock_int(ctx, out, src + i, srclen - i, final_wrap_cnt); +} +OPENSSL_UNTARGET_AVX2 +#endif diff --git a/crypto/evp/enc_b64_avx2.h b/crypto/evp/enc_b64_avx2.h new file mode 100644 index 00000000000..9c871ac13c5 --- /dev/null +++ b/crypto/evp/enc_b64_avx2.h @@ -0,0 +1,12 @@ +#ifndef OSSL_CRYPTO_EVP_B64_AVX2_H +#define OSSL_CRYPTO_EVP_B64_AVX2_H + +#include + +#if defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64) +int encode_base64_avx2(EVP_ENCODE_CTX *ctx, + unsigned char *out, const unsigned char *src, int srclen, + int newlines, int *wrap_cnt); +#endif + +#endif diff --git a/crypto/evp/enc_b64_scalar.c b/crypto/evp/enc_b64_scalar.c new file mode 100644 index 00000000000..d7b10f648bd --- /dev/null +++ b/crypto/evp/enc_b64_scalar.c @@ -0,0 +1,280 @@ +#include +#include "internal/cryptlib.h" +#include "crypto/evp.h" +#include "evp_local.h" +#include "enc_b64_scalar.h" + +static const unsigned char base64_srp_bin2ascii_0[256] = { + '0', '0', '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '3', + '4', '4', '4', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7', + '8', '8', '8', '8', '9', '9', '9', '9', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C', 'D', 'D', 'D', 'D', 'E', 'E', 'E', 'E', 'F', 'F', 'F', 'F', + 'G', 'G', 'G', 'G', 'H', 'H', 'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', + 'K', 'K', 'K', 'K', 'L', 'L', 'L', 'L', 'M', 'M', 'M', 'M', 'N', 'N', 'N', 'N', + 'O', 'O', 'O', 'O', 'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', + 'S', 'S', 'S', 'S', 'T', 'T', 'T', 'T', 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', + 'W', 'W', 'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', + 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', + 'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'h', 'h', 'h', 'h', + 'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l', 'l', 'l', + 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'p', 'p', 'p', 'p', + 'q', 'q', 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's', 't', 't', 't', 't', + 'u', 'u', 'u', 'u', 'v', 'v', 'v', 'v', 'w', 'w', 'w', 'w', 'x', 'x', 'x', 'x', + 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '.', '.', '.', '.', '/', '/', '/', '/' +}; + +static const unsigned char base64_srp_bin2ascii_1[256] = { + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', + 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', + 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', '/', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', + 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', + 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', '/', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', + 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', + 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', '/', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', + 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', + 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', '/' +}; + +static const unsigned char base64_srp_bin2ascii_2[256] = { + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', + 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', + 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', '/', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', + 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', + 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', '/', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', + 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', + 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', '/', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', + 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', + 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', '/' +}; + +static const unsigned char base64_std_bin2ascii_0[256] = { + 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D', + 'D', 'E', 'E', 'E', 'E', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H', + 'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 'K', 'K', 'K', 'K', 'L', + 'L', 'L', 'L', 'M', 'M', 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O', + 'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'S', 'S', 'S', + 'S', 'T', 'T', 'T', 'T', 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W', + 'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'a', + 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', + 'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'h', 'h', 'h', + 'h', 'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l', + 'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'p', + 'p', 'p', 'p', 'q', 'q', 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's', + 't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', 'v', 'v', 'w', 'w', 'w', + 'w', 'x', 'x', 'x', 'x', 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0', + '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '3', '4', + '4', '4', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7', + '8', '8', '8', '8', '9', '9', '9', '9', '+', '+', '+', '+', '/', '/', '/', + '/' +}; + +static const unsigned char base64_std_bin2ascii_1[256] = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', + 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', + 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', + 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', + 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', + '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', + 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', + 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', + 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', + 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', + 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', + '/' +}; + +static const unsigned char base64_std_bin2ascii_2[256] = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', + 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', + 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', + 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', + 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', + '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', + 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', + 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', + 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', + 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', + 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', + '/' +}; + +int evp_encodeblock_int(EVP_ENCODE_CTX *ctx, unsigned char *t, + const unsigned char *f, int dlen, int *wrap_cnt) +{ + int i = 0; + int ret = 0; + uint8_t t1, t2, t3; + const unsigned char *e0, *e1, *e2; + int srp = (ctx != NULL + && (ctx->flags & EVP_ENCODE_CTX_USE_SRP_ALPHABET) != 0); + int wrap_cnt_by_input = *wrap_cnt / 4 * 3; + const int ctx_length = (ctx != NULL) ? ctx->length : 0; + + if (srp) { + e0 = base64_srp_bin2ascii_0; + e1 = base64_srp_bin2ascii_1; + e2 = base64_srp_bin2ascii_2; + } else { + e0 = base64_std_bin2ascii_0; + e1 = base64_std_bin2ascii_1; + e2 = base64_std_bin2ascii_2; + } + + if (ctx_length == 1) { + while (i < dlen && ret <= INT_MAX && ctx != NULL) { + t1 = f[i]; + *(t++) = e0[t1]; + *(t++) = e1[(t1 & 0x03) << 4]; + *(t++) = '='; + *(t++) = '='; + *(t++) = '\n'; + + ret += 5; + i++; + } + + *t = '\0'; + ret--; + + return ret; + } else if (ctx_length % 3 != 0) { + i = 0; + int wrap_cnt_nm3 = 0; + while (i + 2 < dlen && ret <= INT_MAX) { + if (ctx != NULL) { + if ((wrap_cnt_nm3 < ctx->length + && (wrap_cnt_nm3 + 3 + wrap_cnt_by_input) > ctx->length) + && ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0)) { + + switch (ctx->length % 3) { + case 0: + break; + case 1: + t1 = f[i]; + *(t++) = e0[t1]; + *(t++) = e1[(t1 & 0x03) << 4]; + *(t++) = '='; + *(t++) = '='; + + ret += 4; + i++; + break; + case 2: + t1 = f[i]; + t2 = f[i + 1]; + *(t++) = e0[t1]; + *(t++) = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *(t++) = e2[(t2 & 0x0F) << 2]; + *(t++) = '='; + i += 2; + ret += 4; + break; + } + *(t++) = '\n'; + ret++; + wrap_cnt_nm3 = 0; + } + } + + if (ctx_length >= 4 && i + 2 < dlen) { + t1 = f[i]; + t2 = f[i + 1]; + t3 = f[i + 2]; + *(t++) = e0[t1]; + *(t++) = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *(t++) = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; + *(t++) = e2[t3]; + ret += 4; + wrap_cnt_nm3 += 3; + i += 3; + } + } + } else { + for (i = 0; i + 2 < dlen && ret <= INT_MAX; i += 3) { + + t1 = f[i]; + t2 = f[i + 1]; + t3 = f[i + 2]; + *(t++) = e0[t1]; + *(t++) = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *(t++) = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; + *(t++) = e2[t3]; + ret += 4; + + if (ctx != NULL) { + if ((i + 3 + wrap_cnt_by_input) % ctx->length == 0 && ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0) && ctx->length % 3 == 0) { + *(t++) = '\n'; + ret++; + } + } + } + } + + switch (dlen - i) { + case 0: + break; + case 1: + t1 = f[i]; + *(t++) = e0[t1]; + *(t++) = e1[(t1 & 0x03) << 4]; + *(t++) = '='; + *(t++) = '='; + + ret += 4; + + if (ctx != NULL) { + if ((i + 1 + wrap_cnt_by_input) % ctx->length == 0 && ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0) && ctx->length % 3 == 0) { + *(t++) = '\n'; + ret++; + } + } + + break; + case 2: + t1 = f[i]; + t2 = f[i + 1]; + *(t++) = e0[t1]; + *(t++) = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *(t++) = e2[(t2 & 0x0F) << 2]; + *(t++) = '='; + ret += 4; + + if (ctx != NULL) { + if ((i + 2 + wrap_cnt_by_input) % ctx->length == 0 && ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0) && ctx->length % 3 == 0) { + *(t++) = '\n'; + ret++; + } + } + break; + } + + *t = '\0'; + + return ret; +} diff --git a/crypto/evp/enc_b64_scalar.h b/crypto/evp/enc_b64_scalar.h new file mode 100644 index 00000000000..91d416f7586 --- /dev/null +++ b/crypto/evp/enc_b64_scalar.h @@ -0,0 +1,8 @@ +#ifndef OSSL_CRYPTO_EVP_B64_SCALAR_H +#define OSSL_CRYPTO_EVP_B64_SCALAR_H +#include + +int evp_encodeblock_int(EVP_ENCODE_CTX *ctx, unsigned char *t, + const unsigned char *f, int dlen, int *wrap_cnt); + +#endif diff --git a/crypto/evp/encode.c b/crypto/evp/encode.c index 53575e7b60a..bbd36c0820b 100644 --- a/crypto/evp/encode.c +++ b/crypto/evp/encode.c @@ -14,25 +14,20 @@ #include "crypto/evp.h" #include "evp_local.h" +#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM) && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) + +#define HAS_IA32CAP_IS_64 +#endif + +#include "enc_b64_avx2.h" +#include "enc_b64_scalar.h" + static unsigned char conv_ascii2bin(unsigned char a, const unsigned char *table); -static int evp_encodeblock_int(EVP_ENCODE_CTX *ctx, unsigned char *t, - const unsigned char *f, int dlen); +int evp_encodeblock_int(EVP_ENCODE_CTX *ctx, unsigned char *t, + const unsigned char *f, int dlen, int *wrap_cnt); static int evp_decodeblock_int(EVP_ENCODE_CTX *ctx, unsigned char *t, const unsigned char *f, int n, int eof); - -#ifndef CHARSET_EBCDIC -#define conv_bin2ascii(a, table) ((table)[(a) & 0x3f]) -#else -/* - * We assume that PEM encoded files are EBCDIC files (i.e., printable text - * files). Convert them here while decoding. When encoding, output is EBCDIC - * (text) format again. (No need for conversion in the conv_bin2ascii macro, - * as the underlying textstring data_bin2ascii[] is already EBCDIC) - */ -#define conv_bin2ascii(a, table) ((table)[(a) & 0x3f]) -#endif - /*- * 64 char lines * pad input with 0 @@ -45,11 +40,6 @@ static int evp_decodeblock_int(EVP_ENCODE_CTX *ctx, unsigned char *t, #define CHUNKS_PER_LINE (64 / 4) #define CHAR_PER_LINE (64 + 1) -static const unsigned char data_bin2ascii[65] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; - -/* SRP uses a different base64 alphabet */ -static const unsigned char srpdata_bin2ascii[65] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz./"; - /*- * 0xF0 is a EOLN * 0xF1 is ignore but next needs to be 0xF0 (for \r\n processing). @@ -400,28 +390,52 @@ int EVP_EncodeUpdate(EVP_ENCODE_CTX *ctx, unsigned char *out, int *outl, memcpy(&(ctx->enc_data[ctx->num]), in, i); in += i; inl -= i; - j = evp_encodeblock_int(ctx, out, ctx->enc_data, ctx->length); + int wrap_cnt = 0; + j = evp_encodeblock_int(ctx, out, ctx->enc_data, ctx->length, + &wrap_cnt); ctx->num = 0; out += j; total = j; - if ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0) { - *(out++) = '\n'; - total++; - } *out = '\0'; } - while (inl >= ctx->length && total <= INT_MAX) { - j = evp_encodeblock_int(ctx, out, in, ctx->length); - in += ctx->length; - inl -= ctx->length; - out += j; - total += j; - if ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0) { - *(out++) = '\n'; - total++; + int wrap_cnt = 0; + if (ctx->length % 3 != 0) { + j = evp_encodeblock_int(ctx, out, in, inl - (inl % ctx->length), + &wrap_cnt); + } else { +#if defined(__AVX2__) + const int newlines = !(ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) ? ctx->length : 0; + + j = encode_base64_avx2(ctx, + (unsigned char *)out, + (const unsigned char *)in, + inl - (inl % ctx->length), newlines, &wrap_cnt); +#elif defined(HAS_IA32CAP_IS_64) + if ((OPENSSL_ia32cap_P[2] & (1u << 5)) != 0) { + const int newlines = !(ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) ? ctx->length : 0; + + j = encode_base64_avx2(ctx, + (unsigned char *)out, + (const unsigned char *)in, + inl - (inl % ctx->length), newlines, &wrap_cnt); + } else { + j = evp_encodeblock_int(ctx, out, in, inl - (inl % ctx->length), + &wrap_cnt); } - *out = '\0'; +#else + j = evp_encodeblock_int(ctx, out, in, inl - (inl % ctx->length), + &wrap_cnt); +#endif + } + in += inl - (inl % ctx->length); + inl -= inl - (inl % ctx->length); + out += j; + total += j; + if ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0 && ctx->length % 3 != 0) { + *(out++) = '\n'; + total++; } + *out = '\0'; if (total > INT_MAX) { /* Too much output data! */ *outl = 0; @@ -438,9 +452,11 @@ int EVP_EncodeUpdate(EVP_ENCODE_CTX *ctx, unsigned char *out, int *outl, void EVP_EncodeFinal(EVP_ENCODE_CTX *ctx, unsigned char *out, int *outl) { unsigned int ret = 0; + int wrap_cnt = 0; if (ctx->num != 0) { - ret = evp_encodeblock_int(ctx, out, ctx->enc_data, ctx->num); + ret = evp_encodeblock_int(ctx, out, ctx->enc_data, ctx->num, + &wrap_cnt); if ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0) out[ret++] = '\n'; out[ret] = '\0'; @@ -449,46 +465,20 @@ void EVP_EncodeFinal(EVP_ENCODE_CTX *ctx, unsigned char *out, int *outl) *outl = ret; } -static int evp_encodeblock_int(EVP_ENCODE_CTX *ctx, unsigned char *t, - const unsigned char *f, int dlen) +int EVP_EncodeBlock(unsigned char *t, const unsigned char *f, int dlen) { - int i, ret = 0; - unsigned long l; - const unsigned char *table; + int wrap_cnt = 0; - if (ctx != NULL && (ctx->flags & EVP_ENCODE_CTX_USE_SRP_ALPHABET) != 0) - table = srpdata_bin2ascii; +#if defined(__AVX2__) + return encode_base64_avx2(NULL, t, f, dlen, 0, &wrap_cnt); +#elif defined(HAS_IA32CAP_IS_64) + if ((OPENSSL_ia32cap_P[2] & (1u << 5)) != 0) + return encode_base64_avx2(NULL, t, f, dlen, 0, &wrap_cnt); else - table = data_bin2ascii; - - for (i = dlen; i > 0; i -= 3) { - if (i >= 3) { - l = (((unsigned long)f[0]) << 16L) | (((unsigned long)f[1]) << 8L) | f[2]; - *(t++) = conv_bin2ascii(l >> 18L, table); - *(t++) = conv_bin2ascii(l >> 12L, table); - *(t++) = conv_bin2ascii(l >> 6L, table); - *(t++) = conv_bin2ascii(l, table); - } else { - l = ((unsigned long)f[0]) << 16L; - if (i == 2) - l |= ((unsigned long)f[1] << 8L); - - *(t++) = conv_bin2ascii(l >> 18L, table); - *(t++) = conv_bin2ascii(l >> 12L, table); - *(t++) = (i == 1) ? '=' : conv_bin2ascii(l >> 6L, table); - *(t++) = '='; - } - ret += 4; - f += 3; - } - - *t = '\0'; - return ret; -} - -int EVP_EncodeBlock(unsigned char *t, const unsigned char *f, int dlen) -{ - return evp_encodeblock_int(NULL, t, f, dlen); + return evp_encodeblock_int(NULL, t, f, dlen, &wrap_cnt); +#else + return evp_encodeblock_int(NULL, t, f, dlen, &wrap_cnt); +#endif } void EVP_DecodeInit(EVP_ENCODE_CTX *ctx) diff --git a/test/build.info b/test/build.info index 6890544f214..c400ad0ef35 100644 --- a/test/build.info +++ b/test/build.info @@ -55,7 +55,7 @@ IF[{- !$disabled{tests} -}] ssl_test_ctx_test ssl_test x509aux cipherlist_test asynciotest \ bio_callback_test bio_memleak_test bio_core_test bio_dgram_test param_build_test \ bioprinttest sslapitest ssl_handshake_rtt_test dtlstest sslcorrupttest \ - bio_base64_test bio_enc_test pkey_meth_test pkey_meth_kdf_test evp_kdf_test uitest \ + bio_base64_test test_base64_simdutf bio_enc_test pkey_meth_test pkey_meth_kdf_test evp_kdf_test uitest \ cipherbytes_test threadstest_fips threadpool_test \ asn1_encode_test asn1_decode_test asn1_string_table_test asn1_stable_parse_test \ x509_time_test x509_dup_cert_test x509_check_cert_pkey_test \ @@ -640,6 +640,10 @@ IF[{- !$disabled{tests} -}] INCLUDE[bio_base64_test]=../include ../apps/include DEPEND[bio_base64_test]=../libcrypto libtestutil.a + SOURCE[test_base64_simdutf] = test_base64_simdutf.c + INCLUDE[test_base64_simdutf] = ../include ../apps/include ../crypto/include ../crypto/evp/ + DEPEND[test_base64_simdutf] = ../libcrypto libtestutil.a + SOURCE[bio_enc_test]=bio_enc_test.c INCLUDE[bio_enc_test]=../include ../apps/include DEPEND[bio_enc_test]=../libcrypto libtestutil.a diff --git a/test/recipes/90-test_base64_simdutf.t b/test/recipes/90-test_base64_simdutf.t new file mode 100644 index 00000000000..7b3790be227 --- /dev/null +++ b/test/recipes/90-test_base64_simdutf.t @@ -0,0 +1,11 @@ +#! /usr/bin/env perl +# Copyright 2025 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +use OpenSSL::Test::Simple; + +simple_test("b64_simdutf", "test_base64_simdutf", "b64_simdutf"); \ No newline at end of file diff --git a/test/test_base64_simdutf.c b/test/test_base64_simdutf.c new file mode 100644 index 00000000000..a39155dc44f --- /dev/null +++ b/test/test_base64_simdutf.c @@ -0,0 +1,255 @@ +/* + * Copyright 2025 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include +#include "testutil.h" +#include +#include "internal/cryptlib.h" +#include "crypto/evp.h" +#include "evp_local.h" + +#define MAX_INPUT_LEN 3000 + +static void fuzz_fill_encode_ctx(EVP_ENCODE_CTX *ctx, int max_fill) +{ + static int seeded = 0; + + if (!seeded) { + srand((unsigned)time(NULL)); + seeded = 1; + } + + int num = rand() % (max_fill + 1); + ctx->num = num; + + for (int i = 0; i < num; i++) + ctx->enc_data[i] = (unsigned char)(rand() & 0xFF); + ctx->length = (rand() % 80) + 1; + ctx->line_num = rand() % (ctx->length + 1); +} +static inline uint32_t next_u32(uint32_t *state) +{ + *state = (*state * 1664525u) + 1013904223u; + return *state; +} + +static const unsigned char data_bin2ascii[65] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +/* SRP uses a different base64 alphabet */ +static const unsigned char srpdata_bin2ascii[65] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz./"; + +#ifndef CHARSET_EBCDIC +#define conv_bin2ascii(a, table) ((table)[(a) & 0x3f]) +#else +/* + * We assume that PEM encoded files are EBCDIC files (i.e., printable text + * files). Convert them here while decoding. When encoding, output is EBCDIC + * (text) format again. (No need for conversion in the conv_bin2ascii macro, + * as the underlying textstring data_bin2ascii[] is already EBCDIC) + */ +#define conv_bin2ascii(a, table) ((table)[(a) & 0x3f]) +#endif + +static int evp_encodeblock_int_old(EVP_ENCODE_CTX *ctx, unsigned char *t, + const unsigned char *f, int dlen) +{ + int i, ret = 0; + unsigned long l; + const unsigned char *table; + + if (ctx != NULL && (ctx->flags & EVP_ENCODE_CTX_USE_SRP_ALPHABET) != 0) + table = srpdata_bin2ascii; + else + table = data_bin2ascii; + + for (i = dlen; i > 0; i -= 3) { + if (i >= 3) { + l = (((unsigned long)f[0]) << 16L) | (((unsigned long)f[1]) << 8L) | f[2]; + *(t++) = conv_bin2ascii(l >> 18L, table); + *(t++) = conv_bin2ascii(l >> 12L, table); + *(t++) = conv_bin2ascii(l >> 6L, table); + *(t++) = conv_bin2ascii(l, table); + } else { + l = ((unsigned long)f[0]) << 16L; + if (i == 2) + l |= ((unsigned long)f[1] << 8L); + + *(t++) = conv_bin2ascii(l >> 18L, table); + *(t++) = conv_bin2ascii(l >> 12L, table); + *(t++) = (i == 1) ? '=' : conv_bin2ascii(l >> 6L, table); + *(t++) = '='; + } + ret += 4; + f += 3; + } + + *t = '\0'; + return ret; +} +static int evp_encodeupdate_old(EVP_ENCODE_CTX *ctx, unsigned char *out, int *outl, + const unsigned char *in, int inl) +{ + int i, j; + int total = 0; + + *outl = 0; + if (inl <= 0) + return 0; + OPENSSL_assert(ctx->length <= (int)sizeof(ctx->enc_data)); + if (ctx->length - ctx->num > inl) { + memcpy(&(ctx->enc_data[ctx->num]), in, inl); + ctx->num += inl; + return 1; + } + if (ctx->num != 0) { + i = ctx->length - ctx->num; + memcpy(&(ctx->enc_data[ctx->num]), in, i); + in += i; + inl -= i; + j = evp_encodeblock_int_old(ctx, out, ctx->enc_data, ctx->length); + ctx->num = 0; + out += j; + total = j; + if ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0) { + *(out++) = '\n'; + total++; + } + *out = '\0'; + } + while (inl >= ctx->length && total <= INT_MAX) { + j = evp_encodeblock_int_old(ctx, out, in, ctx->length); + in += ctx->length; + inl -= ctx->length; + out += j; + total += j; + if ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0) { + *(out++) = '\n'; + total++; + } + *out = '\0'; + } + if (total > INT_MAX) { + /* Too much output data! */ + *outl = 0; + return 0; + } + if (inl != 0) + memcpy(&(ctx->enc_data[0]), in, inl); + ctx->num = inl; + *outl = total; + + return 1; +} + +static void evp_encodefinal_old(EVP_ENCODE_CTX *ctx, unsigned char *out, int *outl) +{ + unsigned int ret = 0; + + if (ctx->num != 0) { + ret = evp_encodeblock_int_old(ctx, out, ctx->enc_data, ctx->num); + if ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0) + out[ret++] = '\n'; + out[ret] = '\0'; + ctx->num = 0; + } + *outl = ret; +} +static int test_encode_line_lengths_reinforced(void) +{ + const int trials = 50; + uint32_t seed = 12345; + /* Generous output buffers (Update + Final + newlines), plus a guard byte */ + unsigned char out_simd[9000 * 2 + 1] = { 0 }; + unsigned char out_ref[9000 * 2 + 1] = { 0 }; + + for (int t = 0; t < trials; t++) { + uint32_t r = next_u32(&seed); + int inl = r % MAX_INPUT_LEN; + /* Fresh random input */ + unsigned char input[MAX_INPUT_LEN]; + + for (int i = 0; i < inl; i++) + input[i] = (unsigned char)(r % 256); + + for (int partial_ctx_fill = 0; partial_ctx_fill <= 80; + partial_ctx_fill += 1) { + for (int ctx_len = 1; ctx_len <= 80; ctx_len += 1) { + printf("Trial %d, input length %d, ctx length %d, partial ctx fill %d\n", + t + 1, inl, ctx_len, partial_ctx_fill); + EVP_ENCODE_CTX *ctx_simd = EVP_ENCODE_CTX_new(); + EVP_ENCODE_CTX *ctx_ref = EVP_ENCODE_CTX_new(); + + fuzz_fill_encode_ctx(ctx_simd, partial_ctx_fill); + + memset(out_simd, 0xCC, sizeof(out_simd)); /* poison to catch short writes */ + memset(out_ref, 0xDD, sizeof(out_ref)); + + int outlen_simd = 0, outlen_ref = 0; /* bytes produced by Update */ + int finlen_simd = 0, finlen_ref = 0; /* bytes produced by Final */ + + if (!ctx_simd || !ctx_ref) { + EVP_ENCODE_CTX_free(ctx_simd); + EVP_ENCODE_CTX_free(ctx_ref); + TEST_error("Out of memory for contexts"); + return 0; + } + + EVP_EncodeInit(ctx_simd); + EVP_EncodeInit(ctx_ref); + ctx_simd->length = ctx_len; + ctx_ref->length = ctx_len; + + for (int i = 0; i < 2; i++) { + if (i % 2 == 0) { + /* Turn SRP alphabet OFF */ + ctx_simd->flags &= ~EVP_ENCODE_CTX_USE_SRP_ALPHABET; + ctx_ref->flags &= ~EVP_ENCODE_CTX_USE_SRP_ALPHABET; + } else { + /* Turn SRP alphabet ON */ + ctx_simd->flags |= EVP_ENCODE_CTX_USE_SRP_ALPHABET; + ctx_ref->flags |= EVP_ENCODE_CTX_USE_SRP_ALPHABET; + } + + int ret_simd = EVP_EncodeUpdate(ctx_simd, out_simd, &outlen_simd, + input, (int)inl); + int ret_ref = evp_encodeupdate_old(ctx_ref, out_ref, &outlen_ref, + input, (int)inl); + + if (!TEST_int_eq(ret_simd, ret_ref) + || !TEST_mem_eq(out_ref, outlen_ref, out_simd, outlen_simd) + || !TEST_int_eq(outlen_simd, outlen_ref)) + return 0; + + EVP_EncodeFinal(ctx_simd, out_simd + outlen_simd, + &finlen_simd); + evp_encodefinal_old(ctx_ref, out_ref + outlen_ref, + &finlen_ref); + + int total_ref = outlen_ref + finlen_ref; + int total_simd = outlen_simd + finlen_simd; + + if (!TEST_int_eq(finlen_simd, finlen_ref) + || !TEST_mem_eq(out_ref, total_ref, out_simd, total_simd)) + return 0; + } + + EVP_ENCODE_CTX_free(ctx_simd); + EVP_ENCODE_CTX_free(ctx_ref); + } + } + } + + return 1; +} + +int setup_tests(void) +{ + ADD_TEST(test_encode_line_lengths_reinforced); + + return 1; +}