From c4e8d444632140ecb47f31df133c0657f07f9be0 Mon Sep 17 00:00:00 2001 From: Robbie Harwood Date: Thu, 6 Apr 2017 12:15:39 -0400 Subject: [PATCH] Modernize UTF-8/UCS-2 conversion code Remove unused entry points as we only need to convert between little-endian UCS-2 byte buffers and UTF-8. Rename and simplify the remaining two function contracts. Avoid pointer alignment and endianness issues by operating on byte buffers and using store_16_le() and load_16_le(). Avoid two-pass operation using k5buf. [ghudson@mit.edu: simplified code using k5buf; simplified function names and contracts; rewrote commit message] --- src/include/k5-utf8.h | 59 +-- src/lib/crypto/krb/s2k_rc4.c | 2 +- src/lib/krb5/krb/pac.c | 3 +- src/lib/krb5/krb/pac_sign.c | 5 +- src/util/support/libkrb5support-fixed.exports | 5 +- src/util/support/utf8_conv.c | 411 +++--------------- 6 files changed, 72 insertions(+), 413 deletions(-) diff --git a/src/include/k5-utf8.h b/src/include/k5-utf8.h index 22f433c8e9..4b7415e66b 100644 --- a/src/include/k5-utf8.h +++ b/src/include/k5-utf8.h @@ -81,49 +81,22 @@ size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf); int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out); size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf); -int -krb5int_ucs2s_to_utf8s(const krb5_ucs2 *ucs2s, - char **utf8s, - size_t *utf8slen); - -int -krb5int_ucs2cs_to_utf8s(const krb5_ucs2 *ucs2s, - size_t ucs2slen, - char **utf8s, - size_t *utf8slen); - -int -krb5int_ucs2les_to_utf8s(const unsigned char *ucs2les, - char **utf8s, - size_t *utf8slen); - -int -krb5int_ucs2lecs_to_utf8s(const unsigned char *ucs2les, - size_t ucs2leslen, - char **utf8s, - size_t *utf8slen); - -int -krb5int_utf8s_to_ucs2s(const char *utf8s, - krb5_ucs2 **ucs2s, - size_t *ucs2chars); - -int -krb5int_utf8cs_to_ucs2s(const char *utf8s, - size_t utf8slen, - krb5_ucs2 **ucs2s, - size_t *ucs2chars); - -int -krb5int_utf8s_to_ucs2les(const char *utf8s, - unsigned char **ucs2les, - size_t *ucs2leslen); - -int -krb5int_utf8cs_to_ucs2les(const char *utf8s, - size_t utf8slen, - unsigned char **ucs2les, - size_t *ucs2leslen); +/* + * Convert a little-endian UCS-2 string to an allocated null-terminated UTF-8 + * string. nbytes is the length of ucs2bytes in bytes, and must be an even + * number. Return EINVAL on invalid input, ENOMEM on out of memory, or 0 on + * success. + */ +int k5_ucs2le_to_utf8(const uint8_t *ucs2bytes, size_t nbytes, + char **utf8_out); + +/* + * Convert a UTF-8 string to an allocated little-endian UCS-2 string. The + * resulting length is in bytes and will always be even. Return EINVAL on + * invalid input, ENOMEM on out of memory, or 0 on success. + */ +int k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out, + size_t *nbytes_out); /* returns the number of bytes in the UTF-8 string */ size_t krb5int_utf8_bytes(const char *); diff --git a/src/lib/crypto/krb/s2k_rc4.c b/src/lib/crypto/krb/s2k_rc4.c index 7286637a90..fb41b269d2 100644 --- a/src/lib/crypto/krb/s2k_rc4.c +++ b/src/lib/crypto/krb/s2k_rc4.c @@ -24,7 +24,7 @@ krb5int_arcfour_string_to_key(const struct krb5_keytypes *ktp, utf8 = k5memdup0(string->data, string->length, &err); if (utf8 == NULL) return err; - err = krb5int_utf8s_to_ucs2les(utf8, ©str, ©strlen); + err = k5_utf8_to_ucs2le(utf8, ©str, ©strlen); free(utf8); if (err) return err; diff --git a/src/lib/krb5/krb/pac.c b/src/lib/krb5/krb/pac.c index 9098927b5a..6616dd54bf 100644 --- a/src/lib/krb5/krb/pac.c +++ b/src/lib/krb5/krb/pac.c @@ -436,8 +436,7 @@ k5_pac_validate_client(krb5_context context, pac_princname_length % 2) return ERANGE; - ret = krb5int_ucs2lecs_to_utf8s(p, (size_t)pac_princname_length / 2, - &pac_princname, NULL); + ret = k5_ucs2le_to_utf8(p, pac_princname_length, &pac_princname); if (ret != 0) return ret; diff --git a/src/lib/krb5/krb/pac_sign.c b/src/lib/krb5/krb/pac_sign.c index d40df45f99..c6eee767db 100644 --- a/src/lib/krb5/krb/pac_sign.c +++ b/src/lib/krb5/krb/pac_sign.c @@ -54,9 +54,8 @@ k5_insert_client_info(krb5_context context, if (ret != 0) goto cleanup; - ret = krb5int_utf8s_to_ucs2les(princ_name_utf8, - &princ_name_ucs2, - &princ_name_ucs2_len); + ret = k5_utf8_to_ucs2le(princ_name_utf8, &princ_name_ucs2, + &princ_name_ucs2_len); if (ret != 0) goto cleanup; diff --git a/src/util/support/libkrb5support-fixed.exports b/src/util/support/libkrb5support-fixed.exports index d5d4177b72..750dc243f7 100644 --- a/src/util/support/libkrb5support-fixed.exports +++ b/src/util/support/libkrb5support-fixed.exports @@ -52,6 +52,8 @@ k5_path_isabs k5_path_join k5_path_split k5_strerror_r +k5_utf8_to_ucs2le +k5_ucs2le_to_utf8 krb5int_key_register krb5int_key_delete krb5int_getspecific @@ -77,9 +79,6 @@ krb5int_mutex_free krb5int_mutex_lock krb5int_mutex_unlock krb5int_gmt_mktime -krb5int_utf8cs_to_ucs2les -krb5int_utf8s_to_ucs2les -krb5int_ucs2lecs_to_utf8s krb5int_ucs4_to_utf8 krb5int_utf8_to_ucs4 krb5int_utf8_lentab diff --git a/src/util/support/utf8_conv.c b/src/util/support/utf8_conv.c index 80ca90b139..5f279c39b5 100644 --- a/src/util/support/utf8_conv.c +++ b/src/util/support/utf8_conv.c @@ -61,397 +61,86 @@ #include "k5-platform.h" #include "k5-utf8.h" +#include "k5-buf.h" #include "supp-int.h" static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; -static ssize_t -k5_utf8s_to_ucs2s(krb5_ucs2 *ucs2str, - const char *utf8str, - size_t count, - int little_endian) +int +k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out, size_t *nbytes_out) { - size_t ucs2len = 0; - size_t utflen, i; + struct k5buf buf; krb5_ucs2 ch; + size_t chlen, i; + void *p; - /* If input ptr is NULL or empty... */ - if (utf8str == NULL || *utf8str == '\0') { - if (ucs2str != NULL) - *ucs2str = 0; - - return 0; - } + *ucs2_out = NULL; + *nbytes_out = 0; - /* Examine next UTF-8 character. */ - while (ucs2len < count && *utf8str != '\0') { - /* Get UTF-8 sequence length from 1st byte */ - utflen = KRB5_UTF8_CHARLEN2(utf8str, utflen); + k5_buf_init_dynamic(&buf); - if (utflen == 0 || utflen > KRB5_MAX_UTF8_LEN) - return -1; + /* Examine next UTF-8 character. */ + while (*utf8 != '\0') { + /* Get UTF-8 sequence length from first byte. */ + chlen = KRB5_UTF8_CHARLEN2(utf8, chlen); + if (chlen == 0 || chlen > KRB5_MAX_UTF8_LEN) + goto invalid; /* First byte minus length tag */ - ch = (krb5_ucs2)(utf8str[0] & mask[utflen]); + ch = (krb5_ucs2)(utf8[0] & mask[chlen]); - for (i = 1; i < utflen; i++) { - /* Subsequent bytes must start with 10 */ - if ((utf8str[i] & 0xc0) != 0x80) - return -1; + for (i = 1; i < chlen; i++) { + /* Subsequent bytes must start with 10. */ + if ((utf8[i] & 0xc0) != 0x80) + goto invalid; - ch <<= 6; /* 6 bits of data in each subsequent byte */ - ch |= (krb5_ucs2)(utf8str[i] & 0x3f); + /* 6 bits of data in each subsequent byte */ + ch <<= 6; + ch |= (krb5_ucs2)(utf8[i] & 0x3f); } - if (ucs2str != NULL) { -#ifdef K5_BE -#ifndef SWAP16 -#define SWAP16(X) ((((X) << 8) | ((X) >> 8)) & 0xFFFF) -#endif - if (little_endian) - ucs2str[ucs2len] = SWAP16(ch); - else -#endif - ucs2str[ucs2len] = ch; - } + p = k5_buf_get_space(&buf, 2); + if (p == NULL) + return ENOMEM; + store_16_le(ch, p); - utf8str += utflen; /* Move to next UTF-8 character */ - ucs2len++; /* Count number of wide chars stored/required */ - } - - if (ucs2str != NULL && ucs2len < count) { - /* Add null terminator if there's room in the buffer. */ - ucs2str[ucs2len] = 0; - } - - return ucs2len; -} - -int -krb5int_utf8s_to_ucs2s(const char *utf8s, - krb5_ucs2 **ucs2s, - size_t *ucs2chars) -{ - ssize_t len; - size_t chars; - - chars = krb5int_utf8_chars(utf8s); - *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2)); - if (*ucs2s == NULL) { - return ENOMEM; - } - - len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars + 1, 0); - if (len < 0) { - free(*ucs2s); - *ucs2s = NULL; - return EINVAL; - } - - if (ucs2chars != NULL) { - *ucs2chars = chars; + /* Move to next UTF-8 character. */ + utf8 += chlen; } + *ucs2_out = buf.data; + *nbytes_out = buf.len; return 0; -} - -int -krb5int_utf8cs_to_ucs2s(const char *utf8s, - size_t utf8slen, - krb5_ucs2 **ucs2s, - size_t *ucs2chars) -{ - ssize_t len; - size_t chars; - - chars = krb5int_utf8c_chars(utf8s, utf8slen); - *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2)); - if (*ucs2s == NULL) { - return ENOMEM; - } - - len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars, 0); - if (len < 0) { - free(*ucs2s); - *ucs2s = NULL; - return EINVAL; - } - (*ucs2s)[chars] = 0; - - if (ucs2chars != NULL) { - *ucs2chars = chars; - } - return 0; +invalid: + k5_buf_free(&buf); + return EINVAL; } int -krb5int_utf8s_to_ucs2les(const char *utf8s, - unsigned char **ucs2les, - size_t *ucs2leslen) +k5_ucs2le_to_utf8(const uint8_t *ucs2bytes, size_t nbytes, char **utf8_out) { - ssize_t len; - size_t chars; - - chars = krb5int_utf8_chars(utf8s); - - *ucs2les = (unsigned char *)malloc((chars + 1) * sizeof(krb5_ucs2)); - if (*ucs2les == NULL) { - return ENOMEM; - } - - len = k5_utf8s_to_ucs2s((krb5_ucs2 *)*ucs2les, utf8s, chars + 1, 1); - if (len < 0) { - free(*ucs2les); - *ucs2les = NULL; - return EINVAL; - } - - if (ucs2leslen != NULL) { - *ucs2leslen = chars * sizeof(krb5_ucs2); - } - - return 0; -} - -int -krb5int_utf8cs_to_ucs2les(const char *utf8s, - size_t utf8slen, - unsigned char **ucs2les, - size_t *ucs2leslen) -{ - ssize_t len; - size_t chars; - krb5_ucs2 *ucs2s; - - *ucs2les = NULL; - - chars = krb5int_utf8c_chars(utf8s, utf8slen); - ucs2s = malloc((chars + 1) * sizeof(krb5_ucs2)); - if (ucs2s == NULL) - return ENOMEM; - - len = k5_utf8s_to_ucs2s(ucs2s, utf8s, chars, 1); - if (len < 0) { - free(ucs2s); - return EINVAL; - } - ucs2s[chars] = 0; - - *ucs2les = (unsigned char *)ucs2s; - if (ucs2leslen != NULL) { - *ucs2leslen = chars * sizeof(krb5_ucs2); - } - - return 0; -} - -/*----------------------------------------------------------------------------- - Convert a wide char string to a UTF-8 string. - No more than 'count' bytes will be written to the output buffer. - Return the # of bytes written to the output buffer, excl null terminator. - - ucs2len is -1 if the UCS-2 string is NUL terminated, otherwise it is the - length of the UCS-2 string in characters -*/ -static ssize_t -k5_ucs2s_to_utf8s(char *utf8str, const krb5_ucs2 *ucs2str, - size_t count, ssize_t ucs2len, int little_endian) -{ - int len = 0; - int n; - char *p = utf8str; - krb5_ucs2 empty = 0, ch; - - if (ucs2str == NULL) /* Treat input ptr NULL as an empty string */ - ucs2str = ∅ - - if (utf8str == NULL) /* Just compute size of output, excl null */ - { - while (ucs2len == -1 ? *ucs2str : --ucs2len >= 0) { - /* Get UTF-8 size of next wide char */ - ch = *ucs2str++; -#ifdef K5_BE - if (little_endian) - ch = SWAP16(ch); -#endif - - n = krb5int_ucs2_to_utf8(ch, NULL); - if (n < 1 || n > INT_MAX - len) - return -1; - len += n; - } - - return len; - } - - /* Do the actual conversion. */ - - n = 1; /* In case of empty ucs2str */ - while (ucs2len == -1 ? *ucs2str != 0 : --ucs2len >= 0) { - ch = *ucs2str++; -#ifdef K5_BE - if (little_endian) - ch = SWAP16(ch); -#endif - - n = krb5int_ucs2_to_utf8(ch, p); - - if (n < 1) - break; - - p += n; - count -= n; /* Space left in output buffer */ - } - - /* If not enough room for last character, pad remainder with null - so that return value = original count, indicating buffer full. */ - if (n == 0) { - while (count--) - *p++ = 0; - } - /* Add a null terminator if there's room. */ - else if (count) - *p = 0; - - if (n == -1) /* Conversion encountered invalid wide char. */ - return -1; - - /* Return the number of bytes written to output buffer, excl null. */ - return (p - utf8str); -} - -int -krb5int_ucs2s_to_utf8s(const krb5_ucs2 *ucs2s, - char **utf8s, - size_t *utf8slen) -{ - ssize_t len; - - len = k5_ucs2s_to_utf8s(NULL, ucs2s, 0, -1, 0); - if (len < 0) { - return EINVAL; - } - - *utf8s = (char *)malloc((size_t)len + 1); - if (*utf8s == NULL) { - return ENOMEM; - } - - len = k5_ucs2s_to_utf8s(*utf8s, ucs2s, (size_t)len + 1, -1, 0); - if (len < 0) { - free(*utf8s); - *utf8s = NULL; - return EINVAL; - } - - if (utf8slen != NULL) { - *utf8slen = len; - } - - return 0; -} - -int -krb5int_ucs2les_to_utf8s(const unsigned char *ucs2les, - char **utf8s, - size_t *utf8slen) -{ - ssize_t len; - - len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0, -1, 1); - if (len < 0) - return EINVAL; - - *utf8s = (char *)malloc((size_t)len + 1); - if (*utf8s == NULL) { - return ENOMEM; - } - - len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les, (size_t)len + 1, -1, 1); - if (len < 0) { - free(*utf8s); - *utf8s = NULL; - return EINVAL; - } - - if (utf8slen != NULL) { - *utf8slen = len; - } - - return 0; -} - -int -krb5int_ucs2cs_to_utf8s(const krb5_ucs2 *ucs2s, - size_t ucs2slen, - char **utf8s, - size_t *utf8slen) -{ - ssize_t len; - - if (ucs2slen > SSIZE_MAX) - return ERANGE; - - len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2s, 0, - (ssize_t)ucs2slen, 0); - if (len < 0) - return EINVAL; - - *utf8s = (char *)malloc((size_t)len + 1); - if (*utf8s == NULL) { - return ENOMEM; - } - - len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2s, (size_t)len, - (ssize_t)ucs2slen, 0); - if (len < 0) { - free(*utf8s); - *utf8s = NULL; - return EINVAL; - } - (*utf8s)[len] = '\0'; - - if (utf8slen != NULL) { - *utf8slen = len; - } - - return 0; -} - -int -krb5int_ucs2lecs_to_utf8s(const unsigned char *ucs2les, - size_t ucs2leslen, - char **utf8s, - size_t *utf8slen) -{ - ssize_t len; + struct k5buf buf; + krb5_ucs2 ch; + size_t chlen, i; + void *p; - if (ucs2leslen > SSIZE_MAX) - return ERANGE; + *utf8_out = NULL; - len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0, - (ssize_t)ucs2leslen, 1); - if (len < 0) + if (nbytes % 2 != 0) return EINVAL; - *utf8s = (char *)malloc((size_t)len + 1); - if (*utf8s == NULL) { - return ENOMEM; - } - - len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les, (size_t)len, - (ssize_t)ucs2leslen, 1); - if (len < 0) { - free(*utf8s); - *utf8s = NULL; - return EINVAL; - } - (*utf8s)[len] = '\0'; + k5_buf_init_dynamic(&buf); - if (utf8slen != NULL) { - *utf8slen = len; + for (i = 0; i < nbytes; i += 2) { + ch = load_16_le(&ucs2bytes[i]); + chlen = krb5int_ucs2_to_utf8(ch, NULL); + p = k5_buf_get_space(&buf, chlen); + if (p == NULL) + return ENOMEM; + (void)krb5int_ucs2_to_utf8(ch, p); } + *utf8_out = buf.data; return 0; } -- 2.47.2