From: Greg Hudson Date: Tue, 18 Apr 2017 18:01:06 +0000 (-0400) Subject: Replace UCS-2 conversions with UTF-16 X-Git-Tag: krb5-1.16-beta1~78 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=89ce6420832858950271858e7c6e1a2eefebc683;p=thirdparty%2Fkrb5.git Replace UCS-2 conversions with UTF-16 Where we convert between UTF-8 and UCS-2 (RC4 string-to-key and PAC client info), use UTF-16 instead of UCS-2. Add a test program for the conversion functions. ticket: 8577 (new) --- diff --git a/.gitignore b/.gitignore index 815c67d2c0..862a87a715 100644 --- a/.gitignore +++ b/.gitignore @@ -523,6 +523,7 @@ local.properties /src/util/support/t_path_win /src/util/support/t_unal /src/util/support/t_utf8 +/src/util/support/t_utf16 /src/util/verto/rename.h diff --git a/src/include/k5-utf8.h b/src/include/k5-utf8.h index 4b7415e66b..e2f20d4502 100644 --- a/src/include/k5-utf8.h +++ b/src/include/k5-utf8.h @@ -73,8 +73,6 @@ typedef uint16_t krb5_ucs2; typedef uint32_t krb5_ucs4; -#define KRB5_MAX_UTF8_LEN (sizeof(krb5_ucs2) * 3/2) - int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out); size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf); @@ -82,21 +80,21 @@ int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out); size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf); /* - * Convert a little-endian UCS-2 string to an allocated null-terminated UTF-8 + * Convert a little-endian UTF-16 string to an allocated null-terminated UTF-8 * string. nbytes is the length of ucs2bytes in bytes, and must be an even * number. Return EINVAL on invalid input, ENOMEM on out of memory, or 0 on * success. */ -int k5_ucs2le_to_utf8(const uint8_t *ucs2bytes, size_t nbytes, - char **utf8_out); +int k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes, + char **utf8_out); /* - * Convert a UTF-8 string to an allocated little-endian UCS-2 string. The + * Convert a UTF-8 string to an allocated little-endian UTF-16 string. The * resulting length is in bytes and will always be even. Return EINVAL on * invalid input, ENOMEM on out of memory, or 0 on success. */ -int k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out, - size_t *nbytes_out); +int k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out, + size_t *nbytes_out); /* returns the number of bytes in the UTF-8 string */ size_t krb5int_utf8_bytes(const char *); diff --git a/src/lib/crypto/krb/s2k_rc4.c b/src/lib/crypto/krb/s2k_rc4.c index fb41b269d2..081a91217c 100644 --- a/src/lib/crypto/krb/s2k_rc4.c +++ b/src/lib/crypto/krb/s2k_rc4.c @@ -24,7 +24,7 @@ krb5int_arcfour_string_to_key(const struct krb5_keytypes *ktp, utf8 = k5memdup0(string->data, string->length, &err); if (utf8 == NULL) return err; - err = k5_utf8_to_ucs2le(utf8, ©str, ©strlen); + err = k5_utf8_to_utf16le(utf8, ©str, ©strlen); free(utf8); if (err) return err; diff --git a/src/lib/krb5/krb/pac.c b/src/lib/krb5/krb/pac.c index 485a0f7c51..d1662b98f3 100644 --- a/src/lib/krb5/krb/pac.c +++ b/src/lib/krb5/krb/pac.c @@ -436,7 +436,7 @@ k5_pac_validate_client(krb5_context context, pac_princname_length % 2) return ERANGE; - ret = k5_ucs2le_to_utf8(p, pac_princname_length, &pac_princname); + ret = k5_utf16le_to_utf8(p, pac_princname_length, &pac_princname); if (ret != 0) return ret; diff --git a/src/lib/krb5/krb/pac_sign.c b/src/lib/krb5/krb/pac_sign.c index c6eee767db..c94899c96a 100644 --- a/src/lib/krb5/krb/pac_sign.c +++ b/src/lib/krb5/krb/pac_sign.c @@ -38,8 +38,8 @@ k5_insert_client_info(krb5_context context, krb5_error_code ret; krb5_data client_info; char *princ_name_utf8 = NULL; - unsigned char *princ_name_ucs2 = NULL, *p; - size_t princ_name_ucs2_len = 0; + unsigned char *princ_name_utf16 = NULL, *p; + size_t princ_name_utf16_len = 0; uint64_t nt_authtime; /* If we already have a CLIENT_INFO buffer, then just validate it */ @@ -54,12 +54,12 @@ k5_insert_client_info(krb5_context context, if (ret != 0) goto cleanup; - ret = k5_utf8_to_ucs2le(princ_name_utf8, &princ_name_ucs2, - &princ_name_ucs2_len); + ret = k5_utf8_to_utf16le(princ_name_utf8, &princ_name_utf16, + &princ_name_utf16_len); if (ret != 0) goto cleanup; - client_info.length = PAC_CLIENT_INFO_LENGTH + princ_name_ucs2_len; + client_info.length = PAC_CLIENT_INFO_LENGTH + princ_name_utf16_len; client_info.data = NULL; ret = k5_pac_add_buffer(context, pac, KRB5_PAC_CLIENT_INFO, @@ -74,16 +74,16 @@ k5_insert_client_info(krb5_context context, store_64_le(nt_authtime, p); p += 8; - /* copy in number of UCS-2 characters in principal name */ - store_16_le(princ_name_ucs2_len, p); + /* copy in number of UTF-16 bytes in principal name */ + store_16_le(princ_name_utf16_len, p); p += 2; /* copy in principal name */ - memcpy(p, princ_name_ucs2, princ_name_ucs2_len); + memcpy(p, princ_name_utf16, princ_name_utf16_len); cleanup: - if (princ_name_ucs2 != NULL) - free(princ_name_ucs2); + if (princ_name_utf16 != NULL) + free(princ_name_utf16); krb5_free_unparsed_name(context, princ_name_utf8); return ret; diff --git a/src/util/support/Makefile.in b/src/util/support/Makefile.in index 6239e41761..0bf0b7a872 100644 --- a/src/util/support/Makefile.in +++ b/src/util/support/Makefile.in @@ -143,6 +143,7 @@ SRCS=\ $(srcdir)/bcmp.c \ $(srcdir)/strerror_r.c \ $(srcdir)/t_utf8.c \ + $(srcdir)/t_utf16.c \ $(srcdir)/getopt.c \ $(srcdir)/getopt_long.c @@ -220,7 +221,12 @@ t_unal: t_unal.o t_utf8: t_utf8.o utf8.o $(CC_LINK) -o t_utf8 t_utf8.o utf8.o -TEST_PROGS= t_k5buf t_path t_path_win t_base64 t_json t_unal t_utf8 +T_UTF16_OBJS= t_utf16.o utf8_conv.o utf8.o k5buf.o $(PRINTF_ST_OBJ) + +t_utf16: $(T_UTF16_OBJS) + $(CC_LINK) -o $@ $(T_UTF16_OBJS) + +TEST_PROGS= t_k5buf t_path t_path_win t_base64 t_json t_unal t_utf8 t_utf16 check-unix: $(TEST_PROGS) ./t_k5buf @@ -230,11 +236,13 @@ check-unix: $(TEST_PROGS) ./t_json ./t_unal ./t_utf8 + ./t_utf16 clean: $(RM) t_k5buf.o t_k5buf t_unal.o t_unal path_win.o path_win $(RM) t_path_win.o t_path_win t_path.o t_path t_base64.o t_base64 $(RM) t_json.o t_json libkrb5support.exports t_utf8.o t_utf8 + $(RM) t_utf16.o t_utf16 @lib_frag@ @libobj_frag@ diff --git a/src/util/support/deps b/src/util/support/deps index a95d2ad908..34d8a884b3 100644 --- a/src/util/support/deps +++ b/src/util/support/deps @@ -34,8 +34,9 @@ utf8.so utf8.po $(OUTPRE)utf8.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \ $(top_srcdir)/include/k5-utf8.h supp-int.h utf8.c utf8_conv.so utf8_conv.po $(OUTPRE)utf8_conv.$(OBJEXT): \ $(BUILDTOP)/include/autoconf.h $(top_srcdir)/include/k5-buf.h \ - $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \ - $(top_srcdir)/include/k5-utf8.h supp-int.h utf8_conv.c + $(top_srcdir)/include/k5-input.h $(top_srcdir)/include/k5-platform.h \ + $(top_srcdir)/include/k5-thread.h $(top_srcdir)/include/k5-utf8.h \ + supp-int.h utf8_conv.c gettimeofday.so gettimeofday.po $(OUTPRE)gettimeofday.$(OBJEXT): \ $(BUILDTOP)/include/autoconf.h $(top_srcdir)/include/k5-platform.h \ $(top_srcdir)/include/k5-thread.h gettimeofday.c @@ -84,6 +85,9 @@ strerror_r.so strerror_r.po $(OUTPRE)strerror_r.$(OBJEXT): \ t_utf8.so t_utf8.po $(OUTPRE)t_utf8.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \ $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \ $(top_srcdir)/include/k5-utf8.h t_utf8.c +t_utf16.so t_utf16.po $(OUTPRE)t_utf16.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \ + $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \ + $(top_srcdir)/include/k5-utf8.h t_utf16.c getopt.so getopt.po $(OUTPRE)getopt.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \ $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \ getopt.c diff --git a/src/util/support/libkrb5support-fixed.exports b/src/util/support/libkrb5support-fixed.exports index 750dc243f7..fd74a1897e 100644 --- a/src/util/support/libkrb5support-fixed.exports +++ b/src/util/support/libkrb5support-fixed.exports @@ -52,8 +52,8 @@ k5_path_isabs k5_path_join k5_path_split k5_strerror_r -k5_utf8_to_ucs2le -k5_ucs2le_to_utf8 +k5_utf8_to_utf16le +k5_utf16le_to_utf8 krb5int_key_register krb5int_key_delete krb5int_getspecific diff --git a/src/util/support/t_utf16.c b/src/util/support/t_utf16.c new file mode 100644 index 0000000000..bc3390a415 --- /dev/null +++ b/src/util/support/t_utf16.c @@ -0,0 +1,117 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* util/support/t_utf16.c - test UTF-16 conversion functions */ +/* + * Copyright (C) 2017 by the Massachusetts Institute of Technology. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This program tests conversions between UTF-8 and little-endian UTF-16, with + * an eye mainly towards covering UTF-16 edge cases and UTF-8 decoding results + * which we detect as invalid in utf8_conv.c. t_utf8.c covers more UTF-8 edge + * cases. + */ + +#include +#include + +#include "k5-platform.h" +#include "k5-utf8.h" + +struct test { + const char *utf8; + const char *utf16; + size_t utf16len; +} tests[] = { + { "", "", 0 }, + { "abcd", "a\0b\0c\0d\0", 8 }, + /* From RFC 2781 (tests code point 0x12345 and some ASCII) */ + { "\xF0\x92\x8D\x85=Ra", "\x08\xD8\x45\xDF=\0R\0a\0", 10 }, + /* Lowest and highest Supplementary Plane code points */ + { "\xF0\x90\x80\x80 \xF4\x8F\xBF\xBF", + "\x00\xD8\x00\xDC \0\xFF\xDB\xFF\xDF", 10 }, + /* Basic Multilingual Plane code points near and above surrogate range */ + { "\xED\x9F\xBF", "\xFF\xD7", 2 }, + { "\xEE\x80\x80 \xEE\xBF\xBF", "\x00\xE0 \0\xFF\xEF", 6 }, + /* Invalid UTF-8: decodes to value in surrogate pair range */ + { "\xED\xA0\x80", NULL, 0 }, /* 0xD800 */ + { "\xED\xAF\xBF", NULL, 0 }, /* 0xDBFF */ + { "\xED\xB0\x80", NULL, 0 }, /* 0xDC00 */ + { "\xED\xBF\xBF", NULL, 0 }, /* 0xDFFF */ + /* Invalid UTF-8: decodes to value above Unicode range */ + { "\xF4\x90\x80\x80", NULL, 0 }, + { "\xF4\xBF\xBF\xBF", NULL, 0 }, + { "\xF5\x80\x80\x80", NULL, 0 }, /* thrown out early due to first byte */ + /* Invalid UTF-16: odd numbers of UTF-16 bytes */ + { NULL, "\x00", 1 }, + { NULL, "\x01\x00\x02", 3 }, + /* Invalid UTF-16: high surrogate without a following low surrogate */ + { NULL, "\x00\xD8\x00\x00", 4 }, + { NULL, "\x00\xD8\xFF\xDB", 4 }, + { NULL, "\xFF\xDB", 2 }, + /* Invalid UTF-16: low surrogate without a preceding high surrogate */ + { NULL, "\x61\x00\x00\xDC", 4 }, + { NULL, "\xFF\xDF\xFF\xDB", 4 }, +}; + +int +main(int argc, char **argv) +{ + int ret; + struct test *t; + size_t i, utf16len; + uint8_t *utf16; + char *utf8; + + for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) { + t = &tests[i]; + if (t->utf8 != NULL) { + ret = k5_utf8_to_utf16le(t->utf8, &utf16, &utf16len); + if (t->utf16 == NULL) { + assert(ret == EINVAL); + } else { + assert(ret == 0); + assert(t->utf16len == utf16len); + assert(memcmp(t->utf16, utf16, utf16len) == 0); + free(utf16); + } + } + + if (t->utf16 != NULL) { + ret = k5_utf16le_to_utf8((uint8_t *)t->utf16, t->utf16len, &utf8); + if (t->utf8 == NULL) { + assert(ret == EINVAL); + } else { + assert(ret == 0); + assert(strcmp(t->utf8, utf8) == 0); + free(utf8); + } + } + } + return 0; +} diff --git a/src/util/support/utf8.c b/src/util/support/utf8.c index e42c0c7dc8..34e2b6adb0 100644 --- a/src/util/support/utf8.c +++ b/src/util/support/utf8.c @@ -205,7 +205,7 @@ int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out) return 0; } -/* conv UCS-2 to UTF-8, not used */ +/* conv UCS-4 to UTF-8 */ size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf) { size_t len = 0; diff --git a/src/util/support/utf8_conv.c b/src/util/support/utf8_conv.c index 5f279c39b5..5cfc2c512b 100644 --- a/src/util/support/utf8_conv.c +++ b/src/util/support/utf8_conv.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */ /* util/support/utf8_conv.c */ /* - * Copyright 2008 by the Massachusetts Institute of Technology. + * Copyright 2008, 2017 by the Massachusetts Institute of Technology. * All Rights Reserved. * * Export of this software from the United States of America may @@ -47,34 +47,56 @@ * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY. */ -/* This work is part of OpenLDAP Software . */ +/* This work is based on OpenLDAP Software . */ /* - * UTF-8 Conversion Routines - * - * These routines convert between Wide Character and UTF-8, - * or between MultiByte and UTF-8 encodings. - * - * Both single character and string versions of the functions are provided. - * All functions return -1 if the character or string cannot be converted. + * These routines convert between UTF-16 and UTF-8. UTF-16 encodes a Unicode + * character in either two or four bytes. Characters in the Basic Multilingual + * Plane (hex 0..D7FF and E000..FFFF) are encoded as-is in two bytes. + * Characters in the Supplementary Planes (10000..10FFFF) are split into a high + * surrogate and a low surrogate, each containing ten bits of the character + * value, and encoded in four bytes. */ #include "k5-platform.h" #include "k5-utf8.h" #include "k5-buf.h" +#include "k5-input.h" #include "supp-int.h" static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; +/* A high surrogate is ten bits masked with 0xD800. */ +#define IS_HIGH_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDBFF) + +/* A low surrogate is ten bits masked with 0xDC00. */ +#define IS_LOW_SURROGATE(c) ((c) >= 0xDC00 && (c) <= 0xDFFF) + +/* A valid Unicode code point is in the range 0..10FFFF and is not a surrogate + * value. */ +#define IS_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDFFF) +#define IS_VALID_UNICODE(c) ((c) <= 0x10FFFF && !IS_SURROGATE(c)) + +/* A Basic Multilingual Plane character is in the range 0..FFFF and is not a + * surrogate value. */ +#define IS_BMP(c) ((c) <= 0xFFFF && !IS_SURROGATE(c)) + +/* Characters in the Supplementary Planes have a base value subtracted from + * their code points to form a 20-bit value; ten bits go in each surrogate. */ +#define BASE 0x10000 +#define HIGH_SURROGATE(c) (0xD800 | (((c) - BASE) >> 10)) +#define LOW_SURROGATE(c) (0xDC00 | (((c) - BASE) & 0x3FF)) +#define COMPOSE(c1, c2) (BASE + ((((c1) & 0x3FF) << 10) | ((c2) & 0x3FF))) + int -k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out, size_t *nbytes_out) +k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out, size_t *nbytes_out) { struct k5buf buf; - krb5_ucs2 ch; + krb5_ucs4 ch; size_t chlen, i; - void *p; + uint8_t *p; - *ucs2_out = NULL; + *utf16_out = NULL; *nbytes_out = 0; k5_buf_init_dynamic(&buf); @@ -83,11 +105,11 @@ k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out, size_t *nbytes_out) while (*utf8 != '\0') { /* Get UTF-8 sequence length from first byte. */ chlen = KRB5_UTF8_CHARLEN2(utf8, chlen); - if (chlen == 0 || chlen > KRB5_MAX_UTF8_LEN) + if (chlen == 0) goto invalid; /* First byte minus length tag */ - ch = (krb5_ucs2)(utf8[0] & mask[chlen]); + ch = (krb5_ucs4)(utf8[0] & mask[chlen]); for (i = 1; i < chlen; i++) { /* Subsequent bytes must start with 10. */ @@ -96,19 +118,30 @@ k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out, size_t *nbytes_out) /* 6 bits of data in each subsequent byte */ ch <<= 6; - ch |= (krb5_ucs2)(utf8[i] & 0x3f); + ch |= (krb5_ucs4)(utf8[i] & 0x3f); } + if (!IS_VALID_UNICODE(ch)) + goto invalid; - p = k5_buf_get_space(&buf, 2); + /* Characters in the basic multilingual plane are encoded using two + * bytes; other characters are encoded using four bytes. */ + p = k5_buf_get_space(&buf, IS_BMP(ch) ? 2 : 4); if (p == NULL) return ENOMEM; - store_16_le(ch, p); + if (IS_BMP(ch)) { + store_16_le(ch, p); + } else { + /* 0x10000 is subtracted from ch; then the high ten bits plus + * 0xD800 and the low ten bits plus 0xDC00 are the surrogates. */ + store_16_le(HIGH_SURROGATE(ch), p); + store_16_le(LOW_SURROGATE(ch), p + 2); + } /* Move to next UTF-8 character. */ utf8 += chlen; } - *ucs2_out = buf.data; + *utf16_out = buf.data; *nbytes_out = buf.len; return 0; @@ -118,11 +151,13 @@ invalid: } int -k5_ucs2le_to_utf8(const uint8_t *ucs2bytes, size_t nbytes, char **utf8_out) +k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes, char **utf8_out) { struct k5buf buf; - krb5_ucs2 ch; - size_t chlen, i; + struct k5input in; + uint16_t ch1, ch2; + krb5_ucs4 ch; + size_t chlen; void *p; *utf8_out = NULL; @@ -131,16 +166,37 @@ k5_ucs2le_to_utf8(const uint8_t *ucs2bytes, size_t nbytes, char **utf8_out) return EINVAL; k5_buf_init_dynamic(&buf); + k5_input_init(&in, utf16bytes, nbytes); + while (!in.status && in.len > 0) { + /* Get the next character or high surrogate. A low surrogate without a + * preceding high surrogate is invalid. */ + ch1 = k5_input_get_uint16_le(&in); + if (IS_LOW_SURROGATE(ch1)) + goto invalid; + if (IS_HIGH_SURROGATE(ch1)) { + /* Get the low surrogate and combine the pair. */ + ch2 = k5_input_get_uint16_le(&in); + if (!IS_LOW_SURROGATE(ch2)) + goto invalid; + ch = COMPOSE(ch1, ch2); + } else { + ch = ch1; + } - for (i = 0; i < nbytes; i += 2) { - ch = load_16_le(&ucs2bytes[i]); - chlen = krb5int_ucs2_to_utf8(ch, NULL); + chlen = krb5int_ucs4_to_utf8(ch, NULL); p = k5_buf_get_space(&buf, chlen); if (p == NULL) return ENOMEM; - (void)krb5int_ucs2_to_utf8(ch, p); + (void)krb5int_ucs4_to_utf8(ch, p); } + if (in.status) + goto invalid; + *utf8_out = buf.data; return 0; + +invalid: + k5_buf_free(&buf); + return EINVAL; }