/src/util/support/t_path_win
/src/util/support/t_unal
/src/util/support/t_utf8
+/src/util/support/t_utf16
/src/util/verto/rename.h
typedef uint16_t krb5_ucs2;
typedef uint32_t krb5_ucs4;
-#define KRB5_MAX_UTF8_LEN (sizeof(krb5_ucs2) * 3/2)
-
int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out);
size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf);
size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf);
/*
- * Convert a little-endian UCS-2 string to an allocated null-terminated UTF-8
+ * Convert a little-endian UTF-16 string to an allocated null-terminated UTF-8
* string. nbytes is the length of ucs2bytes in bytes, and must be an even
* number. Return EINVAL on invalid input, ENOMEM on out of memory, or 0 on
* success.
*/
-int k5_ucs2le_to_utf8(const uint8_t *ucs2bytes, size_t nbytes,
- char **utf8_out);
+int k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes,
+ char **utf8_out);
/*
- * Convert a UTF-8 string to an allocated little-endian UCS-2 string. The
+ * Convert a UTF-8 string to an allocated little-endian UTF-16 string. The
* resulting length is in bytes and will always be even. Return EINVAL on
* invalid input, ENOMEM on out of memory, or 0 on success.
*/
-int k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out,
- size_t *nbytes_out);
+int k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out,
+ size_t *nbytes_out);
/* returns the number of bytes in the UTF-8 string */
size_t krb5int_utf8_bytes(const char *);
utf8 = k5memdup0(string->data, string->length, &err);
if (utf8 == NULL)
return err;
- err = k5_utf8_to_ucs2le(utf8, ©str, ©strlen);
+ err = k5_utf8_to_utf16le(utf8, ©str, ©strlen);
free(utf8);
if (err)
return err;
pac_princname_length % 2)
return ERANGE;
- ret = k5_ucs2le_to_utf8(p, pac_princname_length, &pac_princname);
+ ret = k5_utf16le_to_utf8(p, pac_princname_length, &pac_princname);
if (ret != 0)
return ret;
krb5_error_code ret;
krb5_data client_info;
char *princ_name_utf8 = NULL;
- unsigned char *princ_name_ucs2 = NULL, *p;
- size_t princ_name_ucs2_len = 0;
+ unsigned char *princ_name_utf16 = NULL, *p;
+ size_t princ_name_utf16_len = 0;
uint64_t nt_authtime;
/* If we already have a CLIENT_INFO buffer, then just validate it */
if (ret != 0)
goto cleanup;
- ret = k5_utf8_to_ucs2le(princ_name_utf8, &princ_name_ucs2,
- &princ_name_ucs2_len);
+ ret = k5_utf8_to_utf16le(princ_name_utf8, &princ_name_utf16,
+ &princ_name_utf16_len);
if (ret != 0)
goto cleanup;
- client_info.length = PAC_CLIENT_INFO_LENGTH + princ_name_ucs2_len;
+ client_info.length = PAC_CLIENT_INFO_LENGTH + princ_name_utf16_len;
client_info.data = NULL;
ret = k5_pac_add_buffer(context, pac, KRB5_PAC_CLIENT_INFO,
store_64_le(nt_authtime, p);
p += 8;
- /* copy in number of UCS-2 characters in principal name */
- store_16_le(princ_name_ucs2_len, p);
+ /* copy in number of UTF-16 bytes in principal name */
+ store_16_le(princ_name_utf16_len, p);
p += 2;
/* copy in principal name */
- memcpy(p, princ_name_ucs2, princ_name_ucs2_len);
+ memcpy(p, princ_name_utf16, princ_name_utf16_len);
cleanup:
- if (princ_name_ucs2 != NULL)
- free(princ_name_ucs2);
+ if (princ_name_utf16 != NULL)
+ free(princ_name_utf16);
krb5_free_unparsed_name(context, princ_name_utf8);
return ret;
$(srcdir)/bcmp.c \
$(srcdir)/strerror_r.c \
$(srcdir)/t_utf8.c \
+ $(srcdir)/t_utf16.c \
$(srcdir)/getopt.c \
$(srcdir)/getopt_long.c
t_utf8: t_utf8.o utf8.o
$(CC_LINK) -o t_utf8 t_utf8.o utf8.o
-TEST_PROGS= t_k5buf t_path t_path_win t_base64 t_json t_unal t_utf8
+T_UTF16_OBJS= t_utf16.o utf8_conv.o utf8.o k5buf.o $(PRINTF_ST_OBJ)
+
+t_utf16: $(T_UTF16_OBJS)
+ $(CC_LINK) -o $@ $(T_UTF16_OBJS)
+
+TEST_PROGS= t_k5buf t_path t_path_win t_base64 t_json t_unal t_utf8 t_utf16
check-unix: $(TEST_PROGS)
./t_k5buf
./t_json
./t_unal
./t_utf8
+ ./t_utf16
clean:
$(RM) t_k5buf.o t_k5buf t_unal.o t_unal path_win.o path_win
$(RM) t_path_win.o t_path_win t_path.o t_path t_base64.o t_base64
$(RM) t_json.o t_json libkrb5support.exports t_utf8.o t_utf8
+ $(RM) t_utf16.o t_utf16
@lib_frag@
@libobj_frag@
$(top_srcdir)/include/k5-utf8.h supp-int.h utf8.c
utf8_conv.so utf8_conv.po $(OUTPRE)utf8_conv.$(OBJEXT): \
$(BUILDTOP)/include/autoconf.h $(top_srcdir)/include/k5-buf.h \
- $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \
- $(top_srcdir)/include/k5-utf8.h supp-int.h utf8_conv.c
+ $(top_srcdir)/include/k5-input.h $(top_srcdir)/include/k5-platform.h \
+ $(top_srcdir)/include/k5-thread.h $(top_srcdir)/include/k5-utf8.h \
+ supp-int.h utf8_conv.c
gettimeofday.so gettimeofday.po $(OUTPRE)gettimeofday.$(OBJEXT): \
$(BUILDTOP)/include/autoconf.h $(top_srcdir)/include/k5-platform.h \
$(top_srcdir)/include/k5-thread.h gettimeofday.c
t_utf8.so t_utf8.po $(OUTPRE)t_utf8.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \
$(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \
$(top_srcdir)/include/k5-utf8.h t_utf8.c
+t_utf16.so t_utf16.po $(OUTPRE)t_utf16.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \
+ $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \
+ $(top_srcdir)/include/k5-utf8.h t_utf16.c
getopt.so getopt.po $(OUTPRE)getopt.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \
$(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \
getopt.c
k5_path_join
k5_path_split
k5_strerror_r
-k5_utf8_to_ucs2le
-k5_ucs2le_to_utf8
+k5_utf8_to_utf16le
+k5_utf16le_to_utf8
krb5int_key_register
krb5int_key_delete
krb5int_getspecific
--- /dev/null
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+/* util/support/t_utf16.c - test UTF-16 conversion functions */
+/*
+ * Copyright (C) 2017 by the Massachusetts Institute of Technology.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This program tests conversions between UTF-8 and little-endian UTF-16, with
+ * an eye mainly towards covering UTF-16 edge cases and UTF-8 decoding results
+ * which we detect as invalid in utf8_conv.c. t_utf8.c covers more UTF-8 edge
+ * cases.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "k5-platform.h"
+#include "k5-utf8.h"
+
+struct test {
+ const char *utf8;
+ const char *utf16;
+ size_t utf16len;
+} tests[] = {
+ { "", "", 0 },
+ { "abcd", "a\0b\0c\0d\0", 8 },
+ /* From RFC 2781 (tests code point 0x12345 and some ASCII) */
+ { "\xF0\x92\x8D\x85=Ra", "\x08\xD8\x45\xDF=\0R\0a\0", 10 },
+ /* Lowest and highest Supplementary Plane code points */
+ { "\xF0\x90\x80\x80 \xF4\x8F\xBF\xBF",
+ "\x00\xD8\x00\xDC \0\xFF\xDB\xFF\xDF", 10 },
+ /* Basic Multilingual Plane code points near and above surrogate range */
+ { "\xED\x9F\xBF", "\xFF\xD7", 2 },
+ { "\xEE\x80\x80 \xEE\xBF\xBF", "\x00\xE0 \0\xFF\xEF", 6 },
+ /* Invalid UTF-8: decodes to value in surrogate pair range */
+ { "\xED\xA0\x80", NULL, 0 }, /* 0xD800 */
+ { "\xED\xAF\xBF", NULL, 0 }, /* 0xDBFF */
+ { "\xED\xB0\x80", NULL, 0 }, /* 0xDC00 */
+ { "\xED\xBF\xBF", NULL, 0 }, /* 0xDFFF */
+ /* Invalid UTF-8: decodes to value above Unicode range */
+ { "\xF4\x90\x80\x80", NULL, 0 },
+ { "\xF4\xBF\xBF\xBF", NULL, 0 },
+ { "\xF5\x80\x80\x80", NULL, 0 }, /* thrown out early due to first byte */
+ /* Invalid UTF-16: odd numbers of UTF-16 bytes */
+ { NULL, "\x00", 1 },
+ { NULL, "\x01\x00\x02", 3 },
+ /* Invalid UTF-16: high surrogate without a following low surrogate */
+ { NULL, "\x00\xD8\x00\x00", 4 },
+ { NULL, "\x00\xD8\xFF\xDB", 4 },
+ { NULL, "\xFF\xDB", 2 },
+ /* Invalid UTF-16: low surrogate without a preceding high surrogate */
+ { NULL, "\x61\x00\x00\xDC", 4 },
+ { NULL, "\xFF\xDF\xFF\xDB", 4 },
+};
+
+int
+main(int argc, char **argv)
+{
+ int ret;
+ struct test *t;
+ size_t i, utf16len;
+ uint8_t *utf16;
+ char *utf8;
+
+ for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) {
+ t = &tests[i];
+ if (t->utf8 != NULL) {
+ ret = k5_utf8_to_utf16le(t->utf8, &utf16, &utf16len);
+ if (t->utf16 == NULL) {
+ assert(ret == EINVAL);
+ } else {
+ assert(ret == 0);
+ assert(t->utf16len == utf16len);
+ assert(memcmp(t->utf16, utf16, utf16len) == 0);
+ free(utf16);
+ }
+ }
+
+ if (t->utf16 != NULL) {
+ ret = k5_utf16le_to_utf8((uint8_t *)t->utf16, t->utf16len, &utf8);
+ if (t->utf8 == NULL) {
+ assert(ret == EINVAL);
+ } else {
+ assert(ret == 0);
+ assert(strcmp(t->utf8, utf8) == 0);
+ free(utf8);
+ }
+ }
+ }
+ return 0;
+}
return 0;
}
-/* conv UCS-2 to UTF-8, not used */
+/* conv UCS-4 to UTF-8 */
size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf)
{
size_t len = 0;
/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
/* util/support/utf8_conv.c */
/*
- * Copyright 2008 by the Massachusetts Institute of Technology.
+ * Copyright 2008, 2017 by the Massachusetts Institute of Technology.
* All Rights Reserved.
*
* Export of this software from the United States of America may
* THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
*/
-/* This work is part of OpenLDAP Software <http://www.openldap.org/>. */
+/* This work is based on OpenLDAP Software <http://www.openldap.org/>. */
/*
- * UTF-8 Conversion Routines
- *
- * These routines convert between Wide Character and UTF-8,
- * or between MultiByte and UTF-8 encodings.
- *
- * Both single character and string versions of the functions are provided.
- * All functions return -1 if the character or string cannot be converted.
+ * These routines convert between UTF-16 and UTF-8. UTF-16 encodes a Unicode
+ * character in either two or four bytes. Characters in the Basic Multilingual
+ * Plane (hex 0..D7FF and E000..FFFF) are encoded as-is in two bytes.
+ * Characters in the Supplementary Planes (10000..10FFFF) are split into a high
+ * surrogate and a low surrogate, each containing ten bits of the character
+ * value, and encoded in four bytes.
*/
#include "k5-platform.h"
#include "k5-utf8.h"
#include "k5-buf.h"
+#include "k5-input.h"
#include "supp-int.h"
static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
+/* A high surrogate is ten bits masked with 0xD800. */
+#define IS_HIGH_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDBFF)
+
+/* A low surrogate is ten bits masked with 0xDC00. */
+#define IS_LOW_SURROGATE(c) ((c) >= 0xDC00 && (c) <= 0xDFFF)
+
+/* A valid Unicode code point is in the range 0..10FFFF and is not a surrogate
+ * value. */
+#define IS_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDFFF)
+#define IS_VALID_UNICODE(c) ((c) <= 0x10FFFF && !IS_SURROGATE(c))
+
+/* A Basic Multilingual Plane character is in the range 0..FFFF and is not a
+ * surrogate value. */
+#define IS_BMP(c) ((c) <= 0xFFFF && !IS_SURROGATE(c))
+
+/* Characters in the Supplementary Planes have a base value subtracted from
+ * their code points to form a 20-bit value; ten bits go in each surrogate. */
+#define BASE 0x10000
+#define HIGH_SURROGATE(c) (0xD800 | (((c) - BASE) >> 10))
+#define LOW_SURROGATE(c) (0xDC00 | (((c) - BASE) & 0x3FF))
+#define COMPOSE(c1, c2) (BASE + ((((c1) & 0x3FF) << 10) | ((c2) & 0x3FF)))
+
int
-k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out, size_t *nbytes_out)
+k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out, size_t *nbytes_out)
{
struct k5buf buf;
- krb5_ucs2 ch;
+ krb5_ucs4 ch;
size_t chlen, i;
- void *p;
+ uint8_t *p;
- *ucs2_out = NULL;
+ *utf16_out = NULL;
*nbytes_out = 0;
k5_buf_init_dynamic(&buf);
while (*utf8 != '\0') {
/* Get UTF-8 sequence length from first byte. */
chlen = KRB5_UTF8_CHARLEN2(utf8, chlen);
- if (chlen == 0 || chlen > KRB5_MAX_UTF8_LEN)
+ if (chlen == 0)
goto invalid;
/* First byte minus length tag */
- ch = (krb5_ucs2)(utf8[0] & mask[chlen]);
+ ch = (krb5_ucs4)(utf8[0] & mask[chlen]);
for (i = 1; i < chlen; i++) {
/* Subsequent bytes must start with 10. */
/* 6 bits of data in each subsequent byte */
ch <<= 6;
- ch |= (krb5_ucs2)(utf8[i] & 0x3f);
+ ch |= (krb5_ucs4)(utf8[i] & 0x3f);
}
+ if (!IS_VALID_UNICODE(ch))
+ goto invalid;
- p = k5_buf_get_space(&buf, 2);
+ /* Characters in the basic multilingual plane are encoded using two
+ * bytes; other characters are encoded using four bytes. */
+ p = k5_buf_get_space(&buf, IS_BMP(ch) ? 2 : 4);
if (p == NULL)
return ENOMEM;
- store_16_le(ch, p);
+ if (IS_BMP(ch)) {
+ store_16_le(ch, p);
+ } else {
+ /* 0x10000 is subtracted from ch; then the high ten bits plus
+ * 0xD800 and the low ten bits plus 0xDC00 are the surrogates. */
+ store_16_le(HIGH_SURROGATE(ch), p);
+ store_16_le(LOW_SURROGATE(ch), p + 2);
+ }
/* Move to next UTF-8 character. */
utf8 += chlen;
}
- *ucs2_out = buf.data;
+ *utf16_out = buf.data;
*nbytes_out = buf.len;
return 0;
}
int
-k5_ucs2le_to_utf8(const uint8_t *ucs2bytes, size_t nbytes, char **utf8_out)
+k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes, char **utf8_out)
{
struct k5buf buf;
- krb5_ucs2 ch;
- size_t chlen, i;
+ struct k5input in;
+ uint16_t ch1, ch2;
+ krb5_ucs4 ch;
+ size_t chlen;
void *p;
*utf8_out = NULL;
return EINVAL;
k5_buf_init_dynamic(&buf);
+ k5_input_init(&in, utf16bytes, nbytes);
+ while (!in.status && in.len > 0) {
+ /* Get the next character or high surrogate. A low surrogate without a
+ * preceding high surrogate is invalid. */
+ ch1 = k5_input_get_uint16_le(&in);
+ if (IS_LOW_SURROGATE(ch1))
+ goto invalid;
+ if (IS_HIGH_SURROGATE(ch1)) {
+ /* Get the low surrogate and combine the pair. */
+ ch2 = k5_input_get_uint16_le(&in);
+ if (!IS_LOW_SURROGATE(ch2))
+ goto invalid;
+ ch = COMPOSE(ch1, ch2);
+ } else {
+ ch = ch1;
+ }
- for (i = 0; i < nbytes; i += 2) {
- ch = load_16_le(&ucs2bytes[i]);
- chlen = krb5int_ucs2_to_utf8(ch, NULL);
+ chlen = krb5int_ucs4_to_utf8(ch, NULL);
p = k5_buf_get_space(&buf, chlen);
if (p == NULL)
return ENOMEM;
- (void)krb5int_ucs2_to_utf8(ch, p);
+ (void)krb5int_ucs4_to_utf8(ch, p);
}
+ if (in.status)
+ goto invalid;
+
*utf8_out = buf.data;
return 0;
+
+invalid:
+ k5_buf_free(&buf);
+ return EINVAL;
}