Replace UCS-2 conversions with UTF-16

author Greg Hudson <ghudson@mit.edu>

Tue, 18 Apr 2017 18:01:06 +0000 (14:01 -0400)

committer Greg Hudson <ghudson@mit.edu>

Fri, 21 Apr 2017 17:08:09 +0000 (13:08 -0400)
author Greg Hudson <ghudson@mit.edu>
Tue, 18 Apr 2017 18:01:06 +0000 (14:01 -0400)
committer Greg Hudson <ghudson@mit.edu>
Fri, 21 Apr 2017 17:08:09 +0000 (13:08 -0400)
diff --git a/.gitignore b/.gitignore

index 815c67d2c0c9ff39bb0cc29bc36ceca2f2720e32..862a87a71533a0acbef984feedfecc5ef17a2ce3 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -523,6 +523,7 @@ local.properties
  /src/util/support/t_path_win
  /src/util/support/t_unal
  /src/util/support/t_utf8
+/src/util/support/t_utf16
  
  /src/util/verto/rename.h
  
diff --git a/src/include/k5-utf8.h b/src/include/k5-utf8.h

index 4b7415e66b13c6c2716b178e7889dd502c2a481f..e2f20d45028f0447d6360422554f943065ee1847 100644 (file)
--- a/src/include/k5-utf8.h
+++ b/src/include/k5-utf8.h
@@ -73,8 +73,6 @@
  typedef uint16_t krb5_ucs2;
  typedef uint32_t krb5_ucs4;
  
-#define KRB5_MAX_UTF8_LEN   (sizeof(krb5_ucs2) * 3/2)
-
  int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out);
  size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf);
  
@@ -82,21 +80,21 @@ int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out);
  size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf);
  
  /*
- * Convert a little-endian UCS-2 string to an allocated null-terminated UTF-8
+ * Convert a little-endian UTF-16 string to an allocated null-terminated UTF-8
   * string.  nbytes is the length of ucs2bytes in bytes, and must be an even
   * number.  Return EINVAL on invalid input, ENOMEM on out of memory, or 0 on
   * success.
   */
-int k5_ucs2le_to_utf8(const uint8_t *ucs2bytes, size_t nbytes,
-                      char **utf8_out);
+int k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes,
+                       char **utf8_out);
  
  /*
- * Convert a UTF-8 string to an allocated little-endian UCS-2 string.  The
+ * Convert a UTF-8 string to an allocated little-endian UTF-16 string.  The
   * resulting length is in bytes and will always be even.  Return EINVAL on
   * invalid input, ENOMEM on out of memory, or 0 on success.
   */
-int k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out,
-                      size_t *nbytes_out);
+int k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out,
+                       size_t *nbytes_out);
  
  /* returns the number of bytes in the UTF-8 string */
  size_t krb5int_utf8_bytes(const char *);
diff --git a/src/lib/crypto/krb/s2k_rc4.c b/src/lib/crypto/krb/s2k_rc4.c

index fb41b269d205a44e79699d9f511b9bb1766c4030..081a91217c6922d0b7bd027cef01ade2ab4ad030 100644 (file)
--- a/src/lib/crypto/krb/s2k_rc4.c
+++ b/src/lib/crypto/krb/s2k_rc4.c
@@ -24,7 +24,7 @@ krb5int_arcfour_string_to_key(const struct krb5_keytypes *ktp,
      utf8 = k5memdup0(string->data, string->length, &err);
      if (utf8 == NULL)
          return err;
-    err = k5_utf8_to_ucs2le(utf8, &copystr, &copystrlen);
+    err = k5_utf8_to_utf16le(utf8, &copystr, &copystrlen);
      free(utf8);
      if (err)
          return err;
diff --git a/src/lib/krb5/krb/pac.c b/src/lib/krb5/krb/pac.c

index 485a0f7c518b58c711794302581f31c1dca8c7b8..d1662b98f3de500f5366f18c19f523a5e1f3db6b 100644 (file)
--- a/src/lib/krb5/krb/pac.c
+++ b/src/lib/krb5/krb/pac.c
@@ -436,7 +436,7 @@ k5_pac_validate_client(krb5_context context,
          pac_princname_length % 2)
          return ERANGE;
  
-    ret = k5_ucs2le_to_utf8(p, pac_princname_length, &pac_princname);
+    ret = k5_utf16le_to_utf8(p, pac_princname_length, &pac_princname);
      if (ret != 0)
          return ret;
  
diff --git a/src/lib/krb5/krb/pac_sign.c b/src/lib/krb5/krb/pac_sign.c

index c6eee767db19ab7dc695573ac9fc7e7b98a3b6b3..c94899c96a79442074ff9e062a2602599d934d34 100644 (file)
--- a/src/lib/krb5/krb/pac_sign.c
+++ b/src/lib/krb5/krb/pac_sign.c
@@ -38,8 +38,8 @@ k5_insert_client_info(krb5_context context,
      krb5_error_code ret;
      krb5_data client_info;
      char *princ_name_utf8 = NULL;
-    unsigned char *princ_name_ucs2 = NULL, *p;
-    size_t princ_name_ucs2_len = 0;
+    unsigned char *princ_name_utf16 = NULL, *p;
+    size_t princ_name_utf16_len = 0;
      uint64_t nt_authtime;
  
      /* If we already have a CLIENT_INFO buffer, then just validate it */
@@ -54,12 +54,12 @@ k5_insert_client_info(krb5_context context,
      if (ret != 0)
          goto cleanup;
  
-    ret = k5_utf8_to_ucs2le(princ_name_utf8, &princ_name_ucs2,
-                            &princ_name_ucs2_len);
+    ret = k5_utf8_to_utf16le(princ_name_utf8, &princ_name_utf16,
+                             &princ_name_utf16_len);
      if (ret != 0)
          goto cleanup;
  
-    client_info.length = PAC_CLIENT_INFO_LENGTH + princ_name_ucs2_len;
+    client_info.length = PAC_CLIENT_INFO_LENGTH + princ_name_utf16_len;
      client_info.data = NULL;
  
      ret = k5_pac_add_buffer(context, pac, KRB5_PAC_CLIENT_INFO,
@@ -74,16 +74,16 @@ k5_insert_client_info(krb5_context context,
      store_64_le(nt_authtime, p);
      p += 8;
  
-    /* copy in number of UCS-2 characters in principal name */
-    store_16_le(princ_name_ucs2_len, p);
+    /* copy in number of UTF-16 bytes in principal name */
+    store_16_le(princ_name_utf16_len, p);
      p += 2;
  
      /* copy in principal name */
-    memcpy(p, princ_name_ucs2, princ_name_ucs2_len);
+    memcpy(p, princ_name_utf16, princ_name_utf16_len);
  
  cleanup:
-    if (princ_name_ucs2 != NULL)
-        free(princ_name_ucs2);
+    if (princ_name_utf16 != NULL)
+        free(princ_name_utf16);
      krb5_free_unparsed_name(context, princ_name_utf8);
  
      return ret;
diff --git a/src/util/support/Makefile.in b/src/util/support/Makefile.in

index 6239e41761ee6821fcef79bc143531277c9c82a1..0bf0b7a8727760f5527270d376f5af6ff1f1c55f 100644 (file)
--- a/src/util/support/Makefile.in
+++ b/src/util/support/Makefile.in
@@ -143,6 +143,7 @@ SRCS=\
         $(srcdir)/bcmp.c \
         $(srcdir)/strerror_r.c \
         $(srcdir)/t_utf8.c \
+       $(srcdir)/t_utf16.c \
         $(srcdir)/getopt.c \
         $(srcdir)/getopt_long.c
  
@@ -220,7 +221,12 @@ t_unal: t_unal.o
  t_utf8: t_utf8.o utf8.o
         $(CC_LINK) -o t_utf8 t_utf8.o utf8.o
  
-TEST_PROGS= t_k5buf t_path t_path_win t_base64 t_json t_unal t_utf8
+T_UTF16_OBJS= t_utf16.o utf8_conv.o utf8.o k5buf.o $(PRINTF_ST_OBJ)
+
+t_utf16: $(T_UTF16_OBJS)
+       $(CC_LINK) -o $@ $(T_UTF16_OBJS)
+
+TEST_PROGS= t_k5buf t_path t_path_win t_base64 t_json t_unal t_utf8 t_utf16
  
  check-unix: $(TEST_PROGS)
         ./t_k5buf
@@ -230,11 +236,13 @@ check-unix: $(TEST_PROGS)
         ./t_json
         ./t_unal
         ./t_utf8
+       ./t_utf16
  
  clean:
         $(RM) t_k5buf.o t_k5buf t_unal.o t_unal path_win.o path_win
         $(RM) t_path_win.o t_path_win t_path.o t_path t_base64.o t_base64
         $(RM) t_json.o t_json libkrb5support.exports t_utf8.o t_utf8
+       $(RM) t_utf16.o t_utf16
  
  @lib_frag@
  @libobj_frag@
diff --git a/src/util/support/deps b/src/util/support/deps

index a95d2ad90830a2352031df4a3766c9d5c1411cd9..34d8a884b3302af17d6cd0217a81000cde4c2d81 100644 (file)
--- a/src/util/support/deps
+++ b/src/util/support/deps
@@ -34,8 +34,9 @@ utf8.so utf8.po $(OUTPRE)utf8.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \
    $(top_srcdir)/include/k5-utf8.h supp-int.h utf8.c
  utf8_conv.so utf8_conv.po $(OUTPRE)utf8_conv.$(OBJEXT): \
    $(BUILDTOP)/include/autoconf.h $(top_srcdir)/include/k5-buf.h \
-  $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \
-  $(top_srcdir)/include/k5-utf8.h supp-int.h utf8_conv.c
+  $(top_srcdir)/include/k5-input.h $(top_srcdir)/include/k5-platform.h \
+  $(top_srcdir)/include/k5-thread.h $(top_srcdir)/include/k5-utf8.h \
+  supp-int.h utf8_conv.c
  gettimeofday.so gettimeofday.po $(OUTPRE)gettimeofday.$(OBJEXT): \
    $(BUILDTOP)/include/autoconf.h $(top_srcdir)/include/k5-platform.h \
    $(top_srcdir)/include/k5-thread.h gettimeofday.c
@@ -84,6 +85,9 @@ strerror_r.so strerror_r.po $(OUTPRE)strerror_r.$(OBJEXT): \
  t_utf8.so t_utf8.po $(OUTPRE)t_utf8.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \
    $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \
    $(top_srcdir)/include/k5-utf8.h t_utf8.c
+t_utf16.so t_utf16.po $(OUTPRE)t_utf16.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \
+  $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \
+  $(top_srcdir)/include/k5-utf8.h t_utf16.c
  getopt.so getopt.po $(OUTPRE)getopt.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \
    $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \
    getopt.c
diff --git a/src/util/support/libkrb5support-fixed.exports b/src/util/support/libkrb5support-fixed.exports

index 750dc243f796e801266bc5ed123fd60f71d7f551..fd74a1897ebbccc0a314b14fb456e4859e10be35 100644 (file)
--- a/src/util/support/libkrb5support-fixed.exports
+++ b/src/util/support/libkrb5support-fixed.exports
@@ -52,8 +52,8 @@ k5_path_isabs
  k5_path_join
  k5_path_split
  k5_strerror_r
-k5_utf8_to_ucs2le
-k5_ucs2le_to_utf8
+k5_utf8_to_utf16le
+k5_utf16le_to_utf8
  krb5int_key_register
  krb5int_key_delete
  krb5int_getspecific
diff --git a/src/util/support/t_utf16.c b/src/util/support/t_utf16.c

new file mode 100644 (file)

index 0000000..bc3390a
--- /dev/null
+++ b/src/util/support/t_utf16.c
@@ -0,0 +1,117 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+/* util/support/t_utf16.c - test UTF-16 conversion functions */
+/*
+ * Copyright (C) 2017 by the Massachusetts Institute of Technology.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This program tests conversions between UTF-8 and little-endian UTF-16, with
+ * an eye mainly towards covering UTF-16 edge cases and UTF-8 decoding results
+ * which we detect as invalid in utf8_conv.c.  t_utf8.c covers more UTF-8 edge
+ * cases.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "k5-platform.h"
+#include "k5-utf8.h"
+
+struct test {
+    const char *utf8;
+    const char *utf16;
+    size_t utf16len;
+} tests[] = {
+    { "", "", 0 },
+    { "abcd", "a\0b\0c\0d\0", 8 },
+    /* From RFC 2781 (tests code point 0x12345 and some ASCII) */
+    { "\xF0\x92\x8D\x85=Ra", "\x08\xD8\x45\xDF=\0R\0a\0", 10 },
+    /* Lowest and highest Supplementary Plane code points */
+    { "\xF0\x90\x80\x80 \xF4\x8F\xBF\xBF",
+      "\x00\xD8\x00\xDC \0\xFF\xDB\xFF\xDF", 10 },
+    /* Basic Multilingual Plane code points near and above surrogate range */
+    { "\xED\x9F\xBF", "\xFF\xD7", 2 },
+    { "\xEE\x80\x80 \xEE\xBF\xBF", "\x00\xE0 \0\xFF\xEF", 6 },
+    /* Invalid UTF-8: decodes to value in surrogate pair range */
+    { "\xED\xA0\x80", NULL, 0 }, /* 0xD800 */
+    { "\xED\xAF\xBF", NULL, 0 }, /* 0xDBFF */
+    { "\xED\xB0\x80", NULL, 0 }, /* 0xDC00 */
+    { "\xED\xBF\xBF", NULL, 0 }, /* 0xDFFF */
+    /* Invalid UTF-8: decodes to value above Unicode range */
+    { "\xF4\x90\x80\x80", NULL, 0 },
+    { "\xF4\xBF\xBF\xBF", NULL, 0 },
+    { "\xF5\x80\x80\x80", NULL, 0 }, /* thrown out early due to first byte */
+    /* Invalid UTF-16: odd numbers of UTF-16 bytes */
+    { NULL, "\x00", 1 },
+    { NULL, "\x01\x00\x02", 3 },
+    /* Invalid UTF-16: high surrogate without a following low surrogate */
+    { NULL, "\x00\xD8\x00\x00", 4 },
+    { NULL, "\x00\xD8\xFF\xDB", 4 },
+    { NULL, "\xFF\xDB", 2 },
+    /* Invalid UTF-16: low surrogate without a preceding high surrogate */
+    { NULL, "\x61\x00\x00\xDC", 4 },
+    { NULL, "\xFF\xDF\xFF\xDB", 4 },
+};
+
+int
+main(int argc, char **argv)
+{
+    int ret;
+    struct test *t;
+    size_t i, utf16len;
+    uint8_t *utf16;
+    char *utf8;
+
+    for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) {
+        t = &tests[i];
+        if (t->utf8 != NULL) {
+            ret = k5_utf8_to_utf16le(t->utf8, &utf16, &utf16len);
+            if (t->utf16 == NULL) {
+                assert(ret == EINVAL);
+            } else {
+                assert(ret == 0);
+                assert(t->utf16len == utf16len);
+                assert(memcmp(t->utf16, utf16, utf16len) == 0);
+                free(utf16);
+            }
+        }
+
+        if (t->utf16 != NULL) {
+            ret = k5_utf16le_to_utf8((uint8_t *)t->utf16, t->utf16len, &utf8);
+            if (t->utf8 == NULL) {
+                assert(ret == EINVAL);
+            } else {
+                assert(ret == 0);
+                assert(strcmp(t->utf8, utf8) == 0);
+                free(utf8);
+            }
+        }
+    }
+    return 0;
+}
diff --git a/src/util/support/utf8.c b/src/util/support/utf8.c

index e42c0c7dc82bfa016f17347d606665d5873297a7..34e2b6adb059d4a097bffac5240bf915c6cb5951 100644 (file)
--- a/src/util/support/utf8.c
+++ b/src/util/support/utf8.c
@@ -205,7 +205,7 @@ int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out)
      return 0;
  }
  
-/* conv UCS-2 to UTF-8, not used */
+/* conv UCS-4 to UTF-8 */
  size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf)
  {
      size_t len = 0;
diff --git a/src/util/support/utf8_conv.c b/src/util/support/utf8_conv.c

index 5f279c39b5ad0824cc9e6f188b3ac57ad152f8db..5cfc2c512b8625fc5290ba6d8d6494f990507633 100644 (file)
--- a/src/util/support/utf8_conv.c
+++ b/src/util/support/utf8_conv.c
@@ -1,7 +1,7 @@
  /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
  /* util/support/utf8_conv.c */
  /*
- * Copyright 2008 by the Massachusetts Institute of Technology.
+ * Copyright 2008, 2017 by the Massachusetts Institute of Technology.
   * All Rights Reserved.
   *
   * Export of this software from the United States of America may
@@ -47,34 +47,56 @@
   * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
   */
  
-/* This work is part of OpenLDAP Software <http://www.openldap.org/>. */
+/* This work is based on OpenLDAP Software <http://www.openldap.org/>. */
  
  /*
- * UTF-8 Conversion Routines
- *
- * These routines convert between Wide Character and UTF-8,
- * or between MultiByte and UTF-8 encodings.
- *
- * Both single character and string versions of the functions are provided.
- * All functions return -1 if the character or string cannot be converted.
+ * These routines convert between UTF-16 and UTF-8.  UTF-16 encodes a Unicode
+ * character in either two or four bytes.  Characters in the Basic Multilingual
+ * Plane (hex 0..D7FF and E000..FFFF) are encoded as-is in two bytes.
+ * Characters in the Supplementary Planes (10000..10FFFF) are split into a high
+ * surrogate and a low surrogate, each containing ten bits of the character
+ * value, and encoded in four bytes.
   */
  
  #include "k5-platform.h"
  #include "k5-utf8.h"
  #include "k5-buf.h"
+#include "k5-input.h"
  #include "supp-int.h"
  
  static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
  
+/* A high surrogate is ten bits masked with 0xD800. */
+#define IS_HIGH_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDBFF)
+
+/* A low surrogate is ten bits masked with 0xDC00. */
+#define IS_LOW_SURROGATE(c) ((c) >= 0xDC00 && (c) <= 0xDFFF)
+
+/* A valid Unicode code point is in the range 0..10FFFF and is not a surrogate
+ * value. */
+#define IS_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDFFF)
+#define IS_VALID_UNICODE(c) ((c) <= 0x10FFFF && !IS_SURROGATE(c))
+
+/* A Basic Multilingual Plane character is in the range 0..FFFF and is not a
+ * surrogate value. */
+#define IS_BMP(c) ((c) <= 0xFFFF && !IS_SURROGATE(c))
+
+/* Characters in the Supplementary Planes have a base value subtracted from
+ * their code points to form a 20-bit value; ten bits go in each surrogate. */
+#define BASE 0x10000
+#define HIGH_SURROGATE(c) (0xD800 | (((c) - BASE) >> 10))
+#define LOW_SURROGATE(c) (0xDC00 | (((c) - BASE) & 0x3FF))
+#define COMPOSE(c1, c2) (BASE + ((((c1) & 0x3FF) << 10) | ((c2) & 0x3FF)))
+
  int
-k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out, size_t *nbytes_out)
+k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out, size_t *nbytes_out)
  {
      struct k5buf buf;
-    krb5_ucs2 ch;
+    krb5_ucs4 ch;
      size_t chlen, i;
-    void *p;
+    uint8_t *p;
  
-    *ucs2_out = NULL;
+    *utf16_out = NULL;
      *nbytes_out = 0;
  
      k5_buf_init_dynamic(&buf);
@@ -83,11 +105,11 @@ k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out, size_t *nbytes_out)
      while (*utf8 != '\0') {
          /* Get UTF-8 sequence length from first byte. */
          chlen = KRB5_UTF8_CHARLEN2(utf8, chlen);
-        if (chlen == 0 || chlen > KRB5_MAX_UTF8_LEN)
+        if (chlen == 0)
              goto invalid;
  
          /* First byte minus length tag */
-        ch = (krb5_ucs2)(utf8[0] & mask[chlen]);
+        ch = (krb5_ucs4)(utf8[0] & mask[chlen]);
  
          for (i = 1; i < chlen; i++) {
              /* Subsequent bytes must start with 10. */
@@ -96,19 +118,30 @@ k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out, size_t *nbytes_out)
  
              /* 6 bits of data in each subsequent byte */
              ch <<= 6;
-            ch |= (krb5_ucs2)(utf8[i] & 0x3f);
+            ch |= (krb5_ucs4)(utf8[i] & 0x3f);
          }
+        if (!IS_VALID_UNICODE(ch))
+            goto invalid;
  
-        p = k5_buf_get_space(&buf, 2);
+        /* Characters in the basic multilingual plane are encoded using two
+         * bytes; other characters are encoded using four bytes. */
+        p = k5_buf_get_space(&buf, IS_BMP(ch) ? 2 : 4);
          if (p == NULL)
              return ENOMEM;
-        store_16_le(ch, p);
+        if (IS_BMP(ch)) {
+            store_16_le(ch, p);
+        } else {
+            /* 0x10000 is subtracted from ch; then the high ten bits plus
+             * 0xD800 and the low ten bits plus 0xDC00 are the surrogates. */
+            store_16_le(HIGH_SURROGATE(ch), p);
+            store_16_le(LOW_SURROGATE(ch), p + 2);
+        }
  
          /* Move to next UTF-8 character. */
          utf8 += chlen;
      }
  
-    *ucs2_out = buf.data;
+    *utf16_out = buf.data;
      *nbytes_out = buf.len;
      return 0;
  
@@ -118,11 +151,13 @@ invalid:
  }
  
  int
-k5_ucs2le_to_utf8(const uint8_t *ucs2bytes, size_t nbytes, char **utf8_out)
+k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes, char **utf8_out)
  {
      struct k5buf buf;
-    krb5_ucs2 ch;
-    size_t chlen, i;
+    struct k5input in;
+    uint16_t ch1, ch2;
+    krb5_ucs4 ch;
+    size_t chlen;
      void *p;
  
      *utf8_out = NULL;
@@ -131,16 +166,37 @@ k5_ucs2le_to_utf8(const uint8_t *ucs2bytes, size_t nbytes, char **utf8_out)
          return EINVAL;
  
      k5_buf_init_dynamic(&buf);
+    k5_input_init(&in, utf16bytes, nbytes);
+    while (!in.status && in.len > 0) {
+        /* Get the next character or high surrogate.  A low surrogate without a
+         * preceding high surrogate is invalid. */
+        ch1 = k5_input_get_uint16_le(&in);
+        if (IS_LOW_SURROGATE(ch1))
+            goto invalid;
+        if (IS_HIGH_SURROGATE(ch1)) {
+            /* Get the low surrogate and combine the pair. */
+            ch2 = k5_input_get_uint16_le(&in);
+            if (!IS_LOW_SURROGATE(ch2))
+                goto invalid;
+            ch = COMPOSE(ch1, ch2);
+        } else {
+            ch = ch1;
+        }
  
-    for (i = 0; i < nbytes; i += 2) {
-        ch = load_16_le(&ucs2bytes[i]);
-        chlen = krb5int_ucs2_to_utf8(ch, NULL);
+        chlen = krb5int_ucs4_to_utf8(ch, NULL);
          p = k5_buf_get_space(&buf, chlen);
          if (p == NULL)
              return ENOMEM;
-        (void)krb5int_ucs2_to_utf8(ch, p);
+        (void)krb5int_ucs4_to_utf8(ch, p);
      }
  
+    if (in.status)
+        goto invalid;
+
      *utf8_out = buf.data;
      return 0;
+
+invalid:
+    k5_buf_free(&buf);
+    return EINVAL;
  }
author	Greg Hudson <ghudson@mit.edu>
	Tue, 18 Apr 2017 18:01:06 +0000 (14:01 -0400)
committer	Greg Hudson <ghudson@mit.edu>
	Fri, 21 Apr 2017 17:08:09 +0000 (13:08 -0400)
.gitignore		patch \| blob \| blame \| history
src/include/k5-utf8.h		patch \| blob \| blame \| history
src/lib/crypto/krb/s2k_rc4.c		patch \| blob \| blame \| history
src/lib/krb5/krb/pac.c		patch \| blob \| blame \| history
src/lib/krb5/krb/pac_sign.c		patch \| blob \| blame \| history
src/util/support/Makefile.in		patch \| blob \| blame \| history
src/util/support/deps		patch \| blob \| blame \| history
src/util/support/libkrb5support-fixed.exports		patch \| blob \| blame \| history
src/util/support/t_utf16.c	[new file with mode: 0644]	patch \| blob
src/util/support/utf8.c		patch \| blob \| blame \| history
src/util/support/utf8_conv.c		patch \| blob \| blame \| history