gcc/d/dmd/utf.c

   1
   2 /* Compiler implementation of the D programming language
   3  * Copyright (C) 2003-2018 by The D Language Foundation, All Rights Reserved
   4  * written by Walter Bright
   5  * http://www.digitalmars.com
   6  * Distributed under the Boost Software License, Version 1.0.
   7  * http://www.boost.org/LICENSE_1_0.txt
   8  * https://github.com/D-Programming-Language/dmd/blob/master/src/utf.c
   9  */
  10
  11 /// Description of UTF-8 in [1].  Unicode non-characters and private-use
  12 /// code points described in [2],[4].
  13 ///
  14 /// References:
  15 /// [1] http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
  16 /// [2] http://en.wikipedia.org/wiki/Unicode
  17 /// [3] http://unicode.org/faq/utf_bom.html
  18 /// [4] http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf
  19
  20 #include "utf.h"
  21
  22 /* The following encodings are valid, except for the 5 and 6 byte
  23  * combinations:
  24  *      0xxxxxxx
  25  *      110xxxxx 10xxxxxx
  26  *      1110xxxx 10xxxxxx 10xxxxxx
  27  *      11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  28  *      111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  29  *      1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  30  */
  31 const unsigned UTF8_STRIDE[256] =
  32 {
  33     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  34     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  35     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  36     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  37     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  38     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  39     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  40     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  41     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
  42     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
  43     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
  44     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
  45     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  46     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  47     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
  48     4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
  49 };
  50
  51 // UTF-8 decoding errors
  52 char const UTF8_DECODE_OUTSIDE_CODE_SPACE[] = "Outside Unicode code space";
  53 char const UTF8_DECODE_TRUNCATED_SEQUENCE[] = "Truncated UTF-8 sequence";
  54 char const UTF8_DECODE_OVERLONG[]           = "Overlong UTF-8 sequence";
  55 char const UTF8_DECODE_INVALID_TRAILER[]    = "Invalid trailing code unit";
  56 char const UTF8_DECODE_INVALID_CODE_POINT[] = "Invalid code point decoded";
  57
  58 // UTF-16 decoding errors
  59 char const UTF16_DECODE_TRUNCATED_SEQUENCE[]= "Truncated UTF-16 sequence";
  60 char const UTF16_DECODE_INVALID_SURROGATE[] = "Invalid low surrogate";
  61 char const UTF16_DECODE_UNPAIRED_SURROGATE[]= "Unpaired surrogate";
  62 char const UTF16_DECODE_INVALID_CODE_POINT[]= "Invalid code point decoded";
  63
  64 /// The Unicode code space is the range of code points [0x000000,0x10FFFF]
  65 /// except the UTF-16 surrogate pairs in the range [0xD800,0xDFFF]
  66 /// and non-characters (which end in 0xFFFE or 0xFFFF).
  67 bool utf_isValidDchar(dchar_t c)
  68 {
  69     // TODO: Whether non-char code points should be rejected is pending review
  70     // largest character code point
  71     if (c > 0x10FFFF)
  72         return false;
  73     // surrogate pairs
  74     if (0xD800 <= c && c <= 0xDFFF)
  75         return false;
  76     // non-characters
  77     if ((c & 0xFFFFFE) == 0x00FFFE)
  78         return false;
  79     return true;
  80 }
  81
  82 /*******************************
  83  * Return !=0 if unicode alpha.
  84  * Use table from C99 Appendix D.
  85  */
  86
  87 bool isUniAlpha(dchar_t c)
  88 {
  89     size_t high = ALPHA_TABLE_LENGTH - 1;
  90     // Shortcut search if c is out of range
  91     size_t low
  92         = (c < ALPHA_TABLE[0][0] || ALPHA_TABLE[high][1] < c) ? high + 1 : 0;
  93     // Binary search
  94     while (low <= high)
  95     {
  96         size_t mid = (low + high) >> 1;
  97         if (c < ALPHA_TABLE[mid][0])
  98             high = mid - 1;
  99         else if (ALPHA_TABLE[mid][1] < c)
 100             low = mid + 1;
 101         else
 102         {
 103             assert(ALPHA_TABLE[mid][0] <= c && c <= ALPHA_TABLE[mid][1]);
 104             return true;
 105         }
 106     }
 107     return false;
 108 }
 109
 110 /**
 111  * Returns the code length of c in code units.
 112  */
 113
 114 int utf_codeLengthChar(dchar_t c)
 115 {
 116   if (c <= 0x7F)
 117       return 1;
 118   if (c <= 0x7FF)
 119       return 2;
 120   if (c <= 0xFFFF)
 121       return 3;
 122   if (c <= 0x10FFFF)
 123       return 4;
 124   assert(false);
 125 }
 126
 127 int utf_codeLengthWchar(dchar_t c)
 128 {
 129     return c <= 0xFFFF ? 1 : 2;
 130 }
 131
 132 /**
 133  * Returns the code length of c in code units for the encoding.
 134  * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32.
 135  */
 136
 137 int utf_codeLength(int sz, dchar_t c)
 138 {
 139     if (sz == 1)
 140         return utf_codeLengthChar(c);
 141     if (sz == 2)
 142         return utf_codeLengthWchar(c);
 143     assert(sz == 4);
 144     return 1;
 145 }
 146
 147 void utf_encodeChar(utf8_t *s, dchar_t c)
 148 {
 149     assert(s != NULL);
 150     assert(utf_isValidDchar(c));
 151     if (c <= 0x7F)
 152     {
 153         s[0] = static_cast<utf8_t>(c);
 154     }
 155     else if (c <= 0x07FF)
 156     {
 157         s[0] = static_cast<utf8_t>(0xC0 | (c >> 6));
 158         s[1] = static_cast<utf8_t>(0x80 | (c & 0x3F));
 159     }
 160     else if (c <= 0xFFFF)
 161     {
 162         s[0] = static_cast<utf8_t>(0xE0 | (c >> 12));
 163         s[1] = static_cast<utf8_t>(0x80 | ((c >> 6) & 0x3F));
 164         s[2] = static_cast<utf8_t>(0x80 | (c & 0x3F));
 165     }
 166     else if (c <= 0x10FFFF)
 167     {
 168         s[0] = static_cast<utf8_t>(0xF0 | (c >> 18));
 169         s[1] = static_cast<utf8_t>(0x80 | ((c >> 12) & 0x3F));
 170         s[2] = static_cast<utf8_t>(0x80 | ((c >> 6) & 0x3F));
 171         s[3] = static_cast<utf8_t>(0x80 | (c & 0x3F));
 172     }
 173     else
 174         assert(0);
 175 }
 176
 177 void utf_encodeWchar(utf16_t *s, dchar_t c)
 178 {
 179     assert(s != NULL);
 180     assert(utf_isValidDchar(c));
 181     if (c <= 0xFFFF)
 182     {
 183         s[0] = static_cast<utf16_t>(c);
 184     }
 185     else
 186     {
 187         s[0] = static_cast<utf16_t>((((c - 0x010000) >> 10) & 0x03FF) + 0xD800);
 188         s[1] = static_cast<utf16_t>(((c - 0x010000) & 0x03FF) + 0xDC00);
 189     }
 190 }
 191
 192 void utf_encode(int sz, void *s, dchar_t c)
 193 {
 194     if (sz == 1)
 195         utf_encodeChar((utf8_t *)s, c);
 196     else if (sz == 2)
 197         utf_encodeWchar((utf16_t *)s, c);
 198     else
 199     {
 200         assert(sz == 4);
 201         *((utf32_t *)s) = c;
 202     }
 203 }
 204
 205 /********************************************
 206  * Decode a UTF-8 sequence as a single UTF-32 code point.
 207  * Returns:
 208  *      NULL    success
 209  *      !=NULL  error message string
 210  */
 211
 212 const char *utf_decodeChar(utf8_t const *s, size_t len, size_t *pidx, dchar_t *presult)
 213 {
 214     assert(s != NULL);
 215     assert(pidx != NULL);
 216     assert(presult != NULL);
 217     size_t i = (*pidx)++;
 218     assert(i < len);
 219     utf8_t u = s[i];
 220     // Pre-stage results for ASCII and error cases
 221     *presult = u;
 222
 223     //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len);
 224
 225     // Get expected sequence length
 226     size_t n = UTF8_STRIDE[u];
 227     switch (n)
 228     {
 229     case 1:                             // ASCII
 230         return UTF8_DECODE_OK;
 231     case 2: case 3: case 4:             // multi-byte UTF-8
 232         break;
 233     default:                            // 5- or 6-byte sequence
 234         return UTF8_DECODE_OUTSIDE_CODE_SPACE;
 235     }
 236     if (len < i + n)                    // source too short
 237         return UTF8_DECODE_TRUNCATED_SEQUENCE;
 238
 239     // Pick off 7 - n low bits from first code unit
 240     utf32_t c = u & ((1 << (7 - n)) - 1);
 241     /* The following combinations are overlong, and illegal:
 242      *      1100000x (10xxxxxx)
 243      *      11100000 100xxxxx (10xxxxxx)
 244      *      11110000 1000xxxx (10xxxxxx 10xxxxxx)
 245      *      11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
 246      *      11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
 247      */
 248     utf8_t u2 = s[++i];
 249     // overlong combination
 250     if ((u & 0xFE) == 0xC0 ||
 251         (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
 252         (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
 253         (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
 254         (u == 0xFC && (u2 & 0xFC) == 0x80))
 255         return UTF8_DECODE_OVERLONG;
 256     // Decode remaining bits
 257     for (n += i - 1; i != n; ++i)
 258     {
 259         u = s[i];
 260         if ((u & 0xC0) != 0x80)         // trailing bytes are 10xxxxxx
 261             return UTF8_DECODE_INVALID_TRAILER;
 262         c = (c << 6) | (u & 0x3F);
 263     }
 264     if (!utf_isValidDchar(c))
 265         return UTF8_DECODE_INVALID_CODE_POINT;
 266     *pidx = i;
 267     *presult = c;
 268     return UTF8_DECODE_OK;
 269 }
 270
 271 /********************************************
 272  * Decode a UTF-16 sequence as a single UTF-32 code point.
 273  * Returns:
 274  *      NULL    success
 275  *      !=NULL  error message string
 276  */
 277
 278 const char *utf_decodeWchar(utf16_t const *s, size_t len, size_t *pidx, dchar_t *presult)
 279 {
 280     assert(s != NULL);
 281     assert(pidx != NULL);
 282     assert(presult != NULL);
 283     size_t i = (*pidx)++;
 284     assert(i < len);
 285     // Pre-stage results for ASCII and error cases
 286     utf32_t u = *presult = s[i];
 287
 288     if (u < 0x80)                       // ASCII
 289         return UTF16_DECODE_OK;
 290     if (0xD800 <= u && u <= 0xDBFF)     // Surrogate pair
 291     {   if (len <= i + 1)
 292             return UTF16_DECODE_TRUNCATED_SEQUENCE;
 293         utf16_t u2 = s[i + 1];
 294         if (u2 < 0xDC00 || 0xDFFF < u)
 295             return UTF16_DECODE_INVALID_SURROGATE;
 296         u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
 297         ++*pidx;
 298     }
 299     else if (0xDC00 <= u && u <= 0xDFFF)
 300         return UTF16_DECODE_UNPAIRED_SURROGATE;
 301     if (!utf_isValidDchar(u))
 302         return UTF16_DECODE_INVALID_CODE_POINT;
 303     *presult = u;
 304     return UTF16_DECODE_OK;
 305 }