2 /* Compiler implementation of the D programming language
3 * Copyright (C) 2003-2018 by The D Language Foundation, All Rights Reserved
4 * written by Walter Bright
5 * http://www.digitalmars.com
6 * Distributed under the Boost Software License, Version 1.0.
7 * http://www.boost.org/LICENSE_1_0.txt
8 * https://github.com/D-Programming-Language/dmd/blob/master/src/utf.c
11 /// Description of UTF-8 in [1]. Unicode non-characters and private-use
12 /// code points described in [2],[4].
15 /// [1] http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
16 /// [2] http://en.wikipedia.org/wiki/Unicode
17 /// [3] http://unicode.org/faq/utf_bom.html
18 /// [4] http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf
22 /* The following encodings are valid, except for the 5 and 6 byte
26 * 1110xxxx 10xxxxxx 10xxxxxx
27 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
28 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
29 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
31 const unsigned UTF8_STRIDE
[256] =
33 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
34 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
35 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
36 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
37 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
38 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
39 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
40 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
41 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
42 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
43 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
44 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
45 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
46 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
47 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
48 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
51 // UTF-8 decoding errors
52 char const UTF8_DECODE_OUTSIDE_CODE_SPACE
[] = "Outside Unicode code space";
53 char const UTF8_DECODE_TRUNCATED_SEQUENCE
[] = "Truncated UTF-8 sequence";
54 char const UTF8_DECODE_OVERLONG
[] = "Overlong UTF-8 sequence";
55 char const UTF8_DECODE_INVALID_TRAILER
[] = "Invalid trailing code unit";
56 char const UTF8_DECODE_INVALID_CODE_POINT
[] = "Invalid code point decoded";
58 // UTF-16 decoding errors
59 char const UTF16_DECODE_TRUNCATED_SEQUENCE
[]= "Truncated UTF-16 sequence";
60 char const UTF16_DECODE_INVALID_SURROGATE
[] = "Invalid low surrogate";
61 char const UTF16_DECODE_UNPAIRED_SURROGATE
[]= "Unpaired surrogate";
62 char const UTF16_DECODE_INVALID_CODE_POINT
[]= "Invalid code point decoded";
64 /// The Unicode code space is the range of code points [0x000000,0x10FFFF]
65 /// except the UTF-16 surrogate pairs in the range [0xD800,0xDFFF]
66 /// and non-characters (which end in 0xFFFE or 0xFFFF).
67 bool utf_isValidDchar(dchar_t c
)
69 // TODO: Whether non-char code points should be rejected is pending review
70 // largest character code point
74 if (0xD800 <= c
&& c
<= 0xDFFF)
77 if ((c
& 0xFFFFFE) == 0x00FFFE)
82 /*******************************
83 * Return !=0 if unicode alpha.
84 * Use table from C99 Appendix D.
87 bool isUniAlpha(dchar_t c
)
89 size_t high
= ALPHA_TABLE_LENGTH
- 1;
90 // Shortcut search if c is out of range
92 = (c
< ALPHA_TABLE
[0][0] || ALPHA_TABLE
[high
][1] < c
) ? high
+ 1 : 0;
96 size_t mid
= (low
+ high
) >> 1;
97 if (c
< ALPHA_TABLE
[mid
][0])
99 else if (ALPHA_TABLE
[mid
][1] < c
)
103 assert(ALPHA_TABLE
[mid
][0] <= c
&& c
<= ALPHA_TABLE
[mid
][1]);
111 * Returns the code length of c in code units.
114 int utf_codeLengthChar(dchar_t c
)
127 int utf_codeLengthWchar(dchar_t c
)
129 return c
<= 0xFFFF ? 1 : 2;
133 * Returns the code length of c in code units for the encoding.
134 * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32.
137 int utf_codeLength(int sz
, dchar_t c
)
140 return utf_codeLengthChar(c
);
142 return utf_codeLengthWchar(c
);
147 void utf_encodeChar(utf8_t
*s
, dchar_t c
)
150 assert(utf_isValidDchar(c
));
153 s
[0] = static_cast<utf8_t
>(c
);
155 else if (c
<= 0x07FF)
157 s
[0] = static_cast<utf8_t
>(0xC0 | (c
>> 6));
158 s
[1] = static_cast<utf8_t
>(0x80 | (c
& 0x3F));
160 else if (c
<= 0xFFFF)
162 s
[0] = static_cast<utf8_t
>(0xE0 | (c
>> 12));
163 s
[1] = static_cast<utf8_t
>(0x80 | ((c
>> 6) & 0x3F));
164 s
[2] = static_cast<utf8_t
>(0x80 | (c
& 0x3F));
166 else if (c
<= 0x10FFFF)
168 s
[0] = static_cast<utf8_t
>(0xF0 | (c
>> 18));
169 s
[1] = static_cast<utf8_t
>(0x80 | ((c
>> 12) & 0x3F));
170 s
[2] = static_cast<utf8_t
>(0x80 | ((c
>> 6) & 0x3F));
171 s
[3] = static_cast<utf8_t
>(0x80 | (c
& 0x3F));
177 void utf_encodeWchar(utf16_t
*s
, dchar_t c
)
180 assert(utf_isValidDchar(c
));
183 s
[0] = static_cast<utf16_t
>(c
);
187 s
[0] = static_cast<utf16_t
>((((c
- 0x010000) >> 10) & 0x03FF) + 0xD800);
188 s
[1] = static_cast<utf16_t
>(((c
- 0x010000) & 0x03FF) + 0xDC00);
192 void utf_encode(int sz
, void *s
, dchar_t c
)
195 utf_encodeChar((utf8_t
*)s
, c
);
197 utf_encodeWchar((utf16_t
*)s
, c
);
205 /********************************************
206 * Decode a UTF-8 sequence as a single UTF-32 code point.
209 * !=NULL error message string
212 const char *utf_decodeChar(utf8_t
const *s
, size_t len
, size_t *pidx
, dchar_t
*presult
)
215 assert(pidx
!= NULL
);
216 assert(presult
!= NULL
);
217 size_t i
= (*pidx
)++;
220 // Pre-stage results for ASCII and error cases
223 //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len);
225 // Get expected sequence length
226 size_t n
= UTF8_STRIDE
[u
];
230 return UTF8_DECODE_OK
;
231 case 2: case 3: case 4: // multi-byte UTF-8
233 default: // 5- or 6-byte sequence
234 return UTF8_DECODE_OUTSIDE_CODE_SPACE
;
236 if (len
< i
+ n
) // source too short
237 return UTF8_DECODE_TRUNCATED_SEQUENCE
;
239 // Pick off 7 - n low bits from first code unit
240 utf32_t c
= u
& ((1 << (7 - n
)) - 1);
241 /* The following combinations are overlong, and illegal:
242 * 1100000x (10xxxxxx)
243 * 11100000 100xxxxx (10xxxxxx)
244 * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
245 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
246 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
249 // overlong combination
250 if ((u
& 0xFE) == 0xC0 ||
251 (u
== 0xE0 && (u2
& 0xE0) == 0x80) ||
252 (u
== 0xF0 && (u2
& 0xF0) == 0x80) ||
253 (u
== 0xF8 && (u2
& 0xF8) == 0x80) ||
254 (u
== 0xFC && (u2
& 0xFC) == 0x80))
255 return UTF8_DECODE_OVERLONG
;
256 // Decode remaining bits
257 for (n
+= i
- 1; i
!= n
; ++i
)
260 if ((u
& 0xC0) != 0x80) // trailing bytes are 10xxxxxx
261 return UTF8_DECODE_INVALID_TRAILER
;
262 c
= (c
<< 6) | (u
& 0x3F);
264 if (!utf_isValidDchar(c
))
265 return UTF8_DECODE_INVALID_CODE_POINT
;
268 return UTF8_DECODE_OK
;
271 /********************************************
272 * Decode a UTF-16 sequence as a single UTF-32 code point.
275 * !=NULL error message string
278 const char *utf_decodeWchar(utf16_t
const *s
, size_t len
, size_t *pidx
, dchar_t
*presult
)
281 assert(pidx
!= NULL
);
282 assert(presult
!= NULL
);
283 size_t i
= (*pidx
)++;
285 // Pre-stage results for ASCII and error cases
286 utf32_t u
= *presult
= s
[i
];
288 if (u
< 0x80) // ASCII
289 return UTF16_DECODE_OK
;
290 if (0xD800 <= u
&& u
<= 0xDBFF) // Surrogate pair
292 return UTF16_DECODE_TRUNCATED_SEQUENCE
;
293 utf16_t u2
= s
[i
+ 1];
294 if (u2
< 0xDC00 || 0xDFFF < u
)
295 return UTF16_DECODE_INVALID_SURROGATE
;
296 u
= ((u
- 0xD7C0) << 10) + (u2
- 0xDC00);
299 else if (0xDC00 <= u
&& u
<= 0xDFFF)
300 return UTF16_DECODE_UNPAIRED_SURROGATE
;
301 if (!utf_isValidDchar(u
))
302 return UTF16_DECODE_INVALID_CODE_POINT
;
304 return UTF16_DECODE_OK
;