]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/d/dmd/utf.c
Merge dmd upstream 6243fa6d2
[thirdparty/gcc.git] / gcc / d / dmd / utf.c
1
2 /* Compiler implementation of the D programming language
3 * Copyright (C) 2003-2018 by The D Language Foundation, All Rights Reserved
4 * written by Walter Bright
5 * http://www.digitalmars.com
6 * Distributed under the Boost Software License, Version 1.0.
7 * http://www.boost.org/LICENSE_1_0.txt
8 * https://github.com/D-Programming-Language/dmd/blob/master/src/utf.c
9 */
10
11 /// Description of UTF-8 in [1]. Unicode non-characters and private-use
12 /// code points described in [2],[4].
13 ///
14 /// References:
15 /// [1] http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
16 /// [2] http://en.wikipedia.org/wiki/Unicode
17 /// [3] http://unicode.org/faq/utf_bom.html
18 /// [4] http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf
19
20 #include "utf.h"
21
22 /* The following encodings are valid, except for the 5 and 6 byte
23 * combinations:
24 * 0xxxxxxx
25 * 110xxxxx 10xxxxxx
26 * 1110xxxx 10xxxxxx 10xxxxxx
27 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
28 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
29 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
30 */
31 const unsigned UTF8_STRIDE[256] =
32 {
33 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
34 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
35 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
36 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
37 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
38 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
39 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
40 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
41 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
42 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
43 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
44 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
45 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
46 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
47 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
48 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
49 };
50
51 // UTF-8 decoding errors
52 char const UTF8_DECODE_OUTSIDE_CODE_SPACE[] = "Outside Unicode code space";
53 char const UTF8_DECODE_TRUNCATED_SEQUENCE[] = "Truncated UTF-8 sequence";
54 char const UTF8_DECODE_OVERLONG[] = "Overlong UTF-8 sequence";
55 char const UTF8_DECODE_INVALID_TRAILER[] = "Invalid trailing code unit";
56 char const UTF8_DECODE_INVALID_CODE_POINT[] = "Invalid code point decoded";
57
58 // UTF-16 decoding errors
59 char const UTF16_DECODE_TRUNCATED_SEQUENCE[]= "Truncated UTF-16 sequence";
60 char const UTF16_DECODE_INVALID_SURROGATE[] = "Invalid low surrogate";
61 char const UTF16_DECODE_UNPAIRED_SURROGATE[]= "Unpaired surrogate";
62 char const UTF16_DECODE_INVALID_CODE_POINT[]= "Invalid code point decoded";
63
64 /// The Unicode code space is the range of code points [0x000000,0x10FFFF]
65 /// except the UTF-16 surrogate pairs in the range [0xD800,0xDFFF]
66 /// and non-characters (which end in 0xFFFE or 0xFFFF).
67 bool utf_isValidDchar(dchar_t c)
68 {
69 // TODO: Whether non-char code points should be rejected is pending review
70 // largest character code point
71 if (c > 0x10FFFF)
72 return false;
73 // surrogate pairs
74 if (0xD800 <= c && c <= 0xDFFF)
75 return false;
76 // non-characters
77 if ((c & 0xFFFFFE) == 0x00FFFE)
78 return false;
79 return true;
80 }
81
82 /*******************************
83 * Return !=0 if unicode alpha.
84 * Use table from C99 Appendix D.
85 */
86
87 bool isUniAlpha(dchar_t c)
88 {
89 size_t high = ALPHA_TABLE_LENGTH - 1;
90 // Shortcut search if c is out of range
91 size_t low
92 = (c < ALPHA_TABLE[0][0] || ALPHA_TABLE[high][1] < c) ? high + 1 : 0;
93 // Binary search
94 while (low <= high)
95 {
96 size_t mid = (low + high) >> 1;
97 if (c < ALPHA_TABLE[mid][0])
98 high = mid - 1;
99 else if (ALPHA_TABLE[mid][1] < c)
100 low = mid + 1;
101 else
102 {
103 assert(ALPHA_TABLE[mid][0] <= c && c <= ALPHA_TABLE[mid][1]);
104 return true;
105 }
106 }
107 return false;
108 }
109
110 /**
111 * Returns the code length of c in code units.
112 */
113
114 int utf_codeLengthChar(dchar_t c)
115 {
116 if (c <= 0x7F)
117 return 1;
118 if (c <= 0x7FF)
119 return 2;
120 if (c <= 0xFFFF)
121 return 3;
122 if (c <= 0x10FFFF)
123 return 4;
124 assert(false);
125 }
126
127 int utf_codeLengthWchar(dchar_t c)
128 {
129 return c <= 0xFFFF ? 1 : 2;
130 }
131
132 /**
133 * Returns the code length of c in code units for the encoding.
134 * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32.
135 */
136
137 int utf_codeLength(int sz, dchar_t c)
138 {
139 if (sz == 1)
140 return utf_codeLengthChar(c);
141 if (sz == 2)
142 return utf_codeLengthWchar(c);
143 assert(sz == 4);
144 return 1;
145 }
146
147 void utf_encodeChar(utf8_t *s, dchar_t c)
148 {
149 assert(s != NULL);
150 assert(utf_isValidDchar(c));
151 if (c <= 0x7F)
152 {
153 s[0] = static_cast<utf8_t>(c);
154 }
155 else if (c <= 0x07FF)
156 {
157 s[0] = static_cast<utf8_t>(0xC0 | (c >> 6));
158 s[1] = static_cast<utf8_t>(0x80 | (c & 0x3F));
159 }
160 else if (c <= 0xFFFF)
161 {
162 s[0] = static_cast<utf8_t>(0xE0 | (c >> 12));
163 s[1] = static_cast<utf8_t>(0x80 | ((c >> 6) & 0x3F));
164 s[2] = static_cast<utf8_t>(0x80 | (c & 0x3F));
165 }
166 else if (c <= 0x10FFFF)
167 {
168 s[0] = static_cast<utf8_t>(0xF0 | (c >> 18));
169 s[1] = static_cast<utf8_t>(0x80 | ((c >> 12) & 0x3F));
170 s[2] = static_cast<utf8_t>(0x80 | ((c >> 6) & 0x3F));
171 s[3] = static_cast<utf8_t>(0x80 | (c & 0x3F));
172 }
173 else
174 assert(0);
175 }
176
177 void utf_encodeWchar(utf16_t *s, dchar_t c)
178 {
179 assert(s != NULL);
180 assert(utf_isValidDchar(c));
181 if (c <= 0xFFFF)
182 {
183 s[0] = static_cast<utf16_t>(c);
184 }
185 else
186 {
187 s[0] = static_cast<utf16_t>((((c - 0x010000) >> 10) & 0x03FF) + 0xD800);
188 s[1] = static_cast<utf16_t>(((c - 0x010000) & 0x03FF) + 0xDC00);
189 }
190 }
191
192 void utf_encode(int sz, void *s, dchar_t c)
193 {
194 if (sz == 1)
195 utf_encodeChar((utf8_t *)s, c);
196 else if (sz == 2)
197 utf_encodeWchar((utf16_t *)s, c);
198 else
199 {
200 assert(sz == 4);
201 *((utf32_t *)s) = c;
202 }
203 }
204
205 /********************************************
206 * Decode a UTF-8 sequence as a single UTF-32 code point.
207 * Returns:
208 * NULL success
209 * !=NULL error message string
210 */
211
212 const char *utf_decodeChar(utf8_t const *s, size_t len, size_t *pidx, dchar_t *presult)
213 {
214 assert(s != NULL);
215 assert(pidx != NULL);
216 assert(presult != NULL);
217 size_t i = (*pidx)++;
218 assert(i < len);
219 utf8_t u = s[i];
220 // Pre-stage results for ASCII and error cases
221 *presult = u;
222
223 //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len);
224
225 // Get expected sequence length
226 size_t n = UTF8_STRIDE[u];
227 switch (n)
228 {
229 case 1: // ASCII
230 return UTF8_DECODE_OK;
231 case 2: case 3: case 4: // multi-byte UTF-8
232 break;
233 default: // 5- or 6-byte sequence
234 return UTF8_DECODE_OUTSIDE_CODE_SPACE;
235 }
236 if (len < i + n) // source too short
237 return UTF8_DECODE_TRUNCATED_SEQUENCE;
238
239 // Pick off 7 - n low bits from first code unit
240 utf32_t c = u & ((1 << (7 - n)) - 1);
241 /* The following combinations are overlong, and illegal:
242 * 1100000x (10xxxxxx)
243 * 11100000 100xxxxx (10xxxxxx)
244 * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
245 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
246 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
247 */
248 utf8_t u2 = s[++i];
249 // overlong combination
250 if ((u & 0xFE) == 0xC0 ||
251 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
252 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
253 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
254 (u == 0xFC && (u2 & 0xFC) == 0x80))
255 return UTF8_DECODE_OVERLONG;
256 // Decode remaining bits
257 for (n += i - 1; i != n; ++i)
258 {
259 u = s[i];
260 if ((u & 0xC0) != 0x80) // trailing bytes are 10xxxxxx
261 return UTF8_DECODE_INVALID_TRAILER;
262 c = (c << 6) | (u & 0x3F);
263 }
264 if (!utf_isValidDchar(c))
265 return UTF8_DECODE_INVALID_CODE_POINT;
266 *pidx = i;
267 *presult = c;
268 return UTF8_DECODE_OK;
269 }
270
271 /********************************************
272 * Decode a UTF-16 sequence as a single UTF-32 code point.
273 * Returns:
274 * NULL success
275 * !=NULL error message string
276 */
277
278 const char *utf_decodeWchar(utf16_t const *s, size_t len, size_t *pidx, dchar_t *presult)
279 {
280 assert(s != NULL);
281 assert(pidx != NULL);
282 assert(presult != NULL);
283 size_t i = (*pidx)++;
284 assert(i < len);
285 // Pre-stage results for ASCII and error cases
286 utf32_t u = *presult = s[i];
287
288 if (u < 0x80) // ASCII
289 return UTF16_DECODE_OK;
290 if (0xD800 <= u && u <= 0xDBFF) // Surrogate pair
291 { if (len <= i + 1)
292 return UTF16_DECODE_TRUNCATED_SEQUENCE;
293 utf16_t u2 = s[i + 1];
294 if (u2 < 0xDC00 || 0xDFFF < u)
295 return UTF16_DECODE_INVALID_SURROGATE;
296 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
297 ++*pidx;
298 }
299 else if (0xDC00 <= u && u <= 0xDFFF)
300 return UTF16_DECODE_UNPAIRED_SURROGATE;
301 if (!utf_isValidDchar(u))
302 return UTF16_DECODE_INVALID_CODE_POINT;
303 *presult = u;
304 return UTF16_DECODE_OK;
305 }