]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/d/dmd/utf.d
d: Import dmd b8384668f, druntime e6caaab9, phobos 5ab9ad256 (v2.098.0-beta.1)
[thirdparty/gcc.git] / gcc / d / dmd / utf.d
1 /**
2 * Functions related to UTF encoding.
3 *
4 * Copyright: Copyright (C) 1999-2021 by The D Language Foundation, All Rights Reserved
5 * Authors: $(LINK2 http://www.digitalmars.com, Walter Bright)
6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/utf.d, _utf.d)
8 * Documentation: https://dlang.org/phobos/dmd_utf.html
9 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/utf.d
10 */
11
12 module dmd.utf;
13
14 nothrow pure @nogc:
15
16 /// The Unicode code space is the range of code points [0x000000,0x10FFFF]
17 /// except the UTF-16 surrogate pairs in the range [0xD800,0xDFFF]
18 bool utf_isValidDchar(dchar c)
19 {
20 // TODO: Whether non-char code points should be rejected is pending review.
21 // 0xFFFE and 0xFFFF are valid for internal use, like Phobos std.utf.isValidDChar
22 // See also https://issues.dlang.org/show_bug.cgi?id=1357
23 if (c < 0xD800) // Almost all characters in a typical document.
24 return true;
25 if (c > 0xDFFF && c <= 0x10FFFF)
26 return true;
27 return false;
28 }
29
30 /*******************************
31 * Return !=0 if unicode alpha.
32 * Use table from C99 Appendix D.
33 */
34 bool isUniAlpha(dchar c)
35 {
36 static immutable wchar[2][] ALPHA_TABLE =
37 [
38 [0x00AA, 0x00AA],
39 [0x00B5, 0x00B5],
40 [0x00B7, 0x00B7],
41 [0x00BA, 0x00BA],
42 [0x00C0, 0x00D6],
43 [0x00D8, 0x00F6],
44 [0x00F8, 0x01F5],
45 [0x01FA, 0x0217],
46 [0x0250, 0x02A8],
47 [0x02B0, 0x02B8],
48 [0x02BB, 0x02BB],
49 [0x02BD, 0x02C1],
50 [0x02D0, 0x02D1],
51 [0x02E0, 0x02E4],
52 [0x037A, 0x037A],
53 [0x0386, 0x0386],
54 [0x0388, 0x038A],
55 [0x038C, 0x038C],
56 [0x038E, 0x03A1],
57 [0x03A3, 0x03CE],
58 [0x03D0, 0x03D6],
59 [0x03DA, 0x03DA],
60 [0x03DC, 0x03DC],
61 [0x03DE, 0x03DE],
62 [0x03E0, 0x03E0],
63 [0x03E2, 0x03F3],
64 [0x0401, 0x040C],
65 [0x040E, 0x044F],
66 [0x0451, 0x045C],
67 [0x045E, 0x0481],
68 [0x0490, 0x04C4],
69 [0x04C7, 0x04C8],
70 [0x04CB, 0x04CC],
71 [0x04D0, 0x04EB],
72 [0x04EE, 0x04F5],
73 [0x04F8, 0x04F9],
74 [0x0531, 0x0556],
75 [0x0559, 0x0559],
76 [0x0561, 0x0587],
77 [0x05B0, 0x05B9],
78 [0x05BB, 0x05BD],
79 [0x05BF, 0x05BF],
80 [0x05C1, 0x05C2],
81 [0x05D0, 0x05EA],
82 [0x05F0, 0x05F2],
83 [0x0621, 0x063A],
84 [0x0640, 0x0652],
85 [0x0660, 0x0669],
86 [0x0670, 0x06B7],
87 [0x06BA, 0x06BE],
88 [0x06C0, 0x06CE],
89 [0x06D0, 0x06DC],
90 [0x06E5, 0x06E8],
91 [0x06EA, 0x06ED],
92 [0x06F0, 0x06F9],
93 [0x0901, 0x0903],
94 [0x0905, 0x0939],
95 [0x093D, 0x094D],
96 [0x0950, 0x0952],
97 [0x0958, 0x0963],
98 [0x0966, 0x096F],
99 [0x0981, 0x0983],
100 [0x0985, 0x098C],
101 [0x098F, 0x0990],
102 [0x0993, 0x09A8],
103 [0x09AA, 0x09B0],
104 [0x09B2, 0x09B2],
105 [0x09B6, 0x09B9],
106 [0x09BE, 0x09C4],
107 [0x09C7, 0x09C8],
108 [0x09CB, 0x09CD],
109 [0x09DC, 0x09DD],
110 [0x09DF, 0x09E3],
111 [0x09E6, 0x09F1],
112 [0x0A02, 0x0A02],
113 [0x0A05, 0x0A0A],
114 [0x0A0F, 0x0A10],
115 [0x0A13, 0x0A28],
116 [0x0A2A, 0x0A30],
117 [0x0A32, 0x0A33],
118 [0x0A35, 0x0A36],
119 [0x0A38, 0x0A39],
120 [0x0A3E, 0x0A42],
121 [0x0A47, 0x0A48],
122 [0x0A4B, 0x0A4D],
123 [0x0A59, 0x0A5C],
124 [0x0A5E, 0x0A5E],
125 [0x0A66, 0x0A6F],
126 [0x0A74, 0x0A74],
127 [0x0A81, 0x0A83],
128 [0x0A85, 0x0A8B],
129 [0x0A8D, 0x0A8D],
130 [0x0A8F, 0x0A91],
131 [0x0A93, 0x0AA8],
132 [0x0AAA, 0x0AB0],
133 [0x0AB2, 0x0AB3],
134 [0x0AB5, 0x0AB9],
135 [0x0ABD, 0x0AC5],
136 [0x0AC7, 0x0AC9],
137 [0x0ACB, 0x0ACD],
138 [0x0AD0, 0x0AD0],
139 [0x0AE0, 0x0AE0],
140 [0x0AE6, 0x0AEF],
141 [0x0B01, 0x0B03],
142 [0x0B05, 0x0B0C],
143 [0x0B0F, 0x0B10],
144 [0x0B13, 0x0B28],
145 [0x0B2A, 0x0B30],
146 [0x0B32, 0x0B33],
147 [0x0B36, 0x0B39],
148 [0x0B3D, 0x0B43],
149 [0x0B47, 0x0B48],
150 [0x0B4B, 0x0B4D],
151 [0x0B5C, 0x0B5D],
152 [0x0B5F, 0x0B61],
153 [0x0B66, 0x0B6F],
154 [0x0B82, 0x0B83],
155 [0x0B85, 0x0B8A],
156 [0x0B8E, 0x0B90],
157 [0x0B92, 0x0B95],
158 [0x0B99, 0x0B9A],
159 [0x0B9C, 0x0B9C],
160 [0x0B9E, 0x0B9F],
161 [0x0BA3, 0x0BA4],
162 [0x0BA8, 0x0BAA],
163 [0x0BAE, 0x0BB5],
164 [0x0BB7, 0x0BB9],
165 [0x0BBE, 0x0BC2],
166 [0x0BC6, 0x0BC8],
167 [0x0BCA, 0x0BCD],
168 [0x0BE7, 0x0BEF],
169 [0x0C01, 0x0C03],
170 [0x0C05, 0x0C0C],
171 [0x0C0E, 0x0C10],
172 [0x0C12, 0x0C28],
173 [0x0C2A, 0x0C33],
174 [0x0C35, 0x0C39],
175 [0x0C3E, 0x0C44],
176 [0x0C46, 0x0C48],
177 [0x0C4A, 0x0C4D],
178 [0x0C60, 0x0C61],
179 [0x0C66, 0x0C6F],
180 [0x0C82, 0x0C83],
181 [0x0C85, 0x0C8C],
182 [0x0C8E, 0x0C90],
183 [0x0C92, 0x0CA8],
184 [0x0CAA, 0x0CB3],
185 [0x0CB5, 0x0CB9],
186 [0x0CBE, 0x0CC4],
187 [0x0CC6, 0x0CC8],
188 [0x0CCA, 0x0CCD],
189 [0x0CDE, 0x0CDE],
190 [0x0CE0, 0x0CE1],
191 [0x0CE6, 0x0CEF],
192 [0x0D02, 0x0D03],
193 [0x0D05, 0x0D0C],
194 [0x0D0E, 0x0D10],
195 [0x0D12, 0x0D28],
196 [0x0D2A, 0x0D39],
197 [0x0D3E, 0x0D43],
198 [0x0D46, 0x0D48],
199 [0x0D4A, 0x0D4D],
200 [0x0D60, 0x0D61],
201 [0x0D66, 0x0D6F],
202 [0x0E01, 0x0E3A],
203 [0x0E40, 0x0E5B],
204 [0x0E81, 0x0E82],
205 [0x0E84, 0x0E84],
206 [0x0E87, 0x0E88],
207 [0x0E8A, 0x0E8A],
208 [0x0E8D, 0x0E8D],
209 [0x0E94, 0x0E97],
210 [0x0E99, 0x0E9F],
211 [0x0EA1, 0x0EA3],
212 [0x0EA5, 0x0EA5],
213 [0x0EA7, 0x0EA7],
214 [0x0EAA, 0x0EAB],
215 [0x0EAD, 0x0EAE],
216 [0x0EB0, 0x0EB9],
217 [0x0EBB, 0x0EBD],
218 [0x0EC0, 0x0EC4],
219 [0x0EC6, 0x0EC6],
220 [0x0EC8, 0x0ECD],
221 [0x0ED0, 0x0ED9],
222 [0x0EDC, 0x0EDD],
223 [0x0F00, 0x0F00],
224 [0x0F18, 0x0F19],
225 [0x0F20, 0x0F33],
226 [0x0F35, 0x0F35],
227 [0x0F37, 0x0F37],
228 [0x0F39, 0x0F39],
229 [0x0F3E, 0x0F47],
230 [0x0F49, 0x0F69],
231 [0x0F71, 0x0F84],
232 [0x0F86, 0x0F8B],
233 [0x0F90, 0x0F95],
234 [0x0F97, 0x0F97],
235 [0x0F99, 0x0FAD],
236 [0x0FB1, 0x0FB7],
237 [0x0FB9, 0x0FB9],
238 [0x10A0, 0x10C5],
239 [0x10D0, 0x10F6],
240 [0x1E00, 0x1E9B],
241 [0x1EA0, 0x1EF9],
242 [0x1F00, 0x1F15],
243 [0x1F18, 0x1F1D],
244 [0x1F20, 0x1F45],
245 [0x1F48, 0x1F4D],
246 [0x1F50, 0x1F57],
247 [0x1F59, 0x1F59],
248 [0x1F5B, 0x1F5B],
249 [0x1F5D, 0x1F5D],
250 [0x1F5F, 0x1F7D],
251 [0x1F80, 0x1FB4],
252 [0x1FB6, 0x1FBC],
253 [0x1FBE, 0x1FBE],
254 [0x1FC2, 0x1FC4],
255 [0x1FC6, 0x1FCC],
256 [0x1FD0, 0x1FD3],
257 [0x1FD6, 0x1FDB],
258 [0x1FE0, 0x1FEC],
259 [0x1FF2, 0x1FF4],
260 [0x1FF6, 0x1FFC],
261 [0x203F, 0x2040],
262 [0x207F, 0x207F],
263 [0x2102, 0x2102],
264 [0x2107, 0x2107],
265 [0x210A, 0x2113],
266 [0x2115, 0x2115],
267 [0x2118, 0x211D],
268 [0x2124, 0x2124],
269 [0x2126, 0x2126],
270 [0x2128, 0x2128],
271 [0x212A, 0x2131],
272 [0x2133, 0x2138],
273 [0x2160, 0x2182],
274 [0x3005, 0x3007],
275 [0x3021, 0x3029],
276 [0x3041, 0x3093],
277 [0x309B, 0x309C],
278 [0x30A1, 0x30F6],
279 [0x30FB, 0x30FC],
280 [0x3105, 0x312C],
281 [0x4E00, 0x9FA5],
282 [0xAC00, 0xD7A3]
283 ];
284
285 size_t high = ALPHA_TABLE.length - 1;
286 // Shortcut search if c is out of range
287 size_t low = (c < ALPHA_TABLE[0][0] || ALPHA_TABLE[high][1] < c) ? high + 1 : 0;
288 // Binary search
289 while (low <= high)
290 {
291 size_t mid = (low + high) >> 1;
292 if (c < ALPHA_TABLE[mid][0])
293 high = mid - 1;
294 else if (ALPHA_TABLE[mid][1] < c)
295 low = mid + 1;
296 else
297 {
298 assert(ALPHA_TABLE[mid][0] <= c && c <= ALPHA_TABLE[mid][1]);
299 return true;
300 }
301 }
302 return false;
303 }
304
305 /**
306 * Returns the code length of c in code units.
307 */
308 int utf_codeLengthChar(dchar c)
309 {
310 if (c <= 0x7F)
311 return 1;
312 if (c <= 0x7FF)
313 return 2;
314 if (c <= 0xFFFF)
315 return 3;
316 if (c <= 0x10FFFF)
317 return 4;
318 assert(false);
319 }
320
321 int utf_codeLengthWchar(dchar c)
322 {
323 return c <= 0xFFFF ? 1 : 2;
324 }
325
326 /**
327 * Returns the code length of c in code units for the encoding.
328 * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32.
329 */
330 int utf_codeLength(int sz, dchar c)
331 {
332 if (sz == 1)
333 return utf_codeLengthChar(c);
334 if (sz == 2)
335 return utf_codeLengthWchar(c);
336 assert(sz == 4);
337 return 1;
338 }
339
340 void utf_encodeChar(char* s, dchar c)
341 {
342 assert(s !is null);
343 assert(utf_isValidDchar(c));
344 if (c <= 0x7F)
345 {
346 s[0] = cast(char)c;
347 }
348 else if (c <= 0x07FF)
349 {
350 s[0] = cast(char)(0xC0 | (c >> 6));
351 s[1] = cast(char)(0x80 | (c & 0x3F));
352 }
353 else if (c <= 0xFFFF)
354 {
355 s[0] = cast(char)(0xE0 | (c >> 12));
356 s[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
357 s[2] = cast(char)(0x80 | (c & 0x3F));
358 }
359 else if (c <= 0x10FFFF)
360 {
361 s[0] = cast(char)(0xF0 | (c >> 18));
362 s[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
363 s[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
364 s[3] = cast(char)(0x80 | (c & 0x3F));
365 }
366 else
367 assert(0);
368 }
369
370 void utf_encodeWchar(wchar* s, dchar c)
371 {
372 assert(s !is null);
373 assert(utf_isValidDchar(c));
374 if (c <= 0xFFFF)
375 {
376 s[0] = cast(wchar)c;
377 }
378 else
379 {
380 s[0] = cast(wchar)((((c - 0x010000) >> 10) & 0x03FF) + 0xD800);
381 s[1] = cast(wchar)(((c - 0x010000) & 0x03FF) + 0xDC00);
382 }
383 }
384
385 void utf_encode(int sz, void* s, dchar c)
386 {
387 if (sz == 1)
388 utf_encodeChar(cast(char*)s, c);
389 else if (sz == 2)
390 utf_encodeWchar(cast(wchar*)s, c);
391 else
392 {
393 assert(sz == 4);
394 *(cast(dchar*)s) = c;
395 }
396 }
397
398 /********************************************
399 * Decode a UTF-8 sequence as a single UTF-32 code point.
400 * Params:
401 * s = UTF-8 sequence
402 * ridx = starting index in s[], updated to reflect number of code units decoded
403 * rresult = set to character decoded
404 * Returns:
405 * null on success, otherwise error message string
406 */
407 string utf_decodeChar(const(char)[] s, ref size_t ridx, out dchar rresult)
408 {
409 // UTF-8 decoding errors
410 static immutable string UTF8_DECODE_OK = null; // no error
411 static immutable string UTF8_DECODE_OUTSIDE_CODE_SPACE = "Outside Unicode code space";
412 static immutable string UTF8_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-8 sequence";
413 static immutable string UTF8_DECODE_OVERLONG = "Overlong UTF-8 sequence";
414 static immutable string UTF8_DECODE_INVALID_TRAILER = "Invalid trailing code unit";
415 static immutable string UTF8_DECODE_INVALID_CODE_POINT = "Invalid code point decoded";
416
417 /* The following encodings are valid, except for the 5 and 6 byte
418 * combinations:
419 * 0xxxxxxx
420 * 110xxxxx 10xxxxxx
421 * 1110xxxx 10xxxxxx 10xxxxxx
422 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
423 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
424 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
425 */
426 static immutable ubyte[256] UTF8_STRIDE =
427 [
428 1,1,1,1, 1,1,1,1,
429 1,1,1,1, 1,1,1,1,
430 1,1,1,1, 1,1,1,1,
431 1,1,1,1, 1,1,1,1,
432 1,1,1,1, 1,1,1,1,
433 1,1,1,1, 1,1,1,1,
434 1,1,1,1, 1,1,1,1,
435 1,1,1,1, 1,1,1,1,
436
437 1,1,1,1, 1,1,1,1,
438 1,1,1,1, 1,1,1,1,
439 1,1,1,1, 1,1,1,1,
440 1,1,1,1, 1,1,1,1,
441 1,1,1,1, 1,1,1,1,
442 1,1,1,1, 1,1,1,1,
443 1,1,1,1, 1,1,1,1,
444 1,1,1,1, 1,1,1,1,
445
446 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
447 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
448 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
449 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
450 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
451 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
452 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
453 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
454
455 2,2,2,2, 2,2,2,2,
456 2,2,2,2, 2,2,2,2,
457 2,2,2,2, 2,2,2,2,
458 2,2,2,2, 2,2,2,2,
459
460 3,3,3,3, 3,3,3,3,
461 3,3,3,3, 3,3,3,3,
462
463 4,4,4,4, 4,4,4,4,
464 5,5,5,5, 6,6,0xFF,0xFF
465 ];
466
467 assert(s !is null);
468 size_t i = ridx++;
469
470 const char u = s[i];
471 // Pre-stage results for ASCII and error cases
472 rresult = u;
473 //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len);
474 // Get expected sequence length
475 const size_t n = UTF8_STRIDE[u];
476 switch (n)
477 {
478 case 1:
479 // ASCII
480 return UTF8_DECODE_OK;
481 case 2:
482 case 3:
483 case 4:
484 // multi-byte UTF-8
485 break;
486 default:
487 // 5- or 6-byte sequence
488 return UTF8_DECODE_OUTSIDE_CODE_SPACE;
489 }
490 if (s.length < i + n) // source too short
491 return UTF8_DECODE_TRUNCATED_SEQUENCE;
492 // Pick off 7 - n low bits from first code unit
493 dchar c = u & ((1 << (7 - n)) - 1);
494 /* The following combinations are overlong, and illegal:
495 * 1100000x (10xxxxxx)
496 * 11100000 100xxxxx (10xxxxxx)
497 * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
498 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
499 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
500 */
501 const char u2 = s[++i];
502 // overlong combination
503 if ((u & 0xFE) == 0xC0 || (u == 0xE0 && (u2 & 0xE0) == 0x80) || (u == 0xF0 && (u2 & 0xF0) == 0x80) || (u == 0xF8 && (u2 & 0xF8) == 0x80) || (u == 0xFC && (u2 & 0xFC) == 0x80))
504 return UTF8_DECODE_OVERLONG;
505 // Decode remaining bits
506 for (const m = n + i - 1; i != m; ++i)
507 {
508 const u3 = s[i];
509 if ((u3 & 0xC0) != 0x80) // trailing bytes are 10xxxxxx
510 return UTF8_DECODE_INVALID_TRAILER;
511 c = (c << 6) | (u3 & 0x3F);
512 }
513 if (!utf_isValidDchar(c))
514 return UTF8_DECODE_INVALID_CODE_POINT;
515 ridx = i;
516 rresult = c;
517 return UTF8_DECODE_OK;
518 }
519
520 /********************************************
521 * Decode a UTF-16 sequence as a single UTF-32 code point.
522 * Params:
523 * s = UTF-16 sequence
524 * ridx = starting index in s[], updated to reflect number of code units decoded
525 * rresult = set to character decoded
526 * Returns:
527 * null on success, otherwise error message string
528 */
529 string utf_decodeWchar(const(wchar)[] s, ref size_t ridx, out dchar rresult)
530 {
531 // UTF-16 decoding errors
532 static immutable string UTF16_DECODE_OK = null; // no error
533 static immutable string UTF16_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-16 sequence";
534 static immutable string UTF16_DECODE_INVALID_SURROGATE = "Invalid low surrogate";
535 static immutable string UTF16_DECODE_UNPAIRED_SURROGATE = "Unpaired surrogate";
536 static immutable string UTF16_DECODE_INVALID_CODE_POINT = "Invalid code point decoded";
537
538 assert(s !is null);
539 size_t i = ridx++;
540
541 // Pre-stage results for single wchar and error cases
542 dchar u = rresult = s[i];
543 if (u < 0xD800) // Single wchar codepoint
544 return UTF16_DECODE_OK;
545 if (0xD800 <= u && u <= 0xDBFF) // Surrogate pair
546 {
547 if (s.length <= i + 1)
548 return UTF16_DECODE_TRUNCATED_SEQUENCE;
549 wchar u2 = s[i + 1];
550 if (u2 < 0xDC00 || 0xDFFF < u)
551 return UTF16_DECODE_INVALID_SURROGATE;
552 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
553 ++ridx;
554 }
555 else if (0xDC00 <= u && u <= 0xDFFF)
556 return UTF16_DECODE_UNPAIRED_SURROGATE;
557 if (!utf_isValidDchar(u))
558 return UTF16_DECODE_INVALID_CODE_POINT;
559 rresult = u;
560 return UTF16_DECODE_OK;
561 }