]>
Commit | Line | Data |
---|---|---|
5fee5ec3 IB |
1 | /** |
2 | * Functions related to UTF encoding. | |
3 | * | |
4 | * Copyright: Copyright (C) 1999-2021 by The D Language Foundation, All Rights Reserved | |
5 | * Authors: $(LINK2 http://www.digitalmars.com, Walter Bright) | |
6 | * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) | |
7 | * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/utf.d, _utf.d) | |
8 | * Documentation: https://dlang.org/phobos/dmd_utf.html | |
9 | * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/utf.d | |
10 | */ | |
11 | ||
12 | module dmd.utf; | |
13 | ||
14 | nothrow pure @nogc: | |
15 | ||
16 | /// The Unicode code space is the range of code points [0x000000,0x10FFFF] | |
17 | /// except the UTF-16 surrogate pairs in the range [0xD800,0xDFFF] | |
18 | bool utf_isValidDchar(dchar c) | |
19 | { | |
20 | // TODO: Whether non-char code points should be rejected is pending review. | |
21 | // 0xFFFE and 0xFFFF are valid for internal use, like Phobos std.utf.isValidDChar | |
22 | // See also https://issues.dlang.org/show_bug.cgi?id=1357 | |
23 | if (c < 0xD800) // Almost all characters in a typical document. | |
24 | return true; | |
25 | if (c > 0xDFFF && c <= 0x10FFFF) | |
26 | return true; | |
27 | return false; | |
28 | } | |
29 | ||
30 | /******************************* | |
31 | * Return !=0 if unicode alpha. | |
32 | * Use table from C99 Appendix D. | |
33 | */ | |
34 | bool isUniAlpha(dchar c) | |
35 | { | |
36 | static immutable wchar[2][] ALPHA_TABLE = | |
37 | [ | |
38 | [0x00AA, 0x00AA], | |
39 | [0x00B5, 0x00B5], | |
40 | [0x00B7, 0x00B7], | |
41 | [0x00BA, 0x00BA], | |
42 | [0x00C0, 0x00D6], | |
43 | [0x00D8, 0x00F6], | |
44 | [0x00F8, 0x01F5], | |
45 | [0x01FA, 0x0217], | |
46 | [0x0250, 0x02A8], | |
47 | [0x02B0, 0x02B8], | |
48 | [0x02BB, 0x02BB], | |
49 | [0x02BD, 0x02C1], | |
50 | [0x02D0, 0x02D1], | |
51 | [0x02E0, 0x02E4], | |
52 | [0x037A, 0x037A], | |
53 | [0x0386, 0x0386], | |
54 | [0x0388, 0x038A], | |
55 | [0x038C, 0x038C], | |
56 | [0x038E, 0x03A1], | |
57 | [0x03A3, 0x03CE], | |
58 | [0x03D0, 0x03D6], | |
59 | [0x03DA, 0x03DA], | |
60 | [0x03DC, 0x03DC], | |
61 | [0x03DE, 0x03DE], | |
62 | [0x03E0, 0x03E0], | |
63 | [0x03E2, 0x03F3], | |
64 | [0x0401, 0x040C], | |
65 | [0x040E, 0x044F], | |
66 | [0x0451, 0x045C], | |
67 | [0x045E, 0x0481], | |
68 | [0x0490, 0x04C4], | |
69 | [0x04C7, 0x04C8], | |
70 | [0x04CB, 0x04CC], | |
71 | [0x04D0, 0x04EB], | |
72 | [0x04EE, 0x04F5], | |
73 | [0x04F8, 0x04F9], | |
74 | [0x0531, 0x0556], | |
75 | [0x0559, 0x0559], | |
76 | [0x0561, 0x0587], | |
77 | [0x05B0, 0x05B9], | |
78 | [0x05BB, 0x05BD], | |
79 | [0x05BF, 0x05BF], | |
80 | [0x05C1, 0x05C2], | |
81 | [0x05D0, 0x05EA], | |
82 | [0x05F0, 0x05F2], | |
83 | [0x0621, 0x063A], | |
84 | [0x0640, 0x0652], | |
85 | [0x0660, 0x0669], | |
86 | [0x0670, 0x06B7], | |
87 | [0x06BA, 0x06BE], | |
88 | [0x06C0, 0x06CE], | |
89 | [0x06D0, 0x06DC], | |
90 | [0x06E5, 0x06E8], | |
91 | [0x06EA, 0x06ED], | |
92 | [0x06F0, 0x06F9], | |
93 | [0x0901, 0x0903], | |
94 | [0x0905, 0x0939], | |
95 | [0x093D, 0x094D], | |
96 | [0x0950, 0x0952], | |
97 | [0x0958, 0x0963], | |
98 | [0x0966, 0x096F], | |
99 | [0x0981, 0x0983], | |
100 | [0x0985, 0x098C], | |
101 | [0x098F, 0x0990], | |
102 | [0x0993, 0x09A8], | |
103 | [0x09AA, 0x09B0], | |
104 | [0x09B2, 0x09B2], | |
105 | [0x09B6, 0x09B9], | |
106 | [0x09BE, 0x09C4], | |
107 | [0x09C7, 0x09C8], | |
108 | [0x09CB, 0x09CD], | |
109 | [0x09DC, 0x09DD], | |
110 | [0x09DF, 0x09E3], | |
111 | [0x09E6, 0x09F1], | |
112 | [0x0A02, 0x0A02], | |
113 | [0x0A05, 0x0A0A], | |
114 | [0x0A0F, 0x0A10], | |
115 | [0x0A13, 0x0A28], | |
116 | [0x0A2A, 0x0A30], | |
117 | [0x0A32, 0x0A33], | |
118 | [0x0A35, 0x0A36], | |
119 | [0x0A38, 0x0A39], | |
120 | [0x0A3E, 0x0A42], | |
121 | [0x0A47, 0x0A48], | |
122 | [0x0A4B, 0x0A4D], | |
123 | [0x0A59, 0x0A5C], | |
124 | [0x0A5E, 0x0A5E], | |
125 | [0x0A66, 0x0A6F], | |
126 | [0x0A74, 0x0A74], | |
127 | [0x0A81, 0x0A83], | |
128 | [0x0A85, 0x0A8B], | |
129 | [0x0A8D, 0x0A8D], | |
130 | [0x0A8F, 0x0A91], | |
131 | [0x0A93, 0x0AA8], | |
132 | [0x0AAA, 0x0AB0], | |
133 | [0x0AB2, 0x0AB3], | |
134 | [0x0AB5, 0x0AB9], | |
135 | [0x0ABD, 0x0AC5], | |
136 | [0x0AC7, 0x0AC9], | |
137 | [0x0ACB, 0x0ACD], | |
138 | [0x0AD0, 0x0AD0], | |
139 | [0x0AE0, 0x0AE0], | |
140 | [0x0AE6, 0x0AEF], | |
141 | [0x0B01, 0x0B03], | |
142 | [0x0B05, 0x0B0C], | |
143 | [0x0B0F, 0x0B10], | |
144 | [0x0B13, 0x0B28], | |
145 | [0x0B2A, 0x0B30], | |
146 | [0x0B32, 0x0B33], | |
147 | [0x0B36, 0x0B39], | |
148 | [0x0B3D, 0x0B43], | |
149 | [0x0B47, 0x0B48], | |
150 | [0x0B4B, 0x0B4D], | |
151 | [0x0B5C, 0x0B5D], | |
152 | [0x0B5F, 0x0B61], | |
153 | [0x0B66, 0x0B6F], | |
154 | [0x0B82, 0x0B83], | |
155 | [0x0B85, 0x0B8A], | |
156 | [0x0B8E, 0x0B90], | |
157 | [0x0B92, 0x0B95], | |
158 | [0x0B99, 0x0B9A], | |
159 | [0x0B9C, 0x0B9C], | |
160 | [0x0B9E, 0x0B9F], | |
161 | [0x0BA3, 0x0BA4], | |
162 | [0x0BA8, 0x0BAA], | |
163 | [0x0BAE, 0x0BB5], | |
164 | [0x0BB7, 0x0BB9], | |
165 | [0x0BBE, 0x0BC2], | |
166 | [0x0BC6, 0x0BC8], | |
167 | [0x0BCA, 0x0BCD], | |
168 | [0x0BE7, 0x0BEF], | |
169 | [0x0C01, 0x0C03], | |
170 | [0x0C05, 0x0C0C], | |
171 | [0x0C0E, 0x0C10], | |
172 | [0x0C12, 0x0C28], | |
173 | [0x0C2A, 0x0C33], | |
174 | [0x0C35, 0x0C39], | |
175 | [0x0C3E, 0x0C44], | |
176 | [0x0C46, 0x0C48], | |
177 | [0x0C4A, 0x0C4D], | |
178 | [0x0C60, 0x0C61], | |
179 | [0x0C66, 0x0C6F], | |
180 | [0x0C82, 0x0C83], | |
181 | [0x0C85, 0x0C8C], | |
182 | [0x0C8E, 0x0C90], | |
183 | [0x0C92, 0x0CA8], | |
184 | [0x0CAA, 0x0CB3], | |
185 | [0x0CB5, 0x0CB9], | |
186 | [0x0CBE, 0x0CC4], | |
187 | [0x0CC6, 0x0CC8], | |
188 | [0x0CCA, 0x0CCD], | |
189 | [0x0CDE, 0x0CDE], | |
190 | [0x0CE0, 0x0CE1], | |
191 | [0x0CE6, 0x0CEF], | |
192 | [0x0D02, 0x0D03], | |
193 | [0x0D05, 0x0D0C], | |
194 | [0x0D0E, 0x0D10], | |
195 | [0x0D12, 0x0D28], | |
196 | [0x0D2A, 0x0D39], | |
197 | [0x0D3E, 0x0D43], | |
198 | [0x0D46, 0x0D48], | |
199 | [0x0D4A, 0x0D4D], | |
200 | [0x0D60, 0x0D61], | |
201 | [0x0D66, 0x0D6F], | |
202 | [0x0E01, 0x0E3A], | |
203 | [0x0E40, 0x0E5B], | |
204 | [0x0E81, 0x0E82], | |
205 | [0x0E84, 0x0E84], | |
206 | [0x0E87, 0x0E88], | |
207 | [0x0E8A, 0x0E8A], | |
208 | [0x0E8D, 0x0E8D], | |
209 | [0x0E94, 0x0E97], | |
210 | [0x0E99, 0x0E9F], | |
211 | [0x0EA1, 0x0EA3], | |
212 | [0x0EA5, 0x0EA5], | |
213 | [0x0EA7, 0x0EA7], | |
214 | [0x0EAA, 0x0EAB], | |
215 | [0x0EAD, 0x0EAE], | |
216 | [0x0EB0, 0x0EB9], | |
217 | [0x0EBB, 0x0EBD], | |
218 | [0x0EC0, 0x0EC4], | |
219 | [0x0EC6, 0x0EC6], | |
220 | [0x0EC8, 0x0ECD], | |
221 | [0x0ED0, 0x0ED9], | |
222 | [0x0EDC, 0x0EDD], | |
223 | [0x0F00, 0x0F00], | |
224 | [0x0F18, 0x0F19], | |
225 | [0x0F20, 0x0F33], | |
226 | [0x0F35, 0x0F35], | |
227 | [0x0F37, 0x0F37], | |
228 | [0x0F39, 0x0F39], | |
229 | [0x0F3E, 0x0F47], | |
230 | [0x0F49, 0x0F69], | |
231 | [0x0F71, 0x0F84], | |
232 | [0x0F86, 0x0F8B], | |
233 | [0x0F90, 0x0F95], | |
234 | [0x0F97, 0x0F97], | |
235 | [0x0F99, 0x0FAD], | |
236 | [0x0FB1, 0x0FB7], | |
237 | [0x0FB9, 0x0FB9], | |
238 | [0x10A0, 0x10C5], | |
239 | [0x10D0, 0x10F6], | |
240 | [0x1E00, 0x1E9B], | |
241 | [0x1EA0, 0x1EF9], | |
242 | [0x1F00, 0x1F15], | |
243 | [0x1F18, 0x1F1D], | |
244 | [0x1F20, 0x1F45], | |
245 | [0x1F48, 0x1F4D], | |
246 | [0x1F50, 0x1F57], | |
247 | [0x1F59, 0x1F59], | |
248 | [0x1F5B, 0x1F5B], | |
249 | [0x1F5D, 0x1F5D], | |
250 | [0x1F5F, 0x1F7D], | |
251 | [0x1F80, 0x1FB4], | |
252 | [0x1FB6, 0x1FBC], | |
253 | [0x1FBE, 0x1FBE], | |
254 | [0x1FC2, 0x1FC4], | |
255 | [0x1FC6, 0x1FCC], | |
256 | [0x1FD0, 0x1FD3], | |
257 | [0x1FD6, 0x1FDB], | |
258 | [0x1FE0, 0x1FEC], | |
259 | [0x1FF2, 0x1FF4], | |
260 | [0x1FF6, 0x1FFC], | |
261 | [0x203F, 0x2040], | |
262 | [0x207F, 0x207F], | |
263 | [0x2102, 0x2102], | |
264 | [0x2107, 0x2107], | |
265 | [0x210A, 0x2113], | |
266 | [0x2115, 0x2115], | |
267 | [0x2118, 0x211D], | |
268 | [0x2124, 0x2124], | |
269 | [0x2126, 0x2126], | |
270 | [0x2128, 0x2128], | |
271 | [0x212A, 0x2131], | |
272 | [0x2133, 0x2138], | |
273 | [0x2160, 0x2182], | |
274 | [0x3005, 0x3007], | |
275 | [0x3021, 0x3029], | |
276 | [0x3041, 0x3093], | |
277 | [0x309B, 0x309C], | |
278 | [0x30A1, 0x30F6], | |
279 | [0x30FB, 0x30FC], | |
280 | [0x3105, 0x312C], | |
281 | [0x4E00, 0x9FA5], | |
282 | [0xAC00, 0xD7A3] | |
283 | ]; | |
284 | ||
285 | size_t high = ALPHA_TABLE.length - 1; | |
286 | // Shortcut search if c is out of range | |
287 | size_t low = (c < ALPHA_TABLE[0][0] || ALPHA_TABLE[high][1] < c) ? high + 1 : 0; | |
288 | // Binary search | |
289 | while (low <= high) | |
290 | { | |
291 | size_t mid = (low + high) >> 1; | |
292 | if (c < ALPHA_TABLE[mid][0]) | |
293 | high = mid - 1; | |
294 | else if (ALPHA_TABLE[mid][1] < c) | |
295 | low = mid + 1; | |
296 | else | |
297 | { | |
298 | assert(ALPHA_TABLE[mid][0] <= c && c <= ALPHA_TABLE[mid][1]); | |
299 | return true; | |
300 | } | |
301 | } | |
302 | return false; | |
303 | } | |
304 | ||
305 | /** | |
306 | * Returns the code length of c in code units. | |
307 | */ | |
308 | int utf_codeLengthChar(dchar c) | |
309 | { | |
310 | if (c <= 0x7F) | |
311 | return 1; | |
312 | if (c <= 0x7FF) | |
313 | return 2; | |
314 | if (c <= 0xFFFF) | |
315 | return 3; | |
316 | if (c <= 0x10FFFF) | |
317 | return 4; | |
318 | assert(false); | |
319 | } | |
320 | ||
321 | int utf_codeLengthWchar(dchar c) | |
322 | { | |
323 | return c <= 0xFFFF ? 1 : 2; | |
324 | } | |
325 | ||
326 | /** | |
327 | * Returns the code length of c in code units for the encoding. | |
328 | * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32. | |
329 | */ | |
330 | int utf_codeLength(int sz, dchar c) | |
331 | { | |
332 | if (sz == 1) | |
333 | return utf_codeLengthChar(c); | |
334 | if (sz == 2) | |
335 | return utf_codeLengthWchar(c); | |
336 | assert(sz == 4); | |
337 | return 1; | |
338 | } | |
339 | ||
340 | void utf_encodeChar(char* s, dchar c) | |
341 | { | |
342 | assert(s !is null); | |
343 | assert(utf_isValidDchar(c)); | |
344 | if (c <= 0x7F) | |
345 | { | |
346 | s[0] = cast(char)c; | |
347 | } | |
348 | else if (c <= 0x07FF) | |
349 | { | |
350 | s[0] = cast(char)(0xC0 | (c >> 6)); | |
351 | s[1] = cast(char)(0x80 | (c & 0x3F)); | |
352 | } | |
353 | else if (c <= 0xFFFF) | |
354 | { | |
355 | s[0] = cast(char)(0xE0 | (c >> 12)); | |
356 | s[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
357 | s[2] = cast(char)(0x80 | (c & 0x3F)); | |
358 | } | |
359 | else if (c <= 0x10FFFF) | |
360 | { | |
361 | s[0] = cast(char)(0xF0 | (c >> 18)); | |
362 | s[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); | |
363 | s[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
364 | s[3] = cast(char)(0x80 | (c & 0x3F)); | |
365 | } | |
366 | else | |
367 | assert(0); | |
368 | } | |
369 | ||
370 | void utf_encodeWchar(wchar* s, dchar c) | |
371 | { | |
372 | assert(s !is null); | |
373 | assert(utf_isValidDchar(c)); | |
374 | if (c <= 0xFFFF) | |
375 | { | |
376 | s[0] = cast(wchar)c; | |
377 | } | |
378 | else | |
379 | { | |
380 | s[0] = cast(wchar)((((c - 0x010000) >> 10) & 0x03FF) + 0xD800); | |
381 | s[1] = cast(wchar)(((c - 0x010000) & 0x03FF) + 0xDC00); | |
382 | } | |
383 | } | |
384 | ||
385 | void utf_encode(int sz, void* s, dchar c) | |
386 | { | |
387 | if (sz == 1) | |
388 | utf_encodeChar(cast(char*)s, c); | |
389 | else if (sz == 2) | |
390 | utf_encodeWchar(cast(wchar*)s, c); | |
391 | else | |
392 | { | |
393 | assert(sz == 4); | |
394 | *(cast(dchar*)s) = c; | |
395 | } | |
396 | } | |
397 | ||
398 | /******************************************** | |
399 | * Decode a UTF-8 sequence as a single UTF-32 code point. | |
400 | * Params: | |
401 | * s = UTF-8 sequence | |
402 | * ridx = starting index in s[], updated to reflect number of code units decoded | |
403 | * rresult = set to character decoded | |
404 | * Returns: | |
405 | * null on success, otherwise error message string | |
406 | */ | |
407 | string utf_decodeChar(const(char)[] s, ref size_t ridx, out dchar rresult) | |
408 | { | |
409 | // UTF-8 decoding errors | |
410 | static immutable string UTF8_DECODE_OK = null; // no error | |
411 | static immutable string UTF8_DECODE_OUTSIDE_CODE_SPACE = "Outside Unicode code space"; | |
412 | static immutable string UTF8_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-8 sequence"; | |
413 | static immutable string UTF8_DECODE_OVERLONG = "Overlong UTF-8 sequence"; | |
414 | static immutable string UTF8_DECODE_INVALID_TRAILER = "Invalid trailing code unit"; | |
415 | static immutable string UTF8_DECODE_INVALID_CODE_POINT = "Invalid code point decoded"; | |
416 | ||
417 | /* The following encodings are valid, except for the 5 and 6 byte | |
418 | * combinations: | |
419 | * 0xxxxxxx | |
420 | * 110xxxxx 10xxxxxx | |
421 | * 1110xxxx 10xxxxxx 10xxxxxx | |
422 | * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
423 | * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
424 | * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
425 | */ | |
426 | static immutable ubyte[256] UTF8_STRIDE = | |
427 | [ | |
428 | 1,1,1,1, 1,1,1,1, | |
429 | 1,1,1,1, 1,1,1,1, | |
430 | 1,1,1,1, 1,1,1,1, | |
431 | 1,1,1,1, 1,1,1,1, | |
432 | 1,1,1,1, 1,1,1,1, | |
433 | 1,1,1,1, 1,1,1,1, | |
434 | 1,1,1,1, 1,1,1,1, | |
435 | 1,1,1,1, 1,1,1,1, | |
436 | ||
437 | 1,1,1,1, 1,1,1,1, | |
438 | 1,1,1,1, 1,1,1,1, | |
439 | 1,1,1,1, 1,1,1,1, | |
440 | 1,1,1,1, 1,1,1,1, | |
441 | 1,1,1,1, 1,1,1,1, | |
442 | 1,1,1,1, 1,1,1,1, | |
443 | 1,1,1,1, 1,1,1,1, | |
444 | 1,1,1,1, 1,1,1,1, | |
445 | ||
446 | 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, | |
447 | 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, | |
448 | 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, | |
449 | 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, | |
450 | 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, | |
451 | 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, | |
452 | 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, | |
453 | 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, | |
454 | ||
455 | 2,2,2,2, 2,2,2,2, | |
456 | 2,2,2,2, 2,2,2,2, | |
457 | 2,2,2,2, 2,2,2,2, | |
458 | 2,2,2,2, 2,2,2,2, | |
459 | ||
460 | 3,3,3,3, 3,3,3,3, | |
461 | 3,3,3,3, 3,3,3,3, | |
462 | ||
463 | 4,4,4,4, 4,4,4,4, | |
464 | 5,5,5,5, 6,6,0xFF,0xFF | |
465 | ]; | |
466 | ||
467 | assert(s !is null); | |
468 | size_t i = ridx++; | |
469 | ||
470 | const char u = s[i]; | |
471 | // Pre-stage results for ASCII and error cases | |
472 | rresult = u; | |
473 | //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len); | |
474 | // Get expected sequence length | |
475 | const size_t n = UTF8_STRIDE[u]; | |
476 | switch (n) | |
477 | { | |
478 | case 1: | |
479 | // ASCII | |
480 | return UTF8_DECODE_OK; | |
481 | case 2: | |
482 | case 3: | |
483 | case 4: | |
484 | // multi-byte UTF-8 | |
485 | break; | |
486 | default: | |
487 | // 5- or 6-byte sequence | |
488 | return UTF8_DECODE_OUTSIDE_CODE_SPACE; | |
489 | } | |
490 | if (s.length < i + n) // source too short | |
491 | return UTF8_DECODE_TRUNCATED_SEQUENCE; | |
492 | // Pick off 7 - n low bits from first code unit | |
493 | dchar c = u & ((1 << (7 - n)) - 1); | |
494 | /* The following combinations are overlong, and illegal: | |
495 | * 1100000x (10xxxxxx) | |
496 | * 11100000 100xxxxx (10xxxxxx) | |
497 | * 11110000 1000xxxx (10xxxxxx 10xxxxxx) | |
498 | * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) | |
499 | * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) | |
500 | */ | |
501 | const char u2 = s[++i]; | |
502 | // overlong combination | |
503 | if ((u & 0xFE) == 0xC0 || (u == 0xE0 && (u2 & 0xE0) == 0x80) || (u == 0xF0 && (u2 & 0xF0) == 0x80) || (u == 0xF8 && (u2 & 0xF8) == 0x80) || (u == 0xFC && (u2 & 0xFC) == 0x80)) | |
504 | return UTF8_DECODE_OVERLONG; | |
505 | // Decode remaining bits | |
506 | for (const m = n + i - 1; i != m; ++i) | |
507 | { | |
508 | const u3 = s[i]; | |
509 | if ((u3 & 0xC0) != 0x80) // trailing bytes are 10xxxxxx | |
510 | return UTF8_DECODE_INVALID_TRAILER; | |
511 | c = (c << 6) | (u3 & 0x3F); | |
512 | } | |
513 | if (!utf_isValidDchar(c)) | |
514 | return UTF8_DECODE_INVALID_CODE_POINT; | |
515 | ridx = i; | |
516 | rresult = c; | |
517 | return UTF8_DECODE_OK; | |
518 | } | |
519 | ||
520 | /******************************************** | |
521 | * Decode a UTF-16 sequence as a single UTF-32 code point. | |
522 | * Params: | |
523 | * s = UTF-16 sequence | |
524 | * ridx = starting index in s[], updated to reflect number of code units decoded | |
525 | * rresult = set to character decoded | |
526 | * Returns: | |
527 | * null on success, otherwise error message string | |
528 | */ | |
529 | string utf_decodeWchar(const(wchar)[] s, ref size_t ridx, out dchar rresult) | |
530 | { | |
531 | // UTF-16 decoding errors | |
532 | static immutable string UTF16_DECODE_OK = null; // no error | |
533 | static immutable string UTF16_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-16 sequence"; | |
534 | static immutable string UTF16_DECODE_INVALID_SURROGATE = "Invalid low surrogate"; | |
535 | static immutable string UTF16_DECODE_UNPAIRED_SURROGATE = "Unpaired surrogate"; | |
536 | static immutable string UTF16_DECODE_INVALID_CODE_POINT = "Invalid code point decoded"; | |
537 | ||
538 | assert(s !is null); | |
539 | size_t i = ridx++; | |
540 | ||
541 | // Pre-stage results for single wchar and error cases | |
542 | dchar u = rresult = s[i]; | |
543 | if (u < 0xD800) // Single wchar codepoint | |
544 | return UTF16_DECODE_OK; | |
545 | if (0xD800 <= u && u <= 0xDBFF) // Surrogate pair | |
546 | { | |
547 | if (s.length <= i + 1) | |
548 | return UTF16_DECODE_TRUNCATED_SEQUENCE; | |
549 | wchar u2 = s[i + 1]; | |
550 | if (u2 < 0xDC00 || 0xDFFF < u) | |
551 | return UTF16_DECODE_INVALID_SURROGATE; | |
552 | u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); | |
553 | ++ridx; | |
554 | } | |
555 | else if (0xDC00 <= u && u <= 0xDFFF) | |
556 | return UTF16_DECODE_UNPAIRED_SURROGATE; | |
557 | if (!utf_isValidDchar(u)) | |
558 | return UTF16_DECODE_INVALID_CODE_POINT; | |
559 | rresult = u; | |
560 | return UTF16_DECODE_OK; | |
561 | } |