2 * Functions related to UTF encoding.
4 * Copyright: Copyright (C) 1999-2021 by The D Language Foundation, All Rights Reserved
5 * Authors: $(LINK2 http://www.digitalmars.com, Walter Bright)
6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/utf.d, _utf.d)
8 * Documentation: https://dlang.org/phobos/dmd_utf.html
9 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/utf.d
16 /// The Unicode code space is the range of code points [0x000000,0x10FFFF]
17 /// except the UTF-16 surrogate pairs in the range [0xD800,0xDFFF]
18 bool utf_isValidDchar(dchar c)
20 // TODO: Whether non-char code points should be rejected is pending review.
21 // 0xFFFE and 0xFFFF are valid for internal use, like Phobos std.utf.isValidDChar
22 // See also https://issues.dlang.org/show_bug.cgi?id=1357
23 if (c < 0xD800) // Almost all characters in a typical document.
25 if (c > 0xDFFF && c <= 0x10FFFF)
30 /*******************************
31 * Return !=0 if unicode alpha.
32 * Use table from C99 Appendix D.
34 bool isUniAlpha(dchar c)
36 static immutable wchar[2][] ALPHA_TABLE =
285 size_t high = ALPHA_TABLE.length - 1;
286 // Shortcut search if c is out of range
287 size_t low = (c < ALPHA_TABLE[0][0] || ALPHA_TABLE[high][1] < c) ? high + 1 : 0;
291 size_t mid = (low + high) >> 1;
292 if (c < ALPHA_TABLE[mid][0])
294 else if (ALPHA_TABLE[mid][1] < c)
298 assert(ALPHA_TABLE[mid][0] <= c && c <= ALPHA_TABLE[mid][1]);
306 * Returns the code length of c in code units.
308 int utf_codeLengthChar(dchar c)
321 int utf_codeLengthWchar(dchar c)
323 return c <= 0xFFFF ? 1 : 2;
327 * Returns the code length of c in code units for the encoding.
328 * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32.
330 int utf_codeLength(int sz, dchar c)
333 return utf_codeLengthChar(c);
335 return utf_codeLengthWchar(c);
340 void utf_encodeChar(char* s, dchar c)
343 assert(utf_isValidDchar(c));
348 else if (c <= 0x07FF)
350 s[0] = cast(char)(0xC0 | (c >> 6));
351 s[1] = cast(char)(0x80 | (c & 0x3F));
353 else if (c <= 0xFFFF)
355 s[0] = cast(char)(0xE0 | (c >> 12));
356 s[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
357 s[2] = cast(char)(0x80 | (c & 0x3F));
359 else if (c <= 0x10FFFF)
361 s[0] = cast(char)(0xF0 | (c >> 18));
362 s[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
363 s[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
364 s[3] = cast(char)(0x80 | (c & 0x3F));
370 void utf_encodeWchar(wchar* s, dchar c)
373 assert(utf_isValidDchar(c));
380 s[0] = cast(wchar)((((c - 0x010000) >> 10) & 0x03FF) + 0xD800);
381 s[1] = cast(wchar)(((c - 0x010000) & 0x03FF) + 0xDC00);
385 void utf_encode(int sz, void* s, dchar c)
388 utf_encodeChar(cast(char*)s, c);
390 utf_encodeWchar(cast(wchar*)s, c);
394 *(cast(dchar*)s) = c;
398 /********************************************
399 * Decode a UTF-8 sequence as a single UTF-32 code point.
402 * ridx = starting index in s[], updated to reflect number of code units decoded
403 * rresult = set to character decoded
405 * null on success, otherwise error message string
407 string utf_decodeChar(const(char)[] s, ref size_t ridx, out dchar rresult)
409 // UTF-8 decoding errors
410 static immutable string UTF8_DECODE_OK = null; // no error
411 static immutable string UTF8_DECODE_OUTSIDE_CODE_SPACE = "Outside Unicode code space";
412 static immutable string UTF8_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-8 sequence";
413 static immutable string UTF8_DECODE_OVERLONG = "Overlong UTF-8 sequence";
414 static immutable string UTF8_DECODE_INVALID_TRAILER = "Invalid trailing code unit";
415 static immutable string UTF8_DECODE_INVALID_CODE_POINT = "Invalid code point decoded";
417 /* The following encodings are valid, except for the 5 and 6 byte
421 * 1110xxxx 10xxxxxx 10xxxxxx
422 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
423 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
424 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
426 static immutable ubyte[256] UTF8_STRIDE =
446 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
447 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
448 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
449 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
450 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
451 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
452 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
453 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
464 5,5,5,5, 6,6,0xFF,0xFF
471 // Pre-stage results for ASCII and error cases
473 //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len);
474 // Get expected sequence length
475 const size_t n = UTF8_STRIDE[u];
480 return UTF8_DECODE_OK;
487 // 5- or 6-byte sequence
488 return UTF8_DECODE_OUTSIDE_CODE_SPACE;
490 if (s.length < i + n) // source too short
491 return UTF8_DECODE_TRUNCATED_SEQUENCE;
492 // Pick off 7 - n low bits from first code unit
493 dchar c = u & ((1 << (7 - n)) - 1);
494 /* The following combinations are overlong, and illegal:
495 * 1100000x (10xxxxxx)
496 * 11100000 100xxxxx (10xxxxxx)
497 * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
498 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
499 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
501 const char u2 = s[++i];
502 // overlong combination
503 if ((u & 0xFE) == 0xC0 || (u == 0xE0 && (u2 & 0xE0) == 0x80) || (u == 0xF0 && (u2 & 0xF0) == 0x80) || (u == 0xF8 && (u2 & 0xF8) == 0x80) || (u == 0xFC && (u2 & 0xFC) == 0x80))
504 return UTF8_DECODE_OVERLONG;
505 // Decode remaining bits
506 for (const m = n + i - 1; i != m; ++i)
509 if ((u3 & 0xC0) != 0x80) // trailing bytes are 10xxxxxx
510 return UTF8_DECODE_INVALID_TRAILER;
511 c = (c << 6) | (u3 & 0x3F);
513 if (!utf_isValidDchar(c))
514 return UTF8_DECODE_INVALID_CODE_POINT;
517 return UTF8_DECODE_OK;
520 /********************************************
521 * Decode a UTF-16 sequence as a single UTF-32 code point.
523 * s = UTF-16 sequence
524 * ridx = starting index in s[], updated to reflect number of code units decoded
525 * rresult = set to character decoded
527 * null on success, otherwise error message string
529 string utf_decodeWchar(const(wchar)[] s, ref size_t ridx, out dchar rresult)
531 // UTF-16 decoding errors
532 static immutable string UTF16_DECODE_OK = null; // no error
533 static immutable string UTF16_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-16 sequence";
534 static immutable string UTF16_DECODE_INVALID_SURROGATE = "Invalid low surrogate";
535 static immutable string UTF16_DECODE_UNPAIRED_SURROGATE = "Unpaired surrogate";
536 static immutable string UTF16_DECODE_INVALID_CODE_POINT = "Invalid code point decoded";
541 // Pre-stage results for single wchar and error cases
542 dchar u = rresult = s[i];
543 if (u < 0xD800) // Single wchar codepoint
544 return UTF16_DECODE_OK;
545 if (0xD800 <= u && u <= 0xDBFF) // Surrogate pair
547 if (s.length <= i + 1)
548 return UTF16_DECODE_TRUNCATED_SEQUENCE;
550 if (u2 < 0xDC00 || 0xDFFF < u)
551 return UTF16_DECODE_INVALID_SURROGATE;
552 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
555 else if (0xDC00 <= u && u <= 0xDFFF)
556 return UTF16_DECODE_UNPAIRED_SURROGATE;
557 if (!utf_isValidDchar(u))
558 return UTF16_DECODE_INVALID_CODE_POINT;
560 return UTF16_DECODE_OK;