1 // Written in the D programming language.
4 Classes and functions for handling and transcoding between various encodings.
6 For cases where the _encoding is known at compile-time, functions are provided
7 for arbitrary _encoding and decoding of characters, arbitrary transcoding
8 between strings of different type, as well as validation and sanitization.
10 Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1
11 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250 and WINDOWS-1252.
13 $(SCRIPT inhibitQuickIndex = 1;)
15 $(TR $(TH Category) $(TH Functions))
16 $(TR $(TD Decode) $(TD
22 $(TR $(TD Conversion) $(TD
27 $(TR $(TD Classification) $(TD
30 $(LREF isValidCodePoint)
31 $(LREF isValidCodeUnit)
39 $(TR $(TD Length & Index) $(TD
46 $(TR $(TD Encoding schemes) $(TD
48 $(LREF EncodingScheme)
49 $(LREF EncodingSchemeASCII)
50 $(LREF EncodingSchemeLatin1)
51 $(LREF EncodingSchemeLatin2)
52 $(LREF EncodingSchemeUtf16Native)
53 $(LREF EncodingSchemeUtf32Native)
54 $(LREF EncodingSchemeUtf8)
55 $(LREF EncodingSchemeWindows1250)
56 $(LREF EncodingSchemeWindows1252)
58 $(TR $(TD Representation) $(TD
65 $(LREF Windows1250Char)
66 $(LREF Windows1250String)
67 $(LREF Windows1252Char)
68 $(LREF Windows1252String)
70 $(TR $(TD Exceptions) $(TD
71 $(LREF INVALID_SEQUENCE)
72 $(LREF EncodingException)
76 For cases where the _encoding is not known at compile-time, but is
77 known at run-time, the abstract class $(LREF EncodingScheme)
78 and its subclasses is provided. To construct a run-time encoder/decoder,
81 ----------------------------------------------------
82 auto e = EncodingScheme.create("utf-8");
83 ----------------------------------------------------
85 This library supplies $(LREF EncodingScheme) subclasses for ASCII,
86 ISO-8859-1 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250,
87 WINDOWS-1252, UTF-8, and (on little-endian architectures) UTF-16LE and
88 UTF-32LE; or (on big-endian architectures) UTF-16BE and UTF-32BE.
90 This library provides a mechanism whereby other modules may add $(LREF
91 EncodingScheme) subclasses for any other _encoding.
93 Copyright: Copyright Janice Caron 2008 - 2009.
94 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
96 Source: $(PHOBOSSRC std/_encoding.d)
99 Copyright Janice Caron 2008 - 2009.
100 Distributed under the Boost Software License, Version 1.0.
101 (See accompanying file LICENSE_1_0.txt or copy at
102 http://www.boost.org/LICENSE_1_0.txt)
106 import std.range.primitives;
112 static ubyte[][] validStrings =
115 cast(ubyte[])"hello",
117 // First possible sequence of a certain length
118 [ 0x00 ], // U+00000000 one byte
119 [ 0xC2, 0x80 ], // U+00000080 two bytes
120 [ 0xE0, 0xA0, 0x80 ], // U+00000800 three bytes
121 [ 0xF0, 0x90, 0x80, 0x80 ], // U+00010000 three bytes
123 // Last possible sequence of a certain length
124 [ 0x7F ], // U+0000007F one byte
125 [ 0xDF, 0xBF ], // U+000007FF two bytes
126 [ 0xEF, 0xBF, 0xBF ], // U+0000FFFF three bytes
128 // Other boundary conditions
129 [ 0xED, 0x9F, 0xBF ],
130 // U+0000D7FF Last character before surrogates
131 [ 0xEE, 0x80, 0x80 ],
132 // U+0000E000 First character after surrogates
133 [ 0xEF, 0xBF, 0xBD ],
134 // U+0000FFFD Unicode replacement character
135 [ 0xF4, 0x8F, 0xBF, 0xBF ],
136 // U+0010FFFF Very last character
138 // Non-character code points
139 /* NOTE: These are legal in UTF, and may be converted from
140 one UTF to another, however they do not represent Unicode
141 characters. These code points have been reserved by
142 Unicode as non-character code points. They are permissible
143 for data exchange within an application, but they are are
144 not permitted to be used as characters. Since this module
145 deals with UTF, and not with Unicode per se, we choose to
147 [ 0xDF, 0xBE ], // U+0000FFFE
148 [ 0xDF, 0xBF ], // U+0000FFFF
151 static ubyte[][] invalidStrings =
153 // First possible sequence of a certain length, but greater
155 [ 0xF8, 0x88, 0x80, 0x80, 0x80 ], // U+00200000 five bytes
156 [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ], // U+04000000 six bytes
158 // Last possible sequence of a certain length, but greater than U+10FFFF
159 [ 0xF7, 0xBF, 0xBF, 0xBF ], // U+001FFFFF four bytes
160 [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF five bytes
161 [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ], // U+7FFFFFFF six bytes
163 // Other boundary conditions
164 [ 0xF4, 0x90, 0x80, 0x80 ], // U+00110000
169 // Unexpected continuation bytes
172 [ 0x20, 0x80, 0x20 ],
173 [ 0x20, 0xBF, 0x20 ],
174 [ 0x80, 0x9F, 0xA0 ],
176 // Lonely start bytes
179 [ 0x20, 0xC0, 0x20 ],
180 [ 0x20, 0xCF, 0x20 ],
183 [ 0x20, 0xD0, 0x20 ],
184 [ 0x20, 0xDF, 0x20 ],
187 [ 0x20, 0xE0, 0x20 ],
188 [ 0x20, 0xEF, 0x20 ],
194 [ 0xF5 ], // If this were legal it would start a character > U+10FFFF
195 [ 0xF6 ], // If this were legal it would start a character > U+10FFFF
196 [ 0xF7 ], // If this were legal it would start a character > U+10FFFF
198 [ 0xEF, 0xBF ], // Three byte sequence with third byte missing
199 [ 0xF7, 0xBF, 0xBF ], // Four byte sequence with fourth byte missing
200 [ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ], // Concatenation of the above
211 [ 0x20, 0xF8, 0x20 ],
212 [ 0x20, 0xF9, 0x20 ],
213 [ 0x20, 0xFA, 0x20 ],
214 [ 0x20, 0xFB, 0x20 ],
215 [ 0x20, 0xFC, 0x20 ],
216 [ 0x20, 0xFD, 0x20 ],
217 [ 0x20, 0xFE, 0x20 ],
218 [ 0x20, 0xFF, 0x20 ],
220 // Overlong sequences, all representing U+002F
221 /* With a safe UTF-8 decoder, all of the following five overlong
222 representations of the ASCII character slash ("/") should be
223 rejected like a malformed UTF-8 sequence */
225 [ 0xE0, 0x80, 0xAF ],
226 [ 0xF0, 0x80, 0x80, 0xAF ],
227 [ 0xF8, 0x80, 0x80, 0x80, 0xAF ],
228 [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ],
230 // Maximum overlong sequences
231 /* Below you see the highest Unicode value that is still resulting in
232 an overlong sequence if represented with the given number of bytes.
233 This is a boundary test for safe UTF-8 decoders. All five
234 characters should be rejected like malformed UTF-8 sequences. */
235 [ 0xC1, 0xBF ], // U+0000007F
236 [ 0xE0, 0x9F, 0xBF ], // U+000007FF
237 [ 0xF0, 0x8F, 0xBF, 0xBF ], // U+0000FFFF
238 [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ], // U+001FFFFF
239 [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF
241 // Overlong representation of the NUL character
242 /* The following five sequences should also be rejected like malformed
243 UTF-8 sequences and should not be treated like the ASCII NUL
246 [ 0xE0, 0x80, 0x80 ],
247 [ 0xF0, 0x80, 0x80, 0x80 ],
248 [ 0xF8, 0x80, 0x80, 0x80, 0x80 ],
249 [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ],
251 // Illegal code positions
252 /* The following UTF-8 sequences should be rejected like malformed
253 sequences, because they never represent valid ISO 10646 characters
254 and a UTF-8 decoder that accepts them might introduce security
255 problems comparable to overlong UTF-8 sequences. */
256 [ 0xED, 0xA0, 0x80 ], // U+D800
257 [ 0xED, 0xAD, 0xBF ], // U+DB7F
258 [ 0xED, 0xAE, 0x80 ], // U+DB80
259 [ 0xED, 0xAF, 0xBF ], // U+DBFF
260 [ 0xED, 0xB0, 0x80 ], // U+DC00
261 [ 0xED, 0xBE, 0x80 ], // U+DF80
262 [ 0xED, 0xBF, 0xBF ], // U+DFFF
265 static string[] sanitizedStrings =
268 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
269 " \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ",
270 "\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ",
271 " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
272 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD",
273 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
274 " \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ",
275 " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
276 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
277 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
280 // Make sure everything that should be valid, is
281 foreach (a;validStrings)
283 string s = cast(string) a;
284 assert(isValid(s),"Failed to validate: "~makeReadable(s));
287 // Make sure everything that shouldn't be valid, isn't
288 foreach (a;invalidStrings)
290 string s = cast(string) a;
291 assert(!isValid(s),"Incorrectly validated: "~makeReadable(s));
294 // Make sure we can sanitize everything bad
295 assert(invalidStrings.length == sanitizedStrings.length);
296 for (int i=0; i<invalidStrings.length; ++i)
298 string s = cast(string) invalidStrings[i];
299 string t = sanitize(s);
301 assert(t == sanitizedStrings[i]);
302 ubyte[] u = cast(ubyte[]) t;
306 // Make sure all transcodings work in both directions, using both forward
307 // and reverse iteration
308 foreach (a; validStrings)
310 string s = cast(string) a;
345 transcodeReverse(s,ws);
347 transcodeReverse(ws,s2);
350 transcodeReverse(s,ds);
352 transcodeReverse(ds,s2);
355 transcodeReverse(ws,s);
357 transcodeReverse(s,ws2);
360 transcodeReverse(ws,ds);
362 transcodeReverse(ds,ws2);
365 transcodeReverse(ds,s);
367 transcodeReverse(s,ds2);
370 transcodeReverse(ds,ws);
372 transcodeReverse(ws,ds2);
376 // Make sure the non-UTF encodings work too
378 auto s = "\u20AC100";
381 assert(t == cast(Windows1252Char[])[0x80, '1', '0', '0']);
387 assert(cast(string) v == "?100");
390 assert(cast(string) w == "?100");
391 s = "\u017Dlu\u0165ou\u010Dk\u00FD k\u016F\u0148";
394 assert(x == cast(Latin2Char[])[0xae, 'l', 'u', 0xbb, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]);
397 assert(y == cast(Windows1250Char[])[0x8e, 'l', 'u', 0x9d, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]);
400 // Make sure we can count properly
402 assert(encodedLength!(char)('A') == 1);
403 assert(encodedLength!(char)('\u00E3') == 2);
404 assert(encodedLength!(char)('\u2028') == 3);
405 assert(encodedLength!(char)('\U0010FFF0') == 4);
406 assert(encodedLength!(wchar)('A') == 1);
407 assert(encodedLength!(wchar)('\U0010FFF0') == 2);
410 // Make sure we can write into mutable arrays
413 auto n = encode(cast(dchar)'\u00E3',buffer);
415 assert(buffer[0] == 0xC3);
416 assert(buffer[1] == 0xA3);
420 //=============================================================================
422 /** Special value returned by $(D safeDecode) */
423 enum dchar INVALID_SEQUENCE = cast(dchar) 0xFFFFFFFF;
425 template EncoderFunctions()
427 // Various forms of read
429 template ReadFromString()
431 @property bool canRead() { return s.length != 0; }
432 E peek() @safe pure @nogc nothrow { return s[0]; }
433 E read() @safe pure @nogc nothrow { E t = s[0]; s = s[1..$]; return t; }
436 template ReverseReadFromString()
438 @property bool canRead() { return s.length != 0; }
439 E peek() @safe pure @nogc nothrow { return s[$-1]; }
440 E read() @safe pure @nogc nothrow { E t = s[$-1]; s = s[0..$-1]; return t; }
443 // Various forms of Write
445 template WriteToString()
448 void write(E c) @safe pure nothrow { s ~= c; }
451 template WriteToArray()
453 void write(E c) @safe pure @nogc nothrow { array[0] = c; array = array[1..$]; }
456 template WriteToDelegate()
458 void write(E c) { dg(c); }
461 // Functions we will export
463 template EncodeViaWrite()
465 mixin encodeViaWrite;
466 void encode(dchar c) { encodeViaWrite(c); }
469 template SkipViaRead()
472 void skip() @safe pure @nogc nothrow { skipViaRead(); }
475 template DecodeViaRead()
478 dchar decode() @safe pure @nogc nothrow { return decodeViaRead(); }
481 template SafeDecodeViaRead()
483 mixin safeDecodeViaRead;
484 dchar safeDecode() @safe pure @nogc nothrow { return safeDecodeViaRead(); }
487 template DecodeReverseViaRead()
489 mixin decodeReverseViaRead;
490 dchar decodeReverse() @safe pure @nogc nothrow { return decodeReverseViaRead(); }
493 // Encoding to different destinations
495 template EncodeToString()
498 mixin EncodeViaWrite;
501 template EncodeToArray()
504 mixin EncodeViaWrite;
507 template EncodeToDelegate()
509 mixin WriteToDelegate;
510 mixin EncodeViaWrite;
513 // Decoding functions
515 template SkipFromString()
517 mixin ReadFromString;
521 template DecodeFromString()
523 mixin ReadFromString;
527 template SafeDecodeFromString()
529 mixin ReadFromString;
530 mixin SafeDecodeViaRead;
533 template DecodeReverseFromString()
535 mixin ReverseReadFromString;
536 mixin DecodeReverseViaRead;
539 //=========================================================================
541 // Below are the functions we will ultimately expose to the user
543 E[] encode(dchar c) @safe pure nothrow
545 mixin EncodeToString e;
550 void encode(dchar c, ref E[] array) @safe pure nothrow
552 mixin EncodeToArray e;
556 void encode(dchar c, void delegate(E) dg)
558 mixin EncodeToDelegate e;
562 void skip(ref const(E)[] s) @safe pure nothrow
564 mixin SkipFromString e;
568 dchar decode(S)(ref S s)
570 mixin DecodeFromString e;
574 dchar safeDecode(S)(ref S s)
576 mixin SafeDecodeFromString e;
577 return e.safeDecode();
580 dchar decodeReverse(ref const(E)[] s) @safe pure nothrow
582 mixin DecodeReverseFromString e;
583 return e.decodeReverse();
587 //=========================================================================
603 int opApply(scope int delegate(ref dchar) dg)
606 while (s.length != 0)
610 if (result != 0) break;
615 int opApply(scope int delegate(ref size_t, ref dchar) dg)
619 while (s.length != 0)
621 immutable len = s.length;
623 size_t j = i; // We don't want the delegate corrupting i
625 if (result != 0) break;
631 int opApplyReverse(scope int delegate(ref dchar) dg)
634 while (s.length != 0)
636 dchar c = decodeReverse(s);
638 if (result != 0) break;
643 int opApplyReverse(scope int delegate(ref size_t, ref dchar) dg)
646 while (s.length != 0)
648 dchar c = decodeReverse(s);
651 if (result != 0) break;
664 assert(isValidCodePoint(d));
671 int opApply(scope int delegate(ref E) dg)
677 if (result != 0) break;
682 int opApplyReverse(scope int delegate(ref E) dg)
685 foreach_reverse (E c;s)
688 if (result != 0) break;
694 //=============================================================================
696 template EncoderInstance(E)
698 static assert(false,"Cannot instantiate EncoderInstance for type "
702 private template GenericEncoder()
704 bool canEncode(dchar c) @safe pure @nogc nothrow
706 if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) return true;
707 if (c >= 0xFFFD) return false;
710 while (idx < bstMap.length)
712 if (bstMap[idx][0] == c) return true;
713 idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index
719 bool isValidCodeUnit(E c) @safe pure @nogc nothrow
721 if (c < m_charMapStart || c > m_charMapEnd) return true;
722 return charMap[c-m_charMapStart] != 0xFFFD;
725 size_t encodedLength(dchar c) @safe pure @nogc nothrow
728 assert(canEncode(c));
735 void encodeViaWrite()(dchar c)
737 if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) {}
738 else if (c >= 0xFFFD) { c = '?'; }
742 while (idx < bstMap.length)
744 if (bstMap[idx][0] == c)
746 write(cast(E) bstMap[idx][1]);
749 idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index
761 dchar decodeViaRead()()
764 return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
767 dchar safeDecodeViaRead()()
769 immutable E c = read();
770 immutable d = (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
771 return d == 0xFFFD ? INVALID_SEQUENCE : d;
774 dchar decodeReverseViaRead()()
777 return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
780 @property EString replacementSequence() @safe pure @nogc nothrow
782 return cast(EString)("?");
785 mixin EncoderFunctions;
788 //=============================================================================
790 //=============================================================================
792 /** Defines various character sets. */
793 enum AsciiChar : ubyte { init }
795 alias AsciiString = immutable(AsciiChar)[];
797 template EncoderInstance(CharType : AsciiChar)
800 alias EString = AsciiString;
802 @property string encodingName() @safe pure nothrow @nogc
807 bool canEncode(dchar c) @safe pure nothrow @nogc
812 bool isValidCodeUnit(AsciiChar c) @safe pure nothrow @nogc
817 size_t encodedLength(dchar c) @safe pure nothrow @nogc
820 assert(canEncode(c));
827 void encodeX(Range)(dchar c, Range r)
829 if (!canEncode(c)) c = '?';
830 r.write(cast(AsciiChar) c);
833 void encodeViaWrite()(dchar c)
835 if (!canEncode(c)) c = '?';
836 write(cast(AsciiChar) c);
844 dchar decodeViaRead()()
849 dchar safeDecodeViaRead()()
851 immutable c = read();
852 return canEncode(c) ? c : INVALID_SEQUENCE;
855 dchar decodeReverseViaRead()()
860 @property EString replacementSequence() @safe pure nothrow @nogc
862 return cast(EString)("?");
865 mixin EncoderFunctions;
868 //=============================================================================
870 //=============================================================================
872 /** Defines an Latin1-encoded character. */
873 enum Latin1Char : ubyte { init }
875 Defines an Latin1-encoded string (as an array of $(D
876 immutable(Latin1Char))).
878 alias Latin1String = immutable(Latin1Char)[];
880 template EncoderInstance(CharType : Latin1Char)
882 alias E = Latin1Char;
883 alias EString = Latin1String;
885 @property string encodingName() @safe pure nothrow @nogc
890 bool canEncode(dchar c) @safe pure nothrow @nogc
895 bool isValidCodeUnit(Latin1Char c) @safe pure nothrow @nogc
900 size_t encodedLength(dchar c) @safe pure nothrow @nogc
903 assert(canEncode(c));
910 void encodeViaWrite()(dchar c)
912 if (!canEncode(c)) c = '?';
913 write(cast(Latin1Char) c);
921 dchar decodeViaRead()()
926 dchar safeDecodeViaRead()()
931 dchar decodeReverseViaRead()()
936 @property EString replacementSequence() @safe pure nothrow @nogc
938 return cast(EString)("?");
941 mixin EncoderFunctions;
944 //=============================================================================
946 //=============================================================================
948 /// Defines a Latin2-encoded character.
949 enum Latin2Char : ubyte { init }
952 * Defines an Latin2-encoded string (as an array of $(D
953 * immutable(Latin2Char))).
955 alias Latin2String = immutable(Latin2Char)[];
957 private template EncoderInstance(CharType : Latin2Char)
959 import std.typecons : Tuple, tuple;
961 alias E = Latin2Char;
962 alias EString = Latin2String;
964 @property string encodingName() @safe pure nothrow @nogc
969 private static immutable dchar m_charMapStart = 0xa1;
970 private static immutable dchar m_charMapEnd = 0xff;
972 private immutable wstring charMap =
973 "\u0104\u02D8\u0141\u00A4\u013D\u015A\u00A7\u00A8"~
974 "\u0160\u015E\u0164\u0179\u00AD\u017D\u017B\u00B0"~
975 "\u0105\u02DB\u0142\u00B4\u013E\u015B\u02C7\u00B8"~
976 "\u0161\u015F\u0165\u017A\u02DD\u017E\u017C\u0154"~
977 "\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7\u010C"~
978 "\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E\u0110"~
979 "\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7\u0158"~
980 "\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF\u0155"~
981 "\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7\u010D"~
982 "\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F\u0111"~
983 "\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7\u0159"~
984 "\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9";
986 private immutable Tuple!(wchar, char)[] bstMap = [
987 tuple('\u0148','\xF2'), tuple('\u00F3','\xF3'), tuple('\u0165','\xBB'),
988 tuple('\u00D3','\xD3'), tuple('\u010F','\xEF'), tuple('\u015B','\xB6'),
989 tuple('\u017C','\xBF'), tuple('\u00C1','\xC1'), tuple('\u00E1','\xE1'),
990 tuple('\u0103','\xE3'), tuple('\u013A','\xE5'), tuple('\u0155','\xE0'),
991 tuple('\u0161','\xB9'), tuple('\u0171','\xFB'), tuple('\u02D8','\xA2'),
992 tuple('\u00AD','\xAD'), tuple('\u00C9','\xC9'), tuple('\u00DA','\xDA'),
993 tuple('\u00E9','\xE9'), tuple('\u00FA','\xFA'), tuple('\u0107','\xE6'),
994 tuple('\u0119','\xEA'), tuple('\u0142','\xB3'), tuple('\u0151','\xF5'),
995 tuple('\u0159','\xF8'), tuple('\u015F','\xBA'), tuple('\u0163','\xFE'),
996 tuple('\u016F','\xF9'), tuple('\u017A','\xBC'), tuple('\u017E','\xBE'),
997 tuple('\u02DB','\xB2'), tuple('\u00A7','\xA7'), tuple('\u00B4','\xB4'),
998 tuple('\u00C4','\xC4'), tuple('\u00CD','\xCD'), tuple('\u00D6','\xD6'),
999 tuple('\u00DD','\xDD'), tuple('\u00E4','\xE4'), tuple('\u00ED','\xED'),
1000 tuple('\u00F6','\xF6'), tuple('\u00FD','\xFD'), tuple('\u0105','\xB1'),
1001 tuple('\u010D','\xE8'), tuple('\u0111','\xF0'), tuple('\u011B','\xEC'),
1002 tuple('\u013E','\xB5'), tuple('\u0144','\xF1'), tuple('\u0150','\xD5'),
1003 tuple('\u0154','\xC0'), tuple('\u0158','\xD8'), tuple('\u015A','\xA6'),
1004 tuple('\u015E','\xAA'), tuple('\u0160','\xA9'), tuple('\u0162','\xDE'),
1005 tuple('\u0164','\xAB'), tuple('\u016E','\xD9'), tuple('\u0170','\xDB'),
1006 tuple('\u0179','\xAC'), tuple('\u017B','\xAF'), tuple('\u017D','\xAE'),
1007 tuple('\u02C7','\xB7'), tuple('\u02D9','\xFF'), tuple('\u02DD','\xBD'),
1008 tuple('\u00A4','\xA4'), tuple('\u00A8','\xA8'), tuple('\u00B0','\xB0'),
1009 tuple('\u00B8','\xB8'), tuple('\u00C2','\xC2'), tuple('\u00C7','\xC7'),
1010 tuple('\u00CB','\xCB'), tuple('\u00CE','\xCE'), tuple('\u00D4','\xD4'),
1011 tuple('\u00D7','\xD7'), tuple('\u00DC','\xDC'), tuple('\u00DF','\xDF'),
1012 tuple('\u00E2','\xE2'), tuple('\u00E7','\xE7'), tuple('\u00EB','\xEB'),
1013 tuple('\u00EE','\xEE'), tuple('\u00F4','\xF4'), tuple('\u00F7','\xF7'),
1014 tuple('\u00FC','\xFC'), tuple('\u0102','\xC3'), tuple('\u0104','\xA1'),
1015 tuple('\u0106','\xC6'), tuple('\u010C','\xC8'), tuple('\u010E','\xCF'),
1016 tuple('\u0110','\xD0'), tuple('\u0118','\xCA'), tuple('\u011A','\xCC'),
1017 tuple('\u0139','\xC5'), tuple('\u013D','\xA5'), tuple('\u0141','\xA3'),
1018 tuple('\u0143','\xD1'), tuple('\u0147','\xD2')
1021 mixin GenericEncoder!();
1024 //=============================================================================
1026 //=============================================================================
1028 /// Defines a Windows1250-encoded character.
1029 enum Windows1250Char : ubyte { init }
1032 * Defines an Windows1250-encoded string (as an array of $(D
1033 * immutable(Windows1250Char))).
1035 alias Windows1250String = immutable(Windows1250Char)[];
1037 private template EncoderInstance(CharType : Windows1250Char)
1039 import std.typecons : Tuple, tuple;
1041 alias E = Windows1250Char;
1042 alias EString = Windows1250String;
1044 @property string encodingName() @safe pure nothrow @nogc
1046 return "windows-1250";
1049 private static immutable dchar m_charMapStart = 0x80;
1050 private static immutable dchar m_charMapEnd = 0xff;
1052 private immutable wstring charMap =
1053 "\u20AC\uFFFD\u201A\uFFFD\u201E\u2026\u2020\u2021"~
1054 "\uFFFD\u2030\u0160\u2039\u015A\u0164\u017D\u0179"~
1055 "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1056 "\uFFFD\u2122\u0161\u203A\u015B\u0165\u017E\u017A"~
1057 "\u00A0\u02C7\u02D8\u0141\u00A4\u0104\u00A6\u00A7"~
1058 "\u00A8\u00A9\u015E\u00AB\u00AC\u00AD\u00AE\u017B"~
1059 "\u00B0\u00B1\u02DB\u0142\u00B4\u00B5\u00B6\u00B7"~
1060 "\u00B8\u0105\u015F\u00BB\u013D\u02DD\u013E\u017C"~
1061 "\u0154\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7"~
1062 "\u010C\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E"~
1063 "\u0110\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7"~
1064 "\u0158\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF"~
1065 "\u0155\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7"~
1066 "\u010D\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F"~
1067 "\u0111\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7"~
1068 "\u0159\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9";
1070 private immutable Tuple!(wchar, char)[] bstMap = [
1071 tuple('\u011A','\xCC'), tuple('\u00DC','\xDC'), tuple('\u0179','\x8F'),
1072 tuple('\u00B7','\xB7'), tuple('\u00FC','\xFC'), tuple('\u0158','\xD8'),
1073 tuple('\u201C','\x93'), tuple('\u00AC','\xAC'), tuple('\u00CB','\xCB'),
1074 tuple('\u00EB','\xEB'), tuple('\u010C','\xC8'), tuple('\u0143','\xD1'),
1075 tuple('\u0162','\xDE'), tuple('\u02D9','\xFF'), tuple('\u2039','\x8B'),
1076 tuple('\u00A7','\xA7'), tuple('\u00B1','\xB1'), tuple('\u00C2','\xC2'),
1077 tuple('\u00D4','\xD4'), tuple('\u00E2','\xE2'), tuple('\u00F4','\xF4'),
1078 tuple('\u0104','\xA5'), tuple('\u0110','\xD0'), tuple('\u013D','\xBC'),
1079 tuple('\u0150','\xD5'), tuple('\u015E','\xAA'), tuple('\u016E','\xD9'),
1080 tuple('\u017D','\x8E'), tuple('\u2014','\x97'), tuple('\u2021','\x87'),
1081 tuple('\u20AC','\x80'), tuple('\u00A4','\xA4'), tuple('\u00A9','\xA9'),
1082 tuple('\u00AE','\xAE'), tuple('\u00B5','\xB5'), tuple('\u00BB','\xBB'),
1083 tuple('\u00C7','\xC7'), tuple('\u00CE','\xCE'), tuple('\u00D7','\xD7'),
1084 tuple('\u00DF','\xDF'), tuple('\u00E7','\xE7'), tuple('\u00EE','\xEE'),
1085 tuple('\u00F7','\xF7'), tuple('\u0102','\xC3'), tuple('\u0106','\xC6'),
1086 tuple('\u010E','\xCF'), tuple('\u0118','\xCA'), tuple('\u0139','\xC5'),
1087 tuple('\u0141','\xA3'), tuple('\u0147','\xD2'), tuple('\u0154','\xC0'),
1088 tuple('\u015A','\x8C'), tuple('\u0160','\x8A'), tuple('\u0164','\x8D'),
1089 tuple('\u0170','\xDB'), tuple('\u017B','\xAF'), tuple('\u02C7','\xA1'),
1090 tuple('\u02DD','\xBD'), tuple('\u2019','\x92'), tuple('\u201E','\x84'),
1091 tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'),
1092 tuple('\u00A0','\xA0'), tuple('\u00A6','\xA6'), tuple('\u00A8','\xA8'),
1093 tuple('\u00AB','\xAB'), tuple('\u00AD','\xAD'), tuple('\u00B0','\xB0'),
1094 tuple('\u00B4','\xB4'), tuple('\u00B6','\xB6'), tuple('\u00B8','\xB8'),
1095 tuple('\u00C1','\xC1'), tuple('\u00C4','\xC4'), tuple('\u00C9','\xC9'),
1096 tuple('\u00CD','\xCD'), tuple('\u00D3','\xD3'), tuple('\u00D6','\xD6'),
1097 tuple('\u00DA','\xDA'), tuple('\u00DD','\xDD'), tuple('\u00E1','\xE1'),
1098 tuple('\u00E4','\xE4'), tuple('\u00E9','\xE9'), tuple('\u00ED','\xED'),
1099 tuple('\u00F3','\xF3'), tuple('\u00F6','\xF6'), tuple('\u00FA','\xFA'),
1100 tuple('\u00FD','\xFD'), tuple('\u0103','\xE3'), tuple('\u0105','\xB9'),
1101 tuple('\u0107','\xE6'), tuple('\u010D','\xE8'), tuple('\u010F','\xEF'),
1102 tuple('\u0111','\xF0'), tuple('\u0119','\xEA'), tuple('\u011B','\xEC'),
1103 tuple('\u013A','\xE5'), tuple('\u013E','\xBE'), tuple('\u0142','\xB3'),
1104 tuple('\u0144','\xF1'), tuple('\u0148','\xF2'), tuple('\u0151','\xF5'),
1105 tuple('\u0155','\xE0'), tuple('\u0159','\xF8'), tuple('\u015B','\x9C'),
1106 tuple('\u015F','\xBA'), tuple('\u0161','\x9A'), tuple('\u0163','\xFE'),
1107 tuple('\u0165','\x9D'), tuple('\u016F','\xF9'), tuple('\u0171','\xFB'),
1108 tuple('\u017A','\x9F'), tuple('\u017C','\xBF'), tuple('\u017E','\x9E'),
1109 tuple('\u02D8','\xA2'), tuple('\u02DB','\xB2'), tuple('\u2013','\x96'),
1110 tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'),
1111 tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89')
1114 mixin GenericEncoder!();
1117 //=============================================================================
1119 //=============================================================================
1121 /// Defines a Windows1252-encoded character.
1122 enum Windows1252Char : ubyte { init }
1125 * Defines an Windows1252-encoded string (as an array of $(D
1126 * immutable(Windows1252Char))).
1128 alias Windows1252String = immutable(Windows1252Char)[];
1130 template EncoderInstance(CharType : Windows1252Char)
1132 import std.typecons : Tuple, tuple;
1134 alias E = Windows1252Char;
1135 alias EString = Windows1252String;
1137 @property string encodingName() @safe pure nothrow @nogc
1139 return "windows-1252";
1142 private static immutable dchar m_charMapStart = 0x80;
1143 private static immutable dchar m_charMapEnd = 0x9f;
1145 private immutable wstring charMap =
1146 "\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021"~
1147 "\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD"~
1148 "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1149 "\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178";
1151 private immutable Tuple!(wchar, char)[] bstMap = [
1152 tuple('\u201C','\x93'), tuple('\u0192','\x83'), tuple('\u2039','\x8B'),
1153 tuple('\u0161','\x9A'), tuple('\u2014','\x97'), tuple('\u2021','\x87'),
1154 tuple('\u20AC','\x80'), tuple('\u0153','\x9C'), tuple('\u017D','\x8E'),
1155 tuple('\u02DC','\x98'), tuple('\u2019','\x92'), tuple('\u201E','\x84'),
1156 tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'),
1157 tuple('\u0152','\x8C'), tuple('\u0160','\x8A'), tuple('\u0178','\x9F'),
1158 tuple('\u017E','\x9E'), tuple('\u02C6','\x88'), tuple('\u2013','\x96'),
1159 tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'),
1160 tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89')
1163 mixin GenericEncoder!();
1166 //=============================================================================
1168 //=============================================================================
1170 template EncoderInstance(CharType : char)
1173 alias EString = immutable(char)[];
1175 @property string encodingName() @safe pure nothrow @nogc
1180 bool canEncode(dchar c) @safe pure nothrow @nogc
1182 return isValidCodePoint(c);
1185 bool isValidCodeUnit(char c) @safe pure nothrow @nogc
1187 return (c < 0xC0 || (c >= 0xC2 && c < 0xF5));
1190 immutable ubyte[128] tailTable =
1192 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1193 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1194 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1195 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1196 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1197 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1198 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1199 3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0,
1202 private int tails(char c) @safe pure nothrow @nogc
1209 return tailTable[c-0x80];
1212 size_t encodedLength(dchar c) @safe pure nothrow @nogc
1215 assert(canEncode(c));
1219 if (c < 0x80) return 1;
1220 if (c < 0x800) return 2;
1221 if (c < 0x10000) return 3;
1225 void encodeViaWrite()(dchar c)
1229 write(cast(char) c);
1233 write(cast(char)((c >> 6) + 0xC0));
1234 write(cast(char)((c & 0x3F) + 0x80));
1236 else if (c < 0x10000)
1238 write(cast(char)((c >> 12) + 0xE0));
1239 write(cast(char)(((c >> 6) & 0x3F) + 0x80));
1240 write(cast(char)((c & 0x3F) + 0x80));
1244 write(cast(char)((c >> 18) + 0xF0));
1245 write(cast(char)(((c >> 12) & 0x3F) + 0x80));
1246 write(cast(char)(((c >> 6) & 0x3F) + 0x80));
1247 write(cast(char)((c & 0x3F) + 0x80));
1251 void skipViaRead()()
1254 if (c < 0xC0) return;
1255 int n = tails(cast(char) c);
1256 for (size_t i=0; i<n; ++i)
1262 dchar decodeViaRead()()
1265 if (c < 0xC0) return c;
1266 int n = tails(cast(char) c);
1267 c &= (1 << (6 - n)) - 1;
1268 for (size_t i=0; i<n; ++i)
1270 c = (c << 6) + (read() & 0x3F);
1275 dchar safeDecodeViaRead()()
1278 if (c < 0x80) return c;
1279 int n = tails(cast(char) c);
1280 if (n == 0) return INVALID_SEQUENCE;
1282 if (!canRead) return INVALID_SEQUENCE;
1286 (c < 0xC2) // fail overlong 2-byte sequences
1287 || (c > 0xF4) // fail overlong 4-6-byte sequences
1288 || (c == 0xE0 && ((d & 0xE0) == 0x80)) // fail overlong 3-byte sequences
1289 || (c == 0xED && ((d & 0xE0) == 0xA0)) // fail surrogates
1290 || (c == 0xF0 && ((d & 0xF0) == 0x80)) // fail overlong 4-byte sequences
1291 || (c == 0xF4 && ((d & 0xF0) >= 0x90)) // fail code points > 0x10FFFF
1294 c &= (1 << (6 - n)) - 1;
1295 for (size_t i=0; i<n; ++i)
1297 if (!canRead) return INVALID_SEQUENCE;
1299 if ((d & 0xC0) != 0x80) return INVALID_SEQUENCE;
1300 c = (c << 6) + (read() & 0x3F);
1303 return err ? INVALID_SEQUENCE : c;
1306 dchar decodeReverseViaRead()()
1309 if (c < 0x80) return c;
1312 for (size_t i=0; i<4; ++i)
1316 size_t n = tails(cast(char) d);
1317 immutable mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1;
1318 c += ((d & mask) << shift);
1324 @property EString replacementSequence() @safe pure nothrow @nogc
1329 mixin EncoderFunctions;
1332 //=============================================================================
1334 //=============================================================================
1336 template EncoderInstance(CharType : wchar)
1339 alias EString = immutable(wchar)[];
1341 @property string encodingName() @safe pure nothrow @nogc
1346 bool canEncode(dchar c) @safe pure nothrow @nogc
1348 return isValidCodePoint(c);
1351 bool isValidCodeUnit(wchar c) @safe pure nothrow @nogc
1356 size_t encodedLength(dchar c) @safe pure nothrow @nogc
1359 assert(canEncode(c));
1363 return (c < 0x10000) ? 1 : 2;
1366 void encodeViaWrite()(dchar c)
1370 write(cast(wchar) c);
1374 size_t n = c - 0x10000;
1375 write(cast(wchar)(0xD800 + (n >> 10)));
1376 write(cast(wchar)(0xDC00 + (n & 0x3FF)));
1380 void skipViaRead()()
1382 immutable c = read();
1383 if (c < 0xD800 || c >= 0xE000) return;
1387 dchar decodeViaRead()()
1390 if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1394 return 0x10000 + (c << 10) + d;
1397 dchar safeDecodeViaRead()()
1400 if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1401 if (c >= 0xDC00) return INVALID_SEQUENCE;
1402 if (!canRead) return INVALID_SEQUENCE;
1404 if (d < 0xDC00 || d >= 0xE000) return INVALID_SEQUENCE;
1408 return 0x10000 + (c << 10) + d;
1411 dchar decodeReverseViaRead()()
1414 if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1418 return 0x10000 + (d << 10) + c;
1421 @property EString replacementSequence() @safe pure nothrow @nogc
1426 mixin EncoderFunctions;
1429 //=============================================================================
1431 //=============================================================================
1433 template EncoderInstance(CharType : dchar)
1436 alias EString = immutable(dchar)[];
1438 @property string encodingName() @safe pure nothrow @nogc
1443 bool canEncode(dchar c) @safe pure @nogc nothrow
1445 return isValidCodePoint(c);
1448 bool isValidCodeUnit(dchar c) @safe pure @nogc nothrow
1450 return isValidCodePoint(c);
1453 size_t encodedLength(dchar c) @safe pure @nogc nothrow
1456 assert(canEncode(c));
1463 void encodeViaWrite()(dchar c)
1468 void skipViaRead()()
1473 dchar decodeViaRead()()
1475 return cast(dchar) read();
1478 dchar safeDecodeViaRead()()
1480 immutable c = read();
1481 return isValidCodePoint(c) ? c : INVALID_SEQUENCE;
1484 dchar decodeReverseViaRead()()
1486 return cast(dchar) read();
1489 @property EString replacementSequence() @safe pure nothrow @nogc
1494 mixin EncoderFunctions;
1497 //=============================================================================
1498 // Below are forwarding functions which expose the function to the user
1501 Returns true if c is a valid code point
1503 Note that this includes the non-character code points U+FFFE and U+FFFF,
1504 since these are valid code points (even though they are not valid
1508 This function supersedes $(D std.utf.startsValidDchar()).
1510 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1514 c = the code point to be tested
1516 bool isValidCodePoint(dchar c) @safe pure nothrow @nogc
1518 return c < 0xD800 || (c >= 0xE000 && c < 0x110000);
1522 Returns the name of an encoding.
1524 The type of encoding cannot be deduced. Therefore, it is necessary to
1525 explicitly specify the encoding type.
1527 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1530 @property string encodingName(T)()
1532 return EncoderInstance!(T).encodingName;
1538 assert(encodingName!(char) == "UTF-8");
1539 assert(encodingName!(wchar) == "UTF-16");
1540 assert(encodingName!(dchar) == "UTF-32");
1541 assert(encodingName!(AsciiChar) == "ASCII");
1542 assert(encodingName!(Latin1Char) == "ISO-8859-1");
1543 assert(encodingName!(Latin2Char) == "ISO-8859-2");
1544 assert(encodingName!(Windows1250Char) == "windows-1250");
1545 assert(encodingName!(Windows1252Char) == "windows-1252");
1549 Returns true iff it is possible to represent the specified codepoint
1552 The type of encoding cannot be deduced. Therefore, it is necessary to
1553 explicitly specify the encoding type.
1555 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1558 bool canEncode(E)(dchar c)
1560 return EncoderInstance!(E).canEncode(c);
1566 assert( canEncode!(Latin1Char)('A'));
1567 assert( canEncode!(Latin2Char)('A'));
1568 assert(!canEncode!(AsciiChar)('\u00A0'));
1569 assert( canEncode!(Latin1Char)('\u00A0'));
1570 assert( canEncode!(Latin2Char)('\u00A0'));
1571 assert( canEncode!(Windows1250Char)('\u20AC'));
1572 assert(!canEncode!(Windows1250Char)('\u20AD'));
1573 assert(!canEncode!(Windows1250Char)('\uFFFD'));
1574 assert( canEncode!(Windows1252Char)('\u20AC'));
1575 assert(!canEncode!(Windows1252Char)('\u20AD'));
1576 assert(!canEncode!(Windows1252Char)('\uFFFD'));
1577 assert(!canEncode!(char)(cast(dchar) 0x110000));
1580 /// How to check an entire string
1583 import std.algorithm.searching : find;
1584 import std.utf : byDchar;
1586 assert("The quick brown fox"
1588 .find!(x => !canEncode!AsciiChar(x))
1593 Returns true if the code unit is legal. For example, the byte 0x80 would
1594 not be legal in ASCII, because ASCII code units must always be in the range
1597 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1601 c = the code unit to be tested
1603 bool isValidCodeUnit(E)(E c)
1605 return EncoderInstance!(E).isValidCodeUnit(c);
1609 @system pure unittest
1611 assert(!isValidCodeUnit(cast(char) 0xC0));
1612 assert(!isValidCodeUnit(cast(char) 0xFF));
1613 assert( isValidCodeUnit(cast(wchar) 0xD800));
1614 assert(!isValidCodeUnit(cast(dchar) 0xD800));
1615 assert(!isValidCodeUnit(cast(AsciiChar) 0xA0));
1616 assert( isValidCodeUnit(cast(Windows1250Char) 0x80));
1617 assert(!isValidCodeUnit(cast(Windows1250Char) 0x81));
1618 assert( isValidCodeUnit(cast(Windows1252Char) 0x80));
1619 assert(!isValidCodeUnit(cast(Windows1252Char) 0x81));
1623 Returns true if the string is encoded correctly
1626 This function supersedes std.utf.validate(), however note that this
1627 function returns a bool indicating whether the input was valid or not,
1628 whereas the older function would throw an exception.
1630 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1634 s = the string to be tested
1636 bool isValid(E)(const(E)[] s)
1638 return s.length == validLength(s);
1642 @system pure unittest
1644 assert( isValid("\u20AC100"));
1645 assert(!isValid(cast(char[3])[167, 133, 175]));
1649 Returns the length of the longest possible substring, starting from
1650 the first code unit, which is validly encoded.
1652 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1656 s = the string to be tested
1658 size_t validLength(E)(const(E)[] s)
1660 size_t result, before = void;
1661 while ((before = s.length) > 0)
1663 if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE)
1665 result += before - s.length;
1671 Sanitizes a string by replacing malformed code unit sequences with valid
1672 code unit sequences. The result is guaranteed to be valid for this encoding.
1674 If the input string is already valid, this function returns the original,
1675 otherwise it constructs a new string by replacing all illegal code unit
1676 sequences with the encoding's replacement character, Invalid sequences will
1677 be replaced with the Unicode replacement character (U+FFFD) if the
1678 character repertoire contains it, otherwise invalid sequences will be
1681 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1685 s = the string to be sanitized
1687 immutable(E)[] sanitize(E)(immutable(E)[] s)
1689 size_t n = validLength(s);
1690 if (n == s.length) return s;
1692 auto repSeq = EncoderInstance!(E).replacementSequence;
1694 // Count how long the string needs to be.
1695 // Overestimating is not a problem
1696 size_t len = s.length;
1697 const(E)[] t = s[n..$];
1698 while (t.length != 0)
1700 immutable c = EncoderInstance!(E).safeDecode(t);
1701 assert(c == INVALID_SEQUENCE);
1702 len += repSeq.length;
1703 t = t[validLength(t)..$];
1707 E[] array = new E[len];
1708 array[0 .. n] = s[0 .. n];
1712 while (t.length != 0)
1714 immutable c = EncoderInstance!(E).safeDecode(t);
1715 assert(c == INVALID_SEQUENCE);
1716 array[offset .. offset+repSeq.length] = repSeq[];
1717 offset += repSeq.length;
1719 array[offset .. offset+n] = t[0 .. n];
1723 return cast(immutable(E)[])array[0 .. offset];
1727 @system pure unittest
1729 assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld");
1733 Returns the length of the first encoded sequence.
1735 The input to this function MUST be validly encoded.
1736 This is enforced by the function's in-contract.
1738 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1742 s = the string to be sliced
1744 size_t firstSequence(E)(const(E)[] s)
1747 assert(s.length != 0);
1749 assert(safeDecode(u) != INVALID_SEQUENCE);
1753 auto before = s.length;
1754 EncoderInstance!(E).skip(s);
1755 return before - s.length;
1759 @system pure unittest
1761 assert(firstSequence("\u20AC1000") == "\u20AC".length);
1762 assert(firstSequence("hel") == "h".length);
1766 Returns the length of the last encoded sequence.
1768 The input to this function MUST be validly encoded.
1769 This is enforced by the function's in-contract.
1771 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1775 s = the string to be sliced
1777 size_t lastSequence(E)(const(E)[] s)
1780 assert(s.length != 0);
1786 EncoderInstance!(E).decodeReverse(s);
1787 return t.length - s.length;
1791 @system pure unittest
1793 assert(lastSequence("1000\u20AC") == "\u20AC".length);
1794 assert(lastSequence("hellö") == "ö".length);
1798 Returns the array index at which the (n+1)th code point begins.
1800 The input to this function MUST be validly encoded.
1801 This is enforced by the function's in-contract.
1804 This function supersedes std.utf.toUTFindex().
1806 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1810 s = the string to be counted
1811 n = the current code point index
1813 ptrdiff_t index(E)(const(E)[] s,int n)
1822 for (size_t i=0; i<n; ++i) EncoderInstance!(E).skip(s);
1823 return t.length - s.length;
1827 @system pure unittest
1829 assert(index("\u20AC100",1) == 3);
1830 assert(index("hällo",2) == 3);
1834 Decodes a single code point.
1836 This function removes one or more code units from the start of a string,
1837 and returns the decoded code point which those code units represent.
1839 The input to this function MUST be validly encoded.
1840 This is enforced by the function's in-contract.
1843 This function supersedes std.utf.decode(), however, note that the
1844 function codePoints() supersedes it more conveniently.
1846 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1850 s = the string whose first code point is to be decoded
1852 dchar decode(S)(ref S s)
1855 assert(s.length != 0);
1857 assert(safeDecode(u) != INVALID_SEQUENCE);
1861 return EncoderInstance!(typeof(s[0])).decode(s);
1865 Decodes a single code point from the end of a string.
1867 This function removes one or more code units from the end of a string,
1868 and returns the decoded code point which those code units represent.
1870 The input to this function MUST be validly encoded.
1871 This is enforced by the function's in-contract.
1873 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1877 s = the string whose first code point is to be decoded
1879 dchar decodeReverse(E)(ref const(E)[] s)
1882 assert(s.length != 0);
1887 return EncoderInstance!(E).decodeReverse(s);
1891 Decodes a single code point. The input does not have to be valid.
1893 This function removes one or more code units from the start of a string,
1894 and returns the decoded code point which those code units represent.
1896 This function will accept an invalidly encoded string as input.
1897 If an invalid sequence is found at the start of the string, this
1898 function will remove it, and return the value INVALID_SEQUENCE.
1900 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1904 s = the string whose first code point is to be decoded
1906 dchar safeDecode(S)(ref S s)
1909 assert(s.length != 0);
1913 return EncoderInstance!(typeof(s[0])).safeDecode(s);
1917 Returns the number of code units required to encode a single code point.
1919 The input to this function MUST be a valid code point.
1920 This is enforced by the function's in-contract.
1922 The type of the output cannot be deduced. Therefore, it is necessary to
1923 explicitly specify the encoding as a template parameter.
1925 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1929 c = the code point to be encoded
1931 size_t encodedLength(E)(dchar c)
1934 assert(isValidCodePoint(c));
1938 return EncoderInstance!(E).encodedLength(c);
1942 Encodes a single code point.
1944 This function encodes a single code point into one or more code units.
1945 It returns a string containing those code units.
1947 The input to this function MUST be a valid code point.
1948 This is enforced by the function's in-contract.
1950 The type of the output cannot be deduced. Therefore, it is necessary to
1951 explicitly specify the encoding as a template parameter.
1954 This function supersedes std.utf.encode(), however, note that the
1955 function codeUnits() supersedes it more conveniently.
1957 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1961 c = the code point to be encoded
1963 E[] encode(E)(dchar c)
1966 assert(isValidCodePoint(c));
1970 return EncoderInstance!(E).encode(c);
1974 Encodes a single code point into an array.
1976 This function encodes a single code point into one or more code units
1977 The code units are stored in a user-supplied fixed-size array,
1978 which must be passed by reference.
1980 The input to this function MUST be a valid code point.
1981 This is enforced by the function's in-contract.
1983 The type of the output cannot be deduced. Therefore, it is necessary to
1984 explicitly specify the encoding as a template parameter.
1987 This function supersedes std.utf.encode(), however, note that the
1988 function codeUnits() supersedes it more conveniently.
1990 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1994 c = the code point to be encoded
1995 array = the destination array
1998 the number of code units written to the array
2000 size_t encode(E)(dchar c, E[] array)
2003 assert(isValidCodePoint(c));
2008 EncoderInstance!(E).encode(c,t);
2009 return array.length - t.length;
2013 Encodes $(D c) in units of type $(D E) and writes the result to the
2014 output range $(D R). Returns the number of $(D E)s written.
2016 size_t encode(E, R)(dchar c, auto ref R range)
2017 if (isNativeOutputRange!(R, E))
2019 static if (is(Unqual!E == char))
2023 put(range, cast(char) c);
2028 put(range, cast(char)(0xC0 | (c >> 6)));
2029 put(range, cast(char)(0x80 | (c & 0x3F)));
2034 put(range, cast(char)(0xE0 | (c >> 12)));
2035 put(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
2036 put(range, cast(char)(0x80 | (c & 0x3F)));
2041 put(range, cast(char)(0xF0 | (c >> 18)));
2042 put(range, cast(char)(0x80 | ((c >> 12) & 0x3F)));
2043 put(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
2044 put(range, cast(char)(0x80 | (c & 0x3F)));
2052 else static if (is(Unqual!E == wchar))
2056 range.put(cast(wchar) c);
2059 range.put(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800));
2060 range.put(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00));
2063 else static if (is(Unqual!E == dchar))
2077 Appender!(char[]) r;
2078 assert(encode!(char)('T', r) == 1);
2079 assert(encode!(wchar)('T', r) == 1);
2080 assert(encode!(dchar)('T', r) == 1);
2084 Encodes a single code point to a delegate.
2086 This function encodes a single code point into one or more code units.
2087 The code units are passed one at a time to the supplied delegate.
2089 The input to this function MUST be a valid code point.
2090 This is enforced by the function's in-contract.
2092 The type of the output cannot be deduced. Therefore, it is necessary to
2093 explicitly specify the encoding as a template parameter.
2096 This function supersedes std.utf.encode(), however, note that the
2097 function codeUnits() supersedes it more conveniently.
2099 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2103 c = the code point to be encoded
2104 dg = the delegate to invoke for each code unit
2106 void encode(E)(dchar c, void delegate(E) dg)
2109 assert(isValidCodePoint(c));
2113 EncoderInstance!(E).encode(c,dg);
2117 Encodes the contents of $(D s) in units of type $(D Tgt), writing the result to an
2120 Returns: The number of $(D Tgt) elements written.
2122 Tgt = Element type of $(D range).
2124 range = Output range.
2126 size_t encode(Tgt, Src, R)(in Src[] s, R range)
2131 result += encode!(Tgt)(c, range);
2137 Returns a foreachable struct which can bidirectionally iterate over all
2138 code points in a string.
2140 The input to this function MUST be validly encoded.
2141 This is enforced by the function's in-contract.
2143 You can foreach either
2144 with or without an index. If an index is specified, it will be initialized
2145 at each iteration with the offset into the string at which the code point
2149 This function supersedes std.utf.decode().
2151 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2155 s = the string to be decoded
2158 --------------------------------------------------------
2159 string s = "hello world";
2160 foreach (c;codePoints(s))
2162 // do something with c (which will always be a dchar)
2164 --------------------------------------------------------
2166 Note that, currently, foreach (c:codePoints(s)) is superior to foreach (c;s)
2167 in that the latter will fall over on encountering U+FFFF.
2169 CodePoints!(E) codePoints(E)(immutable(E)[] s)
2176 return CodePoints!(E)(s);
2184 foreach (c;codePoints(s))
2192 Returns a foreachable struct which can bidirectionally iterate over all
2193 code units in a code point.
2195 The input to this function MUST be a valid code point.
2196 This is enforced by the function's in-contract.
2198 The type of the output cannot be deduced. Therefore, it is necessary to
2199 explicitly specify the encoding type in the template parameter.
2202 This function supersedes std.utf.encode().
2204 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2208 c = the code point to be encoded
2210 CodeUnits!(E) codeUnits(E)(dchar c)
2213 assert(isValidCodePoint(c));
2217 return CodeUnits!(E)(c);
2224 foreach (c;codeUnits!(char)(cast(dchar)'\u20AC'))
2228 assert(a.length == 3);
2229 assert(a[0] == 0xE2);
2230 assert(a[1] == 0x82);
2231 assert(a[2] == 0xAC);
2235 Convert a string from one encoding to another.
2238 This function supersedes std.utf.toUTF8(), std.utf.toUTF16() and
2240 (but note that to!() supersedes it more conveniently).
2242 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2246 s = Source string. $(B Must) be validly encoded.
2247 This is enforced by the function's in-contract.
2248 r = Destination string
2253 void transcode(Src, Dst)(Src[] s, out Dst[] r)
2260 static if (is(Src == Dst) && is(Src == immutable))
2264 else static if (is(Unqual!Src == AsciiChar))
2266 transcode(cast(const(char)[])s, r);
2270 static if (is(Unqual!Dst == wchar))
2272 immutable minReservePlace = 2;
2274 else static if (is(Unqual!Dst == dchar))
2276 immutable minReservePlace = 1;
2280 immutable minReservePlace = 6;
2283 auto buffer = new Unqual!Dst[s.length];
2284 auto tmpBuffer = buffer;
2286 while (s.length != 0)
2288 if (tmpBuffer.length < minReservePlace)
2290 size_t prevLength = buffer.length;
2291 buffer.length += s.length + minReservePlace;
2292 tmpBuffer = buffer[prevLength - tmpBuffer.length .. $];
2294 EncoderInstance!(Unqual!Dst).encode(decode(s), tmpBuffer);
2297 r = cast(Dst[]) buffer[0 .. buffer.length - tmpBuffer.length];
2302 @system pure unittest
2305 // transcode from UTF-8 to UTF-16
2306 transcode("hello world",ws);
2307 assert(ws == "hello world"w);
2310 // transcode from UTF-16 to ISO-8859-1
2312 assert(ws == "hello world");
2315 @system pure unittest
2320 import std.conv : to;
2322 string asciiCharString = to!string(iota(0, 128, 1));
2324 alias Types = AliasSeq!(string, Latin1String, Latin2String, AsciiString,
2325 Windows1250String, Windows1252String, dstring, wstring);
2332 transcode(asciiCharString, sStr);
2333 transcode(sStr, dStr);
2334 transcode(dStr, str);
2335 assert(asciiCharString == str);
2339 string czechChars = "Příliš žluťoučký kůň úpěl ďábelské ódy.";
2340 alias Types = AliasSeq!(string, dstring, wstring);
2347 transcode(czechChars, sStr);
2348 transcode(sStr, dStr);
2349 transcode(dStr, str);
2350 assert(czechChars == str);
2355 @system unittest // mutable/const input/output
2357 import std.meta : AliasSeq;
2359 foreach (O; AliasSeq!(Latin1Char, const Latin1Char, immutable Latin1Char))
2363 char[] mutableInput = "äbc".dup;
2364 transcode(mutableInput, output);
2365 assert(output == [0xE4, 'b', 'c']);
2367 const char[] constInput = "öbc";
2368 transcode(constInput, output);
2369 assert(output == [0xF6, 'b', 'c']);
2371 immutable char[] immutInput = "übc";
2372 transcode(immutInput, output);
2373 assert(output == [0xFC, 'b', 'c']);
2376 // Make sure that const/mutable input is copied.
2377 foreach (C; AliasSeq!(char, const char))
2379 C[] input = "foo".dup;
2381 transcode(input, output);
2382 assert(input == output);
2383 assert(input !is output);
2386 // But immutable input should not be copied.
2387 string input = "foo";
2389 transcode(input, output);
2390 assert(input is output);
2393 //=============================================================================
2395 /** The base class for exceptions thrown by this module */
2396 class EncodingException : Exception { this(string msg) @safe pure { super(msg); } }
2398 class UnrecognizedEncodingException : EncodingException
2400 private this(string msg) @safe pure { super(msg); }
2403 /** Abstract base class of all encoding schemes */
2404 abstract class EncodingScheme
2406 import std.uni : toLower;
2409 * Registers a subclass of EncodingScheme.
2411 * This function allows user-defined subclasses of EncodingScheme to
2412 * be declared in other modules.
2415 * Klass = The subclass of EncodingScheme to register.
2418 * ----------------------------------------------
2419 * class Amiga1251 : EncodingScheme
2421 * shared static this()
2423 * EncodingScheme.register!Amiga1251;
2426 * ----------------------------------------------
2428 static void register(Klass:EncodingScheme)()
2430 scope scheme = new Klass();
2431 foreach (encodingName;scheme.names())
2433 supported[toLower(encodingName)] = () => new Klass();
2437 deprecated("Please pass the EncodingScheme subclass as template argument instead.")
2438 static void register(string className)
2440 auto scheme = cast(EncodingScheme) ClassInfo.find(className).create();
2442 throw new EncodingException("Unable to create class "~className);
2443 foreach (encodingName;scheme.names())
2445 supportedFactories[toLower(encodingName)] = className;
2450 * Obtains a subclass of EncodingScheme which is capable of encoding
2451 * and decoding the named encoding scheme.
2453 * This function is only aware of EncodingSchemes which have been
2454 * registered with the register() function.
2457 * ---------------------------------------------------
2458 * auto scheme = EncodingScheme.create("Amiga-1251");
2459 * ---------------------------------------------------
2461 static EncodingScheme create(string encodingName)
2463 static bool registerDefaultEncodings()
2465 EncodingScheme.register!EncodingSchemeASCII;
2466 EncodingScheme.register!EncodingSchemeLatin1;
2467 EncodingScheme.register!EncodingSchemeLatin2;
2468 EncodingScheme.register!EncodingSchemeWindows1250;
2469 EncodingScheme.register!EncodingSchemeWindows1252;
2470 EncodingScheme.register!EncodingSchemeUtf8;
2471 EncodingScheme.register!EncodingSchemeUtf16Native;
2472 EncodingScheme.register!EncodingSchemeUtf32Native;
2476 static shared bool initialized;
2477 import std.concurrency : initOnce;
2478 initOnce!initialized(registerDefaultEncodings());
2479 encodingName = toLower(encodingName);
2481 if (auto p = encodingName in supported)
2484 auto p = encodingName in supportedFactories;
2486 throw new EncodingException("Unrecognized Encoding: "~encodingName);
2487 string className = *p;
2488 auto scheme = cast(EncodingScheme) ClassInfo.find(className).create();
2489 if (scheme is null) throw new EncodingException("Unable to create class "~className);
2496 * Returns the standard name of the encoding scheme
2498 abstract override string toString();
2501 * Returns an array of all known names for this encoding scheme
2503 abstract string[] names();
2506 * Returns true if the character c can be represented
2507 * in this encoding scheme.
2509 abstract bool canEncode(dchar c);
2512 * Returns the number of ubytes required to encode this code point.
2514 * The input to this function MUST be a valid code point.
2517 * c = the code point to be encoded
2520 * the number of ubytes required.
2522 abstract size_t encodedLength(dchar c);
2525 * Encodes a single code point into a user-supplied, fixed-size buffer.
2527 * This function encodes a single code point into one or more ubytes.
2528 * The supplied buffer must be code unit aligned.
2529 * (For example, UTF-16LE or UTF-16BE must be wchar-aligned,
2530 * UTF-32LE or UTF-32BE must be dchar-aligned, etc.)
2532 * The input to this function MUST be a valid code point.
2535 * c = the code point to be encoded
2536 * buffer = the destination array
2539 * the number of ubytes written.
2541 abstract size_t encode(dchar c, ubyte[] buffer);
2544 * Decodes a single code point.
2546 * This function removes one or more ubytes from the start of an array,
2547 * and returns the decoded code point which those ubytes represent.
2549 * The input to this function MUST be validly encoded.
2552 * s = the array whose first code point is to be decoded
2554 abstract dchar decode(ref const(ubyte)[] s);
2557 * Decodes a single code point. The input does not have to be valid.
2559 * This function removes one or more ubytes from the start of an array,
2560 * and returns the decoded code point which those ubytes represent.
2562 * This function will accept an invalidly encoded array as input.
2563 * If an invalid sequence is found at the start of the string, this
2564 * function will remove it, and return the value INVALID_SEQUENCE.
2567 * s = the array whose first code point is to be decoded
2569 abstract dchar safeDecode(ref const(ubyte)[] s);
2572 * Returns the sequence of ubytes to be used to represent
2573 * any character which cannot be represented in the encoding scheme.
2575 * Normally this will be a representation of some substitution
2576 * character, such as U+FFFD or '?'.
2578 abstract @property immutable(ubyte)[] replacementSequence();
2582 * Returns true if the array is encoded correctly
2585 * s = the array to be tested
2587 bool isValid(const(ubyte)[] s)
2589 while (s.length != 0)
2591 if (safeDecode(s) == INVALID_SEQUENCE)
2598 * Returns the length of the longest possible substring, starting from
2599 * the first element, which is validly encoded.
2602 * s = the array to be tested
2604 size_t validLength()(const(ubyte)[] s)
2606 const(ubyte)[] r = s;
2607 const(ubyte)[] t = s;
2608 while (s.length != 0)
2610 if (safeDecode(s) == INVALID_SEQUENCE) break;
2613 return r.length - t.length;
2617 * Sanitizes an array by replacing malformed ubyte sequences with valid
2618 * ubyte sequences. The result is guaranteed to be valid for this
2621 * If the input array is already valid, this function returns the
2622 * original, otherwise it constructs a new array by replacing all illegal
2623 * sequences with the encoding scheme's replacement sequence.
2626 * s = the string to be sanitized
2628 immutable(ubyte)[] sanitize()(immutable(ubyte)[] s)
2630 auto n = validLength(s);
2631 if (n == s.length) return s;
2633 auto repSeq = replacementSequence;
2635 // Count how long the string needs to be.
2636 // Overestimating is not a problem
2637 auto len = s.length;
2638 const(ubyte)[] t = s[n..$];
2639 while (t.length != 0)
2641 immutable c = safeDecode(t);
2642 assert(c == INVALID_SEQUENCE);
2643 len += repSeq.length;
2644 t = t[validLength(t)..$];
2648 ubyte[] array = new ubyte[len];
2649 array[0 .. n] = s[0 .. n];
2653 while (t.length != 0)
2655 immutable c = safeDecode(t);
2656 assert(c == INVALID_SEQUENCE);
2657 array[offset .. offset+repSeq.length] = repSeq[];
2658 offset += repSeq.length;
2660 array[offset .. offset+n] = t[0 .. n];
2664 return cast(immutable(ubyte)[])array[0 .. offset];
2668 * Returns the length of the first encoded sequence.
2670 * The input to this function MUST be validly encoded.
2671 * This is enforced by the function's in-contract.
2674 * s = the array to be sliced
2676 size_t firstSequence()(const(ubyte)[] s)
2679 assert(s.length != 0);
2680 const(ubyte)[] u = s;
2681 assert(safeDecode(u) != INVALID_SEQUENCE);
2685 const(ubyte)[] t = s;
2687 return t.length - s.length;
2691 * Returns the total number of code points encoded in a ubyte array.
2693 * The input to this function MUST be validly encoded.
2694 * This is enforced by the function's in-contract.
2697 * s = the string to be counted
2699 size_t count()(const(ubyte)[] s)
2707 while (s.length != 0)
2716 * Returns the array index at which the (n+1)th code point begins.
2718 * The input to this function MUST be validly encoded.
2719 * This is enforced by the function's in-contract.
2722 * s = the string to be counted
2723 * n = the current code point index
2725 ptrdiff_t index()(const(ubyte)[] s, size_t n)
2733 const(ubyte)[] t = s;
2734 for (size_t i=0; i<n; ++i) decode(s);
2735 return t.length - s.length;
2738 __gshared EncodingScheme function()[string] supported;
2739 __gshared string[string] supportedFactories;
2743 EncodingScheme to handle ASCII
2745 This scheme recognises the following names:
2758 class EncodingSchemeASCII : EncodingScheme
2760 /* // moved to std.internal.phobosinit
2761 shared static this()
2763 EncodingScheme.register("std.encoding.EncodingSchemeASCII");
2768 override string[] names() @safe pure nothrow
2786 override string toString() @safe pure nothrow @nogc
2791 override bool canEncode(dchar c) @safe pure nothrow @nogc
2793 return std.encoding.canEncode!(AsciiChar)(c);
2796 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
2798 return std.encoding.encodedLength!(AsciiChar)(c);
2801 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2803 auto r = cast(AsciiChar[]) buffer;
2804 return std.encoding.encode(c,r);
2807 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2809 auto t = cast(const(AsciiChar)[]) s;
2810 dchar c = std.encoding.decode(t);
2811 s = s[$-t.length..$];
2815 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2817 auto t = cast(const(AsciiChar)[]) s;
2818 dchar c = std.encoding.safeDecode(t);
2819 s = s[$-t.length..$];
2823 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2825 return cast(immutable(ubyte)[])"?";
2831 EncodingScheme to handle Latin-1
2833 This scheme recognises the following names:
2844 class EncodingSchemeLatin1 : EncodingScheme
2846 /* // moved to std.internal.phobosinit
2847 shared static this()
2849 EncodingScheme.register("std.encoding.EncodingSchemeLatin1");
2854 override string[] names() @safe pure nothrow
2870 override string toString() @safe pure nothrow @nogc
2872 return "ISO-8859-1";
2875 override bool canEncode(dchar c) @safe pure nothrow @nogc
2877 return std.encoding.canEncode!(Latin1Char)(c);
2880 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
2882 return std.encoding.encodedLength!(Latin1Char)(c);
2885 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2887 auto r = cast(Latin1Char[]) buffer;
2888 return std.encoding.encode(c,r);
2891 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2893 auto t = cast(const(Latin1Char)[]) s;
2894 dchar c = std.encoding.decode(t);
2895 s = s[$-t.length..$];
2899 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2901 auto t = cast(const(Latin1Char)[]) s;
2902 dchar c = std.encoding.safeDecode(t);
2903 s = s[$-t.length..$];
2907 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2909 return cast(immutable(ubyte)[])"?";
2915 EncodingScheme to handle Latin-2
2917 This scheme recognises the following names:
2924 class EncodingSchemeLatin2 : EncodingScheme
2926 /* // moved to std.internal.phobosinit
2927 shared static this()
2929 EncodingScheme.register("std.encoding.EncodingSchemeLatin2");
2934 override string[] names() @safe pure nothrow
2946 override string toString() @safe pure nothrow @nogc
2948 return "ISO-8859-2";
2951 override bool canEncode(dchar c) @safe pure nothrow @nogc
2953 return std.encoding.canEncode!(Latin2Char)(c);
2956 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
2958 return std.encoding.encodedLength!(Latin2Char)(c);
2961 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2963 auto r = cast(Latin2Char[]) buffer;
2964 return std.encoding.encode(c,r);
2967 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2969 auto t = cast(const(Latin2Char)[]) s;
2970 dchar c = std.encoding.decode(t);
2971 s = s[$-t.length..$];
2975 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2977 auto t = cast(const(Latin2Char)[]) s;
2978 dchar c = std.encoding.safeDecode(t);
2979 s = s[$-t.length..$];
2983 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2985 return cast(immutable(ubyte)[])"?";
2991 EncodingScheme to handle Windows-1250
2993 This scheme recognises the following names:
2996 class EncodingSchemeWindows1250 : EncodingScheme
2998 /* // moved to std.internal.phobosinit
2999 shared static this()
3001 EncodingScheme.register("std.encoding.EncodingSchemeWindows1250");
3006 override string[] names() @safe pure nothrow
3014 override string toString() @safe pure nothrow @nogc
3016 return "windows-1250";
3019 override bool canEncode(dchar c) @safe pure nothrow @nogc
3021 return std.encoding.canEncode!(Windows1250Char)(c);
3024 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3026 return std.encoding.encodedLength!(Windows1250Char)(c);
3029 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3031 auto r = cast(Windows1250Char[]) buffer;
3032 return std.encoding.encode(c,r);
3035 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3037 auto t = cast(const(Windows1250Char)[]) s;
3038 dchar c = std.encoding.decode(t);
3039 s = s[$-t.length..$];
3043 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3045 auto t = cast(const(Windows1250Char)[]) s;
3046 dchar c = std.encoding.safeDecode(t);
3047 s = s[$-t.length..$];
3051 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3053 return cast(immutable(ubyte)[])"?";
3059 EncodingScheme to handle Windows-1252
3061 This scheme recognises the following names:
3064 class EncodingSchemeWindows1252 : EncodingScheme
3066 /* // moved to std.internal.phobosinit
3067 shared static this()
3069 EncodingScheme.register("std.encoding.EncodingSchemeWindows1252");
3074 override string[] names() @safe pure nothrow
3082 override string toString() @safe pure nothrow @nogc
3084 return "windows-1252";
3087 override bool canEncode(dchar c) @safe pure nothrow @nogc
3089 return std.encoding.canEncode!(Windows1252Char)(c);
3092 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3094 return std.encoding.encodedLength!(Windows1252Char)(c);
3097 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3099 auto r = cast(Windows1252Char[]) buffer;
3100 return std.encoding.encode(c,r);
3103 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3105 auto t = cast(const(Windows1252Char)[]) s;
3106 dchar c = std.encoding.decode(t);
3107 s = s[$-t.length..$];
3111 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3113 auto t = cast(const(Windows1252Char)[]) s;
3114 dchar c = std.encoding.safeDecode(t);
3115 s = s[$-t.length..$];
3119 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3121 return cast(immutable(ubyte)[])"?";
3127 EncodingScheme to handle UTF-8
3129 This scheme recognises the following names:
3132 class EncodingSchemeUtf8 : EncodingScheme
3134 /* // moved to std.internal.phobosinit
3135 shared static this()
3137 EncodingScheme.register("std.encoding.EncodingSchemeUtf8");
3142 override string[] names() @safe pure nothrow
3150 override string toString() @safe pure nothrow @nogc
3155 override bool canEncode(dchar c) @safe pure nothrow @nogc
3157 return std.encoding.canEncode!(char)(c);
3160 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3162 return std.encoding.encodedLength!(char)(c);
3165 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3167 auto r = cast(char[]) buffer;
3168 return std.encoding.encode(c,r);
3171 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3173 auto t = cast(const(char)[]) s;
3174 dchar c = std.encoding.decode(t);
3175 s = s[$-t.length..$];
3179 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3181 auto t = cast(const(char)[]) s;
3182 dchar c = std.encoding.safeDecode(t);
3183 s = s[$-t.length..$];
3187 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3189 return cast(immutable(ubyte)[])"\uFFFD";
3195 EncodingScheme to handle UTF-16 in native byte order
3197 This scheme recognises the following names:
3198 "UTF-16LE" (little-endian architecture only)
3199 "UTF-16BE" (big-endian architecture only)
3201 class EncodingSchemeUtf16Native : EncodingScheme
3203 /* // moved to std.internal.phobosinit
3204 shared static this()
3206 EncodingScheme.register("std.encoding.EncodingSchemeUtf16Native");
3211 version (LittleEndian) { enum string NAME = "UTF-16LE"; }
3212 version (BigEndian) { enum string NAME = "UTF-16BE"; }
3214 override string[] names() @safe pure nothrow
3219 override string toString() @safe pure nothrow @nogc
3224 override bool canEncode(dchar c) @safe pure nothrow @nogc
3226 return std.encoding.canEncode!(wchar)(c);
3229 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3231 return std.encoding.encodedLength!(wchar)(c);
3234 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3236 auto r = cast(wchar[]) buffer;
3237 return wchar.sizeof * std.encoding.encode(c,r);
3240 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3243 assert((s.length & 1) == 0);
3247 auto t = cast(const(wchar)[]) s;
3248 dchar c = std.encoding.decode(t);
3249 s = s[$-t.length * wchar.sizeof..$];
3253 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3256 assert((s.length & 1) == 0);
3260 auto t = cast(const(wchar)[]) s;
3261 dchar c = std.encoding.safeDecode(t);
3262 s = s[$-t.length * wchar.sizeof..$];
3266 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3268 return cast(immutable(ubyte)[])"\uFFFD"w;
3274 version (LittleEndian)
3276 auto efrom = EncodingScheme.create("utf-16le");
3277 ubyte[6] sample = [154,1, 155,1, 156,1];
3281 auto efrom = EncodingScheme.create("utf-16be");
3282 ubyte[6] sample = [1,154, 1,155, 1,156];
3284 const(ubyte)[] ub = cast(const(ubyte)[])sample;
3285 dchar dc = efrom.safeDecode(ub);
3287 assert(ub.length == 4);
3291 EncodingScheme to handle UTF-32 in native byte order
3293 This scheme recognises the following names:
3294 "UTF-32LE" (little-endian architecture only)
3295 "UTF-32BE" (big-endian architecture only)
3297 class EncodingSchemeUtf32Native : EncodingScheme
3299 /* // moved to std.internal.phobosinit
3300 shared static this()
3302 EncodingScheme.register("std.encoding.EncodingSchemeUtf32Native");
3307 version (LittleEndian) { enum string NAME = "UTF-32LE"; }
3308 version (BigEndian) { enum string NAME = "UTF-32BE"; }
3310 override string[] names() @safe pure nothrow
3315 override string toString() @safe pure nothrow @nogc
3320 override bool canEncode(dchar c) @safe pure nothrow @nogc
3322 return std.encoding.canEncode!(dchar)(c);
3325 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3327 return std.encoding.encodedLength!(dchar)(c);
3330 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3332 auto r = cast(dchar[]) buffer;
3333 return dchar.sizeof * std.encoding.encode(c,r);
3336 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3339 assert((s.length & 3) == 0);
3343 auto t = cast(const(dchar)[]) s;
3344 dchar c = std.encoding.decode(t);
3345 s = s[$-t.length * dchar.sizeof..$];
3349 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3352 assert((s.length & 3) == 0);
3356 auto t = cast(const(dchar)[]) s;
3357 dchar c = std.encoding.safeDecode(t);
3358 s = s[$-t.length * dchar.sizeof..$];
3362 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3364 return cast(immutable(ubyte)[])"\uFFFD"d;
3370 version (LittleEndian)
3372 auto efrom = EncodingScheme.create("utf-32le");
3373 ubyte[12] sample = [154,1,0,0, 155,1,0,0, 156,1,0,0];
3377 auto efrom = EncodingScheme.create("utf-32be");
3378 ubyte[12] sample = [0,0,1,154, 0,0,1,155, 0,0,1,156];
3380 const(ubyte)[] ub = cast(const(ubyte)[])sample;
3381 dchar dc = efrom.safeDecode(ub);
3383 assert(ub.length == 8);
3386 //=============================================================================
3392 void transcodeReverse(Src,Dst)(immutable(Src)[] s, out immutable(Dst)[] r)
3394 static if (is(Src == Dst))
3398 else static if (is(Src == AsciiChar))
3400 transcodeReverse!(char,Dst)(cast(string) s,r);
3404 foreach_reverse (d;codePoints(s))
3406 foreach_reverse (c;codeUnits!(Dst)(d))
3414 string makeReadable(string s)
3419 if (c >= 0x20 && c < 0x80)
3426 r ~= toHexDigit(c >> 4);
3434 string makeReadable(wstring s)
3439 if (c >= 0x20 && c < 0x80)
3446 r ~= toHexDigit(c >> 12);
3447 r ~= toHexDigit(c >> 8);
3448 r ~= toHexDigit(c >> 4);
3456 string makeReadable(dstring s)
3459 foreach (dchar c; s)
3461 if (c >= 0x20 && c < 0x80)
3465 else if (c < 0x10000)
3468 r ~= toHexDigit(c >> 12);
3469 r ~= toHexDigit(c >> 8);
3470 r ~= toHexDigit(c >> 4);
3476 r ~= toHexDigit(c >> 20);
3477 r ~= toHexDigit(c >> 16);
3478 r ~= toHexDigit(c >> 12);
3479 r ~= toHexDigit(c >> 8);
3480 r ~= toHexDigit(c >> 4);
3488 char toHexDigit(int n)
3490 return "0123456789ABCDEF"[n & 0xF];
3494 /** Definitions of common Byte Order Marks.
3495 The elements of the $(D enum) can used as indices into $(D bomTable) to get
3496 matching $(D BOMSeq).
3500 none = 0, /// no BOM was found
3501 utf32be = 1, /// [0x00, 0x00, 0xFE, 0xFF]
3502 utf32le = 2, /// [0xFF, 0xFE, 0x00, 0x00]
3503 utf7 = 3, /* [0x2B, 0x2F, 0x76, 0x38]
3504 [0x2B, 0x2F, 0x76, 0x39],
3505 [0x2B, 0x2F, 0x76, 0x2B],
3506 [0x2B, 0x2F, 0x76, 0x2F],
3507 [0x2B, 0x2F, 0x76, 0x38, 0x2D]
3509 utf1 = 8, /// [0xF7, 0x64, 0x4C]
3510 utfebcdic = 9, /// [0xDD, 0x73, 0x66, 0x73]
3511 scsu = 10, /// [0x0E, 0xFE, 0xFF]
3512 bocu1 = 11, /// [0xFB, 0xEE, 0x28]
3513 gb18030 = 12, /// [0x84, 0x31, 0x95, 0x33]
3514 utf8 = 13, /// [0xEF, 0xBB, 0xBF]
3515 utf16be = 14, /// [0xFE, 0xFF]
3516 utf16le = 15 /// [0xFF, 0xFE]
3519 /// The type stored inside $(D bomTable).
3520 alias BOMSeq = Tuple!(BOM, "schema", ubyte[], "sequence");
3522 /** Mapping of a byte sequence to $(B Byte Order Mark (BOM))
3524 immutable bomTable = [
3525 BOMSeq(BOM.none, null),
3526 BOMSeq(BOM.utf32be, cast(ubyte[])([0x00, 0x00, 0xFE, 0xFF])),
3527 BOMSeq(BOM.utf32le, cast(ubyte[])([0xFF, 0xFE, 0x00, 0x00])),
3528 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x39])),
3529 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2B])),
3530 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2F])),
3531 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38, 0x2D])),
3532 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38])),
3533 BOMSeq(BOM.utf1, cast(ubyte[])([0xF7, 0x64, 0x4C])),
3534 BOMSeq(BOM.utfebcdic, cast(ubyte[])([0xDD, 0x73, 0x66, 0x73])),
3535 BOMSeq(BOM.scsu, cast(ubyte[])([0x0E, 0xFE, 0xFF])),
3536 BOMSeq(BOM.bocu1, cast(ubyte[])([0xFB, 0xEE, 0x28])),
3537 BOMSeq(BOM.gb18030, cast(ubyte[])([0x84, 0x31, 0x95, 0x33])),
3538 BOMSeq(BOM.utf8, cast(ubyte[])([0xEF, 0xBB, 0xBF])),
3539 BOMSeq(BOM.utf16be, cast(ubyte[])([0xFE, 0xFF])),
3540 BOMSeq(BOM.utf16le, cast(ubyte[])([0xFF, 0xFE]))
3543 /** Returns a $(D BOMSeq) for a given $(D input).
3544 If no $(D BOM) is present the $(D BOMSeq) for $(D BOM.none) is
3545 returned. The $(D BOM) sequence at the beginning of the range will
3546 not be comsumed from the passed range. If you pass a reference type
3547 range make sure that $(D save) creates a deep copy.
3550 input = The sequence to check for the $(D BOM)
3553 the found $(D BOMSeq) corresponding to the passed $(D input).
3555 immutable(BOMSeq) getBOM(Range)(Range input)
3556 if (isForwardRange!Range && is(Unqual!(ElementType!Range) == ubyte))
3558 import std.algorithm.searching : startsWith;
3559 foreach (it; bomTable[1 .. $])
3561 if (startsWith(input.save, it.sequence))
3573 import std.format : format;
3575 auto ts = dchar(0x0000FEFF) ~ "Hello World"d;
3577 auto entry = getBOM(cast(ubyte[]) ts);
3580 assert(entry.schema == BOM.utf32be, format("%s", entry.schema));
3584 assert(entry.schema == BOM.utf32le, format("%s", entry.schema));
3590 import std.format : format;
3592 foreach (idx, it; bomTable)
3594 auto s = it[1] ~ cast(ubyte[])"hello world";
3596 assert(i[0] == bomTable[idx][0]);
3598 if (idx < 4 || idx > 7) // get around the multiple utf7 bom's
3600 assert(i[0] == BOM.init + idx);
3601 assert(i[1] == it[1]);
3608 struct BOMInputRange
3612 @property ubyte front()
3614 return this.arr.front;
3617 @property bool empty()
3619 return this.arr.empty;
3624 this.arr = this.arr[1 .. $];
3627 @property typeof(this) save()
3633 static assert( isInputRange!BOMInputRange);
3634 static assert(!isArray!BOMInputRange);
3636 ubyte[] dummyEnd = [0,0,0,0];
3638 foreach (idx, it; bomTable[1 .. $])
3641 auto ir = BOMInputRange(it.sequence.dup);
3643 auto b = getBOM(ir);
3644 assert(b.schema == it.schema);
3645 assert(ir.arr == it.sequence);
3649 auto noBom = it.sequence[0 .. 1].dup ~ dummyEnd;
3650 size_t oldLen = noBom.length;
3651 assert(oldLen - 4 < it.sequence.length);
3653 auto ir = BOMInputRange(noBom.dup);
3654 auto b = getBOM(ir);
3655 assert(b.schema == BOM.none);
3656 assert(noBom.length == oldLen);
3661 /** Constant defining a fully decoded BOM */
3662 enum dchar utfBOM = 0xfeff;