]> git.ipfire.org Git - thirdparty/gcc.git/blob - libphobos/src/std/encoding.d
Add D front-end, libphobos library, and D2 testsuite.
[thirdparty/gcc.git] / libphobos / src / std / encoding.d
1 // Written in the D programming language.
2
3 /**
4 Classes and functions for handling and transcoding between various encodings.
5
6 For cases where the _encoding is known at compile-time, functions are provided
7 for arbitrary _encoding and decoding of characters, arbitrary transcoding
8 between strings of different type, as well as validation and sanitization.
9
10 Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1
11 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250 and WINDOWS-1252.
12
13 $(SCRIPT inhibitQuickIndex = 1;)
14 $(BOOKTABLE,
15 $(TR $(TH Category) $(TH Functions))
16 $(TR $(TD Decode) $(TD
17 $(LREF codePoints)
18 $(LREF decode)
19 $(LREF decodeReverse)
20 $(LREF safeDecode)
21 ))
22 $(TR $(TD Conversion) $(TD
23 $(LREF codeUnits)
24 $(LREF sanitize)
25 $(LREF transcode)
26 ))
27 $(TR $(TD Classification) $(TD
28 $(LREF canEncode)
29 $(LREF isValid)
30 $(LREF isValidCodePoint)
31 $(LREF isValidCodeUnit)
32 ))
33 $(TR $(TD BOM) $(TD
34 $(LREF BOM)
35 $(LREF BOMSeq)
36 $(LREF getBOM)
37 $(LREF utfBOM)
38 ))
39 $(TR $(TD Length & Index) $(TD
40 $(LREF firstSequence)
41 $(LREF encodedLength)
42 $(LREF index)
43 $(LREF lastSequence)
44 $(LREF validLength)
45 ))
46 $(TR $(TD Encoding schemes) $(TD
47 $(LREF encodingName)
48 $(LREF EncodingScheme)
49 $(LREF EncodingSchemeASCII)
50 $(LREF EncodingSchemeLatin1)
51 $(LREF EncodingSchemeLatin2)
52 $(LREF EncodingSchemeUtf16Native)
53 $(LREF EncodingSchemeUtf32Native)
54 $(LREF EncodingSchemeUtf8)
55 $(LREF EncodingSchemeWindows1250)
56 $(LREF EncodingSchemeWindows1252)
57 ))
58 $(TR $(TD Representation) $(TD
59 $(LREF AsciiChar)
60 $(LREF AsciiString)
61 $(LREF Latin1Char)
62 $(LREF Latin1String)
63 $(LREF Latin2Char)
64 $(LREF Latin2String)
65 $(LREF Windows1250Char)
66 $(LREF Windows1250String)
67 $(LREF Windows1252Char)
68 $(LREF Windows1252String)
69 ))
70 $(TR $(TD Exceptions) $(TD
71 $(LREF INVALID_SEQUENCE)
72 $(LREF EncodingException)
73 ))
74 )
75
76 For cases where the _encoding is not known at compile-time, but is
77 known at run-time, the abstract class $(LREF EncodingScheme)
78 and its subclasses is provided. To construct a run-time encoder/decoder,
79 one does e.g.
80
81 ----------------------------------------------------
82 auto e = EncodingScheme.create("utf-8");
83 ----------------------------------------------------
84
85 This library supplies $(LREF EncodingScheme) subclasses for ASCII,
86 ISO-8859-1 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250,
87 WINDOWS-1252, UTF-8, and (on little-endian architectures) UTF-16LE and
88 UTF-32LE; or (on big-endian architectures) UTF-16BE and UTF-32BE.
89
90 This library provides a mechanism whereby other modules may add $(LREF
91 EncodingScheme) subclasses for any other _encoding.
92
93 Copyright: Copyright Janice Caron 2008 - 2009.
94 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
95 Authors: Janice Caron
96 Source: $(PHOBOSSRC std/_encoding.d)
97 */
98 /*
99 Copyright Janice Caron 2008 - 2009.
100 Distributed under the Boost Software License, Version 1.0.
101 (See accompanying file LICENSE_1_0.txt or copy at
102 http://www.boost.org/LICENSE_1_0.txt)
103 */
104 module std.encoding;
105
106 import std.range.primitives;
107 import std.traits;
108 import std.typecons;
109
110 @system unittest
111 {
112 static ubyte[][] validStrings =
113 [
114 // Plain ASCII
115 cast(ubyte[])"hello",
116
117 // First possible sequence of a certain length
118 [ 0x00 ], // U+00000000 one byte
119 [ 0xC2, 0x80 ], // U+00000080 two bytes
120 [ 0xE0, 0xA0, 0x80 ], // U+00000800 three bytes
121 [ 0xF0, 0x90, 0x80, 0x80 ], // U+00010000 three bytes
122
123 // Last possible sequence of a certain length
124 [ 0x7F ], // U+0000007F one byte
125 [ 0xDF, 0xBF ], // U+000007FF two bytes
126 [ 0xEF, 0xBF, 0xBF ], // U+0000FFFF three bytes
127
128 // Other boundary conditions
129 [ 0xED, 0x9F, 0xBF ],
130 // U+0000D7FF Last character before surrogates
131 [ 0xEE, 0x80, 0x80 ],
132 // U+0000E000 First character after surrogates
133 [ 0xEF, 0xBF, 0xBD ],
134 // U+0000FFFD Unicode replacement character
135 [ 0xF4, 0x8F, 0xBF, 0xBF ],
136 // U+0010FFFF Very last character
137
138 // Non-character code points
139 /* NOTE: These are legal in UTF, and may be converted from
140 one UTF to another, however they do not represent Unicode
141 characters. These code points have been reserved by
142 Unicode as non-character code points. They are permissible
143 for data exchange within an application, but they are are
144 not permitted to be used as characters. Since this module
145 deals with UTF, and not with Unicode per se, we choose to
146 accept them here. */
147 [ 0xDF, 0xBE ], // U+0000FFFE
148 [ 0xDF, 0xBF ], // U+0000FFFF
149 ];
150
151 static ubyte[][] invalidStrings =
152 [
153 // First possible sequence of a certain length, but greater
154 // than U+10FFFF
155 [ 0xF8, 0x88, 0x80, 0x80, 0x80 ], // U+00200000 five bytes
156 [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ], // U+04000000 six bytes
157
158 // Last possible sequence of a certain length, but greater than U+10FFFF
159 [ 0xF7, 0xBF, 0xBF, 0xBF ], // U+001FFFFF four bytes
160 [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF five bytes
161 [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ], // U+7FFFFFFF six bytes
162
163 // Other boundary conditions
164 [ 0xF4, 0x90, 0x80, 0x80 ], // U+00110000
165 // First code
166 // point after
167 // last character
168
169 // Unexpected continuation bytes
170 [ 0x80 ],
171 [ 0xBF ],
172 [ 0x20, 0x80, 0x20 ],
173 [ 0x20, 0xBF, 0x20 ],
174 [ 0x80, 0x9F, 0xA0 ],
175
176 // Lonely start bytes
177 [ 0xC0 ],
178 [ 0xCF ],
179 [ 0x20, 0xC0, 0x20 ],
180 [ 0x20, 0xCF, 0x20 ],
181 [ 0xD0 ],
182 [ 0xDF ],
183 [ 0x20, 0xD0, 0x20 ],
184 [ 0x20, 0xDF, 0x20 ],
185 [ 0xE0 ],
186 [ 0xEF ],
187 [ 0x20, 0xE0, 0x20 ],
188 [ 0x20, 0xEF, 0x20 ],
189 [ 0xF0 ],
190 [ 0xF1 ],
191 [ 0xF2 ],
192 [ 0xF3 ],
193 [ 0xF4 ],
194 [ 0xF5 ], // If this were legal it would start a character > U+10FFFF
195 [ 0xF6 ], // If this were legal it would start a character > U+10FFFF
196 [ 0xF7 ], // If this were legal it would start a character > U+10FFFF
197
198 [ 0xEF, 0xBF ], // Three byte sequence with third byte missing
199 [ 0xF7, 0xBF, 0xBF ], // Four byte sequence with fourth byte missing
200 [ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ], // Concatenation of the above
201
202 // Impossible bytes
203 [ 0xF8 ],
204 [ 0xF9 ],
205 [ 0xFA ],
206 [ 0xFB ],
207 [ 0xFC ],
208 [ 0xFD ],
209 [ 0xFE ],
210 [ 0xFF ],
211 [ 0x20, 0xF8, 0x20 ],
212 [ 0x20, 0xF9, 0x20 ],
213 [ 0x20, 0xFA, 0x20 ],
214 [ 0x20, 0xFB, 0x20 ],
215 [ 0x20, 0xFC, 0x20 ],
216 [ 0x20, 0xFD, 0x20 ],
217 [ 0x20, 0xFE, 0x20 ],
218 [ 0x20, 0xFF, 0x20 ],
219
220 // Overlong sequences, all representing U+002F
221 /* With a safe UTF-8 decoder, all of the following five overlong
222 representations of the ASCII character slash ("/") should be
223 rejected like a malformed UTF-8 sequence */
224 [ 0xC0, 0xAF ],
225 [ 0xE0, 0x80, 0xAF ],
226 [ 0xF0, 0x80, 0x80, 0xAF ],
227 [ 0xF8, 0x80, 0x80, 0x80, 0xAF ],
228 [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ],
229
230 // Maximum overlong sequences
231 /* Below you see the highest Unicode value that is still resulting in
232 an overlong sequence if represented with the given number of bytes.
233 This is a boundary test for safe UTF-8 decoders. All five
234 characters should be rejected like malformed UTF-8 sequences. */
235 [ 0xC1, 0xBF ], // U+0000007F
236 [ 0xE0, 0x9F, 0xBF ], // U+000007FF
237 [ 0xF0, 0x8F, 0xBF, 0xBF ], // U+0000FFFF
238 [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ], // U+001FFFFF
239 [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF
240
241 // Overlong representation of the NUL character
242 /* The following five sequences should also be rejected like malformed
243 UTF-8 sequences and should not be treated like the ASCII NUL
244 character. */
245 [ 0xC0, 0x80 ],
246 [ 0xE0, 0x80, 0x80 ],
247 [ 0xF0, 0x80, 0x80, 0x80 ],
248 [ 0xF8, 0x80, 0x80, 0x80, 0x80 ],
249 [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ],
250
251 // Illegal code positions
252 /* The following UTF-8 sequences should be rejected like malformed
253 sequences, because they never represent valid ISO 10646 characters
254 and a UTF-8 decoder that accepts them might introduce security
255 problems comparable to overlong UTF-8 sequences. */
256 [ 0xED, 0xA0, 0x80 ], // U+D800
257 [ 0xED, 0xAD, 0xBF ], // U+DB7F
258 [ 0xED, 0xAE, 0x80 ], // U+DB80
259 [ 0xED, 0xAF, 0xBF ], // U+DBFF
260 [ 0xED, 0xB0, 0x80 ], // U+DC00
261 [ 0xED, 0xBE, 0x80 ], // U+DF80
262 [ 0xED, 0xBF, 0xBF ], // U+DFFF
263 ];
264
265 static string[] sanitizedStrings =
266 [
267 "\uFFFD","\uFFFD",
268 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
269 " \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ",
270 "\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ",
271 " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
272 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD",
273 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
274 " \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ",
275 " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
276 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
277 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
278 ];
279
280 // Make sure everything that should be valid, is
281 foreach (a;validStrings)
282 {
283 string s = cast(string) a;
284 assert(isValid(s),"Failed to validate: "~makeReadable(s));
285 }
286
287 // Make sure everything that shouldn't be valid, isn't
288 foreach (a;invalidStrings)
289 {
290 string s = cast(string) a;
291 assert(!isValid(s),"Incorrectly validated: "~makeReadable(s));
292 }
293
294 // Make sure we can sanitize everything bad
295 assert(invalidStrings.length == sanitizedStrings.length);
296 for (int i=0; i<invalidStrings.length; ++i)
297 {
298 string s = cast(string) invalidStrings[i];
299 string t = sanitize(s);
300 assert(isValid(t));
301 assert(t == sanitizedStrings[i]);
302 ubyte[] u = cast(ubyte[]) t;
303 validStrings ~= u;
304 }
305
306 // Make sure all transcodings work in both directions, using both forward
307 // and reverse iteration
308 foreach (a; validStrings)
309 {
310 string s = cast(string) a;
311 string s2;
312 wstring ws, ws2;
313 dstring ds, ds2;
314
315 transcode(s,ws);
316 assert(isValid(ws));
317 transcode(ws,s2);
318 assert(s == s2);
319
320 transcode(s,ds);
321 assert(isValid(ds));
322 transcode(ds,s2);
323 assert(s == s2);
324
325 transcode(ws,s);
326 assert(isValid(s));
327 transcode(s,ws2);
328 assert(ws == ws2);
329
330 transcode(ws,ds);
331 assert(isValid(ds));
332 transcode(ds,ws2);
333 assert(ws == ws2);
334
335 transcode(ds,s);
336 assert(isValid(s));
337 transcode(s,ds2);
338 assert(ds == ds2);
339
340 transcode(ds,ws);
341 assert(isValid(ws));
342 transcode(ws,ds2);
343 assert(ds == ds2);
344
345 transcodeReverse(s,ws);
346 assert(isValid(ws));
347 transcodeReverse(ws,s2);
348 assert(s == s2);
349
350 transcodeReverse(s,ds);
351 assert(isValid(ds));
352 transcodeReverse(ds,s2);
353 assert(s == s2);
354
355 transcodeReverse(ws,s);
356 assert(isValid(s));
357 transcodeReverse(s,ws2);
358 assert(ws == ws2);
359
360 transcodeReverse(ws,ds);
361 assert(isValid(ds));
362 transcodeReverse(ds,ws2);
363 assert(ws == ws2);
364
365 transcodeReverse(ds,s);
366 assert(isValid(s));
367 transcodeReverse(s,ds2);
368 assert(ds == ds2);
369
370 transcodeReverse(ds,ws);
371 assert(isValid(ws));
372 transcodeReverse(ws,ds2);
373 assert(ds == ds2);
374 }
375
376 // Make sure the non-UTF encodings work too
377 {
378 auto s = "\u20AC100";
379 Windows1252String t;
380 transcode(s,t);
381 assert(t == cast(Windows1252Char[])[0x80, '1', '0', '0']);
382 string u;
383 transcode(s,u);
384 assert(s == u);
385 Latin1String v;
386 transcode(s,v);
387 assert(cast(string) v == "?100");
388 AsciiString w;
389 transcode(v,w);
390 assert(cast(string) w == "?100");
391 s = "\u017Dlu\u0165ou\u010Dk\u00FD k\u016F\u0148";
392 Latin2String x;
393 transcode(s,x);
394 assert(x == cast(Latin2Char[])[0xae, 'l', 'u', 0xbb, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]);
395 Windows1250String y;
396 transcode(s,y);
397 assert(y == cast(Windows1250Char[])[0x8e, 'l', 'u', 0x9d, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]);
398 }
399
400 // Make sure we can count properly
401 {
402 assert(encodedLength!(char)('A') == 1);
403 assert(encodedLength!(char)('\u00E3') == 2);
404 assert(encodedLength!(char)('\u2028') == 3);
405 assert(encodedLength!(char)('\U0010FFF0') == 4);
406 assert(encodedLength!(wchar)('A') == 1);
407 assert(encodedLength!(wchar)('\U0010FFF0') == 2);
408 }
409
410 // Make sure we can write into mutable arrays
411 {
412 char[4] buffer;
413 auto n = encode(cast(dchar)'\u00E3',buffer);
414 assert(n == 2);
415 assert(buffer[0] == 0xC3);
416 assert(buffer[1] == 0xA3);
417 }
418 }
419
420 //=============================================================================
421
422 /** Special value returned by $(D safeDecode) */
423 enum dchar INVALID_SEQUENCE = cast(dchar) 0xFFFFFFFF;
424
425 template EncoderFunctions()
426 {
427 // Various forms of read
428
429 template ReadFromString()
430 {
431 @property bool canRead() { return s.length != 0; }
432 E peek() @safe pure @nogc nothrow { return s[0]; }
433 E read() @safe pure @nogc nothrow { E t = s[0]; s = s[1..$]; return t; }
434 }
435
436 template ReverseReadFromString()
437 {
438 @property bool canRead() { return s.length != 0; }
439 E peek() @safe pure @nogc nothrow { return s[$-1]; }
440 E read() @safe pure @nogc nothrow { E t = s[$-1]; s = s[0..$-1]; return t; }
441 }
442
443 // Various forms of Write
444
445 template WriteToString()
446 {
447 E[] s;
448 void write(E c) @safe pure nothrow { s ~= c; }
449 }
450
451 template WriteToArray()
452 {
453 void write(E c) @safe pure @nogc nothrow { array[0] = c; array = array[1..$]; }
454 }
455
456 template WriteToDelegate()
457 {
458 void write(E c) { dg(c); }
459 }
460
461 // Functions we will export
462
463 template EncodeViaWrite()
464 {
465 mixin encodeViaWrite;
466 void encode(dchar c) { encodeViaWrite(c); }
467 }
468
469 template SkipViaRead()
470 {
471 mixin skipViaRead;
472 void skip() @safe pure @nogc nothrow { skipViaRead(); }
473 }
474
475 template DecodeViaRead()
476 {
477 mixin decodeViaRead;
478 dchar decode() @safe pure @nogc nothrow { return decodeViaRead(); }
479 }
480
481 template SafeDecodeViaRead()
482 {
483 mixin safeDecodeViaRead;
484 dchar safeDecode() @safe pure @nogc nothrow { return safeDecodeViaRead(); }
485 }
486
487 template DecodeReverseViaRead()
488 {
489 mixin decodeReverseViaRead;
490 dchar decodeReverse() @safe pure @nogc nothrow { return decodeReverseViaRead(); }
491 }
492
493 // Encoding to different destinations
494
495 template EncodeToString()
496 {
497 mixin WriteToString;
498 mixin EncodeViaWrite;
499 }
500
501 template EncodeToArray()
502 {
503 mixin WriteToArray;
504 mixin EncodeViaWrite;
505 }
506
507 template EncodeToDelegate()
508 {
509 mixin WriteToDelegate;
510 mixin EncodeViaWrite;
511 }
512
513 // Decoding functions
514
515 template SkipFromString()
516 {
517 mixin ReadFromString;
518 mixin SkipViaRead;
519 }
520
521 template DecodeFromString()
522 {
523 mixin ReadFromString;
524 mixin DecodeViaRead;
525 }
526
527 template SafeDecodeFromString()
528 {
529 mixin ReadFromString;
530 mixin SafeDecodeViaRead;
531 }
532
533 template DecodeReverseFromString()
534 {
535 mixin ReverseReadFromString;
536 mixin DecodeReverseViaRead;
537 }
538
539 //=========================================================================
540
541 // Below are the functions we will ultimately expose to the user
542
543 E[] encode(dchar c) @safe pure nothrow
544 {
545 mixin EncodeToString e;
546 e.encode(c);
547 return e.s;
548 }
549
550 void encode(dchar c, ref E[] array) @safe pure nothrow
551 {
552 mixin EncodeToArray e;
553 e.encode(c);
554 }
555
556 void encode(dchar c, void delegate(E) dg)
557 {
558 mixin EncodeToDelegate e;
559 e.encode(c);
560 }
561
562 void skip(ref const(E)[] s) @safe pure nothrow
563 {
564 mixin SkipFromString e;
565 e.skip();
566 }
567
568 dchar decode(S)(ref S s)
569 {
570 mixin DecodeFromString e;
571 return e.decode();
572 }
573
574 dchar safeDecode(S)(ref S s)
575 {
576 mixin SafeDecodeFromString e;
577 return e.safeDecode();
578 }
579
580 dchar decodeReverse(ref const(E)[] s) @safe pure nothrow
581 {
582 mixin DecodeReverseFromString e;
583 return e.decodeReverse();
584 }
585 }
586
587 //=========================================================================
588
589 struct CodePoints(E)
590 {
591 const(E)[] s;
592
593 this(const(E)[] s)
594 in
595 {
596 assert(isValid(s));
597 }
598 body
599 {
600 this.s = s;
601 }
602
603 int opApply(scope int delegate(ref dchar) dg)
604 {
605 int result = 0;
606 while (s.length != 0)
607 {
608 dchar c = decode(s);
609 result = dg(c);
610 if (result != 0) break;
611 }
612 return result;
613 }
614
615 int opApply(scope int delegate(ref size_t, ref dchar) dg)
616 {
617 size_t i = 0;
618 int result = 0;
619 while (s.length != 0)
620 {
621 immutable len = s.length;
622 dchar c = decode(s);
623 size_t j = i; // We don't want the delegate corrupting i
624 result = dg(j,c);
625 if (result != 0) break;
626 i += len - s.length;
627 }
628 return result;
629 }
630
631 int opApplyReverse(scope int delegate(ref dchar) dg)
632 {
633 int result = 0;
634 while (s.length != 0)
635 {
636 dchar c = decodeReverse(s);
637 result = dg(c);
638 if (result != 0) break;
639 }
640 return result;
641 }
642
643 int opApplyReverse(scope int delegate(ref size_t, ref dchar) dg)
644 {
645 int result = 0;
646 while (s.length != 0)
647 {
648 dchar c = decodeReverse(s);
649 size_t i = s.length;
650 result = dg(i,c);
651 if (result != 0) break;
652 }
653 return result;
654 }
655 }
656
657 struct CodeUnits(E)
658 {
659 E[] s;
660
661 this(dchar d)
662 in
663 {
664 assert(isValidCodePoint(d));
665 }
666 body
667 {
668 s = encode!(E)(d);
669 }
670
671 int opApply(scope int delegate(ref E) dg)
672 {
673 int result = 0;
674 foreach (E c;s)
675 {
676 result = dg(c);
677 if (result != 0) break;
678 }
679 return result;
680 }
681
682 int opApplyReverse(scope int delegate(ref E) dg)
683 {
684 int result = 0;
685 foreach_reverse (E c;s)
686 {
687 result = dg(c);
688 if (result != 0) break;
689 }
690 return result;
691 }
692 }
693
694 //=============================================================================
695
696 template EncoderInstance(E)
697 {
698 static assert(false,"Cannot instantiate EncoderInstance for type "
699 ~ E.stringof);
700 }
701
702 private template GenericEncoder()
703 {
704 bool canEncode(dchar c) @safe pure @nogc nothrow
705 {
706 if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) return true;
707 if (c >= 0xFFFD) return false;
708
709 auto idx = 0;
710 while (idx < bstMap.length)
711 {
712 if (bstMap[idx][0] == c) return true;
713 idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index
714 }
715
716 return false;
717 }
718
719 bool isValidCodeUnit(E c) @safe pure @nogc nothrow
720 {
721 if (c < m_charMapStart || c > m_charMapEnd) return true;
722 return charMap[c-m_charMapStart] != 0xFFFD;
723 }
724
725 size_t encodedLength(dchar c) @safe pure @nogc nothrow
726 in
727 {
728 assert(canEncode(c));
729 }
730 body
731 {
732 return 1;
733 }
734
735 void encodeViaWrite()(dchar c)
736 {
737 if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) {}
738 else if (c >= 0xFFFD) { c = '?'; }
739 else
740 {
741 auto idx = 0;
742 while (idx < bstMap.length)
743 {
744 if (bstMap[idx][0] == c)
745 {
746 write(cast(E) bstMap[idx][1]);
747 return;
748 }
749 idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index
750 }
751 c = '?';
752 }
753 write(cast(E) c);
754 }
755
756 void skipViaRead()()
757 {
758 read();
759 }
760
761 dchar decodeViaRead()()
762 {
763 E c = read();
764 return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
765 }
766
767 dchar safeDecodeViaRead()()
768 {
769 immutable E c = read();
770 immutable d = (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
771 return d == 0xFFFD ? INVALID_SEQUENCE : d;
772 }
773
774 dchar decodeReverseViaRead()()
775 {
776 E c = read();
777 return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
778 }
779
780 @property EString replacementSequence() @safe pure @nogc nothrow
781 {
782 return cast(EString)("?");
783 }
784
785 mixin EncoderFunctions;
786 }
787
788 //=============================================================================
789 // ASCII
790 //=============================================================================
791
792 /** Defines various character sets. */
793 enum AsciiChar : ubyte { init }
794 /// Ditto
795 alias AsciiString = immutable(AsciiChar)[];
796
797 template EncoderInstance(CharType : AsciiChar)
798 {
799 alias E = AsciiChar;
800 alias EString = AsciiString;
801
802 @property string encodingName() @safe pure nothrow @nogc
803 {
804 return "ASCII";
805 }
806
807 bool canEncode(dchar c) @safe pure nothrow @nogc
808 {
809 return c < 0x80;
810 }
811
812 bool isValidCodeUnit(AsciiChar c) @safe pure nothrow @nogc
813 {
814 return c < 0x80;
815 }
816
817 size_t encodedLength(dchar c) @safe pure nothrow @nogc
818 in
819 {
820 assert(canEncode(c));
821 }
822 body
823 {
824 return 1;
825 }
826
827 void encodeX(Range)(dchar c, Range r)
828 {
829 if (!canEncode(c)) c = '?';
830 r.write(cast(AsciiChar) c);
831 }
832
833 void encodeViaWrite()(dchar c)
834 {
835 if (!canEncode(c)) c = '?';
836 write(cast(AsciiChar) c);
837 }
838
839 void skipViaRead()()
840 {
841 read();
842 }
843
844 dchar decodeViaRead()()
845 {
846 return read();
847 }
848
849 dchar safeDecodeViaRead()()
850 {
851 immutable c = read();
852 return canEncode(c) ? c : INVALID_SEQUENCE;
853 }
854
855 dchar decodeReverseViaRead()()
856 {
857 return read();
858 }
859
860 @property EString replacementSequence() @safe pure nothrow @nogc
861 {
862 return cast(EString)("?");
863 }
864
865 mixin EncoderFunctions;
866 }
867
868 //=============================================================================
869 // ISO-8859-1
870 //=============================================================================
871
872 /** Defines an Latin1-encoded character. */
873 enum Latin1Char : ubyte { init }
874 /**
875 Defines an Latin1-encoded string (as an array of $(D
876 immutable(Latin1Char))).
877 */
878 alias Latin1String = immutable(Latin1Char)[];
879
880 template EncoderInstance(CharType : Latin1Char)
881 {
882 alias E = Latin1Char;
883 alias EString = Latin1String;
884
885 @property string encodingName() @safe pure nothrow @nogc
886 {
887 return "ISO-8859-1";
888 }
889
890 bool canEncode(dchar c) @safe pure nothrow @nogc
891 {
892 return c < 0x100;
893 }
894
895 bool isValidCodeUnit(Latin1Char c) @safe pure nothrow @nogc
896 {
897 return true;
898 }
899
900 size_t encodedLength(dchar c) @safe pure nothrow @nogc
901 in
902 {
903 assert(canEncode(c));
904 }
905 body
906 {
907 return 1;
908 }
909
910 void encodeViaWrite()(dchar c)
911 {
912 if (!canEncode(c)) c = '?';
913 write(cast(Latin1Char) c);
914 }
915
916 void skipViaRead()()
917 {
918 read();
919 }
920
921 dchar decodeViaRead()()
922 {
923 return read();
924 }
925
926 dchar safeDecodeViaRead()()
927 {
928 return read();
929 }
930
931 dchar decodeReverseViaRead()()
932 {
933 return read();
934 }
935
936 @property EString replacementSequence() @safe pure nothrow @nogc
937 {
938 return cast(EString)("?");
939 }
940
941 mixin EncoderFunctions;
942 }
943
944 //=============================================================================
945 // ISO-8859-2
946 //=============================================================================
947
948 /// Defines a Latin2-encoded character.
949 enum Latin2Char : ubyte { init }
950
951 /**
952 * Defines an Latin2-encoded string (as an array of $(D
953 * immutable(Latin2Char))).
954 */
955 alias Latin2String = immutable(Latin2Char)[];
956
957 private template EncoderInstance(CharType : Latin2Char)
958 {
959 import std.typecons : Tuple, tuple;
960
961 alias E = Latin2Char;
962 alias EString = Latin2String;
963
964 @property string encodingName() @safe pure nothrow @nogc
965 {
966 return "ISO-8859-2";
967 }
968
969 private static immutable dchar m_charMapStart = 0xa1;
970 private static immutable dchar m_charMapEnd = 0xff;
971
972 private immutable wstring charMap =
973 "\u0104\u02D8\u0141\u00A4\u013D\u015A\u00A7\u00A8"~
974 "\u0160\u015E\u0164\u0179\u00AD\u017D\u017B\u00B0"~
975 "\u0105\u02DB\u0142\u00B4\u013E\u015B\u02C7\u00B8"~
976 "\u0161\u015F\u0165\u017A\u02DD\u017E\u017C\u0154"~
977 "\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7\u010C"~
978 "\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E\u0110"~
979 "\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7\u0158"~
980 "\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF\u0155"~
981 "\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7\u010D"~
982 "\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F\u0111"~
983 "\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7\u0159"~
984 "\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9";
985
986 private immutable Tuple!(wchar, char)[] bstMap = [
987 tuple('\u0148','\xF2'), tuple('\u00F3','\xF3'), tuple('\u0165','\xBB'),
988 tuple('\u00D3','\xD3'), tuple('\u010F','\xEF'), tuple('\u015B','\xB6'),
989 tuple('\u017C','\xBF'), tuple('\u00C1','\xC1'), tuple('\u00E1','\xE1'),
990 tuple('\u0103','\xE3'), tuple('\u013A','\xE5'), tuple('\u0155','\xE0'),
991 tuple('\u0161','\xB9'), tuple('\u0171','\xFB'), tuple('\u02D8','\xA2'),
992 tuple('\u00AD','\xAD'), tuple('\u00C9','\xC9'), tuple('\u00DA','\xDA'),
993 tuple('\u00E9','\xE9'), tuple('\u00FA','\xFA'), tuple('\u0107','\xE6'),
994 tuple('\u0119','\xEA'), tuple('\u0142','\xB3'), tuple('\u0151','\xF5'),
995 tuple('\u0159','\xF8'), tuple('\u015F','\xBA'), tuple('\u0163','\xFE'),
996 tuple('\u016F','\xF9'), tuple('\u017A','\xBC'), tuple('\u017E','\xBE'),
997 tuple('\u02DB','\xB2'), tuple('\u00A7','\xA7'), tuple('\u00B4','\xB4'),
998 tuple('\u00C4','\xC4'), tuple('\u00CD','\xCD'), tuple('\u00D6','\xD6'),
999 tuple('\u00DD','\xDD'), tuple('\u00E4','\xE4'), tuple('\u00ED','\xED'),
1000 tuple('\u00F6','\xF6'), tuple('\u00FD','\xFD'), tuple('\u0105','\xB1'),
1001 tuple('\u010D','\xE8'), tuple('\u0111','\xF0'), tuple('\u011B','\xEC'),
1002 tuple('\u013E','\xB5'), tuple('\u0144','\xF1'), tuple('\u0150','\xD5'),
1003 tuple('\u0154','\xC0'), tuple('\u0158','\xD8'), tuple('\u015A','\xA6'),
1004 tuple('\u015E','\xAA'), tuple('\u0160','\xA9'), tuple('\u0162','\xDE'),
1005 tuple('\u0164','\xAB'), tuple('\u016E','\xD9'), tuple('\u0170','\xDB'),
1006 tuple('\u0179','\xAC'), tuple('\u017B','\xAF'), tuple('\u017D','\xAE'),
1007 tuple('\u02C7','\xB7'), tuple('\u02D9','\xFF'), tuple('\u02DD','\xBD'),
1008 tuple('\u00A4','\xA4'), tuple('\u00A8','\xA8'), tuple('\u00B0','\xB0'),
1009 tuple('\u00B8','\xB8'), tuple('\u00C2','\xC2'), tuple('\u00C7','\xC7'),
1010 tuple('\u00CB','\xCB'), tuple('\u00CE','\xCE'), tuple('\u00D4','\xD4'),
1011 tuple('\u00D7','\xD7'), tuple('\u00DC','\xDC'), tuple('\u00DF','\xDF'),
1012 tuple('\u00E2','\xE2'), tuple('\u00E7','\xE7'), tuple('\u00EB','\xEB'),
1013 tuple('\u00EE','\xEE'), tuple('\u00F4','\xF4'), tuple('\u00F7','\xF7'),
1014 tuple('\u00FC','\xFC'), tuple('\u0102','\xC3'), tuple('\u0104','\xA1'),
1015 tuple('\u0106','\xC6'), tuple('\u010C','\xC8'), tuple('\u010E','\xCF'),
1016 tuple('\u0110','\xD0'), tuple('\u0118','\xCA'), tuple('\u011A','\xCC'),
1017 tuple('\u0139','\xC5'), tuple('\u013D','\xA5'), tuple('\u0141','\xA3'),
1018 tuple('\u0143','\xD1'), tuple('\u0147','\xD2')
1019 ];
1020
1021 mixin GenericEncoder!();
1022 }
1023
1024 //=============================================================================
1025 // WINDOWS-1250
1026 //=============================================================================
1027
1028 /// Defines a Windows1250-encoded character.
1029 enum Windows1250Char : ubyte { init }
1030
1031 /**
1032 * Defines an Windows1250-encoded string (as an array of $(D
1033 * immutable(Windows1250Char))).
1034 */
1035 alias Windows1250String = immutable(Windows1250Char)[];
1036
1037 private template EncoderInstance(CharType : Windows1250Char)
1038 {
1039 import std.typecons : Tuple, tuple;
1040
1041 alias E = Windows1250Char;
1042 alias EString = Windows1250String;
1043
1044 @property string encodingName() @safe pure nothrow @nogc
1045 {
1046 return "windows-1250";
1047 }
1048
1049 private static immutable dchar m_charMapStart = 0x80;
1050 private static immutable dchar m_charMapEnd = 0xff;
1051
1052 private immutable wstring charMap =
1053 "\u20AC\uFFFD\u201A\uFFFD\u201E\u2026\u2020\u2021"~
1054 "\uFFFD\u2030\u0160\u2039\u015A\u0164\u017D\u0179"~
1055 "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1056 "\uFFFD\u2122\u0161\u203A\u015B\u0165\u017E\u017A"~
1057 "\u00A0\u02C7\u02D8\u0141\u00A4\u0104\u00A6\u00A7"~
1058 "\u00A8\u00A9\u015E\u00AB\u00AC\u00AD\u00AE\u017B"~
1059 "\u00B0\u00B1\u02DB\u0142\u00B4\u00B5\u00B6\u00B7"~
1060 "\u00B8\u0105\u015F\u00BB\u013D\u02DD\u013E\u017C"~
1061 "\u0154\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7"~
1062 "\u010C\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E"~
1063 "\u0110\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7"~
1064 "\u0158\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF"~
1065 "\u0155\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7"~
1066 "\u010D\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F"~
1067 "\u0111\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7"~
1068 "\u0159\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9";
1069
1070 private immutable Tuple!(wchar, char)[] bstMap = [
1071 tuple('\u011A','\xCC'), tuple('\u00DC','\xDC'), tuple('\u0179','\x8F'),
1072 tuple('\u00B7','\xB7'), tuple('\u00FC','\xFC'), tuple('\u0158','\xD8'),
1073 tuple('\u201C','\x93'), tuple('\u00AC','\xAC'), tuple('\u00CB','\xCB'),
1074 tuple('\u00EB','\xEB'), tuple('\u010C','\xC8'), tuple('\u0143','\xD1'),
1075 tuple('\u0162','\xDE'), tuple('\u02D9','\xFF'), tuple('\u2039','\x8B'),
1076 tuple('\u00A7','\xA7'), tuple('\u00B1','\xB1'), tuple('\u00C2','\xC2'),
1077 tuple('\u00D4','\xD4'), tuple('\u00E2','\xE2'), tuple('\u00F4','\xF4'),
1078 tuple('\u0104','\xA5'), tuple('\u0110','\xD0'), tuple('\u013D','\xBC'),
1079 tuple('\u0150','\xD5'), tuple('\u015E','\xAA'), tuple('\u016E','\xD9'),
1080 tuple('\u017D','\x8E'), tuple('\u2014','\x97'), tuple('\u2021','\x87'),
1081 tuple('\u20AC','\x80'), tuple('\u00A4','\xA4'), tuple('\u00A9','\xA9'),
1082 tuple('\u00AE','\xAE'), tuple('\u00B5','\xB5'), tuple('\u00BB','\xBB'),
1083 tuple('\u00C7','\xC7'), tuple('\u00CE','\xCE'), tuple('\u00D7','\xD7'),
1084 tuple('\u00DF','\xDF'), tuple('\u00E7','\xE7'), tuple('\u00EE','\xEE'),
1085 tuple('\u00F7','\xF7'), tuple('\u0102','\xC3'), tuple('\u0106','\xC6'),
1086 tuple('\u010E','\xCF'), tuple('\u0118','\xCA'), tuple('\u0139','\xC5'),
1087 tuple('\u0141','\xA3'), tuple('\u0147','\xD2'), tuple('\u0154','\xC0'),
1088 tuple('\u015A','\x8C'), tuple('\u0160','\x8A'), tuple('\u0164','\x8D'),
1089 tuple('\u0170','\xDB'), tuple('\u017B','\xAF'), tuple('\u02C7','\xA1'),
1090 tuple('\u02DD','\xBD'), tuple('\u2019','\x92'), tuple('\u201E','\x84'),
1091 tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'),
1092 tuple('\u00A0','\xA0'), tuple('\u00A6','\xA6'), tuple('\u00A8','\xA8'),
1093 tuple('\u00AB','\xAB'), tuple('\u00AD','\xAD'), tuple('\u00B0','\xB0'),
1094 tuple('\u00B4','\xB4'), tuple('\u00B6','\xB6'), tuple('\u00B8','\xB8'),
1095 tuple('\u00C1','\xC1'), tuple('\u00C4','\xC4'), tuple('\u00C9','\xC9'),
1096 tuple('\u00CD','\xCD'), tuple('\u00D3','\xD3'), tuple('\u00D6','\xD6'),
1097 tuple('\u00DA','\xDA'), tuple('\u00DD','\xDD'), tuple('\u00E1','\xE1'),
1098 tuple('\u00E4','\xE4'), tuple('\u00E9','\xE9'), tuple('\u00ED','\xED'),
1099 tuple('\u00F3','\xF3'), tuple('\u00F6','\xF6'), tuple('\u00FA','\xFA'),
1100 tuple('\u00FD','\xFD'), tuple('\u0103','\xE3'), tuple('\u0105','\xB9'),
1101 tuple('\u0107','\xE6'), tuple('\u010D','\xE8'), tuple('\u010F','\xEF'),
1102 tuple('\u0111','\xF0'), tuple('\u0119','\xEA'), tuple('\u011B','\xEC'),
1103 tuple('\u013A','\xE5'), tuple('\u013E','\xBE'), tuple('\u0142','\xB3'),
1104 tuple('\u0144','\xF1'), tuple('\u0148','\xF2'), tuple('\u0151','\xF5'),
1105 tuple('\u0155','\xE0'), tuple('\u0159','\xF8'), tuple('\u015B','\x9C'),
1106 tuple('\u015F','\xBA'), tuple('\u0161','\x9A'), tuple('\u0163','\xFE'),
1107 tuple('\u0165','\x9D'), tuple('\u016F','\xF9'), tuple('\u0171','\xFB'),
1108 tuple('\u017A','\x9F'), tuple('\u017C','\xBF'), tuple('\u017E','\x9E'),
1109 tuple('\u02D8','\xA2'), tuple('\u02DB','\xB2'), tuple('\u2013','\x96'),
1110 tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'),
1111 tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89')
1112 ];
1113
1114 mixin GenericEncoder!();
1115 }
1116
1117 //=============================================================================
1118 // WINDOWS-1252
1119 //=============================================================================
1120
1121 /// Defines a Windows1252-encoded character.
1122 enum Windows1252Char : ubyte { init }
1123
1124 /**
1125 * Defines an Windows1252-encoded string (as an array of $(D
1126 * immutable(Windows1252Char))).
1127 */
1128 alias Windows1252String = immutable(Windows1252Char)[];
1129
1130 template EncoderInstance(CharType : Windows1252Char)
1131 {
1132 import std.typecons : Tuple, tuple;
1133
1134 alias E = Windows1252Char;
1135 alias EString = Windows1252String;
1136
1137 @property string encodingName() @safe pure nothrow @nogc
1138 {
1139 return "windows-1252";
1140 }
1141
1142 private static immutable dchar m_charMapStart = 0x80;
1143 private static immutable dchar m_charMapEnd = 0x9f;
1144
1145 private immutable wstring charMap =
1146 "\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021"~
1147 "\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD"~
1148 "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1149 "\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178";
1150
1151 private immutable Tuple!(wchar, char)[] bstMap = [
1152 tuple('\u201C','\x93'), tuple('\u0192','\x83'), tuple('\u2039','\x8B'),
1153 tuple('\u0161','\x9A'), tuple('\u2014','\x97'), tuple('\u2021','\x87'),
1154 tuple('\u20AC','\x80'), tuple('\u0153','\x9C'), tuple('\u017D','\x8E'),
1155 tuple('\u02DC','\x98'), tuple('\u2019','\x92'), tuple('\u201E','\x84'),
1156 tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'),
1157 tuple('\u0152','\x8C'), tuple('\u0160','\x8A'), tuple('\u0178','\x9F'),
1158 tuple('\u017E','\x9E'), tuple('\u02C6','\x88'), tuple('\u2013','\x96'),
1159 tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'),
1160 tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89')
1161 ];
1162
1163 mixin GenericEncoder!();
1164 }
1165
1166 //=============================================================================
1167 // UTF-8
1168 //=============================================================================
1169
1170 template EncoderInstance(CharType : char)
1171 {
1172 alias E = char;
1173 alias EString = immutable(char)[];
1174
1175 @property string encodingName() @safe pure nothrow @nogc
1176 {
1177 return "UTF-8";
1178 }
1179
1180 bool canEncode(dchar c) @safe pure nothrow @nogc
1181 {
1182 return isValidCodePoint(c);
1183 }
1184
1185 bool isValidCodeUnit(char c) @safe pure nothrow @nogc
1186 {
1187 return (c < 0xC0 || (c >= 0xC2 && c < 0xF5));
1188 }
1189
1190 immutable ubyte[128] tailTable =
1191 [
1192 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1193 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1194 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1195 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1196 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1197 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1198 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1199 3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0,
1200 ];
1201
1202 private int tails(char c) @safe pure nothrow @nogc
1203 in
1204 {
1205 assert(c >= 0x80);
1206 }
1207 body
1208 {
1209 return tailTable[c-0x80];
1210 }
1211
1212 size_t encodedLength(dchar c) @safe pure nothrow @nogc
1213 in
1214 {
1215 assert(canEncode(c));
1216 }
1217 body
1218 {
1219 if (c < 0x80) return 1;
1220 if (c < 0x800) return 2;
1221 if (c < 0x10000) return 3;
1222 return 4;
1223 }
1224
1225 void encodeViaWrite()(dchar c)
1226 {
1227 if (c < 0x80)
1228 {
1229 write(cast(char) c);
1230 }
1231 else if (c < 0x800)
1232 {
1233 write(cast(char)((c >> 6) + 0xC0));
1234 write(cast(char)((c & 0x3F) + 0x80));
1235 }
1236 else if (c < 0x10000)
1237 {
1238 write(cast(char)((c >> 12) + 0xE0));
1239 write(cast(char)(((c >> 6) & 0x3F) + 0x80));
1240 write(cast(char)((c & 0x3F) + 0x80));
1241 }
1242 else
1243 {
1244 write(cast(char)((c >> 18) + 0xF0));
1245 write(cast(char)(((c >> 12) & 0x3F) + 0x80));
1246 write(cast(char)(((c >> 6) & 0x3F) + 0x80));
1247 write(cast(char)((c & 0x3F) + 0x80));
1248 }
1249 }
1250
1251 void skipViaRead()()
1252 {
1253 auto c = read();
1254 if (c < 0xC0) return;
1255 int n = tails(cast(char) c);
1256 for (size_t i=0; i<n; ++i)
1257 {
1258 read();
1259 }
1260 }
1261
1262 dchar decodeViaRead()()
1263 {
1264 dchar c = read();
1265 if (c < 0xC0) return c;
1266 int n = tails(cast(char) c);
1267 c &= (1 << (6 - n)) - 1;
1268 for (size_t i=0; i<n; ++i)
1269 {
1270 c = (c << 6) + (read() & 0x3F);
1271 }
1272 return c;
1273 }
1274
1275 dchar safeDecodeViaRead()()
1276 {
1277 dchar c = read();
1278 if (c < 0x80) return c;
1279 int n = tails(cast(char) c);
1280 if (n == 0) return INVALID_SEQUENCE;
1281
1282 if (!canRead) return INVALID_SEQUENCE;
1283 size_t d = peek();
1284 immutable err =
1285 (
1286 (c < 0xC2) // fail overlong 2-byte sequences
1287 || (c > 0xF4) // fail overlong 4-6-byte sequences
1288 || (c == 0xE0 && ((d & 0xE0) == 0x80)) // fail overlong 3-byte sequences
1289 || (c == 0xED && ((d & 0xE0) == 0xA0)) // fail surrogates
1290 || (c == 0xF0 && ((d & 0xF0) == 0x80)) // fail overlong 4-byte sequences
1291 || (c == 0xF4 && ((d & 0xF0) >= 0x90)) // fail code points > 0x10FFFF
1292 );
1293
1294 c &= (1 << (6 - n)) - 1;
1295 for (size_t i=0; i<n; ++i)
1296 {
1297 if (!canRead) return INVALID_SEQUENCE;
1298 d = peek();
1299 if ((d & 0xC0) != 0x80) return INVALID_SEQUENCE;
1300 c = (c << 6) + (read() & 0x3F);
1301 }
1302
1303 return err ? INVALID_SEQUENCE : c;
1304 }
1305
1306 dchar decodeReverseViaRead()()
1307 {
1308 dchar c = read();
1309 if (c < 0x80) return c;
1310 size_t shift = 0;
1311 c &= 0x3F;
1312 for (size_t i=0; i<4; ++i)
1313 {
1314 shift += 6;
1315 auto d = read();
1316 size_t n = tails(cast(char) d);
1317 immutable mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1;
1318 c += ((d & mask) << shift);
1319 if (n != 0) break;
1320 }
1321 return c;
1322 }
1323
1324 @property EString replacementSequence() @safe pure nothrow @nogc
1325 {
1326 return "\uFFFD";
1327 }
1328
1329 mixin EncoderFunctions;
1330 }
1331
1332 //=============================================================================
1333 // UTF-16
1334 //=============================================================================
1335
1336 template EncoderInstance(CharType : wchar)
1337 {
1338 alias E = wchar;
1339 alias EString = immutable(wchar)[];
1340
1341 @property string encodingName() @safe pure nothrow @nogc
1342 {
1343 return "UTF-16";
1344 }
1345
1346 bool canEncode(dchar c) @safe pure nothrow @nogc
1347 {
1348 return isValidCodePoint(c);
1349 }
1350
1351 bool isValidCodeUnit(wchar c) @safe pure nothrow @nogc
1352 {
1353 return true;
1354 }
1355
1356 size_t encodedLength(dchar c) @safe pure nothrow @nogc
1357 in
1358 {
1359 assert(canEncode(c));
1360 }
1361 body
1362 {
1363 return (c < 0x10000) ? 1 : 2;
1364 }
1365
1366 void encodeViaWrite()(dchar c)
1367 {
1368 if (c < 0x10000)
1369 {
1370 write(cast(wchar) c);
1371 }
1372 else
1373 {
1374 size_t n = c - 0x10000;
1375 write(cast(wchar)(0xD800 + (n >> 10)));
1376 write(cast(wchar)(0xDC00 + (n & 0x3FF)));
1377 }
1378 }
1379
1380 void skipViaRead()()
1381 {
1382 immutable c = read();
1383 if (c < 0xD800 || c >= 0xE000) return;
1384 read();
1385 }
1386
1387 dchar decodeViaRead()()
1388 {
1389 wchar c = read();
1390 if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1391 wchar d = read();
1392 c &= 0x3FF;
1393 d &= 0x3FF;
1394 return 0x10000 + (c << 10) + d;
1395 }
1396
1397 dchar safeDecodeViaRead()()
1398 {
1399 wchar c = read();
1400 if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1401 if (c >= 0xDC00) return INVALID_SEQUENCE;
1402 if (!canRead) return INVALID_SEQUENCE;
1403 wchar d = peek();
1404 if (d < 0xDC00 || d >= 0xE000) return INVALID_SEQUENCE;
1405 d = read();
1406 c &= 0x3FF;
1407 d &= 0x3FF;
1408 return 0x10000 + (c << 10) + d;
1409 }
1410
1411 dchar decodeReverseViaRead()()
1412 {
1413 wchar c = read();
1414 if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1415 wchar d = read();
1416 c &= 0x3FF;
1417 d &= 0x3FF;
1418 return 0x10000 + (d << 10) + c;
1419 }
1420
1421 @property EString replacementSequence() @safe pure nothrow @nogc
1422 {
1423 return "\uFFFD"w;
1424 }
1425
1426 mixin EncoderFunctions;
1427 }
1428
1429 //=============================================================================
1430 // UTF-32
1431 //=============================================================================
1432
1433 template EncoderInstance(CharType : dchar)
1434 {
1435 alias E = dchar;
1436 alias EString = immutable(dchar)[];
1437
1438 @property string encodingName() @safe pure nothrow @nogc
1439 {
1440 return "UTF-32";
1441 }
1442
1443 bool canEncode(dchar c) @safe pure @nogc nothrow
1444 {
1445 return isValidCodePoint(c);
1446 }
1447
1448 bool isValidCodeUnit(dchar c) @safe pure @nogc nothrow
1449 {
1450 return isValidCodePoint(c);
1451 }
1452
1453 size_t encodedLength(dchar c) @safe pure @nogc nothrow
1454 in
1455 {
1456 assert(canEncode(c));
1457 }
1458 body
1459 {
1460 return 1;
1461 }
1462
1463 void encodeViaWrite()(dchar c)
1464 {
1465 write(c);
1466 }
1467
1468 void skipViaRead()()
1469 {
1470 read();
1471 }
1472
1473 dchar decodeViaRead()()
1474 {
1475 return cast(dchar) read();
1476 }
1477
1478 dchar safeDecodeViaRead()()
1479 {
1480 immutable c = read();
1481 return isValidCodePoint(c) ? c : INVALID_SEQUENCE;
1482 }
1483
1484 dchar decodeReverseViaRead()()
1485 {
1486 return cast(dchar) read();
1487 }
1488
1489 @property EString replacementSequence() @safe pure nothrow @nogc
1490 {
1491 return "\uFFFD"d;
1492 }
1493
1494 mixin EncoderFunctions;
1495 }
1496
1497 //=============================================================================
1498 // Below are forwarding functions which expose the function to the user
1499
1500 /**
1501 Returns true if c is a valid code point
1502
1503 Note that this includes the non-character code points U+FFFE and U+FFFF,
1504 since these are valid code points (even though they are not valid
1505 characters).
1506
1507 Supersedes:
1508 This function supersedes $(D std.utf.startsValidDchar()).
1509
1510 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1511 WINDOWS-1252
1512
1513 Params:
1514 c = the code point to be tested
1515 */
1516 bool isValidCodePoint(dchar c) @safe pure nothrow @nogc
1517 {
1518 return c < 0xD800 || (c >= 0xE000 && c < 0x110000);
1519 }
1520
1521 /**
1522 Returns the name of an encoding.
1523
1524 The type of encoding cannot be deduced. Therefore, it is necessary to
1525 explicitly specify the encoding type.
1526
1527 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1528 WINDOWS-1252
1529 */
1530 @property string encodingName(T)()
1531 {
1532 return EncoderInstance!(T).encodingName;
1533 }
1534
1535 ///
1536 @safe unittest
1537 {
1538 assert(encodingName!(char) == "UTF-8");
1539 assert(encodingName!(wchar) == "UTF-16");
1540 assert(encodingName!(dchar) == "UTF-32");
1541 assert(encodingName!(AsciiChar) == "ASCII");
1542 assert(encodingName!(Latin1Char) == "ISO-8859-1");
1543 assert(encodingName!(Latin2Char) == "ISO-8859-2");
1544 assert(encodingName!(Windows1250Char) == "windows-1250");
1545 assert(encodingName!(Windows1252Char) == "windows-1252");
1546 }
1547
1548 /**
1549 Returns true iff it is possible to represent the specified codepoint
1550 in the encoding.
1551
1552 The type of encoding cannot be deduced. Therefore, it is necessary to
1553 explicitly specify the encoding type.
1554
1555 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1556 WINDOWS-1252
1557 */
1558 bool canEncode(E)(dchar c)
1559 {
1560 return EncoderInstance!(E).canEncode(c);
1561 }
1562
1563 ///
1564 @safe pure unittest
1565 {
1566 assert( canEncode!(Latin1Char)('A'));
1567 assert( canEncode!(Latin2Char)('A'));
1568 assert(!canEncode!(AsciiChar)('\u00A0'));
1569 assert( canEncode!(Latin1Char)('\u00A0'));
1570 assert( canEncode!(Latin2Char)('\u00A0'));
1571 assert( canEncode!(Windows1250Char)('\u20AC'));
1572 assert(!canEncode!(Windows1250Char)('\u20AD'));
1573 assert(!canEncode!(Windows1250Char)('\uFFFD'));
1574 assert( canEncode!(Windows1252Char)('\u20AC'));
1575 assert(!canEncode!(Windows1252Char)('\u20AD'));
1576 assert(!canEncode!(Windows1252Char)('\uFFFD'));
1577 assert(!canEncode!(char)(cast(dchar) 0x110000));
1578 }
1579
1580 /// How to check an entire string
1581 @safe pure unittest
1582 {
1583 import std.algorithm.searching : find;
1584 import std.utf : byDchar;
1585
1586 assert("The quick brown fox"
1587 .byDchar
1588 .find!(x => !canEncode!AsciiChar(x))
1589 .empty);
1590 }
1591
1592 /**
1593 Returns true if the code unit is legal. For example, the byte 0x80 would
1594 not be legal in ASCII, because ASCII code units must always be in the range
1595 0x00 to 0x7F.
1596
1597 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1598 WINDOWS-1252
1599
1600 Params:
1601 c = the code unit to be tested
1602 */
1603 bool isValidCodeUnit(E)(E c)
1604 {
1605 return EncoderInstance!(E).isValidCodeUnit(c);
1606 }
1607
1608 ///
1609 @system pure unittest
1610 {
1611 assert(!isValidCodeUnit(cast(char) 0xC0));
1612 assert(!isValidCodeUnit(cast(char) 0xFF));
1613 assert( isValidCodeUnit(cast(wchar) 0xD800));
1614 assert(!isValidCodeUnit(cast(dchar) 0xD800));
1615 assert(!isValidCodeUnit(cast(AsciiChar) 0xA0));
1616 assert( isValidCodeUnit(cast(Windows1250Char) 0x80));
1617 assert(!isValidCodeUnit(cast(Windows1250Char) 0x81));
1618 assert( isValidCodeUnit(cast(Windows1252Char) 0x80));
1619 assert(!isValidCodeUnit(cast(Windows1252Char) 0x81));
1620 }
1621
1622 /**
1623 Returns true if the string is encoded correctly
1624
1625 Supersedes:
1626 This function supersedes std.utf.validate(), however note that this
1627 function returns a bool indicating whether the input was valid or not,
1628 whereas the older function would throw an exception.
1629
1630 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1631 WINDOWS-1252
1632
1633 Params:
1634 s = the string to be tested
1635 */
1636 bool isValid(E)(const(E)[] s)
1637 {
1638 return s.length == validLength(s);
1639 }
1640
1641 ///
1642 @system pure unittest
1643 {
1644 assert( isValid("\u20AC100"));
1645 assert(!isValid(cast(char[3])[167, 133, 175]));
1646 }
1647
1648 /**
1649 Returns the length of the longest possible substring, starting from
1650 the first code unit, which is validly encoded.
1651
1652 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1653 WINDOWS-1252
1654
1655 Params:
1656 s = the string to be tested
1657 */
1658 size_t validLength(E)(const(E)[] s)
1659 {
1660 size_t result, before = void;
1661 while ((before = s.length) > 0)
1662 {
1663 if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE)
1664 break;
1665 result += before - s.length;
1666 }
1667 return result;
1668 }
1669
1670 /**
1671 Sanitizes a string by replacing malformed code unit sequences with valid
1672 code unit sequences. The result is guaranteed to be valid for this encoding.
1673
1674 If the input string is already valid, this function returns the original,
1675 otherwise it constructs a new string by replacing all illegal code unit
1676 sequences with the encoding's replacement character, Invalid sequences will
1677 be replaced with the Unicode replacement character (U+FFFD) if the
1678 character repertoire contains it, otherwise invalid sequences will be
1679 replaced with '?'.
1680
1681 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1682 WINDOWS-1252
1683
1684 Params:
1685 s = the string to be sanitized
1686 */
1687 immutable(E)[] sanitize(E)(immutable(E)[] s)
1688 {
1689 size_t n = validLength(s);
1690 if (n == s.length) return s;
1691
1692 auto repSeq = EncoderInstance!(E).replacementSequence;
1693
1694 // Count how long the string needs to be.
1695 // Overestimating is not a problem
1696 size_t len = s.length;
1697 const(E)[] t = s[n..$];
1698 while (t.length != 0)
1699 {
1700 immutable c = EncoderInstance!(E).safeDecode(t);
1701 assert(c == INVALID_SEQUENCE);
1702 len += repSeq.length;
1703 t = t[validLength(t)..$];
1704 }
1705
1706 // Now do the write
1707 E[] array = new E[len];
1708 array[0 .. n] = s[0 .. n];
1709 size_t offset = n;
1710
1711 t = s[n..$];
1712 while (t.length != 0)
1713 {
1714 immutable c = EncoderInstance!(E).safeDecode(t);
1715 assert(c == INVALID_SEQUENCE);
1716 array[offset .. offset+repSeq.length] = repSeq[];
1717 offset += repSeq.length;
1718 n = validLength(t);
1719 array[offset .. offset+n] = t[0 .. n];
1720 offset += n;
1721 t = t[n..$];
1722 }
1723 return cast(immutable(E)[])array[0 .. offset];
1724 }
1725
1726 ///
1727 @system pure unittest
1728 {
1729 assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld");
1730 }
1731
1732 /**
1733 Returns the length of the first encoded sequence.
1734
1735 The input to this function MUST be validly encoded.
1736 This is enforced by the function's in-contract.
1737
1738 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1739 WINDOWS-1252
1740
1741 Params:
1742 s = the string to be sliced
1743 */
1744 size_t firstSequence(E)(const(E)[] s)
1745 in
1746 {
1747 assert(s.length != 0);
1748 const(E)[] u = s;
1749 assert(safeDecode(u) != INVALID_SEQUENCE);
1750 }
1751 body
1752 {
1753 auto before = s.length;
1754 EncoderInstance!(E).skip(s);
1755 return before - s.length;
1756 }
1757
1758 ///
1759 @system pure unittest
1760 {
1761 assert(firstSequence("\u20AC1000") == "\u20AC".length);
1762 assert(firstSequence("hel") == "h".length);
1763 }
1764
1765 /**
1766 Returns the length of the last encoded sequence.
1767
1768 The input to this function MUST be validly encoded.
1769 This is enforced by the function's in-contract.
1770
1771 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1772 WINDOWS-1252
1773
1774 Params:
1775 s = the string to be sliced
1776 */
1777 size_t lastSequence(E)(const(E)[] s)
1778 in
1779 {
1780 assert(s.length != 0);
1781 assert(isValid(s));
1782 }
1783 body
1784 {
1785 const(E)[] t = s;
1786 EncoderInstance!(E).decodeReverse(s);
1787 return t.length - s.length;
1788 }
1789
1790 ///
1791 @system pure unittest
1792 {
1793 assert(lastSequence("1000\u20AC") == "\u20AC".length);
1794 assert(lastSequence("hellö") == "ö".length);
1795 }
1796
1797 /**
1798 Returns the array index at which the (n+1)th code point begins.
1799
1800 The input to this function MUST be validly encoded.
1801 This is enforced by the function's in-contract.
1802
1803 Supersedes:
1804 This function supersedes std.utf.toUTFindex().
1805
1806 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1807 WINDOWS-1252
1808
1809 Params:
1810 s = the string to be counted
1811 n = the current code point index
1812 */
1813 ptrdiff_t index(E)(const(E)[] s,int n)
1814 in
1815 {
1816 assert(isValid(s));
1817 assert(n >= 0);
1818 }
1819 body
1820 {
1821 const(E)[] t = s;
1822 for (size_t i=0; i<n; ++i) EncoderInstance!(E).skip(s);
1823 return t.length - s.length;
1824 }
1825
1826 ///
1827 @system pure unittest
1828 {
1829 assert(index("\u20AC100",1) == 3);
1830 assert(index("hällo",2) == 3);
1831 }
1832
1833 /**
1834 Decodes a single code point.
1835
1836 This function removes one or more code units from the start of a string,
1837 and returns the decoded code point which those code units represent.
1838
1839 The input to this function MUST be validly encoded.
1840 This is enforced by the function's in-contract.
1841
1842 Supersedes:
1843 This function supersedes std.utf.decode(), however, note that the
1844 function codePoints() supersedes it more conveniently.
1845
1846 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1847 WINDOWS-1252
1848
1849 Params:
1850 s = the string whose first code point is to be decoded
1851 */
1852 dchar decode(S)(ref S s)
1853 in
1854 {
1855 assert(s.length != 0);
1856 auto u = s;
1857 assert(safeDecode(u) != INVALID_SEQUENCE);
1858 }
1859 body
1860 {
1861 return EncoderInstance!(typeof(s[0])).decode(s);
1862 }
1863
1864 /**
1865 Decodes a single code point from the end of a string.
1866
1867 This function removes one or more code units from the end of a string,
1868 and returns the decoded code point which those code units represent.
1869
1870 The input to this function MUST be validly encoded.
1871 This is enforced by the function's in-contract.
1872
1873 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1874 WINDOWS-1252
1875
1876 Params:
1877 s = the string whose first code point is to be decoded
1878 */
1879 dchar decodeReverse(E)(ref const(E)[] s)
1880 in
1881 {
1882 assert(s.length != 0);
1883 assert(isValid(s));
1884 }
1885 body
1886 {
1887 return EncoderInstance!(E).decodeReverse(s);
1888 }
1889
1890 /**
1891 Decodes a single code point. The input does not have to be valid.
1892
1893 This function removes one or more code units from the start of a string,
1894 and returns the decoded code point which those code units represent.
1895
1896 This function will accept an invalidly encoded string as input.
1897 If an invalid sequence is found at the start of the string, this
1898 function will remove it, and return the value INVALID_SEQUENCE.
1899
1900 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1901 WINDOWS-1252
1902
1903 Params:
1904 s = the string whose first code point is to be decoded
1905 */
1906 dchar safeDecode(S)(ref S s)
1907 in
1908 {
1909 assert(s.length != 0);
1910 }
1911 body
1912 {
1913 return EncoderInstance!(typeof(s[0])).safeDecode(s);
1914 }
1915
1916 /**
1917 Returns the number of code units required to encode a single code point.
1918
1919 The input to this function MUST be a valid code point.
1920 This is enforced by the function's in-contract.
1921
1922 The type of the output cannot be deduced. Therefore, it is necessary to
1923 explicitly specify the encoding as a template parameter.
1924
1925 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1926 WINDOWS-1252
1927
1928 Params:
1929 c = the code point to be encoded
1930 */
1931 size_t encodedLength(E)(dchar c)
1932 in
1933 {
1934 assert(isValidCodePoint(c));
1935 }
1936 body
1937 {
1938 return EncoderInstance!(E).encodedLength(c);
1939 }
1940
1941 /**
1942 Encodes a single code point.
1943
1944 This function encodes a single code point into one or more code units.
1945 It returns a string containing those code units.
1946
1947 The input to this function MUST be a valid code point.
1948 This is enforced by the function's in-contract.
1949
1950 The type of the output cannot be deduced. Therefore, it is necessary to
1951 explicitly specify the encoding as a template parameter.
1952
1953 Supersedes:
1954 This function supersedes std.utf.encode(), however, note that the
1955 function codeUnits() supersedes it more conveniently.
1956
1957 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1958 WINDOWS-1252
1959
1960 Params:
1961 c = the code point to be encoded
1962 */
1963 E[] encode(E)(dchar c)
1964 in
1965 {
1966 assert(isValidCodePoint(c));
1967 }
1968 body
1969 {
1970 return EncoderInstance!(E).encode(c);
1971 }
1972
1973 /**
1974 Encodes a single code point into an array.
1975
1976 This function encodes a single code point into one or more code units
1977 The code units are stored in a user-supplied fixed-size array,
1978 which must be passed by reference.
1979
1980 The input to this function MUST be a valid code point.
1981 This is enforced by the function's in-contract.
1982
1983 The type of the output cannot be deduced. Therefore, it is necessary to
1984 explicitly specify the encoding as a template parameter.
1985
1986 Supersedes:
1987 This function supersedes std.utf.encode(), however, note that the
1988 function codeUnits() supersedes it more conveniently.
1989
1990 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1991 WINDOWS-1252
1992
1993 Params:
1994 c = the code point to be encoded
1995 array = the destination array
1996
1997 Returns:
1998 the number of code units written to the array
1999 */
2000 size_t encode(E)(dchar c, E[] array)
2001 in
2002 {
2003 assert(isValidCodePoint(c));
2004 }
2005 body
2006 {
2007 E[] t = array;
2008 EncoderInstance!(E).encode(c,t);
2009 return array.length - t.length;
2010 }
2011
2012 /*
2013 Encodes $(D c) in units of type $(D E) and writes the result to the
2014 output range $(D R). Returns the number of $(D E)s written.
2015 */
2016 size_t encode(E, R)(dchar c, auto ref R range)
2017 if (isNativeOutputRange!(R, E))
2018 {
2019 static if (is(Unqual!E == char))
2020 {
2021 if (c <= 0x7F)
2022 {
2023 put(range, cast(char) c);
2024 return 1;
2025 }
2026 if (c <= 0x7FF)
2027 {
2028 put(range, cast(char)(0xC0 | (c >> 6)));
2029 put(range, cast(char)(0x80 | (c & 0x3F)));
2030 return 2;
2031 }
2032 if (c <= 0xFFFF)
2033 {
2034 put(range, cast(char)(0xE0 | (c >> 12)));
2035 put(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
2036 put(range, cast(char)(0x80 | (c & 0x3F)));
2037 return 3;
2038 }
2039 if (c <= 0x10FFFF)
2040 {
2041 put(range, cast(char)(0xF0 | (c >> 18)));
2042 put(range, cast(char)(0x80 | ((c >> 12) & 0x3F)));
2043 put(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
2044 put(range, cast(char)(0x80 | (c & 0x3F)));
2045 return 4;
2046 }
2047 else
2048 {
2049 assert(0);
2050 }
2051 }
2052 else static if (is(Unqual!E == wchar))
2053 {
2054 if (c <= 0xFFFF)
2055 {
2056 range.put(cast(wchar) c);
2057 return 1;
2058 }
2059 range.put(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800));
2060 range.put(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00));
2061 return 2;
2062 }
2063 else static if (is(Unqual!E == dchar))
2064 {
2065 range.put(c);
2066 return 1;
2067 }
2068 else
2069 {
2070 static assert(0);
2071 }
2072 }
2073
2074 @safe pure unittest
2075 {
2076 import std.array;
2077 Appender!(char[]) r;
2078 assert(encode!(char)('T', r) == 1);
2079 assert(encode!(wchar)('T', r) == 1);
2080 assert(encode!(dchar)('T', r) == 1);
2081 }
2082
2083 /**
2084 Encodes a single code point to a delegate.
2085
2086 This function encodes a single code point into one or more code units.
2087 The code units are passed one at a time to the supplied delegate.
2088
2089 The input to this function MUST be a valid code point.
2090 This is enforced by the function's in-contract.
2091
2092 The type of the output cannot be deduced. Therefore, it is necessary to
2093 explicitly specify the encoding as a template parameter.
2094
2095 Supersedes:
2096 This function supersedes std.utf.encode(), however, note that the
2097 function codeUnits() supersedes it more conveniently.
2098
2099 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2100 WINDOWS-1252
2101
2102 Params:
2103 c = the code point to be encoded
2104 dg = the delegate to invoke for each code unit
2105 */
2106 void encode(E)(dchar c, void delegate(E) dg)
2107 in
2108 {
2109 assert(isValidCodePoint(c));
2110 }
2111 body
2112 {
2113 EncoderInstance!(E).encode(c,dg);
2114 }
2115
2116 /**
2117 Encodes the contents of $(D s) in units of type $(D Tgt), writing the result to an
2118 output range.
2119
2120 Returns: The number of $(D Tgt) elements written.
2121 Params:
2122 Tgt = Element type of $(D range).
2123 s = Input array.
2124 range = Output range.
2125 */
2126 size_t encode(Tgt, Src, R)(in Src[] s, R range)
2127 {
2128 size_t result;
2129 foreach (c; s)
2130 {
2131 result += encode!(Tgt)(c, range);
2132 }
2133 return result;
2134 }
2135
2136 /**
2137 Returns a foreachable struct which can bidirectionally iterate over all
2138 code points in a string.
2139
2140 The input to this function MUST be validly encoded.
2141 This is enforced by the function's in-contract.
2142
2143 You can foreach either
2144 with or without an index. If an index is specified, it will be initialized
2145 at each iteration with the offset into the string at which the code point
2146 begins.
2147
2148 Supersedes:
2149 This function supersedes std.utf.decode().
2150
2151 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2152 WINDOWS-1252
2153
2154 Params:
2155 s = the string to be decoded
2156
2157 Example:
2158 --------------------------------------------------------
2159 string s = "hello world";
2160 foreach (c;codePoints(s))
2161 {
2162 // do something with c (which will always be a dchar)
2163 }
2164 --------------------------------------------------------
2165
2166 Note that, currently, foreach (c:codePoints(s)) is superior to foreach (c;s)
2167 in that the latter will fall over on encountering U+FFFF.
2168 */
2169 CodePoints!(E) codePoints(E)(immutable(E)[] s)
2170 in
2171 {
2172 assert(isValid(s));
2173 }
2174 body
2175 {
2176 return CodePoints!(E)(s);
2177 }
2178
2179 ///
2180 @system unittest
2181 {
2182 string s = "hello";
2183 string t;
2184 foreach (c;codePoints(s))
2185 {
2186 t ~= cast(char) c;
2187 }
2188 assert(s == t);
2189 }
2190
2191 /**
2192 Returns a foreachable struct which can bidirectionally iterate over all
2193 code units in a code point.
2194
2195 The input to this function MUST be a valid code point.
2196 This is enforced by the function's in-contract.
2197
2198 The type of the output cannot be deduced. Therefore, it is necessary to
2199 explicitly specify the encoding type in the template parameter.
2200
2201 Supersedes:
2202 This function supersedes std.utf.encode().
2203
2204 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2205 WINDOWS-1252
2206
2207 Params:
2208 c = the code point to be encoded
2209 */
2210 CodeUnits!(E) codeUnits(E)(dchar c)
2211 in
2212 {
2213 assert(isValidCodePoint(c));
2214 }
2215 body
2216 {
2217 return CodeUnits!(E)(c);
2218 }
2219
2220 ///
2221 @system unittest
2222 {
2223 char[] a;
2224 foreach (c;codeUnits!(char)(cast(dchar)'\u20AC'))
2225 {
2226 a ~= c;
2227 }
2228 assert(a.length == 3);
2229 assert(a[0] == 0xE2);
2230 assert(a[1] == 0x82);
2231 assert(a[2] == 0xAC);
2232 }
2233
2234 /**
2235 Convert a string from one encoding to another.
2236
2237 Supersedes:
2238 This function supersedes std.utf.toUTF8(), std.utf.toUTF16() and
2239 std.utf.toUTF32()
2240 (but note that to!() supersedes it more conveniently).
2241
2242 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2243 WINDOWS-1252
2244
2245 Params:
2246 s = Source string. $(B Must) be validly encoded.
2247 This is enforced by the function's in-contract.
2248 r = Destination string
2249
2250 See_Also:
2251 $(REF to, std,conv)
2252 */
2253 void transcode(Src, Dst)(Src[] s, out Dst[] r)
2254 in
2255 {
2256 assert(isValid(s));
2257 }
2258 body
2259 {
2260 static if (is(Src == Dst) && is(Src == immutable))
2261 {
2262 r = s;
2263 }
2264 else static if (is(Unqual!Src == AsciiChar))
2265 {
2266 transcode(cast(const(char)[])s, r);
2267 }
2268 else
2269 {
2270 static if (is(Unqual!Dst == wchar))
2271 {
2272 immutable minReservePlace = 2;
2273 }
2274 else static if (is(Unqual!Dst == dchar))
2275 {
2276 immutable minReservePlace = 1;
2277 }
2278 else
2279 {
2280 immutable minReservePlace = 6;
2281 }
2282
2283 auto buffer = new Unqual!Dst[s.length];
2284 auto tmpBuffer = buffer;
2285
2286 while (s.length != 0)
2287 {
2288 if (tmpBuffer.length < minReservePlace)
2289 {
2290 size_t prevLength = buffer.length;
2291 buffer.length += s.length + minReservePlace;
2292 tmpBuffer = buffer[prevLength - tmpBuffer.length .. $];
2293 }
2294 EncoderInstance!(Unqual!Dst).encode(decode(s), tmpBuffer);
2295 }
2296
2297 r = cast(Dst[]) buffer[0 .. buffer.length - tmpBuffer.length];
2298 }
2299 }
2300
2301 ///
2302 @system pure unittest
2303 {
2304 wstring ws;
2305 // transcode from UTF-8 to UTF-16
2306 transcode("hello world",ws);
2307 assert(ws == "hello world"w);
2308
2309 Latin1String ls;
2310 // transcode from UTF-16 to ISO-8859-1
2311 transcode(ws, ls);
2312 assert(ws == "hello world");
2313 }
2314
2315 @system pure unittest
2316 {
2317 import std.meta;
2318 import std.range;
2319 {
2320 import std.conv : to;
2321
2322 string asciiCharString = to!string(iota(0, 128, 1));
2323
2324 alias Types = AliasSeq!(string, Latin1String, Latin2String, AsciiString,
2325 Windows1250String, Windows1252String, dstring, wstring);
2326 foreach (S; Types)
2327 foreach (D; Types)
2328 {
2329 string str;
2330 S sStr;
2331 D dStr;
2332 transcode(asciiCharString, sStr);
2333 transcode(sStr, dStr);
2334 transcode(dStr, str);
2335 assert(asciiCharString == str);
2336 }
2337 }
2338 {
2339 string czechChars = "Příliš žluťoučký kůň úpěl ďábelské ódy.";
2340 alias Types = AliasSeq!(string, dstring, wstring);
2341 foreach (S; Types)
2342 foreach (D; Types)
2343 {
2344 string str;
2345 S sStr;
2346 D dStr;
2347 transcode(czechChars, sStr);
2348 transcode(sStr, dStr);
2349 transcode(dStr, str);
2350 assert(czechChars == str);
2351 }
2352 }
2353 }
2354
2355 @system unittest // mutable/const input/output
2356 {
2357 import std.meta : AliasSeq;
2358
2359 foreach (O; AliasSeq!(Latin1Char, const Latin1Char, immutable Latin1Char))
2360 {
2361 O[] output;
2362
2363 char[] mutableInput = "äbc".dup;
2364 transcode(mutableInput, output);
2365 assert(output == [0xE4, 'b', 'c']);
2366
2367 const char[] constInput = "öbc";
2368 transcode(constInput, output);
2369 assert(output == [0xF6, 'b', 'c']);
2370
2371 immutable char[] immutInput = "übc";
2372 transcode(immutInput, output);
2373 assert(output == [0xFC, 'b', 'c']);
2374 }
2375
2376 // Make sure that const/mutable input is copied.
2377 foreach (C; AliasSeq!(char, const char))
2378 {
2379 C[] input = "foo".dup;
2380 C[] output;
2381 transcode(input, output);
2382 assert(input == output);
2383 assert(input !is output);
2384 }
2385
2386 // But immutable input should not be copied.
2387 string input = "foo";
2388 string output;
2389 transcode(input, output);
2390 assert(input is output);
2391 }
2392
2393 //=============================================================================
2394
2395 /** The base class for exceptions thrown by this module */
2396 class EncodingException : Exception { this(string msg) @safe pure { super(msg); } }
2397
2398 class UnrecognizedEncodingException : EncodingException
2399 {
2400 private this(string msg) @safe pure { super(msg); }
2401 }
2402
2403 /** Abstract base class of all encoding schemes */
2404 abstract class EncodingScheme
2405 {
2406 import std.uni : toLower;
2407
2408 /**
2409 * Registers a subclass of EncodingScheme.
2410 *
2411 * This function allows user-defined subclasses of EncodingScheme to
2412 * be declared in other modules.
2413 *
2414 * Params:
2415 * Klass = The subclass of EncodingScheme to register.
2416 *
2417 * Example:
2418 * ----------------------------------------------
2419 * class Amiga1251 : EncodingScheme
2420 * {
2421 * shared static this()
2422 * {
2423 * EncodingScheme.register!Amiga1251;
2424 * }
2425 * }
2426 * ----------------------------------------------
2427 */
2428 static void register(Klass:EncodingScheme)()
2429 {
2430 scope scheme = new Klass();
2431 foreach (encodingName;scheme.names())
2432 {
2433 supported[toLower(encodingName)] = () => new Klass();
2434 }
2435 }
2436
2437 deprecated("Please pass the EncodingScheme subclass as template argument instead.")
2438 static void register(string className)
2439 {
2440 auto scheme = cast(EncodingScheme) ClassInfo.find(className).create();
2441 if (scheme is null)
2442 throw new EncodingException("Unable to create class "~className);
2443 foreach (encodingName;scheme.names())
2444 {
2445 supportedFactories[toLower(encodingName)] = className;
2446 }
2447 }
2448
2449 /**
2450 * Obtains a subclass of EncodingScheme which is capable of encoding
2451 * and decoding the named encoding scheme.
2452 *
2453 * This function is only aware of EncodingSchemes which have been
2454 * registered with the register() function.
2455 *
2456 * Example:
2457 * ---------------------------------------------------
2458 * auto scheme = EncodingScheme.create("Amiga-1251");
2459 * ---------------------------------------------------
2460 */
2461 static EncodingScheme create(string encodingName)
2462 {
2463 static bool registerDefaultEncodings()
2464 {
2465 EncodingScheme.register!EncodingSchemeASCII;
2466 EncodingScheme.register!EncodingSchemeLatin1;
2467 EncodingScheme.register!EncodingSchemeLatin2;
2468 EncodingScheme.register!EncodingSchemeWindows1250;
2469 EncodingScheme.register!EncodingSchemeWindows1252;
2470 EncodingScheme.register!EncodingSchemeUtf8;
2471 EncodingScheme.register!EncodingSchemeUtf16Native;
2472 EncodingScheme.register!EncodingSchemeUtf32Native;
2473 return true;
2474 }
2475
2476 static shared bool initialized;
2477 import std.concurrency : initOnce;
2478 initOnce!initialized(registerDefaultEncodings());
2479 encodingName = toLower(encodingName);
2480
2481 if (auto p = encodingName in supported)
2482 return (*p)();
2483
2484 auto p = encodingName in supportedFactories;
2485 if (p is null)
2486 throw new EncodingException("Unrecognized Encoding: "~encodingName);
2487 string className = *p;
2488 auto scheme = cast(EncodingScheme) ClassInfo.find(className).create();
2489 if (scheme is null) throw new EncodingException("Unable to create class "~className);
2490 return scheme;
2491 }
2492
2493 const
2494 {
2495 /**
2496 * Returns the standard name of the encoding scheme
2497 */
2498 abstract override string toString();
2499
2500 /**
2501 * Returns an array of all known names for this encoding scheme
2502 */
2503 abstract string[] names();
2504
2505 /**
2506 * Returns true if the character c can be represented
2507 * in this encoding scheme.
2508 */
2509 abstract bool canEncode(dchar c);
2510
2511 /**
2512 * Returns the number of ubytes required to encode this code point.
2513 *
2514 * The input to this function MUST be a valid code point.
2515 *
2516 * Params:
2517 * c = the code point to be encoded
2518 *
2519 * Returns:
2520 * the number of ubytes required.
2521 */
2522 abstract size_t encodedLength(dchar c);
2523
2524 /**
2525 * Encodes a single code point into a user-supplied, fixed-size buffer.
2526 *
2527 * This function encodes a single code point into one or more ubytes.
2528 * The supplied buffer must be code unit aligned.
2529 * (For example, UTF-16LE or UTF-16BE must be wchar-aligned,
2530 * UTF-32LE or UTF-32BE must be dchar-aligned, etc.)
2531 *
2532 * The input to this function MUST be a valid code point.
2533 *
2534 * Params:
2535 * c = the code point to be encoded
2536 * buffer = the destination array
2537 *
2538 * Returns:
2539 * the number of ubytes written.
2540 */
2541 abstract size_t encode(dchar c, ubyte[] buffer);
2542
2543 /**
2544 * Decodes a single code point.
2545 *
2546 * This function removes one or more ubytes from the start of an array,
2547 * and returns the decoded code point which those ubytes represent.
2548 *
2549 * The input to this function MUST be validly encoded.
2550 *
2551 * Params:
2552 * s = the array whose first code point is to be decoded
2553 */
2554 abstract dchar decode(ref const(ubyte)[] s);
2555
2556 /**
2557 * Decodes a single code point. The input does not have to be valid.
2558 *
2559 * This function removes one or more ubytes from the start of an array,
2560 * and returns the decoded code point which those ubytes represent.
2561 *
2562 * This function will accept an invalidly encoded array as input.
2563 * If an invalid sequence is found at the start of the string, this
2564 * function will remove it, and return the value INVALID_SEQUENCE.
2565 *
2566 * Params:
2567 * s = the array whose first code point is to be decoded
2568 */
2569 abstract dchar safeDecode(ref const(ubyte)[] s);
2570
2571 /**
2572 * Returns the sequence of ubytes to be used to represent
2573 * any character which cannot be represented in the encoding scheme.
2574 *
2575 * Normally this will be a representation of some substitution
2576 * character, such as U+FFFD or '?'.
2577 */
2578 abstract @property immutable(ubyte)[] replacementSequence();
2579 }
2580
2581 /**
2582 * Returns true if the array is encoded correctly
2583 *
2584 * Params:
2585 * s = the array to be tested
2586 */
2587 bool isValid(const(ubyte)[] s)
2588 {
2589 while (s.length != 0)
2590 {
2591 if (safeDecode(s) == INVALID_SEQUENCE)
2592 return false;
2593 }
2594 return true;
2595 }
2596
2597 /**
2598 * Returns the length of the longest possible substring, starting from
2599 * the first element, which is validly encoded.
2600 *
2601 * Params:
2602 * s = the array to be tested
2603 */
2604 size_t validLength()(const(ubyte)[] s)
2605 {
2606 const(ubyte)[] r = s;
2607 const(ubyte)[] t = s;
2608 while (s.length != 0)
2609 {
2610 if (safeDecode(s) == INVALID_SEQUENCE) break;
2611 t = s;
2612 }
2613 return r.length - t.length;
2614 }
2615
2616 /**
2617 * Sanitizes an array by replacing malformed ubyte sequences with valid
2618 * ubyte sequences. The result is guaranteed to be valid for this
2619 * encoding scheme.
2620 *
2621 * If the input array is already valid, this function returns the
2622 * original, otherwise it constructs a new array by replacing all illegal
2623 * sequences with the encoding scheme's replacement sequence.
2624 *
2625 * Params:
2626 * s = the string to be sanitized
2627 */
2628 immutable(ubyte)[] sanitize()(immutable(ubyte)[] s)
2629 {
2630 auto n = validLength(s);
2631 if (n == s.length) return s;
2632
2633 auto repSeq = replacementSequence;
2634
2635 // Count how long the string needs to be.
2636 // Overestimating is not a problem
2637 auto len = s.length;
2638 const(ubyte)[] t = s[n..$];
2639 while (t.length != 0)
2640 {
2641 immutable c = safeDecode(t);
2642 assert(c == INVALID_SEQUENCE);
2643 len += repSeq.length;
2644 t = t[validLength(t)..$];
2645 }
2646
2647 // Now do the write
2648 ubyte[] array = new ubyte[len];
2649 array[0 .. n] = s[0 .. n];
2650 auto offset = n;
2651
2652 t = s[n..$];
2653 while (t.length != 0)
2654 {
2655 immutable c = safeDecode(t);
2656 assert(c == INVALID_SEQUENCE);
2657 array[offset .. offset+repSeq.length] = repSeq[];
2658 offset += repSeq.length;
2659 n = validLength(t);
2660 array[offset .. offset+n] = t[0 .. n];
2661 offset += n;
2662 t = t[n..$];
2663 }
2664 return cast(immutable(ubyte)[])array[0 .. offset];
2665 }
2666
2667 /**
2668 * Returns the length of the first encoded sequence.
2669 *
2670 * The input to this function MUST be validly encoded.
2671 * This is enforced by the function's in-contract.
2672 *
2673 * Params:
2674 * s = the array to be sliced
2675 */
2676 size_t firstSequence()(const(ubyte)[] s)
2677 in
2678 {
2679 assert(s.length != 0);
2680 const(ubyte)[] u = s;
2681 assert(safeDecode(u) != INVALID_SEQUENCE);
2682 }
2683 body
2684 {
2685 const(ubyte)[] t = s;
2686 decode(s);
2687 return t.length - s.length;
2688 }
2689
2690 /**
2691 * Returns the total number of code points encoded in a ubyte array.
2692 *
2693 * The input to this function MUST be validly encoded.
2694 * This is enforced by the function's in-contract.
2695 *
2696 * Params:
2697 * s = the string to be counted
2698 */
2699 size_t count()(const(ubyte)[] s)
2700 in
2701 {
2702 assert(isValid(s));
2703 }
2704 body
2705 {
2706 size_t n = 0;
2707 while (s.length != 0)
2708 {
2709 decode(s);
2710 ++n;
2711 }
2712 return n;
2713 }
2714
2715 /**
2716 * Returns the array index at which the (n+1)th code point begins.
2717 *
2718 * The input to this function MUST be validly encoded.
2719 * This is enforced by the function's in-contract.
2720 *
2721 * Params:
2722 * s = the string to be counted
2723 * n = the current code point index
2724 */
2725 ptrdiff_t index()(const(ubyte)[] s, size_t n)
2726 in
2727 {
2728 assert(isValid(s));
2729 assert(n >= 0);
2730 }
2731 body
2732 {
2733 const(ubyte)[] t = s;
2734 for (size_t i=0; i<n; ++i) decode(s);
2735 return t.length - s.length;
2736 }
2737
2738 __gshared EncodingScheme function()[string] supported;
2739 __gshared string[string] supportedFactories;
2740 }
2741
2742 /**
2743 EncodingScheme to handle ASCII
2744
2745 This scheme recognises the following names:
2746 "ANSI_X3.4-1968",
2747 "ANSI_X3.4-1986",
2748 "ASCII",
2749 "IBM367",
2750 "ISO646-US",
2751 "ISO_646.irv:1991",
2752 "US-ASCII",
2753 "cp367",
2754 "csASCII"
2755 "iso-ir-6",
2756 "us"
2757 */
2758 class EncodingSchemeASCII : EncodingScheme
2759 {
2760 /* // moved to std.internal.phobosinit
2761 shared static this()
2762 {
2763 EncodingScheme.register("std.encoding.EncodingSchemeASCII");
2764 }*/
2765
2766 const
2767 {
2768 override string[] names() @safe pure nothrow
2769 {
2770 return
2771 [
2772 "ANSI_X3.4-1968",
2773 "ANSI_X3.4-1986",
2774 "ASCII",
2775 "IBM367",
2776 "ISO646-US",
2777 "ISO_646.irv:1991",
2778 "US-ASCII",
2779 "cp367",
2780 "csASCII",
2781 "iso-ir-6",
2782 "us"
2783 ];
2784 }
2785
2786 override string toString() @safe pure nothrow @nogc
2787 {
2788 return "ASCII";
2789 }
2790
2791 override bool canEncode(dchar c) @safe pure nothrow @nogc
2792 {
2793 return std.encoding.canEncode!(AsciiChar)(c);
2794 }
2795
2796 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
2797 {
2798 return std.encoding.encodedLength!(AsciiChar)(c);
2799 }
2800
2801 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2802 {
2803 auto r = cast(AsciiChar[]) buffer;
2804 return std.encoding.encode(c,r);
2805 }
2806
2807 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2808 {
2809 auto t = cast(const(AsciiChar)[]) s;
2810 dchar c = std.encoding.decode(t);
2811 s = s[$-t.length..$];
2812 return c;
2813 }
2814
2815 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2816 {
2817 auto t = cast(const(AsciiChar)[]) s;
2818 dchar c = std.encoding.safeDecode(t);
2819 s = s[$-t.length..$];
2820 return c;
2821 }
2822
2823 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2824 {
2825 return cast(immutable(ubyte)[])"?";
2826 }
2827 }
2828 }
2829
2830 /**
2831 EncodingScheme to handle Latin-1
2832
2833 This scheme recognises the following names:
2834 "CP819",
2835 "IBM819",
2836 "ISO-8859-1",
2837 "ISO_8859-1",
2838 "ISO_8859-1:1987",
2839 "csISOLatin1",
2840 "iso-ir-100",
2841 "l1",
2842 "latin1"
2843 */
2844 class EncodingSchemeLatin1 : EncodingScheme
2845 {
2846 /* // moved to std.internal.phobosinit
2847 shared static this()
2848 {
2849 EncodingScheme.register("std.encoding.EncodingSchemeLatin1");
2850 }*/
2851
2852 const
2853 {
2854 override string[] names() @safe pure nothrow
2855 {
2856 return
2857 [
2858 "CP819",
2859 "IBM819",
2860 "ISO-8859-1",
2861 "ISO_8859-1",
2862 "ISO_8859-1:1987",
2863 "csISOLatin1",
2864 "iso-ir-100",
2865 "l1",
2866 "latin1"
2867 ];
2868 }
2869
2870 override string toString() @safe pure nothrow @nogc
2871 {
2872 return "ISO-8859-1";
2873 }
2874
2875 override bool canEncode(dchar c) @safe pure nothrow @nogc
2876 {
2877 return std.encoding.canEncode!(Latin1Char)(c);
2878 }
2879
2880 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
2881 {
2882 return std.encoding.encodedLength!(Latin1Char)(c);
2883 }
2884
2885 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2886 {
2887 auto r = cast(Latin1Char[]) buffer;
2888 return std.encoding.encode(c,r);
2889 }
2890
2891 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2892 {
2893 auto t = cast(const(Latin1Char)[]) s;
2894 dchar c = std.encoding.decode(t);
2895 s = s[$-t.length..$];
2896 return c;
2897 }
2898
2899 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2900 {
2901 auto t = cast(const(Latin1Char)[]) s;
2902 dchar c = std.encoding.safeDecode(t);
2903 s = s[$-t.length..$];
2904 return c;
2905 }
2906
2907 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2908 {
2909 return cast(immutable(ubyte)[])"?";
2910 }
2911 }
2912 }
2913
2914 /**
2915 EncodingScheme to handle Latin-2
2916
2917 This scheme recognises the following names:
2918 "Latin 2",
2919 "ISO-8859-2",
2920 "ISO_8859-2",
2921 "ISO_8859-2:1999",
2922 "Windows-28592"
2923 */
2924 class EncodingSchemeLatin2 : EncodingScheme
2925 {
2926 /* // moved to std.internal.phobosinit
2927 shared static this()
2928 {
2929 EncodingScheme.register("std.encoding.EncodingSchemeLatin2");
2930 }*/
2931
2932 const
2933 {
2934 override string[] names() @safe pure nothrow
2935 {
2936 return
2937 [
2938 "Latin 2",
2939 "ISO-8859-2",
2940 "ISO_8859-2",
2941 "ISO_8859-2:1999",
2942 "windows-28592"
2943 ];
2944 }
2945
2946 override string toString() @safe pure nothrow @nogc
2947 {
2948 return "ISO-8859-2";
2949 }
2950
2951 override bool canEncode(dchar c) @safe pure nothrow @nogc
2952 {
2953 return std.encoding.canEncode!(Latin2Char)(c);
2954 }
2955
2956 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
2957 {
2958 return std.encoding.encodedLength!(Latin2Char)(c);
2959 }
2960
2961 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2962 {
2963 auto r = cast(Latin2Char[]) buffer;
2964 return std.encoding.encode(c,r);
2965 }
2966
2967 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2968 {
2969 auto t = cast(const(Latin2Char)[]) s;
2970 dchar c = std.encoding.decode(t);
2971 s = s[$-t.length..$];
2972 return c;
2973 }
2974
2975 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2976 {
2977 auto t = cast(const(Latin2Char)[]) s;
2978 dchar c = std.encoding.safeDecode(t);
2979 s = s[$-t.length..$];
2980 return c;
2981 }
2982
2983 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2984 {
2985 return cast(immutable(ubyte)[])"?";
2986 }
2987 }
2988 }
2989
2990 /**
2991 EncodingScheme to handle Windows-1250
2992
2993 This scheme recognises the following names:
2994 "windows-1250"
2995 */
2996 class EncodingSchemeWindows1250 : EncodingScheme
2997 {
2998 /* // moved to std.internal.phobosinit
2999 shared static this()
3000 {
3001 EncodingScheme.register("std.encoding.EncodingSchemeWindows1250");
3002 }*/
3003
3004 const
3005 {
3006 override string[] names() @safe pure nothrow
3007 {
3008 return
3009 [
3010 "windows-1250"
3011 ];
3012 }
3013
3014 override string toString() @safe pure nothrow @nogc
3015 {
3016 return "windows-1250";
3017 }
3018
3019 override bool canEncode(dchar c) @safe pure nothrow @nogc
3020 {
3021 return std.encoding.canEncode!(Windows1250Char)(c);
3022 }
3023
3024 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3025 {
3026 return std.encoding.encodedLength!(Windows1250Char)(c);
3027 }
3028
3029 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3030 {
3031 auto r = cast(Windows1250Char[]) buffer;
3032 return std.encoding.encode(c,r);
3033 }
3034
3035 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3036 {
3037 auto t = cast(const(Windows1250Char)[]) s;
3038 dchar c = std.encoding.decode(t);
3039 s = s[$-t.length..$];
3040 return c;
3041 }
3042
3043 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3044 {
3045 auto t = cast(const(Windows1250Char)[]) s;
3046 dchar c = std.encoding.safeDecode(t);
3047 s = s[$-t.length..$];
3048 return c;
3049 }
3050
3051 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3052 {
3053 return cast(immutable(ubyte)[])"?";
3054 }
3055 }
3056 }
3057
3058 /**
3059 EncodingScheme to handle Windows-1252
3060
3061 This scheme recognises the following names:
3062 "windows-1252"
3063 */
3064 class EncodingSchemeWindows1252 : EncodingScheme
3065 {
3066 /* // moved to std.internal.phobosinit
3067 shared static this()
3068 {
3069 EncodingScheme.register("std.encoding.EncodingSchemeWindows1252");
3070 }*/
3071
3072 const
3073 {
3074 override string[] names() @safe pure nothrow
3075 {
3076 return
3077 [
3078 "windows-1252"
3079 ];
3080 }
3081
3082 override string toString() @safe pure nothrow @nogc
3083 {
3084 return "windows-1252";
3085 }
3086
3087 override bool canEncode(dchar c) @safe pure nothrow @nogc
3088 {
3089 return std.encoding.canEncode!(Windows1252Char)(c);
3090 }
3091
3092 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3093 {
3094 return std.encoding.encodedLength!(Windows1252Char)(c);
3095 }
3096
3097 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3098 {
3099 auto r = cast(Windows1252Char[]) buffer;
3100 return std.encoding.encode(c,r);
3101 }
3102
3103 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3104 {
3105 auto t = cast(const(Windows1252Char)[]) s;
3106 dchar c = std.encoding.decode(t);
3107 s = s[$-t.length..$];
3108 return c;
3109 }
3110
3111 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3112 {
3113 auto t = cast(const(Windows1252Char)[]) s;
3114 dchar c = std.encoding.safeDecode(t);
3115 s = s[$-t.length..$];
3116 return c;
3117 }
3118
3119 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3120 {
3121 return cast(immutable(ubyte)[])"?";
3122 }
3123 }
3124 }
3125
3126 /**
3127 EncodingScheme to handle UTF-8
3128
3129 This scheme recognises the following names:
3130 "UTF-8"
3131 */
3132 class EncodingSchemeUtf8 : EncodingScheme
3133 {
3134 /* // moved to std.internal.phobosinit
3135 shared static this()
3136 {
3137 EncodingScheme.register("std.encoding.EncodingSchemeUtf8");
3138 }*/
3139
3140 const
3141 {
3142 override string[] names() @safe pure nothrow
3143 {
3144 return
3145 [
3146 "UTF-8"
3147 ];
3148 }
3149
3150 override string toString() @safe pure nothrow @nogc
3151 {
3152 return "UTF-8";
3153 }
3154
3155 override bool canEncode(dchar c) @safe pure nothrow @nogc
3156 {
3157 return std.encoding.canEncode!(char)(c);
3158 }
3159
3160 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3161 {
3162 return std.encoding.encodedLength!(char)(c);
3163 }
3164
3165 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3166 {
3167 auto r = cast(char[]) buffer;
3168 return std.encoding.encode(c,r);
3169 }
3170
3171 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3172 {
3173 auto t = cast(const(char)[]) s;
3174 dchar c = std.encoding.decode(t);
3175 s = s[$-t.length..$];
3176 return c;
3177 }
3178
3179 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3180 {
3181 auto t = cast(const(char)[]) s;
3182 dchar c = std.encoding.safeDecode(t);
3183 s = s[$-t.length..$];
3184 return c;
3185 }
3186
3187 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3188 {
3189 return cast(immutable(ubyte)[])"\uFFFD";
3190 }
3191 }
3192 }
3193
3194 /**
3195 EncodingScheme to handle UTF-16 in native byte order
3196
3197 This scheme recognises the following names:
3198 "UTF-16LE" (little-endian architecture only)
3199 "UTF-16BE" (big-endian architecture only)
3200 */
3201 class EncodingSchemeUtf16Native : EncodingScheme
3202 {
3203 /* // moved to std.internal.phobosinit
3204 shared static this()
3205 {
3206 EncodingScheme.register("std.encoding.EncodingSchemeUtf16Native");
3207 }*/
3208
3209 const
3210 {
3211 version (LittleEndian) { enum string NAME = "UTF-16LE"; }
3212 version (BigEndian) { enum string NAME = "UTF-16BE"; }
3213
3214 override string[] names() @safe pure nothrow
3215 {
3216 return [ NAME ];
3217 }
3218
3219 override string toString() @safe pure nothrow @nogc
3220 {
3221 return NAME;
3222 }
3223
3224 override bool canEncode(dchar c) @safe pure nothrow @nogc
3225 {
3226 return std.encoding.canEncode!(wchar)(c);
3227 }
3228
3229 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3230 {
3231 return std.encoding.encodedLength!(wchar)(c);
3232 }
3233
3234 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3235 {
3236 auto r = cast(wchar[]) buffer;
3237 return wchar.sizeof * std.encoding.encode(c,r);
3238 }
3239
3240 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3241 in
3242 {
3243 assert((s.length & 1) == 0);
3244 }
3245 body
3246 {
3247 auto t = cast(const(wchar)[]) s;
3248 dchar c = std.encoding.decode(t);
3249 s = s[$-t.length * wchar.sizeof..$];
3250 return c;
3251 }
3252
3253 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3254 in
3255 {
3256 assert((s.length & 1) == 0);
3257 }
3258 body
3259 {
3260 auto t = cast(const(wchar)[]) s;
3261 dchar c = std.encoding.safeDecode(t);
3262 s = s[$-t.length * wchar.sizeof..$];
3263 return c;
3264 }
3265
3266 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3267 {
3268 return cast(immutable(ubyte)[])"\uFFFD"w;
3269 }
3270 }
3271 }
3272 @system unittest
3273 {
3274 version (LittleEndian)
3275 {
3276 auto efrom = EncodingScheme.create("utf-16le");
3277 ubyte[6] sample = [154,1, 155,1, 156,1];
3278 }
3279 version (BigEndian)
3280 {
3281 auto efrom = EncodingScheme.create("utf-16be");
3282 ubyte[6] sample = [1,154, 1,155, 1,156];
3283 }
3284 const(ubyte)[] ub = cast(const(ubyte)[])sample;
3285 dchar dc = efrom.safeDecode(ub);
3286 assert(dc == 410);
3287 assert(ub.length == 4);
3288 }
3289
3290 /**
3291 EncodingScheme to handle UTF-32 in native byte order
3292
3293 This scheme recognises the following names:
3294 "UTF-32LE" (little-endian architecture only)
3295 "UTF-32BE" (big-endian architecture only)
3296 */
3297 class EncodingSchemeUtf32Native : EncodingScheme
3298 {
3299 /* // moved to std.internal.phobosinit
3300 shared static this()
3301 {
3302 EncodingScheme.register("std.encoding.EncodingSchemeUtf32Native");
3303 }*/
3304
3305 const
3306 {
3307 version (LittleEndian) { enum string NAME = "UTF-32LE"; }
3308 version (BigEndian) { enum string NAME = "UTF-32BE"; }
3309
3310 override string[] names() @safe pure nothrow
3311 {
3312 return [ NAME ];
3313 }
3314
3315 override string toString() @safe pure nothrow @nogc
3316 {
3317 return NAME;
3318 }
3319
3320 override bool canEncode(dchar c) @safe pure nothrow @nogc
3321 {
3322 return std.encoding.canEncode!(dchar)(c);
3323 }
3324
3325 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3326 {
3327 return std.encoding.encodedLength!(dchar)(c);
3328 }
3329
3330 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3331 {
3332 auto r = cast(dchar[]) buffer;
3333 return dchar.sizeof * std.encoding.encode(c,r);
3334 }
3335
3336 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3337 in
3338 {
3339 assert((s.length & 3) == 0);
3340 }
3341 body
3342 {
3343 auto t = cast(const(dchar)[]) s;
3344 dchar c = std.encoding.decode(t);
3345 s = s[$-t.length * dchar.sizeof..$];
3346 return c;
3347 }
3348
3349 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3350 in
3351 {
3352 assert((s.length & 3) == 0);
3353 }
3354 body
3355 {
3356 auto t = cast(const(dchar)[]) s;
3357 dchar c = std.encoding.safeDecode(t);
3358 s = s[$-t.length * dchar.sizeof..$];
3359 return c;
3360 }
3361
3362 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3363 {
3364 return cast(immutable(ubyte)[])"\uFFFD"d;
3365 }
3366 }
3367 }
3368 @system unittest
3369 {
3370 version (LittleEndian)
3371 {
3372 auto efrom = EncodingScheme.create("utf-32le");
3373 ubyte[12] sample = [154,1,0,0, 155,1,0,0, 156,1,0,0];
3374 }
3375 version (BigEndian)
3376 {
3377 auto efrom = EncodingScheme.create("utf-32be");
3378 ubyte[12] sample = [0,0,1,154, 0,0,1,155, 0,0,1,156];
3379 }
3380 const(ubyte)[] ub = cast(const(ubyte)[])sample;
3381 dchar dc = efrom.safeDecode(ub);
3382 assert(dc == 410);
3383 assert(ub.length == 8);
3384 }
3385
3386 //=============================================================================
3387
3388
3389 // Helper functions
3390 version (unittest)
3391 {
3392 void transcodeReverse(Src,Dst)(immutable(Src)[] s, out immutable(Dst)[] r)
3393 {
3394 static if (is(Src == Dst))
3395 {
3396 return s;
3397 }
3398 else static if (is(Src == AsciiChar))
3399 {
3400 transcodeReverse!(char,Dst)(cast(string) s,r);
3401 }
3402 else
3403 {
3404 foreach_reverse (d;codePoints(s))
3405 {
3406 foreach_reverse (c;codeUnits!(Dst)(d))
3407 {
3408 r = c ~ r;
3409 }
3410 }
3411 }
3412 }
3413
3414 string makeReadable(string s)
3415 {
3416 string r = "\"";
3417 foreach (char c;s)
3418 {
3419 if (c >= 0x20 && c < 0x80)
3420 {
3421 r ~= c;
3422 }
3423 else
3424 {
3425 r ~= "\\x";
3426 r ~= toHexDigit(c >> 4);
3427 r ~= toHexDigit(c);
3428 }
3429 }
3430 r ~= "\"";
3431 return r;
3432 }
3433
3434 string makeReadable(wstring s)
3435 {
3436 string r = "\"";
3437 foreach (wchar c;s)
3438 {
3439 if (c >= 0x20 && c < 0x80)
3440 {
3441 r ~= cast(char) c;
3442 }
3443 else
3444 {
3445 r ~= "\\u";
3446 r ~= toHexDigit(c >> 12);
3447 r ~= toHexDigit(c >> 8);
3448 r ~= toHexDigit(c >> 4);
3449 r ~= toHexDigit(c);
3450 }
3451 }
3452 r ~= "\"w";
3453 return r;
3454 }
3455
3456 string makeReadable(dstring s)
3457 {
3458 string r = "\"";
3459 foreach (dchar c; s)
3460 {
3461 if (c >= 0x20 && c < 0x80)
3462 {
3463 r ~= cast(char) c;
3464 }
3465 else if (c < 0x10000)
3466 {
3467 r ~= "\\u";
3468 r ~= toHexDigit(c >> 12);
3469 r ~= toHexDigit(c >> 8);
3470 r ~= toHexDigit(c >> 4);
3471 r ~= toHexDigit(c);
3472 }
3473 else
3474 {
3475 r ~= "\\U00";
3476 r ~= toHexDigit(c >> 20);
3477 r ~= toHexDigit(c >> 16);
3478 r ~= toHexDigit(c >> 12);
3479 r ~= toHexDigit(c >> 8);
3480 r ~= toHexDigit(c >> 4);
3481 r ~= toHexDigit(c);
3482 }
3483 }
3484 r ~= "\"d";
3485 return r;
3486 }
3487
3488 char toHexDigit(int n)
3489 {
3490 return "0123456789ABCDEF"[n & 0xF];
3491 }
3492 }
3493
3494 /** Definitions of common Byte Order Marks.
3495 The elements of the $(D enum) can used as indices into $(D bomTable) to get
3496 matching $(D BOMSeq).
3497 */
3498 enum BOM
3499 {
3500 none = 0, /// no BOM was found
3501 utf32be = 1, /// [0x00, 0x00, 0xFE, 0xFF]
3502 utf32le = 2, /// [0xFF, 0xFE, 0x00, 0x00]
3503 utf7 = 3, /* [0x2B, 0x2F, 0x76, 0x38]
3504 [0x2B, 0x2F, 0x76, 0x39],
3505 [0x2B, 0x2F, 0x76, 0x2B],
3506 [0x2B, 0x2F, 0x76, 0x2F],
3507 [0x2B, 0x2F, 0x76, 0x38, 0x2D]
3508 */
3509 utf1 = 8, /// [0xF7, 0x64, 0x4C]
3510 utfebcdic = 9, /// [0xDD, 0x73, 0x66, 0x73]
3511 scsu = 10, /// [0x0E, 0xFE, 0xFF]
3512 bocu1 = 11, /// [0xFB, 0xEE, 0x28]
3513 gb18030 = 12, /// [0x84, 0x31, 0x95, 0x33]
3514 utf8 = 13, /// [0xEF, 0xBB, 0xBF]
3515 utf16be = 14, /// [0xFE, 0xFF]
3516 utf16le = 15 /// [0xFF, 0xFE]
3517 }
3518
3519 /// The type stored inside $(D bomTable).
3520 alias BOMSeq = Tuple!(BOM, "schema", ubyte[], "sequence");
3521
3522 /** Mapping of a byte sequence to $(B Byte Order Mark (BOM))
3523 */
3524 immutable bomTable = [
3525 BOMSeq(BOM.none, null),
3526 BOMSeq(BOM.utf32be, cast(ubyte[])([0x00, 0x00, 0xFE, 0xFF])),
3527 BOMSeq(BOM.utf32le, cast(ubyte[])([0xFF, 0xFE, 0x00, 0x00])),
3528 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x39])),
3529 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2B])),
3530 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2F])),
3531 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38, 0x2D])),
3532 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38])),
3533 BOMSeq(BOM.utf1, cast(ubyte[])([0xF7, 0x64, 0x4C])),
3534 BOMSeq(BOM.utfebcdic, cast(ubyte[])([0xDD, 0x73, 0x66, 0x73])),
3535 BOMSeq(BOM.scsu, cast(ubyte[])([0x0E, 0xFE, 0xFF])),
3536 BOMSeq(BOM.bocu1, cast(ubyte[])([0xFB, 0xEE, 0x28])),
3537 BOMSeq(BOM.gb18030, cast(ubyte[])([0x84, 0x31, 0x95, 0x33])),
3538 BOMSeq(BOM.utf8, cast(ubyte[])([0xEF, 0xBB, 0xBF])),
3539 BOMSeq(BOM.utf16be, cast(ubyte[])([0xFE, 0xFF])),
3540 BOMSeq(BOM.utf16le, cast(ubyte[])([0xFF, 0xFE]))
3541 ];
3542
3543 /** Returns a $(D BOMSeq) for a given $(D input).
3544 If no $(D BOM) is present the $(D BOMSeq) for $(D BOM.none) is
3545 returned. The $(D BOM) sequence at the beginning of the range will
3546 not be comsumed from the passed range. If you pass a reference type
3547 range make sure that $(D save) creates a deep copy.
3548
3549 Params:
3550 input = The sequence to check for the $(D BOM)
3551
3552 Returns:
3553 the found $(D BOMSeq) corresponding to the passed $(D input).
3554 */
3555 immutable(BOMSeq) getBOM(Range)(Range input)
3556 if (isForwardRange!Range && is(Unqual!(ElementType!Range) == ubyte))
3557 {
3558 import std.algorithm.searching : startsWith;
3559 foreach (it; bomTable[1 .. $])
3560 {
3561 if (startsWith(input.save, it.sequence))
3562 {
3563 return it;
3564 }
3565 }
3566
3567 return bomTable[0];
3568 }
3569
3570 ///
3571 @system unittest
3572 {
3573 import std.format : format;
3574
3575 auto ts = dchar(0x0000FEFF) ~ "Hello World"d;
3576
3577 auto entry = getBOM(cast(ubyte[]) ts);
3578 version (BigEndian)
3579 {
3580 assert(entry.schema == BOM.utf32be, format("%s", entry.schema));
3581 }
3582 else
3583 {
3584 assert(entry.schema == BOM.utf32le, format("%s", entry.schema));
3585 }
3586 }
3587
3588 @system unittest
3589 {
3590 import std.format : format;
3591
3592 foreach (idx, it; bomTable)
3593 {
3594 auto s = it[1] ~ cast(ubyte[])"hello world";
3595 auto i = getBOM(s);
3596 assert(i[0] == bomTable[idx][0]);
3597
3598 if (idx < 4 || idx > 7) // get around the multiple utf7 bom's
3599 {
3600 assert(i[0] == BOM.init + idx);
3601 assert(i[1] == it[1]);
3602 }
3603 }
3604 }
3605
3606 @safe pure unittest
3607 {
3608 struct BOMInputRange
3609 {
3610 ubyte[] arr;
3611
3612 @property ubyte front()
3613 {
3614 return this.arr.front;
3615 }
3616
3617 @property bool empty()
3618 {
3619 return this.arr.empty;
3620 }
3621
3622 void popFront()
3623 {
3624 this.arr = this.arr[1 .. $];
3625 }
3626
3627 @property typeof(this) save()
3628 {
3629 return this;
3630 }
3631 }
3632
3633 static assert( isInputRange!BOMInputRange);
3634 static assert(!isArray!BOMInputRange);
3635
3636 ubyte[] dummyEnd = [0,0,0,0];
3637
3638 foreach (idx, it; bomTable[1 .. $])
3639 {
3640 {
3641 auto ir = BOMInputRange(it.sequence.dup);
3642
3643 auto b = getBOM(ir);
3644 assert(b.schema == it.schema);
3645 assert(ir.arr == it.sequence);
3646 }
3647
3648 {
3649 auto noBom = it.sequence[0 .. 1].dup ~ dummyEnd;
3650 size_t oldLen = noBom.length;
3651 assert(oldLen - 4 < it.sequence.length);
3652
3653 auto ir = BOMInputRange(noBom.dup);
3654 auto b = getBOM(ir);
3655 assert(b.schema == BOM.none);
3656 assert(noBom.length == oldLen);
3657 }
3658 }
3659 }
3660
3661 /** Constant defining a fully decoded BOM */
3662 enum dchar utfBOM = 0xfeff;