libphobos/src/std/encoding.d

   1 // Written in the D programming language.
   2
   3 /**
   4 Classes and functions for handling and transcoding between various encodings.
   5
   6 For cases where the _encoding is known at compile-time, functions are provided
   7 for arbitrary _encoding and decoding of characters, arbitrary transcoding
   8 between strings of different type, as well as validation and sanitization.
   9
  10 Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1
  11 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250 and WINDOWS-1252.
  12
  13 $(SCRIPT inhibitQuickIndex = 1;)
  14 $(BOOKTABLE,
  15 $(TR $(TH Category) $(TH Functions))
  16 $(TR $(TD Decode) $(TD
  17     $(LREF codePoints)
  18     $(LREF decode)
  19     $(LREF decodeReverse)
  20     $(LREF safeDecode)
  21 ))
  22 $(TR $(TD Conversion) $(TD
  23     $(LREF codeUnits)
  24     $(LREF sanitize)
  25     $(LREF transcode)
  26 ))
  27 $(TR $(TD Classification) $(TD
  28     $(LREF canEncode)
  29     $(LREF isValid)
  30     $(LREF isValidCodePoint)
  31     $(LREF isValidCodeUnit)
  32 ))
  33 $(TR $(TD BOM) $(TD
  34     $(LREF BOM)
  35     $(LREF BOMSeq)
  36     $(LREF getBOM)
  37     $(LREF utfBOM)
  38 ))
  39 $(TR $(TD Length &amp; Index) $(TD
  40     $(LREF firstSequence)
  41     $(LREF encodedLength)
  42     $(LREF index)
  43     $(LREF lastSequence)
  44     $(LREF validLength)
  45 ))
  46 $(TR $(TD Encoding schemes) $(TD
  47     $(LREF encodingName)
  48     $(LREF EncodingScheme)
  49     $(LREF EncodingSchemeASCII)
  50     $(LREF EncodingSchemeLatin1)
  51     $(LREF EncodingSchemeLatin2)
  52     $(LREF EncodingSchemeUtf16Native)
  53     $(LREF EncodingSchemeUtf32Native)
  54     $(LREF EncodingSchemeUtf8)
  55     $(LREF EncodingSchemeWindows1250)
  56     $(LREF EncodingSchemeWindows1252)
  57 ))
  58 $(TR $(TD Representation) $(TD
  59     $(LREF AsciiChar)
  60     $(LREF AsciiString)
  61     $(LREF Latin1Char)
  62     $(LREF Latin1String)
  63     $(LREF Latin2Char)
  64     $(LREF Latin2String)
  65     $(LREF Windows1250Char)
  66     $(LREF Windows1250String)
  67     $(LREF Windows1252Char)
  68     $(LREF Windows1252String)
  69 ))
  70 $(TR $(TD Exceptions) $(TD
  71     $(LREF INVALID_SEQUENCE)
  72     $(LREF EncodingException)
  73 ))
  74 )
  75
  76 For cases where the _encoding is not known at compile-time, but is
  77 known at run-time, the abstract class $(LREF EncodingScheme)
  78 and its subclasses is provided.  To construct a run-time encoder/decoder,
  79 one does e.g.
  80
  81 ----------------------------------------------------
  82 auto e = EncodingScheme.create("utf-8");
  83 ----------------------------------------------------
  84
  85 This library supplies $(LREF EncodingScheme) subclasses for ASCII,
  86 ISO-8859-1 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250,
  87 WINDOWS-1252, UTF-8, and (on little-endian architectures) UTF-16LE and
  88 UTF-32LE; or (on big-endian architectures) UTF-16BE and UTF-32BE.
  89
  90 This library provides a mechanism whereby other modules may add $(LREF
  91 EncodingScheme) subclasses for any other _encoding.
  92
  93 Copyright: Copyright Janice Caron 2008 - 2009.
  94 License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
  95 Authors:   Janice Caron
  96 Source:    $(PHOBOSSRC std/_encoding.d)
  97 */
  98 /*
  99          Copyright Janice Caron 2008 - 2009.
 100 Distributed under the Boost Software License, Version 1.0.
 101    (See accompanying file LICENSE_1_0.txt or copy at
 102          http://www.boost.org/LICENSE_1_0.txt)
 103 */
 104 module std.encoding;
 105
 106 import std.range.primitives;
 107 import std.traits;
 108 import std.typecons;
 109
 110 @system unittest
 111 {
 112     static ubyte[][] validStrings =
 113     [
 114         // Plain ASCII
 115         cast(ubyte[])"hello",
 116
 117         // First possible sequence of a certain length
 118         [ 0x00 ],                       // U+00000000   one byte
 119         [ 0xC2, 0x80 ],                 // U+00000080   two bytes
 120         [ 0xE0, 0xA0, 0x80 ],           // U+00000800   three bytes
 121         [ 0xF0, 0x90, 0x80, 0x80 ],     // U+00010000   three bytes
 122
 123         // Last possible sequence of a certain length
 124         [ 0x7F ],                       // U+0000007F   one byte
 125         [ 0xDF, 0xBF ],                 // U+000007FF   two bytes
 126         [ 0xEF, 0xBF, 0xBF ],           // U+0000FFFF   three bytes
 127
 128         // Other boundary conditions
 129         [ 0xED, 0x9F, 0xBF ],
 130         // U+0000D7FF   Last character before surrogates
 131         [ 0xEE, 0x80, 0x80 ],
 132         // U+0000E000   First character after surrogates
 133         [ 0xEF, 0xBF, 0xBD ],
 134         // U+0000FFFD   Unicode replacement character
 135         [ 0xF4, 0x8F, 0xBF, 0xBF ],
 136         // U+0010FFFF   Very last character
 137
 138         // Non-character code points
 139         /*  NOTE: These are legal in UTF, and may be converted from
 140             one UTF to another, however they do not represent Unicode
 141             characters. These code points have been reserved by
 142             Unicode as non-character code points. They are permissible
 143             for data exchange within an application, but they are are
 144             not permitted to be used as characters. Since this module
 145             deals with UTF, and not with Unicode per se, we choose to
 146             accept them here. */
 147         [ 0xDF, 0xBE ],                 // U+0000FFFE
 148         [ 0xDF, 0xBF ],                 // U+0000FFFF
 149     ];
 150
 151     static ubyte[][] invalidStrings =
 152     [
 153         // First possible sequence of a certain length, but greater
 154         // than U+10FFFF
 155         [ 0xF8, 0x88, 0x80, 0x80, 0x80 ],           // U+00200000   five bytes
 156         [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ],     // U+04000000   six bytes
 157
 158         // Last possible sequence of a certain length, but greater than U+10FFFF
 159         [ 0xF7, 0xBF, 0xBF, 0xBF ],                 // U+001FFFFF   four bytes
 160         [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ],           // U+03FFFFFF   five bytes
 161         [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+7FFFFFFF   six bytes
 162
 163         // Other boundary conditions
 164         [ 0xF4, 0x90, 0x80, 0x80 ],                 // U+00110000
 165                                                     // First code
 166                                                     // point after
 167                                                     // last character
 168
 169         // Unexpected continuation bytes
 170         [ 0x80 ],
 171         [ 0xBF ],
 172         [ 0x20, 0x80, 0x20 ],
 173         [ 0x20, 0xBF, 0x20 ],
 174         [ 0x80, 0x9F, 0xA0 ],
 175
 176         // Lonely start bytes
 177         [ 0xC0 ],
 178         [ 0xCF ],
 179         [ 0x20, 0xC0, 0x20 ],
 180         [ 0x20, 0xCF, 0x20 ],
 181         [ 0xD0 ],
 182         [ 0xDF ],
 183         [ 0x20, 0xD0, 0x20 ],
 184         [ 0x20, 0xDF, 0x20 ],
 185         [ 0xE0 ],
 186         [ 0xEF ],
 187         [ 0x20, 0xE0, 0x20 ],
 188         [ 0x20, 0xEF, 0x20 ],
 189         [ 0xF0 ],
 190         [ 0xF1 ],
 191         [ 0xF2 ],
 192         [ 0xF3 ],
 193         [ 0xF4 ],
 194         [ 0xF5 ],   // If this were legal it would start a character > U+10FFFF
 195         [ 0xF6 ],   // If this were legal it would start a character > U+10FFFF
 196         [ 0xF7 ],   // If this were legal it would start a character > U+10FFFF
 197
 198         [ 0xEF, 0xBF ],             // Three byte sequence with third byte missing
 199         [ 0xF7, 0xBF, 0xBF ],       // Four byte sequence with fourth byte missing
 200         [ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ],   // Concatenation of the above
 201
 202         // Impossible bytes
 203         [ 0xF8 ],
 204         [ 0xF9 ],
 205         [ 0xFA ],
 206         [ 0xFB ],
 207         [ 0xFC ],
 208         [ 0xFD ],
 209         [ 0xFE ],
 210         [ 0xFF ],
 211         [ 0x20, 0xF8, 0x20 ],
 212         [ 0x20, 0xF9, 0x20 ],
 213         [ 0x20, 0xFA, 0x20 ],
 214         [ 0x20, 0xFB, 0x20 ],
 215         [ 0x20, 0xFC, 0x20 ],
 216         [ 0x20, 0xFD, 0x20 ],
 217         [ 0x20, 0xFE, 0x20 ],
 218         [ 0x20, 0xFF, 0x20 ],
 219
 220         // Overlong sequences, all representing U+002F
 221         /*  With a safe UTF-8 decoder, all of the following five overlong
 222             representations of the ASCII character slash ("/") should be
 223             rejected like a malformed UTF-8 sequence */
 224         [ 0xC0, 0xAF ],
 225         [ 0xE0, 0x80, 0xAF ],
 226         [ 0xF0, 0x80, 0x80, 0xAF ],
 227         [ 0xF8, 0x80, 0x80, 0x80, 0xAF ],
 228         [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ],
 229
 230         // Maximum overlong sequences
 231         /*  Below you see the highest Unicode value that is still resulting in
 232             an overlong sequence if represented with the given number of bytes.
 233             This is a boundary test for safe UTF-8 decoders. All five
 234             characters should be rejected like malformed UTF-8 sequences. */
 235         [ 0xC1, 0xBF ],                             // U+0000007F
 236         [ 0xE0, 0x9F, 0xBF ],                       // U+000007FF
 237         [ 0xF0, 0x8F, 0xBF, 0xBF ],                 // U+0000FFFF
 238         [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ],           // U+001FFFFF
 239         [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+03FFFFFF
 240
 241         // Overlong representation of the NUL character
 242         /*  The following five sequences should also be rejected like malformed
 243             UTF-8 sequences and should not be treated like the ASCII NUL
 244             character. */
 245         [ 0xC0, 0x80 ],
 246         [ 0xE0, 0x80, 0x80 ],
 247         [ 0xF0, 0x80, 0x80, 0x80 ],
 248         [ 0xF8, 0x80, 0x80, 0x80, 0x80 ],
 249         [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ],
 250
 251         // Illegal code positions
 252         /*  The following UTF-8 sequences should be rejected like malformed
 253             sequences, because they never represent valid ISO 10646 characters
 254             and a UTF-8 decoder that accepts them might introduce security
 255             problems comparable to overlong UTF-8 sequences. */
 256         [ 0xED, 0xA0, 0x80 ],       // U+D800
 257         [ 0xED, 0xAD, 0xBF ],       // U+DB7F
 258         [ 0xED, 0xAE, 0x80 ],       // U+DB80
 259         [ 0xED, 0xAF, 0xBF ],       // U+DBFF
 260         [ 0xED, 0xB0, 0x80 ],       // U+DC00
 261         [ 0xED, 0xBE, 0x80 ],       // U+DF80
 262         [ 0xED, 0xBF, 0xBF ],       // U+DFFF
 263     ];
 264
 265     static string[] sanitizedStrings =
 266     [
 267         "\uFFFD","\uFFFD",
 268         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
 269         " \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ",
 270         "\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ",
 271         " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
 272         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD",
 273         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
 274         " \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ",
 275         " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
 276         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
 277         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
 278     ];
 279
 280     // Make sure everything that should be valid, is
 281     foreach (a;validStrings)
 282     {
 283         string s = cast(string) a;
 284         assert(isValid(s),"Failed to validate: "~makeReadable(s));
 285     }
 286
 287     // Make sure everything that shouldn't be valid, isn't
 288     foreach (a;invalidStrings)
 289     {
 290         string s = cast(string) a;
 291         assert(!isValid(s),"Incorrectly validated: "~makeReadable(s));
 292     }
 293
 294     // Make sure we can sanitize everything bad
 295     assert(invalidStrings.length == sanitizedStrings.length);
 296     for (int i=0; i<invalidStrings.length; ++i)
 297     {
 298         string s = cast(string) invalidStrings[i];
 299         string t = sanitize(s);
 300         assert(isValid(t));
 301         assert(t == sanitizedStrings[i]);
 302         ubyte[] u = cast(ubyte[]) t;
 303         validStrings ~= u;
 304     }
 305
 306     // Make sure all transcodings work in both directions, using both forward
 307     // and reverse iteration
 308     foreach (a; validStrings)
 309     {
 310         string s = cast(string) a;
 311         string s2;
 312         wstring ws, ws2;
 313         dstring ds, ds2;
 314
 315         transcode(s,ws);
 316         assert(isValid(ws));
 317         transcode(ws,s2);
 318         assert(s == s2);
 319
 320         transcode(s,ds);
 321         assert(isValid(ds));
 322         transcode(ds,s2);
 323         assert(s == s2);
 324
 325         transcode(ws,s);
 326         assert(isValid(s));
 327         transcode(s,ws2);
 328         assert(ws == ws2);
 329
 330         transcode(ws,ds);
 331         assert(isValid(ds));
 332         transcode(ds,ws2);
 333         assert(ws == ws2);
 334
 335         transcode(ds,s);
 336         assert(isValid(s));
 337         transcode(s,ds2);
 338         assert(ds == ds2);
 339
 340         transcode(ds,ws);
 341         assert(isValid(ws));
 342         transcode(ws,ds2);
 343         assert(ds == ds2);
 344
 345         transcodeReverse(s,ws);
 346         assert(isValid(ws));
 347         transcodeReverse(ws,s2);
 348         assert(s == s2);
 349
 350         transcodeReverse(s,ds);
 351         assert(isValid(ds));
 352         transcodeReverse(ds,s2);
 353         assert(s == s2);
 354
 355         transcodeReverse(ws,s);
 356         assert(isValid(s));
 357         transcodeReverse(s,ws2);
 358         assert(ws == ws2);
 359
 360         transcodeReverse(ws,ds);
 361         assert(isValid(ds));
 362         transcodeReverse(ds,ws2);
 363         assert(ws == ws2);
 364
 365         transcodeReverse(ds,s);
 366         assert(isValid(s));
 367         transcodeReverse(s,ds2);
 368         assert(ds == ds2);
 369
 370         transcodeReverse(ds,ws);
 371         assert(isValid(ws));
 372         transcodeReverse(ws,ds2);
 373         assert(ds == ds2);
 374     }
 375
 376     // Make sure the non-UTF encodings work too
 377     {
 378         auto s = "\u20AC100";
 379         Windows1252String t;
 380         transcode(s,t);
 381         assert(t == cast(Windows1252Char[])[0x80, '1', '0', '0']);
 382         string u;
 383         transcode(s,u);
 384         assert(s == u);
 385         Latin1String v;
 386         transcode(s,v);
 387         assert(cast(string) v == "?100");
 388         AsciiString w;
 389         transcode(v,w);
 390         assert(cast(string) w == "?100");
 391         s = "\u017Dlu\u0165ou\u010Dk\u00FD k\u016F\u0148";
 392         Latin2String x;
 393         transcode(s,x);
 394         assert(x == cast(Latin2Char[])[0xae, 'l', 'u', 0xbb, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]);
 395         Windows1250String y;
 396         transcode(s,y);
 397         assert(y == cast(Windows1250Char[])[0x8e, 'l', 'u', 0x9d, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]);
 398     }
 399
 400     // Make sure we can count properly
 401     {
 402         assert(encodedLength!(char)('A') == 1);
 403         assert(encodedLength!(char)('\u00E3') == 2);
 404         assert(encodedLength!(char)('\u2028') == 3);
 405         assert(encodedLength!(char)('\U0010FFF0') == 4);
 406         assert(encodedLength!(wchar)('A') == 1);
 407         assert(encodedLength!(wchar)('\U0010FFF0') == 2);
 408     }
 409
 410     // Make sure we can write into mutable arrays
 411     {
 412         char[4] buffer;
 413         auto n = encode(cast(dchar)'\u00E3',buffer);
 414         assert(n == 2);
 415         assert(buffer[0] == 0xC3);
 416         assert(buffer[1] == 0xA3);
 417     }
 418 }
 419
 420 //=============================================================================
 421
 422 /** Special value returned by $(D safeDecode) */
 423 enum dchar INVALID_SEQUENCE = cast(dchar) 0xFFFFFFFF;
 424
 425 template EncoderFunctions()
 426 {
 427     // Various forms of read
 428
 429     template ReadFromString()
 430     {
 431         @property bool canRead() { return s.length != 0; }
 432         E peek() @safe pure @nogc nothrow { return s[0]; }
 433         E read() @safe pure @nogc nothrow { E t = s[0]; s = s[1..$]; return t; }
 434     }
 435
 436     template ReverseReadFromString()
 437     {
 438         @property bool canRead() { return s.length != 0; }
 439         E peek() @safe pure @nogc nothrow { return s[$-1]; }
 440         E read() @safe pure @nogc nothrow { E t = s[$-1]; s = s[0..$-1]; return t; }
 441     }
 442
 443     // Various forms of Write
 444
 445     template WriteToString()
 446     {
 447         E[] s;
 448         void write(E c) @safe pure nothrow { s ~= c; }
 449     }
 450
 451     template WriteToArray()
 452     {
 453         void write(E c) @safe pure @nogc nothrow { array[0] = c; array = array[1..$]; }
 454     }
 455
 456     template WriteToDelegate()
 457     {
 458         void write(E c) { dg(c); }
 459     }
 460
 461     // Functions we will export
 462
 463     template EncodeViaWrite()
 464     {
 465         mixin encodeViaWrite;
 466         void encode(dchar c) { encodeViaWrite(c); }
 467     }
 468
 469     template SkipViaRead()
 470     {
 471         mixin skipViaRead;
 472         void skip() @safe pure @nogc nothrow { skipViaRead(); }
 473     }
 474
 475     template DecodeViaRead()
 476     {
 477         mixin decodeViaRead;
 478         dchar decode() @safe pure @nogc nothrow { return decodeViaRead(); }
 479     }
 480
 481     template SafeDecodeViaRead()
 482     {
 483         mixin safeDecodeViaRead;
 484         dchar safeDecode() @safe pure @nogc nothrow { return safeDecodeViaRead(); }
 485     }
 486
 487     template DecodeReverseViaRead()
 488     {
 489         mixin decodeReverseViaRead;
 490         dchar decodeReverse() @safe pure @nogc nothrow { return decodeReverseViaRead(); }
 491     }
 492
 493     // Encoding to different destinations
 494
 495     template EncodeToString()
 496     {
 497         mixin WriteToString;
 498         mixin EncodeViaWrite;
 499     }
 500
 501     template EncodeToArray()
 502     {
 503         mixin WriteToArray;
 504         mixin EncodeViaWrite;
 505     }
 506
 507     template EncodeToDelegate()
 508     {
 509         mixin WriteToDelegate;
 510         mixin EncodeViaWrite;
 511     }
 512
 513     // Decoding functions
 514
 515     template SkipFromString()
 516     {
 517         mixin ReadFromString;
 518         mixin SkipViaRead;
 519     }
 520
 521     template DecodeFromString()
 522     {
 523         mixin ReadFromString;
 524         mixin DecodeViaRead;
 525     }
 526
 527     template SafeDecodeFromString()
 528     {
 529         mixin ReadFromString;
 530         mixin SafeDecodeViaRead;
 531     }
 532
 533     template DecodeReverseFromString()
 534     {
 535         mixin ReverseReadFromString;
 536         mixin DecodeReverseViaRead;
 537     }
 538
 539     //=========================================================================
 540
 541     // Below are the functions we will ultimately expose to the user
 542
 543     E[] encode(dchar c) @safe pure nothrow
 544     {
 545         mixin EncodeToString e;
 546         e.encode(c);
 547         return e.s;
 548     }
 549
 550     void encode(dchar c, ref E[] array) @safe pure nothrow
 551     {
 552         mixin EncodeToArray e;
 553         e.encode(c);
 554     }
 555
 556     void encode(dchar c, void delegate(E) dg)
 557     {
 558         mixin EncodeToDelegate e;
 559         e.encode(c);
 560     }
 561
 562     void skip(ref const(E)[] s) @safe pure nothrow
 563     {
 564         mixin SkipFromString e;
 565         e.skip();
 566     }
 567
 568     dchar decode(S)(ref S s)
 569     {
 570         mixin DecodeFromString e;
 571         return e.decode();
 572     }
 573
 574     dchar safeDecode(S)(ref S s)
 575     {
 576         mixin SafeDecodeFromString e;
 577         return e.safeDecode();
 578     }
 579
 580     dchar decodeReverse(ref const(E)[] s) @safe pure nothrow
 581     {
 582         mixin DecodeReverseFromString e;
 583         return e.decodeReverse();
 584     }
 585 }
 586
 587 //=========================================================================
 588
 589 struct CodePoints(E)
 590 {
 591     const(E)[] s;
 592
 593     this(const(E)[] s)
 594     in
 595     {
 596         assert(isValid(s));
 597     }
 598     body
 599     {
 600         this.s = s;
 601     }
 602
 603     int opApply(scope int delegate(ref dchar) dg)
 604     {
 605         int result = 0;
 606         while (s.length != 0)
 607         {
 608             dchar c = decode(s);
 609             result = dg(c);
 610             if (result != 0) break;
 611         }
 612         return result;
 613     }
 614
 615     int opApply(scope int delegate(ref size_t, ref dchar) dg)
 616     {
 617         size_t i = 0;
 618         int result = 0;
 619         while (s.length != 0)
 620         {
 621             immutable len = s.length;
 622             dchar c = decode(s);
 623             size_t j = i; // We don't want the delegate corrupting i
 624             result = dg(j,c);
 625             if (result != 0) break;
 626             i += len - s.length;
 627         }
 628         return result;
 629     }
 630
 631     int opApplyReverse(scope int delegate(ref dchar) dg)
 632     {
 633         int result = 0;
 634         while (s.length != 0)
 635         {
 636             dchar c = decodeReverse(s);
 637             result = dg(c);
 638             if (result != 0) break;
 639         }
 640         return result;
 641     }
 642
 643     int opApplyReverse(scope int delegate(ref size_t, ref dchar) dg)
 644     {
 645         int result = 0;
 646         while (s.length != 0)
 647         {
 648             dchar c = decodeReverse(s);
 649             size_t i = s.length;
 650             result = dg(i,c);
 651             if (result != 0) break;
 652         }
 653         return result;
 654     }
 655 }
 656
 657 struct CodeUnits(E)
 658 {
 659     E[] s;
 660
 661     this(dchar d)
 662     in
 663     {
 664         assert(isValidCodePoint(d));
 665     }
 666     body
 667     {
 668         s = encode!(E)(d);
 669     }
 670
 671     int opApply(scope int delegate(ref E) dg)
 672     {
 673         int result = 0;
 674         foreach (E c;s)
 675         {
 676             result = dg(c);
 677             if (result != 0) break;
 678         }
 679         return result;
 680     }
 681
 682     int opApplyReverse(scope int delegate(ref E) dg)
 683     {
 684         int result = 0;
 685         foreach_reverse (E c;s)
 686         {
 687             result = dg(c);
 688             if (result != 0) break;
 689         }
 690         return result;
 691     }
 692 }
 693
 694 //=============================================================================
 695
 696 template EncoderInstance(E)
 697 {
 698     static assert(false,"Cannot instantiate EncoderInstance for type "
 699         ~ E.stringof);
 700 }
 701
 702 private template GenericEncoder()
 703 {
 704     bool canEncode(dchar c) @safe pure @nogc nothrow
 705     {
 706         if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) return true;
 707         if (c >= 0xFFFD) return false;
 708
 709         auto idx = 0;
 710         while (idx < bstMap.length)
 711         {
 712             if (bstMap[idx][0] == c) return true;
 713             idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index
 714         }
 715
 716         return false;
 717     }
 718
 719     bool isValidCodeUnit(E c) @safe pure @nogc nothrow
 720     {
 721         if (c < m_charMapStart || c > m_charMapEnd) return true;
 722         return charMap[c-m_charMapStart] != 0xFFFD;
 723     }
 724
 725     size_t encodedLength(dchar c) @safe pure @nogc nothrow
 726     in
 727     {
 728         assert(canEncode(c));
 729     }
 730     body
 731     {
 732         return 1;
 733     }
 734
 735     void encodeViaWrite()(dchar c)
 736     {
 737         if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) {}
 738         else if (c >= 0xFFFD) { c = '?'; }
 739         else
 740         {
 741             auto idx = 0;
 742             while (idx < bstMap.length)
 743             {
 744                 if (bstMap[idx][0] == c)
 745                 {
 746                     write(cast(E) bstMap[idx][1]);
 747                     return;
 748                 }
 749                 idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index
 750             }
 751             c = '?';
 752         }
 753         write(cast(E) c);
 754     }
 755
 756     void skipViaRead()()
 757     {
 758         read();
 759     }
 760
 761     dchar decodeViaRead()()
 762     {
 763         E c = read();
 764         return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
 765     }
 766
 767     dchar safeDecodeViaRead()()
 768     {
 769         immutable E c = read();
 770         immutable d = (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
 771         return d == 0xFFFD ? INVALID_SEQUENCE : d;
 772     }
 773
 774     dchar decodeReverseViaRead()()
 775     {
 776         E c = read();
 777         return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
 778     }
 779
 780     @property EString replacementSequence() @safe pure @nogc nothrow
 781     {
 782         return cast(EString)("?");
 783     }
 784
 785     mixin EncoderFunctions;
 786 }
 787
 788 //=============================================================================
 789 //          ASCII
 790 //=============================================================================
 791
 792 /** Defines various character sets. */
 793 enum AsciiChar : ubyte { init }
 794 /// Ditto
 795 alias AsciiString = immutable(AsciiChar)[];
 796
 797 template EncoderInstance(CharType : AsciiChar)
 798 {
 799     alias E = AsciiChar;
 800     alias EString = AsciiString;
 801
 802     @property string encodingName() @safe pure nothrow @nogc
 803     {
 804         return "ASCII";
 805     }
 806
 807     bool canEncode(dchar c) @safe pure nothrow @nogc
 808     {
 809         return c < 0x80;
 810     }
 811
 812     bool isValidCodeUnit(AsciiChar c) @safe pure nothrow @nogc
 813     {
 814         return c < 0x80;
 815     }
 816
 817     size_t encodedLength(dchar c) @safe pure nothrow @nogc
 818     in
 819     {
 820         assert(canEncode(c));
 821     }
 822     body
 823     {
 824         return 1;
 825     }
 826
 827     void encodeX(Range)(dchar c, Range r)
 828     {
 829         if (!canEncode(c)) c = '?';
 830         r.write(cast(AsciiChar) c);
 831     }
 832
 833     void encodeViaWrite()(dchar c)
 834     {
 835         if (!canEncode(c)) c = '?';
 836         write(cast(AsciiChar) c);
 837     }
 838
 839     void skipViaRead()()
 840     {
 841         read();
 842     }
 843
 844     dchar decodeViaRead()()
 845     {
 846         return read();
 847     }
 848
 849     dchar safeDecodeViaRead()()
 850     {
 851         immutable c = read();
 852         return canEncode(c) ? c : INVALID_SEQUENCE;
 853     }
 854
 855     dchar decodeReverseViaRead()()
 856     {
 857         return read();
 858     }
 859
 860     @property EString replacementSequence() @safe pure nothrow @nogc
 861     {
 862         return cast(EString)("?");
 863     }
 864
 865     mixin EncoderFunctions;
 866 }
 867
 868 //=============================================================================
 869 //          ISO-8859-1
 870 //=============================================================================
 871
 872 /** Defines an Latin1-encoded character. */
 873 enum Latin1Char : ubyte { init }
 874 /**
 875 Defines an Latin1-encoded string (as an array of $(D
 876 immutable(Latin1Char))).
 877  */
 878 alias Latin1String = immutable(Latin1Char)[];
 879
 880 template EncoderInstance(CharType : Latin1Char)
 881 {
 882     alias E = Latin1Char;
 883     alias EString = Latin1String;
 884
 885     @property string encodingName() @safe pure nothrow @nogc
 886     {
 887         return "ISO-8859-1";
 888     }
 889
 890     bool canEncode(dchar c) @safe pure nothrow @nogc
 891     {
 892         return c < 0x100;
 893     }
 894
 895     bool isValidCodeUnit(Latin1Char c) @safe pure nothrow @nogc
 896     {
 897         return true;
 898     }
 899
 900     size_t encodedLength(dchar c) @safe pure nothrow @nogc
 901     in
 902     {
 903         assert(canEncode(c));
 904     }
 905     body
 906     {
 907         return 1;
 908     }
 909
 910     void encodeViaWrite()(dchar c)
 911     {
 912         if (!canEncode(c)) c = '?';
 913         write(cast(Latin1Char) c);
 914     }
 915
 916     void skipViaRead()()
 917     {
 918         read();
 919     }
 920
 921     dchar decodeViaRead()()
 922     {
 923         return read();
 924     }
 925
 926     dchar safeDecodeViaRead()()
 927     {
 928         return read();
 929     }
 930
 931     dchar decodeReverseViaRead()()
 932     {
 933         return read();
 934     }
 935
 936     @property EString replacementSequence() @safe pure nothrow @nogc
 937     {
 938         return cast(EString)("?");
 939     }
 940
 941     mixin EncoderFunctions;
 942 }
 943
 944 //=============================================================================
 945 //          ISO-8859-2
 946 //=============================================================================
 947
 948 /// Defines a Latin2-encoded character.
 949 enum Latin2Char : ubyte { init }
 950
 951 /**
 952  * Defines an Latin2-encoded string (as an array of $(D
 953  * immutable(Latin2Char))).
 954  */
 955 alias Latin2String = immutable(Latin2Char)[];
 956
 957 private template EncoderInstance(CharType : Latin2Char)
 958 {
 959     import std.typecons : Tuple, tuple;
 960
 961     alias E = Latin2Char;
 962     alias EString = Latin2String;
 963
 964     @property string encodingName() @safe pure nothrow @nogc
 965     {
 966         return "ISO-8859-2";
 967     }
 968
 969     private static immutable dchar m_charMapStart = 0xa1;
 970     private static immutable dchar m_charMapEnd = 0xff;
 971
 972     private immutable wstring charMap =
 973         "\u0104\u02D8\u0141\u00A4\u013D\u015A\u00A7\u00A8"~
 974         "\u0160\u015E\u0164\u0179\u00AD\u017D\u017B\u00B0"~
 975         "\u0105\u02DB\u0142\u00B4\u013E\u015B\u02C7\u00B8"~
 976         "\u0161\u015F\u0165\u017A\u02DD\u017E\u017C\u0154"~
 977         "\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7\u010C"~
 978         "\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E\u0110"~
 979         "\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7\u0158"~
 980         "\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF\u0155"~
 981         "\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7\u010D"~
 982         "\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F\u0111"~
 983         "\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7\u0159"~
 984         "\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9";
 985
 986     private immutable Tuple!(wchar, char)[] bstMap = [
 987         tuple('\u0148','\xF2'), tuple('\u00F3','\xF3'), tuple('\u0165','\xBB'),
 988         tuple('\u00D3','\xD3'), tuple('\u010F','\xEF'), tuple('\u015B','\xB6'),
 989         tuple('\u017C','\xBF'), tuple('\u00C1','\xC1'), tuple('\u00E1','\xE1'),
 990         tuple('\u0103','\xE3'), tuple('\u013A','\xE5'), tuple('\u0155','\xE0'),
 991         tuple('\u0161','\xB9'), tuple('\u0171','\xFB'), tuple('\u02D8','\xA2'),
 992         tuple('\u00AD','\xAD'), tuple('\u00C9','\xC9'), tuple('\u00DA','\xDA'),
 993         tuple('\u00E9','\xE9'), tuple('\u00FA','\xFA'), tuple('\u0107','\xE6'),
 994         tuple('\u0119','\xEA'), tuple('\u0142','\xB3'), tuple('\u0151','\xF5'),
 995         tuple('\u0159','\xF8'), tuple('\u015F','\xBA'), tuple('\u0163','\xFE'),
 996         tuple('\u016F','\xF9'), tuple('\u017A','\xBC'), tuple('\u017E','\xBE'),
 997         tuple('\u02DB','\xB2'), tuple('\u00A7','\xA7'), tuple('\u00B4','\xB4'),
 998         tuple('\u00C4','\xC4'), tuple('\u00CD','\xCD'), tuple('\u00D6','\xD6'),
 999         tuple('\u00DD','\xDD'), tuple('\u00E4','\xE4'), tuple('\u00ED','\xED'),
1000         tuple('\u00F6','\xF6'), tuple('\u00FD','\xFD'), tuple('\u0105','\xB1'),
1001         tuple('\u010D','\xE8'), tuple('\u0111','\xF0'), tuple('\u011B','\xEC'),
1002         tuple('\u013E','\xB5'), tuple('\u0144','\xF1'), tuple('\u0150','\xD5'),
1003         tuple('\u0154','\xC0'), tuple('\u0158','\xD8'), tuple('\u015A','\xA6'),
1004         tuple('\u015E','\xAA'), tuple('\u0160','\xA9'), tuple('\u0162','\xDE'),
1005         tuple('\u0164','\xAB'), tuple('\u016E','\xD9'), tuple('\u0170','\xDB'),
1006         tuple('\u0179','\xAC'), tuple('\u017B','\xAF'), tuple('\u017D','\xAE'),
1007         tuple('\u02C7','\xB7'), tuple('\u02D9','\xFF'), tuple('\u02DD','\xBD'),
1008         tuple('\u00A4','\xA4'), tuple('\u00A8','\xA8'), tuple('\u00B0','\xB0'),
1009         tuple('\u00B8','\xB8'), tuple('\u00C2','\xC2'), tuple('\u00C7','\xC7'),
1010         tuple('\u00CB','\xCB'), tuple('\u00CE','\xCE'), tuple('\u00D4','\xD4'),
1011         tuple('\u00D7','\xD7'), tuple('\u00DC','\xDC'), tuple('\u00DF','\xDF'),
1012         tuple('\u00E2','\xE2'), tuple('\u00E7','\xE7'), tuple('\u00EB','\xEB'),
1013         tuple('\u00EE','\xEE'), tuple('\u00F4','\xF4'), tuple('\u00F7','\xF7'),
1014         tuple('\u00FC','\xFC'), tuple('\u0102','\xC3'), tuple('\u0104','\xA1'),
1015         tuple('\u0106','\xC6'), tuple('\u010C','\xC8'), tuple('\u010E','\xCF'),
1016         tuple('\u0110','\xD0'), tuple('\u0118','\xCA'), tuple('\u011A','\xCC'),
1017         tuple('\u0139','\xC5'), tuple('\u013D','\xA5'), tuple('\u0141','\xA3'),
1018         tuple('\u0143','\xD1'), tuple('\u0147','\xD2')
1019     ];
1020
1021     mixin GenericEncoder!();
1022 }
1023
1024 //=============================================================================
1025 //          WINDOWS-1250
1026 //=============================================================================
1027
1028 /// Defines a Windows1250-encoded character.
1029 enum Windows1250Char : ubyte { init }
1030
1031 /**
1032  * Defines an Windows1250-encoded string (as an array of $(D
1033  * immutable(Windows1250Char))).
1034  */
1035 alias Windows1250String = immutable(Windows1250Char)[];
1036
1037 private template EncoderInstance(CharType : Windows1250Char)
1038 {
1039     import std.typecons : Tuple, tuple;
1040
1041     alias E = Windows1250Char;
1042     alias EString = Windows1250String;
1043
1044     @property string encodingName() @safe pure nothrow @nogc
1045     {
1046         return "windows-1250";
1047     }
1048
1049     private static immutable dchar m_charMapStart = 0x80;
1050     private static immutable dchar m_charMapEnd = 0xff;
1051
1052     private immutable wstring charMap =
1053         "\u20AC\uFFFD\u201A\uFFFD\u201E\u2026\u2020\u2021"~
1054         "\uFFFD\u2030\u0160\u2039\u015A\u0164\u017D\u0179"~
1055         "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1056         "\uFFFD\u2122\u0161\u203A\u015B\u0165\u017E\u017A"~
1057         "\u00A0\u02C7\u02D8\u0141\u00A4\u0104\u00A6\u00A7"~
1058         "\u00A8\u00A9\u015E\u00AB\u00AC\u00AD\u00AE\u017B"~
1059         "\u00B0\u00B1\u02DB\u0142\u00B4\u00B5\u00B6\u00B7"~
1060         "\u00B8\u0105\u015F\u00BB\u013D\u02DD\u013E\u017C"~
1061         "\u0154\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7"~
1062         "\u010C\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E"~
1063         "\u0110\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7"~
1064         "\u0158\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF"~
1065         "\u0155\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7"~
1066         "\u010D\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F"~
1067         "\u0111\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7"~
1068         "\u0159\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9";
1069
1070     private immutable Tuple!(wchar, char)[] bstMap = [
1071         tuple('\u011A','\xCC'), tuple('\u00DC','\xDC'), tuple('\u0179','\x8F'),
1072         tuple('\u00B7','\xB7'), tuple('\u00FC','\xFC'), tuple('\u0158','\xD8'),
1073         tuple('\u201C','\x93'), tuple('\u00AC','\xAC'), tuple('\u00CB','\xCB'),
1074         tuple('\u00EB','\xEB'), tuple('\u010C','\xC8'), tuple('\u0143','\xD1'),
1075         tuple('\u0162','\xDE'), tuple('\u02D9','\xFF'), tuple('\u2039','\x8B'),
1076         tuple('\u00A7','\xA7'), tuple('\u00B1','\xB1'), tuple('\u00C2','\xC2'),
1077         tuple('\u00D4','\xD4'), tuple('\u00E2','\xE2'), tuple('\u00F4','\xF4'),
1078         tuple('\u0104','\xA5'), tuple('\u0110','\xD0'), tuple('\u013D','\xBC'),
1079         tuple('\u0150','\xD5'), tuple('\u015E','\xAA'), tuple('\u016E','\xD9'),
1080         tuple('\u017D','\x8E'), tuple('\u2014','\x97'), tuple('\u2021','\x87'),
1081         tuple('\u20AC','\x80'), tuple('\u00A4','\xA4'), tuple('\u00A9','\xA9'),
1082         tuple('\u00AE','\xAE'), tuple('\u00B5','\xB5'), tuple('\u00BB','\xBB'),
1083         tuple('\u00C7','\xC7'), tuple('\u00CE','\xCE'), tuple('\u00D7','\xD7'),
1084         tuple('\u00DF','\xDF'), tuple('\u00E7','\xE7'), tuple('\u00EE','\xEE'),
1085         tuple('\u00F7','\xF7'), tuple('\u0102','\xC3'), tuple('\u0106','\xC6'),
1086         tuple('\u010E','\xCF'), tuple('\u0118','\xCA'), tuple('\u0139','\xC5'),
1087         tuple('\u0141','\xA3'), tuple('\u0147','\xD2'), tuple('\u0154','\xC0'),
1088         tuple('\u015A','\x8C'), tuple('\u0160','\x8A'), tuple('\u0164','\x8D'),
1089         tuple('\u0170','\xDB'), tuple('\u017B','\xAF'), tuple('\u02C7','\xA1'),
1090         tuple('\u02DD','\xBD'), tuple('\u2019','\x92'), tuple('\u201E','\x84'),
1091         tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'),
1092         tuple('\u00A0','\xA0'), tuple('\u00A6','\xA6'), tuple('\u00A8','\xA8'),
1093         tuple('\u00AB','\xAB'), tuple('\u00AD','\xAD'), tuple('\u00B0','\xB0'),
1094         tuple('\u00B4','\xB4'), tuple('\u00B6','\xB6'), tuple('\u00B8','\xB8'),
1095         tuple('\u00C1','\xC1'), tuple('\u00C4','\xC4'), tuple('\u00C9','\xC9'),
1096         tuple('\u00CD','\xCD'), tuple('\u00D3','\xD3'), tuple('\u00D6','\xD6'),
1097         tuple('\u00DA','\xDA'), tuple('\u00DD','\xDD'), tuple('\u00E1','\xE1'),
1098         tuple('\u00E4','\xE4'), tuple('\u00E9','\xE9'), tuple('\u00ED','\xED'),
1099         tuple('\u00F3','\xF3'), tuple('\u00F6','\xF6'), tuple('\u00FA','\xFA'),
1100         tuple('\u00FD','\xFD'), tuple('\u0103','\xE3'), tuple('\u0105','\xB9'),
1101         tuple('\u0107','\xE6'), tuple('\u010D','\xE8'), tuple('\u010F','\xEF'),
1102         tuple('\u0111','\xF0'), tuple('\u0119','\xEA'), tuple('\u011B','\xEC'),
1103         tuple('\u013A','\xE5'), tuple('\u013E','\xBE'), tuple('\u0142','\xB3'),
1104         tuple('\u0144','\xF1'), tuple('\u0148','\xF2'), tuple('\u0151','\xF5'),
1105         tuple('\u0155','\xE0'), tuple('\u0159','\xF8'), tuple('\u015B','\x9C'),
1106         tuple('\u015F','\xBA'), tuple('\u0161','\x9A'), tuple('\u0163','\xFE'),
1107         tuple('\u0165','\x9D'), tuple('\u016F','\xF9'), tuple('\u0171','\xFB'),
1108         tuple('\u017A','\x9F'), tuple('\u017C','\xBF'), tuple('\u017E','\x9E'),
1109         tuple('\u02D8','\xA2'), tuple('\u02DB','\xB2'), tuple('\u2013','\x96'),
1110         tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'),
1111         tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89')
1112     ];
1113
1114     mixin GenericEncoder!();
1115 }
1116
1117 //=============================================================================
1118 //          WINDOWS-1252
1119 //=============================================================================
1120
1121 /// Defines a Windows1252-encoded character.
1122 enum Windows1252Char : ubyte { init }
1123
1124 /**
1125  * Defines an Windows1252-encoded string (as an array of $(D
1126  * immutable(Windows1252Char))).
1127  */
1128 alias Windows1252String = immutable(Windows1252Char)[];
1129
1130 template EncoderInstance(CharType : Windows1252Char)
1131 {
1132     import std.typecons : Tuple, tuple;
1133
1134     alias E = Windows1252Char;
1135     alias EString = Windows1252String;
1136
1137     @property string encodingName() @safe pure nothrow @nogc
1138     {
1139         return "windows-1252";
1140     }
1141
1142     private static immutable dchar m_charMapStart = 0x80;
1143     private static immutable dchar m_charMapEnd = 0x9f;
1144
1145     private immutable wstring charMap =
1146         "\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021"~
1147         "\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD"~
1148         "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1149         "\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178";
1150
1151     private immutable Tuple!(wchar, char)[] bstMap = [
1152         tuple('\u201C','\x93'), tuple('\u0192','\x83'), tuple('\u2039','\x8B'),
1153         tuple('\u0161','\x9A'), tuple('\u2014','\x97'), tuple('\u2021','\x87'),
1154         tuple('\u20AC','\x80'), tuple('\u0153','\x9C'), tuple('\u017D','\x8E'),
1155         tuple('\u02DC','\x98'), tuple('\u2019','\x92'), tuple('\u201E','\x84'),
1156         tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'),
1157         tuple('\u0152','\x8C'), tuple('\u0160','\x8A'), tuple('\u0178','\x9F'),
1158         tuple('\u017E','\x9E'), tuple('\u02C6','\x88'), tuple('\u2013','\x96'),
1159         tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'),
1160         tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89')
1161     ];
1162
1163     mixin GenericEncoder!();
1164 }
1165
1166 //=============================================================================
1167 //          UTF-8
1168 //=============================================================================
1169
1170 template EncoderInstance(CharType : char)
1171 {
1172     alias E = char;
1173     alias EString = immutable(char)[];
1174
1175     @property string encodingName() @safe pure nothrow @nogc
1176     {
1177         return "UTF-8";
1178     }
1179
1180     bool canEncode(dchar c) @safe pure nothrow @nogc
1181     {
1182         return isValidCodePoint(c);
1183     }
1184
1185     bool isValidCodeUnit(char c) @safe pure nothrow @nogc
1186     {
1187         return (c < 0xC0 || (c >= 0xC2 && c < 0xF5));
1188     }
1189
1190     immutable ubyte[128] tailTable =
1191     [
1192         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1193         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1194         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1195         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1196         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1197         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1198         2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1199         3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0,
1200     ];
1201
1202     private int tails(char c) @safe pure nothrow @nogc
1203     in
1204     {
1205         assert(c >= 0x80);
1206     }
1207     body
1208     {
1209         return tailTable[c-0x80];
1210     }
1211
1212     size_t encodedLength(dchar c) @safe pure nothrow @nogc
1213     in
1214     {
1215         assert(canEncode(c));
1216     }
1217     body
1218     {
1219         if (c < 0x80) return 1;
1220         if (c < 0x800) return 2;
1221         if (c < 0x10000) return 3;
1222         return 4;
1223     }
1224
1225     void encodeViaWrite()(dchar c)
1226     {
1227         if (c < 0x80)
1228         {
1229             write(cast(char) c);
1230         }
1231         else if (c < 0x800)
1232         {
1233             write(cast(char)((c >> 6) + 0xC0));
1234             write(cast(char)((c & 0x3F) + 0x80));
1235         }
1236         else if (c < 0x10000)
1237         {
1238             write(cast(char)((c >> 12) + 0xE0));
1239             write(cast(char)(((c >> 6) & 0x3F) + 0x80));
1240             write(cast(char)((c & 0x3F) + 0x80));
1241         }
1242         else
1243         {
1244             write(cast(char)((c >> 18) + 0xF0));
1245             write(cast(char)(((c >> 12) & 0x3F) + 0x80));
1246             write(cast(char)(((c >> 6) & 0x3F) + 0x80));
1247             write(cast(char)((c & 0x3F) + 0x80));
1248         }
1249     }
1250
1251     void skipViaRead()()
1252     {
1253         auto c = read();
1254         if (c < 0xC0) return;
1255         int n = tails(cast(char) c);
1256         for (size_t i=0; i<n; ++i)
1257         {
1258             read();
1259         }
1260     }
1261
1262     dchar decodeViaRead()()
1263     {
1264         dchar c = read();
1265         if (c < 0xC0) return c;
1266         int n = tails(cast(char) c);
1267         c &= (1 << (6 - n)) - 1;
1268         for (size_t i=0; i<n; ++i)
1269         {
1270             c = (c << 6) + (read() & 0x3F);
1271         }
1272         return c;
1273     }
1274
1275     dchar safeDecodeViaRead()()
1276     {
1277         dchar c = read();
1278         if (c < 0x80) return c;
1279         int n = tails(cast(char) c);
1280         if (n == 0) return INVALID_SEQUENCE;
1281
1282         if (!canRead) return INVALID_SEQUENCE;
1283         size_t d = peek();
1284         immutable err =
1285         (
1286             (c < 0xC2)                              // fail overlong 2-byte sequences
1287         ||  (c > 0xF4)                              // fail overlong 4-6-byte sequences
1288         ||  (c == 0xE0 && ((d & 0xE0) == 0x80))     // fail overlong 3-byte sequences
1289         ||  (c == 0xED && ((d & 0xE0) == 0xA0))     // fail surrogates
1290         ||  (c == 0xF0 && ((d & 0xF0) == 0x80))     // fail overlong 4-byte sequences
1291         ||  (c == 0xF4 && ((d & 0xF0) >= 0x90))     // fail code points > 0x10FFFF
1292         );
1293
1294         c &= (1 << (6 - n)) - 1;
1295         for (size_t i=0; i<n; ++i)
1296         {
1297             if (!canRead) return INVALID_SEQUENCE;
1298             d = peek();
1299             if ((d & 0xC0) != 0x80) return INVALID_SEQUENCE;
1300             c = (c << 6) + (read() & 0x3F);
1301         }
1302
1303         return err ? INVALID_SEQUENCE : c;
1304     }
1305
1306     dchar decodeReverseViaRead()()
1307     {
1308         dchar c = read();
1309         if (c < 0x80) return c;
1310         size_t shift = 0;
1311         c &= 0x3F;
1312         for (size_t i=0; i<4; ++i)
1313         {
1314             shift += 6;
1315             auto d = read();
1316             size_t n = tails(cast(char) d);
1317             immutable mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1;
1318             c += ((d & mask) << shift);
1319             if (n != 0) break;
1320         }
1321         return c;
1322     }
1323
1324     @property EString replacementSequence() @safe pure nothrow @nogc
1325     {
1326         return "\uFFFD";
1327     }
1328
1329     mixin EncoderFunctions;
1330 }
1331
1332 //=============================================================================
1333 //          UTF-16
1334 //=============================================================================
1335
1336 template EncoderInstance(CharType : wchar)
1337 {
1338     alias E = wchar;
1339     alias EString = immutable(wchar)[];
1340
1341     @property string encodingName() @safe pure nothrow @nogc
1342     {
1343         return "UTF-16";
1344     }
1345
1346     bool canEncode(dchar c) @safe pure nothrow @nogc
1347     {
1348         return isValidCodePoint(c);
1349     }
1350
1351     bool isValidCodeUnit(wchar c) @safe pure nothrow @nogc
1352     {
1353         return true;
1354     }
1355
1356     size_t encodedLength(dchar c) @safe pure nothrow @nogc
1357     in
1358     {
1359         assert(canEncode(c));
1360     }
1361     body
1362     {
1363         return (c < 0x10000) ? 1 : 2;
1364     }
1365
1366     void encodeViaWrite()(dchar c)
1367     {
1368         if (c < 0x10000)
1369         {
1370             write(cast(wchar) c);
1371         }
1372         else
1373         {
1374             size_t n = c - 0x10000;
1375             write(cast(wchar)(0xD800 + (n >> 10)));
1376             write(cast(wchar)(0xDC00 + (n & 0x3FF)));
1377         }
1378     }
1379
1380     void skipViaRead()()
1381     {
1382         immutable c = read();
1383         if (c < 0xD800 || c >= 0xE000) return;
1384         read();
1385     }
1386
1387     dchar decodeViaRead()()
1388     {
1389         wchar c = read();
1390         if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1391         wchar d = read();
1392         c &= 0x3FF;
1393         d &= 0x3FF;
1394         return 0x10000 + (c << 10) + d;
1395     }
1396
1397     dchar safeDecodeViaRead()()
1398     {
1399         wchar c = read();
1400         if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1401         if (c >= 0xDC00) return INVALID_SEQUENCE;
1402         if (!canRead) return INVALID_SEQUENCE;
1403         wchar d = peek();
1404         if (d < 0xDC00 || d >= 0xE000) return INVALID_SEQUENCE;
1405         d = read();
1406         c &= 0x3FF;
1407         d &= 0x3FF;
1408         return 0x10000 + (c << 10) + d;
1409     }
1410
1411     dchar decodeReverseViaRead()()
1412     {
1413         wchar c = read();
1414         if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1415         wchar d = read();
1416         c &= 0x3FF;
1417         d &= 0x3FF;
1418         return 0x10000 + (d << 10) + c;
1419     }
1420
1421     @property EString replacementSequence() @safe pure nothrow @nogc
1422     {
1423         return "\uFFFD"w;
1424     }
1425
1426     mixin EncoderFunctions;
1427 }
1428
1429 //=============================================================================
1430 //          UTF-32
1431 //=============================================================================
1432
1433 template EncoderInstance(CharType : dchar)
1434 {
1435     alias E = dchar;
1436     alias EString = immutable(dchar)[];
1437
1438     @property string encodingName() @safe pure nothrow @nogc
1439     {
1440         return "UTF-32";
1441     }
1442
1443     bool canEncode(dchar c) @safe pure @nogc nothrow
1444     {
1445         return isValidCodePoint(c);
1446     }
1447
1448     bool isValidCodeUnit(dchar c) @safe pure @nogc nothrow
1449     {
1450         return isValidCodePoint(c);
1451     }
1452
1453     size_t encodedLength(dchar c) @safe pure @nogc nothrow
1454     in
1455     {
1456         assert(canEncode(c));
1457     }
1458     body
1459     {
1460         return 1;
1461     }
1462
1463     void encodeViaWrite()(dchar c)
1464     {
1465         write(c);
1466     }
1467
1468     void skipViaRead()()
1469     {
1470         read();
1471     }
1472
1473     dchar decodeViaRead()()
1474     {
1475         return cast(dchar) read();
1476     }
1477
1478     dchar safeDecodeViaRead()()
1479     {
1480         immutable c = read();
1481         return isValidCodePoint(c) ? c : INVALID_SEQUENCE;
1482     }
1483
1484     dchar decodeReverseViaRead()()
1485     {
1486         return cast(dchar) read();
1487     }
1488
1489     @property EString replacementSequence() @safe pure nothrow @nogc
1490     {
1491         return "\uFFFD"d;
1492     }
1493
1494     mixin EncoderFunctions;
1495 }
1496
1497 //=============================================================================
1498 // Below are forwarding functions which expose the function to the user
1499
1500 /**
1501 Returns true if c is a valid code point
1502
1503  Note that this includes the non-character code points U+FFFE and U+FFFF,
1504  since these are valid code points (even though they are not valid
1505  characters).
1506
1507  Supersedes:
1508  This function supersedes $(D std.utf.startsValidDchar()).
1509
1510  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1511  WINDOWS-1252
1512
1513  Params:
1514     c = the code point to be tested
1515  */
1516 bool isValidCodePoint(dchar c) @safe pure nothrow @nogc
1517 {
1518     return c < 0xD800 || (c >= 0xE000 && c < 0x110000);
1519 }
1520
1521 /**
1522  Returns the name of an encoding.
1523
1524  The type of encoding cannot be deduced. Therefore, it is necessary to
1525  explicitly specify the encoding type.
1526
1527  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1528  WINDOWS-1252
1529  */
1530 @property string encodingName(T)()
1531 {
1532     return EncoderInstance!(T).encodingName;
1533 }
1534
1535 ///
1536 @safe unittest
1537 {
1538     assert(encodingName!(char) == "UTF-8");
1539     assert(encodingName!(wchar) == "UTF-16");
1540     assert(encodingName!(dchar) == "UTF-32");
1541     assert(encodingName!(AsciiChar) == "ASCII");
1542     assert(encodingName!(Latin1Char) == "ISO-8859-1");
1543     assert(encodingName!(Latin2Char) == "ISO-8859-2");
1544     assert(encodingName!(Windows1250Char) == "windows-1250");
1545     assert(encodingName!(Windows1252Char) == "windows-1252");
1546 }
1547
1548 /**
1549  Returns true iff it is possible to represent the specified codepoint
1550  in the encoding.
1551
1552  The type of encoding cannot be deduced. Therefore, it is necessary to
1553  explicitly specify the encoding type.
1554
1555  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1556  WINDOWS-1252
1557  */
1558 bool canEncode(E)(dchar c)
1559 {
1560     return EncoderInstance!(E).canEncode(c);
1561 }
1562
1563 ///
1564 @safe pure unittest
1565 {
1566     assert( canEncode!(Latin1Char)('A'));
1567     assert( canEncode!(Latin2Char)('A'));
1568     assert(!canEncode!(AsciiChar)('\u00A0'));
1569     assert( canEncode!(Latin1Char)('\u00A0'));
1570     assert( canEncode!(Latin2Char)('\u00A0'));
1571     assert( canEncode!(Windows1250Char)('\u20AC'));
1572     assert(!canEncode!(Windows1250Char)('\u20AD'));
1573     assert(!canEncode!(Windows1250Char)('\uFFFD'));
1574     assert( canEncode!(Windows1252Char)('\u20AC'));
1575     assert(!canEncode!(Windows1252Char)('\u20AD'));
1576     assert(!canEncode!(Windows1252Char)('\uFFFD'));
1577     assert(!canEncode!(char)(cast(dchar) 0x110000));
1578 }
1579
1580 /// How to check an entire string
1581 @safe pure unittest
1582 {
1583     import std.algorithm.searching : find;
1584     import std.utf : byDchar;
1585
1586     assert("The quick brown fox"
1587         .byDchar
1588         .find!(x => !canEncode!AsciiChar(x))
1589         .empty);
1590 }
1591
1592 /**
1593  Returns true if the code unit is legal. For example, the byte 0x80 would
1594  not be legal in ASCII, because ASCII code units must always be in the range
1595  0x00 to 0x7F.
1596
1597  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1598  WINDOWS-1252
1599
1600  Params:
1601     c = the code unit to be tested
1602  */
1603 bool isValidCodeUnit(E)(E c)
1604 {
1605     return EncoderInstance!(E).isValidCodeUnit(c);
1606 }
1607
1608 ///
1609 @system pure unittest
1610 {
1611     assert(!isValidCodeUnit(cast(char) 0xC0));
1612     assert(!isValidCodeUnit(cast(char) 0xFF));
1613     assert( isValidCodeUnit(cast(wchar) 0xD800));
1614     assert(!isValidCodeUnit(cast(dchar) 0xD800));
1615     assert(!isValidCodeUnit(cast(AsciiChar) 0xA0));
1616     assert( isValidCodeUnit(cast(Windows1250Char) 0x80));
1617     assert(!isValidCodeUnit(cast(Windows1250Char) 0x81));
1618     assert( isValidCodeUnit(cast(Windows1252Char) 0x80));
1619     assert(!isValidCodeUnit(cast(Windows1252Char) 0x81));
1620 }
1621
1622 /**
1623  Returns true if the string is encoded correctly
1624
1625  Supersedes:
1626  This function supersedes std.utf.validate(), however note that this
1627  function returns a bool indicating whether the input was valid or not,
1628  whereas the older function would throw an exception.
1629
1630  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1631  WINDOWS-1252
1632
1633  Params:
1634     s = the string to be tested
1635  */
1636 bool isValid(E)(const(E)[] s)
1637 {
1638     return s.length == validLength(s);
1639 }
1640
1641 ///
1642 @system pure unittest
1643 {
1644     assert( isValid("\u20AC100"));
1645     assert(!isValid(cast(char[3])[167, 133, 175]));
1646 }
1647
1648 /**
1649  Returns the length of the longest possible substring, starting from
1650  the first code unit, which is validly encoded.
1651
1652  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1653  WINDOWS-1252
1654
1655  Params:
1656     s = the string to be tested
1657  */
1658 size_t validLength(E)(const(E)[] s)
1659 {
1660     size_t result, before = void;
1661     while ((before = s.length) > 0)
1662     {
1663         if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE)
1664             break;
1665         result += before - s.length;
1666     }
1667     return result;
1668 }
1669
1670 /**
1671  Sanitizes a string by replacing malformed code unit sequences with valid
1672  code unit sequences. The result is guaranteed to be valid for this encoding.
1673
1674  If the input string is already valid, this function returns the original,
1675  otherwise it constructs a new string by replacing all illegal code unit
1676  sequences with the encoding's replacement character, Invalid sequences will
1677  be replaced with the Unicode replacement character (U+FFFD) if the
1678  character repertoire contains it, otherwise invalid sequences will be
1679  replaced with '?'.
1680
1681  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1682  WINDOWS-1252
1683
1684  Params:
1685     s = the string to be sanitized
1686  */
1687 immutable(E)[] sanitize(E)(immutable(E)[] s)
1688 {
1689     size_t n = validLength(s);
1690     if (n == s.length) return s;
1691
1692     auto repSeq = EncoderInstance!(E).replacementSequence;
1693
1694     // Count how long the string needs to be.
1695     // Overestimating is not a problem
1696     size_t len = s.length;
1697     const(E)[] t = s[n..$];
1698     while (t.length != 0)
1699     {
1700         immutable c = EncoderInstance!(E).safeDecode(t);
1701         assert(c == INVALID_SEQUENCE);
1702         len += repSeq.length;
1703         t = t[validLength(t)..$];
1704     }
1705
1706     // Now do the write
1707     E[] array = new E[len];
1708     array[0 .. n] = s[0 .. n];
1709     size_t offset = n;
1710
1711     t = s[n..$];
1712     while (t.length != 0)
1713     {
1714         immutable c = EncoderInstance!(E).safeDecode(t);
1715         assert(c == INVALID_SEQUENCE);
1716         array[offset .. offset+repSeq.length] = repSeq[];
1717         offset += repSeq.length;
1718         n = validLength(t);
1719         array[offset .. offset+n] = t[0 .. n];
1720         offset += n;
1721         t = t[n..$];
1722     }
1723     return cast(immutable(E)[])array[0 .. offset];
1724 }
1725
1726 ///
1727 @system pure unittest
1728 {
1729     assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld");
1730 }
1731
1732 /**
1733  Returns the length of the first encoded sequence.
1734
1735  The input to this function MUST be validly encoded.
1736  This is enforced by the function's in-contract.
1737
1738  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1739  WINDOWS-1252
1740
1741  Params:
1742  s = the string to be sliced
1743  */
1744 size_t firstSequence(E)(const(E)[] s)
1745 in
1746 {
1747     assert(s.length != 0);
1748     const(E)[] u = s;
1749     assert(safeDecode(u) != INVALID_SEQUENCE);
1750 }
1751 body
1752 {
1753     auto before = s.length;
1754     EncoderInstance!(E).skip(s);
1755     return before - s.length;
1756 }
1757
1758 ///
1759 @system pure unittest
1760 {
1761     assert(firstSequence("\u20AC1000") == "\u20AC".length);
1762     assert(firstSequence("hel") == "h".length);
1763 }
1764
1765 /**
1766  Returns the length of the last encoded sequence.
1767
1768  The input to this function MUST be validly encoded.
1769  This is enforced by the function's in-contract.
1770
1771  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1772  WINDOWS-1252
1773
1774  Params:
1775     s = the string to be sliced
1776  */
1777 size_t lastSequence(E)(const(E)[] s)
1778 in
1779 {
1780     assert(s.length != 0);
1781     assert(isValid(s));
1782 }
1783 body
1784 {
1785     const(E)[] t = s;
1786     EncoderInstance!(E).decodeReverse(s);
1787     return t.length - s.length;
1788 }
1789
1790 ///
1791 @system pure unittest
1792 {
1793     assert(lastSequence("1000\u20AC") == "\u20AC".length);
1794     assert(lastSequence("hellö") == "ö".length);
1795 }
1796
1797 /**
1798  Returns the array index at which the (n+1)th code point begins.
1799
1800  The input to this function MUST be validly encoded.
1801  This is enforced by the function's in-contract.
1802
1803  Supersedes:
1804  This function supersedes std.utf.toUTFindex().
1805
1806  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1807  WINDOWS-1252
1808
1809  Params:
1810     s = the string to be counted
1811     n = the current code point index
1812  */
1813 ptrdiff_t index(E)(const(E)[] s,int n)
1814 in
1815 {
1816     assert(isValid(s));
1817     assert(n >= 0);
1818 }
1819 body
1820 {
1821     const(E)[] t = s;
1822     for (size_t i=0; i<n; ++i) EncoderInstance!(E).skip(s);
1823     return t.length - s.length;
1824 }
1825
1826 ///
1827 @system pure unittest
1828 {
1829     assert(index("\u20AC100",1) == 3);
1830     assert(index("hällo",2) == 3);
1831 }
1832
1833 /**
1834  Decodes a single code point.
1835
1836  This function removes one or more code units from the start of a string,
1837  and returns the decoded code point which those code units represent.
1838
1839  The input to this function MUST be validly encoded.
1840  This is enforced by the function's in-contract.
1841
1842  Supersedes:
1843  This function supersedes std.utf.decode(), however, note that the
1844  function codePoints() supersedes it more conveniently.
1845
1846  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1847  WINDOWS-1252
1848
1849  Params:
1850     s = the string whose first code point is to be decoded
1851  */
1852 dchar decode(S)(ref S s)
1853 in
1854 {
1855     assert(s.length != 0);
1856     auto u = s;
1857     assert(safeDecode(u) != INVALID_SEQUENCE);
1858 }
1859 body
1860 {
1861     return EncoderInstance!(typeof(s[0])).decode(s);
1862 }
1863
1864 /**
1865  Decodes a single code point from the end of a string.
1866
1867  This function removes one or more code units from the end of a string,
1868  and returns the decoded code point which those code units represent.
1869
1870  The input to this function MUST be validly encoded.
1871  This is enforced by the function's in-contract.
1872
1873  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1874  WINDOWS-1252
1875
1876  Params:
1877     s = the string whose first code point is to be decoded
1878  */
1879 dchar decodeReverse(E)(ref const(E)[] s)
1880 in
1881 {
1882     assert(s.length != 0);
1883     assert(isValid(s));
1884 }
1885 body
1886 {
1887     return EncoderInstance!(E).decodeReverse(s);
1888 }
1889
1890 /**
1891  Decodes a single code point. The input does not have to be valid.
1892
1893  This function removes one or more code units from the start of a string,
1894  and returns the decoded code point which those code units represent.
1895
1896  This function will accept an invalidly encoded string as input.
1897  If an invalid sequence is found at the start of the string, this
1898  function will remove it, and return the value INVALID_SEQUENCE.
1899
1900  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1901  WINDOWS-1252
1902
1903  Params:
1904     s = the string whose first code point is to be decoded
1905  */
1906 dchar safeDecode(S)(ref S s)
1907 in
1908 {
1909     assert(s.length != 0);
1910 }
1911 body
1912 {
1913     return EncoderInstance!(typeof(s[0])).safeDecode(s);
1914 }
1915
1916 /**
1917  Returns the number of code units required to encode a single code point.
1918
1919  The input to this function MUST be a valid code point.
1920  This is enforced by the function's in-contract.
1921
1922  The type of the output cannot be deduced. Therefore, it is necessary to
1923  explicitly specify the encoding as a template parameter.
1924
1925  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1926  WINDOWS-1252
1927
1928  Params:
1929     c = the code point to be encoded
1930  */
1931 size_t encodedLength(E)(dchar c)
1932 in
1933 {
1934     assert(isValidCodePoint(c));
1935 }
1936 body
1937 {
1938     return EncoderInstance!(E).encodedLength(c);
1939 }
1940
1941 /**
1942  Encodes a single code point.
1943
1944  This function encodes a single code point into one or more code units.
1945  It returns a string containing those code units.
1946
1947  The input to this function MUST be a valid code point.
1948  This is enforced by the function's in-contract.
1949
1950  The type of the output cannot be deduced. Therefore, it is necessary to
1951  explicitly specify the encoding as a template parameter.
1952
1953  Supersedes:
1954  This function supersedes std.utf.encode(), however, note that the
1955  function codeUnits() supersedes it more conveniently.
1956
1957  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1958  WINDOWS-1252
1959
1960  Params:
1961     c = the code point to be encoded
1962  */
1963 E[] encode(E)(dchar c)
1964 in
1965 {
1966     assert(isValidCodePoint(c));
1967 }
1968 body
1969 {
1970     return EncoderInstance!(E).encode(c);
1971 }
1972
1973 /**
1974  Encodes a single code point into an array.
1975
1976  This function encodes a single code point into one or more code units
1977  The code units are stored in a user-supplied fixed-size array,
1978  which must be passed by reference.
1979
1980  The input to this function MUST be a valid code point.
1981  This is enforced by the function's in-contract.
1982
1983  The type of the output cannot be deduced. Therefore, it is necessary to
1984  explicitly specify the encoding as a template parameter.
1985
1986  Supersedes:
1987  This function supersedes std.utf.encode(), however, note that the
1988  function codeUnits() supersedes it more conveniently.
1989
1990  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1991  WINDOWS-1252
1992
1993  Params:
1994     c     = the code point to be encoded
1995     array = the destination array
1996
1997  Returns:
1998           the number of code units written to the array
1999  */
2000 size_t encode(E)(dchar c, E[] array)
2001 in
2002 {
2003     assert(isValidCodePoint(c));
2004 }
2005 body
2006 {
2007     E[] t = array;
2008     EncoderInstance!(E).encode(c,t);
2009     return array.length - t.length;
2010 }
2011
2012 /*
2013 Encodes $(D c) in units of type $(D E) and writes the result to the
2014 output range $(D R). Returns the number of $(D E)s written.
2015  */
2016 size_t encode(E, R)(dchar c, auto ref R range)
2017 if (isNativeOutputRange!(R, E))
2018 {
2019     static if (is(Unqual!E == char))
2020     {
2021         if (c <= 0x7F)
2022         {
2023             put(range, cast(char) c);
2024             return 1;
2025         }
2026         if (c <= 0x7FF)
2027         {
2028             put(range, cast(char)(0xC0 | (c >> 6)));
2029             put(range, cast(char)(0x80 | (c & 0x3F)));
2030             return 2;
2031         }
2032         if (c <= 0xFFFF)
2033         {
2034             put(range, cast(char)(0xE0 | (c >> 12)));
2035             put(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
2036             put(range, cast(char)(0x80 | (c & 0x3F)));
2037             return 3;
2038         }
2039         if (c <= 0x10FFFF)
2040         {
2041             put(range, cast(char)(0xF0 | (c >> 18)));
2042             put(range, cast(char)(0x80 | ((c >> 12) & 0x3F)));
2043             put(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
2044             put(range, cast(char)(0x80 | (c & 0x3F)));
2045             return 4;
2046         }
2047         else
2048         {
2049             assert(0);
2050         }
2051     }
2052     else static if (is(Unqual!E == wchar))
2053     {
2054         if (c <= 0xFFFF)
2055         {
2056             range.put(cast(wchar) c);
2057             return 1;
2058         }
2059         range.put(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800));
2060         range.put(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00));
2061         return 2;
2062     }
2063     else static if (is(Unqual!E == dchar))
2064     {
2065         range.put(c);
2066         return 1;
2067     }
2068     else
2069     {
2070         static assert(0);
2071     }
2072 }
2073
2074 @safe pure unittest
2075 {
2076     import std.array;
2077     Appender!(char[]) r;
2078     assert(encode!(char)('T', r) == 1);
2079     assert(encode!(wchar)('T', r) == 1);
2080     assert(encode!(dchar)('T', r) == 1);
2081 }
2082
2083 /**
2084  Encodes a single code point to a delegate.
2085
2086  This function encodes a single code point into one or more code units.
2087  The code units are passed one at a time to the supplied delegate.
2088
2089  The input to this function MUST be a valid code point.
2090  This is enforced by the function's in-contract.
2091
2092  The type of the output cannot be deduced. Therefore, it is necessary to
2093  explicitly specify the encoding as a template parameter.
2094
2095  Supersedes:
2096  This function supersedes std.utf.encode(), however, note that the
2097  function codeUnits() supersedes it more conveniently.
2098
2099  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2100  WINDOWS-1252
2101
2102  Params:
2103     c  = the code point to be encoded
2104     dg = the delegate to invoke for each code unit
2105  */
2106 void encode(E)(dchar c, void delegate(E) dg)
2107 in
2108 {
2109     assert(isValidCodePoint(c));
2110 }
2111 body
2112 {
2113     EncoderInstance!(E).encode(c,dg);
2114 }
2115
2116 /**
2117 Encodes the contents of $(D s) in units of type $(D Tgt), writing the result to an
2118 output range.
2119
2120 Returns: The number of $(D Tgt) elements written.
2121 Params:
2122 Tgt = Element type of $(D range).
2123 s = Input array.
2124 range = Output range.
2125  */
2126 size_t encode(Tgt, Src, R)(in Src[] s, R range)
2127 {
2128     size_t result;
2129     foreach (c; s)
2130     {
2131         result += encode!(Tgt)(c, range);
2132     }
2133     return result;
2134 }
2135
2136 /**
2137  Returns a foreachable struct which can bidirectionally iterate over all
2138  code points in a string.
2139
2140  The input to this function MUST be validly encoded.
2141  This is enforced by the function's in-contract.
2142
2143  You can foreach either
2144  with or without an index. If an index is specified, it will be initialized
2145  at each iteration with the offset into the string at which the code point
2146  begins.
2147
2148  Supersedes:
2149  This function supersedes std.utf.decode().
2150
2151  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2152  WINDOWS-1252
2153
2154  Params:
2155     s = the string to be decoded
2156
2157  Example:
2158  --------------------------------------------------------
2159  string s = "hello world";
2160  foreach (c;codePoints(s))
2161  {
2162      // do something with c (which will always be a dchar)
2163  }
2164  --------------------------------------------------------
2165
2166  Note that, currently, foreach (c:codePoints(s)) is superior to foreach (c;s)
2167  in that the latter will fall over on encountering U+FFFF.
2168  */
2169 CodePoints!(E) codePoints(E)(immutable(E)[] s)
2170 in
2171 {
2172     assert(isValid(s));
2173 }
2174 body
2175 {
2176     return CodePoints!(E)(s);
2177 }
2178
2179 ///
2180 @system unittest
2181 {
2182     string s = "hello";
2183     string t;
2184     foreach (c;codePoints(s))
2185     {
2186         t ~= cast(char) c;
2187     }
2188     assert(s == t);
2189 }
2190
2191 /**
2192  Returns a foreachable struct which can bidirectionally iterate over all
2193  code units in a code point.
2194
2195  The input to this function MUST be a valid code point.
2196  This is enforced by the function's in-contract.
2197
2198  The type of the output cannot be deduced. Therefore, it is necessary to
2199  explicitly specify the encoding type in the template parameter.
2200
2201  Supersedes:
2202  This function supersedes std.utf.encode().
2203
2204  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2205  WINDOWS-1252
2206
2207  Params:
2208     c = the code point to be encoded
2209  */
2210 CodeUnits!(E) codeUnits(E)(dchar c)
2211 in
2212 {
2213     assert(isValidCodePoint(c));
2214 }
2215 body
2216 {
2217     return CodeUnits!(E)(c);
2218 }
2219
2220 ///
2221 @system unittest
2222 {
2223     char[] a;
2224     foreach (c;codeUnits!(char)(cast(dchar)'\u20AC'))
2225     {
2226         a ~= c;
2227     }
2228     assert(a.length == 3);
2229     assert(a[0] == 0xE2);
2230     assert(a[1] == 0x82);
2231     assert(a[2] == 0xAC);
2232 }
2233
2234 /**
2235  Convert a string from one encoding to another.
2236
2237  Supersedes:
2238  This function supersedes std.utf.toUTF8(), std.utf.toUTF16() and
2239  std.utf.toUTF32()
2240  (but note that to!() supersedes it more conveniently).
2241
2242  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2243  WINDOWS-1252
2244
2245  Params:
2246     s = Source string. $(B Must) be validly encoded.
2247         This is enforced by the function's in-contract.
2248     r = Destination string
2249
2250  See_Also:
2251     $(REF to, std,conv)
2252  */
2253 void transcode(Src, Dst)(Src[] s, out Dst[] r)
2254 in
2255 {
2256     assert(isValid(s));
2257 }
2258 body
2259 {
2260     static if (is(Src == Dst) && is(Src == immutable))
2261     {
2262         r = s;
2263     }
2264     else static if (is(Unqual!Src == AsciiChar))
2265     {
2266         transcode(cast(const(char)[])s, r);
2267     }
2268     else
2269     {
2270         static if (is(Unqual!Dst == wchar))
2271         {
2272             immutable minReservePlace = 2;
2273         }
2274         else static if (is(Unqual!Dst == dchar))
2275         {
2276             immutable minReservePlace = 1;
2277         }
2278         else
2279         {
2280             immutable minReservePlace = 6;
2281         }
2282
2283         auto buffer = new Unqual!Dst[s.length];
2284         auto tmpBuffer = buffer;
2285
2286         while (s.length != 0)
2287         {
2288             if (tmpBuffer.length < minReservePlace)
2289             {
2290                 size_t prevLength = buffer.length;
2291                 buffer.length += s.length + minReservePlace;
2292                 tmpBuffer = buffer[prevLength - tmpBuffer.length .. $];
2293             }
2294             EncoderInstance!(Unqual!Dst).encode(decode(s), tmpBuffer);
2295         }
2296
2297         r = cast(Dst[]) buffer[0 .. buffer.length - tmpBuffer.length];
2298     }
2299 }
2300
2301 ///
2302 @system pure unittest
2303 {
2304     wstring ws;
2305     // transcode from UTF-8 to UTF-16
2306     transcode("hello world",ws);
2307     assert(ws == "hello world"w);
2308
2309     Latin1String ls;
2310     // transcode from UTF-16 to ISO-8859-1
2311     transcode(ws, ls);
2312     assert(ws == "hello world");
2313 }
2314
2315 @system pure unittest
2316 {
2317     import std.meta;
2318     import std.range;
2319     {
2320         import std.conv : to;
2321
2322         string asciiCharString = to!string(iota(0, 128, 1));
2323
2324         alias Types = AliasSeq!(string, Latin1String, Latin2String, AsciiString,
2325             Windows1250String, Windows1252String, dstring, wstring);
2326         foreach (S; Types)
2327             foreach (D; Types)
2328             {
2329                 string str;
2330                 S sStr;
2331                 D dStr;
2332                 transcode(asciiCharString, sStr);
2333                 transcode(sStr, dStr);
2334                 transcode(dStr, str);
2335                 assert(asciiCharString == str);
2336             }
2337     }
2338     {
2339         string czechChars = "Příliš žluťoučký kůň úpěl ďábelské ódy.";
2340         alias Types = AliasSeq!(string, dstring, wstring);
2341         foreach (S; Types)
2342             foreach (D; Types)
2343             {
2344                 string str;
2345                 S sStr;
2346                 D dStr;
2347                 transcode(czechChars, sStr);
2348                 transcode(sStr, dStr);
2349                 transcode(dStr, str);
2350                 assert(czechChars == str);
2351             }
2352     }
2353 }
2354
2355 @system unittest // mutable/const input/output
2356 {
2357     import std.meta : AliasSeq;
2358
2359     foreach (O; AliasSeq!(Latin1Char, const Latin1Char, immutable Latin1Char))
2360     {
2361         O[] output;
2362
2363         char[] mutableInput = "äbc".dup;
2364         transcode(mutableInput, output);
2365         assert(output == [0xE4, 'b', 'c']);
2366
2367         const char[] constInput = "öbc";
2368         transcode(constInput, output);
2369         assert(output == [0xF6, 'b', 'c']);
2370
2371         immutable char[] immutInput = "übc";
2372         transcode(immutInput, output);
2373         assert(output == [0xFC, 'b', 'c']);
2374     }
2375
2376     // Make sure that const/mutable input is copied.
2377     foreach (C; AliasSeq!(char, const char))
2378     {
2379         C[] input = "foo".dup;
2380         C[] output;
2381         transcode(input, output);
2382         assert(input == output);
2383         assert(input !is output);
2384     }
2385
2386     // But immutable input should not be copied.
2387     string input = "foo";
2388     string output;
2389     transcode(input, output);
2390     assert(input is output);
2391 }
2392
2393 //=============================================================================
2394
2395 /** The base class for exceptions thrown by this module */
2396 class EncodingException : Exception { this(string msg) @safe pure { super(msg); } }
2397
2398 class UnrecognizedEncodingException : EncodingException
2399 {
2400     private this(string msg) @safe pure { super(msg); }
2401 }
2402
2403 /** Abstract base class of all encoding schemes */
2404 abstract class EncodingScheme
2405 {
2406     import std.uni : toLower;
2407
2408     /**
2409      * Registers a subclass of EncodingScheme.
2410      *
2411      * This function allows user-defined subclasses of EncodingScheme to
2412      * be declared in other modules.
2413      *
2414      * Params:
2415      *     Klass = The subclass of EncodingScheme to register.
2416      *
2417      * Example:
2418      * ----------------------------------------------
2419      * class Amiga1251 : EncodingScheme
2420      * {
2421      *     shared static this()
2422      *     {
2423      *         EncodingScheme.register!Amiga1251;
2424      *     }
2425      * }
2426      * ----------------------------------------------
2427      */
2428     static void register(Klass:EncodingScheme)()
2429     {
2430         scope scheme = new Klass();
2431         foreach (encodingName;scheme.names())
2432         {
2433             supported[toLower(encodingName)] = () => new Klass();
2434         }
2435     }
2436
2437     deprecated("Please pass the EncodingScheme subclass as template argument instead.")
2438     static void register(string className)
2439     {
2440         auto scheme = cast(EncodingScheme) ClassInfo.find(className).create();
2441         if (scheme is null)
2442             throw new EncodingException("Unable to create class "~className);
2443         foreach (encodingName;scheme.names())
2444         {
2445             supportedFactories[toLower(encodingName)] = className;
2446         }
2447     }
2448
2449     /**
2450      * Obtains a subclass of EncodingScheme which is capable of encoding
2451      * and decoding the named encoding scheme.
2452      *
2453      * This function is only aware of EncodingSchemes which have been
2454      * registered with the register() function.
2455      *
2456      * Example:
2457      * ---------------------------------------------------
2458      * auto scheme = EncodingScheme.create("Amiga-1251");
2459      * ---------------------------------------------------
2460      */
2461     static EncodingScheme create(string encodingName)
2462     {
2463         static bool registerDefaultEncodings()
2464         {
2465             EncodingScheme.register!EncodingSchemeASCII;
2466             EncodingScheme.register!EncodingSchemeLatin1;
2467             EncodingScheme.register!EncodingSchemeLatin2;
2468             EncodingScheme.register!EncodingSchemeWindows1250;
2469             EncodingScheme.register!EncodingSchemeWindows1252;
2470             EncodingScheme.register!EncodingSchemeUtf8;
2471             EncodingScheme.register!EncodingSchemeUtf16Native;
2472             EncodingScheme.register!EncodingSchemeUtf32Native;
2473             return true;
2474         }
2475
2476         static shared bool initialized;
2477         import std.concurrency : initOnce;
2478         initOnce!initialized(registerDefaultEncodings());
2479         encodingName = toLower(encodingName);
2480
2481         if (auto p = encodingName in supported)
2482             return (*p)();
2483
2484         auto p = encodingName in supportedFactories;
2485         if (p is null)
2486             throw new EncodingException("Unrecognized Encoding: "~encodingName);
2487         string className = *p;
2488         auto scheme = cast(EncodingScheme) ClassInfo.find(className).create();
2489         if (scheme is null) throw new EncodingException("Unable to create class "~className);
2490         return scheme;
2491     }
2492
2493     const
2494     {
2495         /**
2496          * Returns the standard name of the encoding scheme
2497          */
2498         abstract override string toString();
2499
2500         /**
2501          * Returns an array of all known names for this encoding scheme
2502          */
2503         abstract string[] names();
2504
2505         /**
2506          * Returns true if the character c can be represented
2507          * in this encoding scheme.
2508          */
2509         abstract bool canEncode(dchar c);
2510
2511         /**
2512          * Returns the number of ubytes required to encode this code point.
2513          *
2514          * The input to this function MUST be a valid code point.
2515          *
2516          * Params:
2517          *    c = the code point to be encoded
2518          *
2519          * Returns:
2520          *    the number of ubytes required.
2521          */
2522         abstract size_t encodedLength(dchar c);
2523
2524         /**
2525          * Encodes a single code point into a user-supplied, fixed-size buffer.
2526          *
2527          * This function encodes a single code point into one or more ubytes.
2528          * The supplied buffer must be code unit aligned.
2529          * (For example, UTF-16LE or UTF-16BE must be wchar-aligned,
2530          * UTF-32LE or UTF-32BE must be dchar-aligned, etc.)
2531          *
2532          * The input to this function MUST be a valid code point.
2533          *
2534          * Params:
2535          *    c      = the code point to be encoded
2536          *    buffer = the destination array
2537          *
2538          * Returns:
2539          *    the number of ubytes written.
2540          */
2541         abstract size_t encode(dchar c, ubyte[] buffer);
2542
2543         /**
2544          * Decodes a single code point.
2545          *
2546          * This function removes one or more ubytes from the start of an array,
2547          * and returns the decoded code point which those ubytes represent.
2548          *
2549          * The input to this function MUST be validly encoded.
2550          *
2551          * Params:
2552          *    s = the array whose first code point is to be decoded
2553          */
2554         abstract dchar decode(ref const(ubyte)[] s);
2555
2556         /**
2557          * Decodes a single code point. The input does not have to be valid.
2558          *
2559          * This function removes one or more ubytes from the start of an array,
2560          * and returns the decoded code point which those ubytes represent.
2561          *
2562          * This function will accept an invalidly encoded array as input.
2563          * If an invalid sequence is found at the start of the string, this
2564          * function will remove it, and return the value INVALID_SEQUENCE.
2565          *
2566          * Params:
2567          *    s = the array whose first code point is to be decoded
2568          */
2569         abstract dchar safeDecode(ref const(ubyte)[] s);
2570
2571         /**
2572          * Returns the sequence of ubytes to be used to represent
2573          * any character which cannot be represented in the encoding scheme.
2574          *
2575          * Normally this will be a representation of some substitution
2576          * character, such as U+FFFD or '?'.
2577          */
2578         abstract @property immutable(ubyte)[] replacementSequence();
2579     }
2580
2581     /**
2582      * Returns true if the array is encoded correctly
2583      *
2584      * Params:
2585      *    s = the array to be tested
2586      */
2587     bool isValid(const(ubyte)[] s)
2588     {
2589         while (s.length != 0)
2590         {
2591             if (safeDecode(s) == INVALID_SEQUENCE)
2592                 return false;
2593         }
2594         return true;
2595     }
2596
2597     /**
2598      * Returns the length of the longest possible substring, starting from
2599      * the first element, which is validly encoded.
2600      *
2601      * Params:
2602      *    s = the array to be tested
2603      */
2604     size_t validLength()(const(ubyte)[] s)
2605     {
2606         const(ubyte)[] r = s;
2607         const(ubyte)[] t = s;
2608         while (s.length != 0)
2609         {
2610             if (safeDecode(s) == INVALID_SEQUENCE) break;
2611             t = s;
2612         }
2613         return r.length - t.length;
2614     }
2615
2616     /**
2617      * Sanitizes an array by replacing malformed ubyte sequences with valid
2618      * ubyte sequences. The result is guaranteed to be valid for this
2619      * encoding scheme.
2620      *
2621      * If the input array is already valid, this function returns the
2622      * original, otherwise it constructs a new array by replacing all illegal
2623      * sequences with the encoding scheme's replacement sequence.
2624      *
2625      * Params:
2626      *    s = the string to be sanitized
2627      */
2628     immutable(ubyte)[] sanitize()(immutable(ubyte)[] s)
2629     {
2630         auto n = validLength(s);
2631         if (n == s.length) return s;
2632
2633         auto repSeq = replacementSequence;
2634
2635         // Count how long the string needs to be.
2636         // Overestimating is not a problem
2637         auto len = s.length;
2638         const(ubyte)[] t = s[n..$];
2639         while (t.length != 0)
2640         {
2641             immutable c = safeDecode(t);
2642             assert(c == INVALID_SEQUENCE);
2643             len += repSeq.length;
2644             t = t[validLength(t)..$];
2645         }
2646
2647         // Now do the write
2648         ubyte[] array = new ubyte[len];
2649         array[0 .. n] = s[0 .. n];
2650         auto offset = n;
2651
2652         t = s[n..$];
2653         while (t.length != 0)
2654         {
2655             immutable c = safeDecode(t);
2656             assert(c == INVALID_SEQUENCE);
2657             array[offset .. offset+repSeq.length] = repSeq[];
2658             offset += repSeq.length;
2659             n = validLength(t);
2660             array[offset .. offset+n] = t[0 .. n];
2661             offset += n;
2662             t = t[n..$];
2663         }
2664         return cast(immutable(ubyte)[])array[0 .. offset];
2665     }
2666
2667     /**
2668      * Returns the length of the first encoded sequence.
2669      *
2670      * The input to this function MUST be validly encoded.
2671      * This is enforced by the function's in-contract.
2672      *
2673      * Params:
2674      *    s = the array to be sliced
2675      */
2676     size_t firstSequence()(const(ubyte)[] s)
2677     in
2678     {
2679         assert(s.length != 0);
2680         const(ubyte)[] u = s;
2681         assert(safeDecode(u) != INVALID_SEQUENCE);
2682     }
2683     body
2684     {
2685         const(ubyte)[] t = s;
2686         decode(s);
2687         return t.length - s.length;
2688     }
2689
2690     /**
2691      * Returns the total number of code points encoded in a ubyte array.
2692      *
2693      * The input to this function MUST be validly encoded.
2694      * This is enforced by the function's in-contract.
2695      *
2696      * Params:
2697      *    s = the string to be counted
2698      */
2699     size_t count()(const(ubyte)[] s)
2700     in
2701     {
2702         assert(isValid(s));
2703     }
2704     body
2705     {
2706         size_t n = 0;
2707         while (s.length != 0)
2708         {
2709             decode(s);
2710             ++n;
2711         }
2712         return n;
2713     }
2714
2715     /**
2716      * Returns the array index at which the (n+1)th code point begins.
2717      *
2718      * The input to this function MUST be validly encoded.
2719      * This is enforced by the function's in-contract.
2720      *
2721      * Params:
2722      *    s = the string to be counted
2723      *    n = the current code point index
2724      */
2725     ptrdiff_t index()(const(ubyte)[] s, size_t n)
2726     in
2727     {
2728         assert(isValid(s));
2729         assert(n >= 0);
2730     }
2731     body
2732     {
2733         const(ubyte)[] t = s;
2734         for (size_t i=0; i<n; ++i) decode(s);
2735         return t.length - s.length;
2736     }
2737
2738     __gshared EncodingScheme function()[string] supported;
2739     __gshared string[string] supportedFactories;
2740 }
2741
2742 /**
2743  EncodingScheme to handle ASCII
2744
2745  This scheme recognises the following names:
2746                  "ANSI_X3.4-1968",
2747                  "ANSI_X3.4-1986",
2748                  "ASCII",
2749                  "IBM367",
2750                  "ISO646-US",
2751                  "ISO_646.irv:1991",
2752                  "US-ASCII",
2753                  "cp367",
2754                  "csASCII"
2755                  "iso-ir-6",
2756                  "us"
2757  */
2758 class EncodingSchemeASCII : EncodingScheme
2759 {
2760     /* // moved to std.internal.phobosinit
2761     shared static this()
2762     {
2763         EncodingScheme.register("std.encoding.EncodingSchemeASCII");
2764     }*/
2765
2766     const
2767     {
2768         override string[] names() @safe pure nothrow
2769         {
2770             return
2771             [
2772                 "ANSI_X3.4-1968",
2773                 "ANSI_X3.4-1986",
2774                 "ASCII",
2775                 "IBM367",
2776                 "ISO646-US",
2777                 "ISO_646.irv:1991",
2778                 "US-ASCII",
2779                 "cp367",
2780                 "csASCII",
2781                 "iso-ir-6",
2782                 "us"
2783             ];
2784         }
2785
2786         override string toString() @safe pure nothrow @nogc
2787         {
2788             return "ASCII";
2789         }
2790
2791         override bool canEncode(dchar c) @safe pure nothrow @nogc
2792         {
2793             return std.encoding.canEncode!(AsciiChar)(c);
2794         }
2795
2796         override size_t encodedLength(dchar c)  @safe pure nothrow @nogc
2797         {
2798             return std.encoding.encodedLength!(AsciiChar)(c);
2799         }
2800
2801         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2802         {
2803             auto r = cast(AsciiChar[]) buffer;
2804             return std.encoding.encode(c,r);
2805         }
2806
2807         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2808         {
2809             auto t = cast(const(AsciiChar)[]) s;
2810             dchar c = std.encoding.decode(t);
2811             s = s[$-t.length..$];
2812             return c;
2813         }
2814
2815         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2816         {
2817             auto t = cast(const(AsciiChar)[]) s;
2818             dchar c = std.encoding.safeDecode(t);
2819             s = s[$-t.length..$];
2820             return c;
2821         }
2822
2823         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2824         {
2825             return cast(immutable(ubyte)[])"?";
2826         }
2827     }
2828 }
2829
2830 /**
2831  EncodingScheme to handle Latin-1
2832
2833  This scheme recognises the following names:
2834                  "CP819",
2835                  "IBM819",
2836                  "ISO-8859-1",
2837                  "ISO_8859-1",
2838                  "ISO_8859-1:1987",
2839                  "csISOLatin1",
2840                  "iso-ir-100",
2841                  "l1",
2842                  "latin1"
2843  */
2844 class EncodingSchemeLatin1 : EncodingScheme
2845 {
2846     /* // moved to std.internal.phobosinit
2847     shared static this()
2848     {
2849         EncodingScheme.register("std.encoding.EncodingSchemeLatin1");
2850     }*/
2851
2852     const
2853     {
2854         override string[] names() @safe pure nothrow
2855         {
2856             return
2857             [
2858                 "CP819",
2859                 "IBM819",
2860                 "ISO-8859-1",
2861                 "ISO_8859-1",
2862                 "ISO_8859-1:1987",
2863                 "csISOLatin1",
2864                 "iso-ir-100",
2865                 "l1",
2866                 "latin1"
2867             ];
2868         }
2869
2870         override string toString() @safe pure nothrow @nogc
2871         {
2872             return "ISO-8859-1";
2873         }
2874
2875         override bool canEncode(dchar c) @safe pure nothrow @nogc
2876         {
2877             return std.encoding.canEncode!(Latin1Char)(c);
2878         }
2879
2880         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
2881         {
2882             return std.encoding.encodedLength!(Latin1Char)(c);
2883         }
2884
2885         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2886         {
2887             auto r = cast(Latin1Char[]) buffer;
2888             return std.encoding.encode(c,r);
2889         }
2890
2891         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2892         {
2893             auto t = cast(const(Latin1Char)[]) s;
2894             dchar c = std.encoding.decode(t);
2895             s = s[$-t.length..$];
2896             return c;
2897         }
2898
2899         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2900         {
2901             auto t = cast(const(Latin1Char)[]) s;
2902             dchar c = std.encoding.safeDecode(t);
2903             s = s[$-t.length..$];
2904             return c;
2905         }
2906
2907         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2908         {
2909             return cast(immutable(ubyte)[])"?";
2910         }
2911     }
2912 }
2913
2914 /**
2915  EncodingScheme to handle Latin-2
2916
2917  This scheme recognises the following names:
2918                  "Latin 2",
2919                  "ISO-8859-2",
2920                  "ISO_8859-2",
2921                  "ISO_8859-2:1999",
2922                  "Windows-28592"
2923  */
2924 class EncodingSchemeLatin2 : EncodingScheme
2925 {
2926     /* // moved to std.internal.phobosinit
2927     shared static this()
2928     {
2929         EncodingScheme.register("std.encoding.EncodingSchemeLatin2");
2930     }*/
2931
2932     const
2933     {
2934         override string[] names() @safe pure nothrow
2935         {
2936             return
2937             [
2938                 "Latin 2",
2939                 "ISO-8859-2",
2940                 "ISO_8859-2",
2941                 "ISO_8859-2:1999",
2942                 "windows-28592"
2943             ];
2944         }
2945
2946         override string toString() @safe pure nothrow @nogc
2947         {
2948             return "ISO-8859-2";
2949         }
2950
2951         override bool canEncode(dchar c) @safe pure nothrow @nogc
2952         {
2953             return std.encoding.canEncode!(Latin2Char)(c);
2954         }
2955
2956         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
2957         {
2958             return std.encoding.encodedLength!(Latin2Char)(c);
2959         }
2960
2961         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2962         {
2963             auto r = cast(Latin2Char[]) buffer;
2964             return std.encoding.encode(c,r);
2965         }
2966
2967         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2968         {
2969             auto t = cast(const(Latin2Char)[]) s;
2970             dchar c = std.encoding.decode(t);
2971             s = s[$-t.length..$];
2972             return c;
2973         }
2974
2975         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2976         {
2977             auto t = cast(const(Latin2Char)[]) s;
2978             dchar c = std.encoding.safeDecode(t);
2979             s = s[$-t.length..$];
2980             return c;
2981         }
2982
2983         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2984         {
2985             return cast(immutable(ubyte)[])"?";
2986         }
2987     }
2988 }
2989
2990 /**
2991  EncodingScheme to handle Windows-1250
2992
2993  This scheme recognises the following names:
2994                  "windows-1250"
2995  */
2996 class EncodingSchemeWindows1250 : EncodingScheme
2997 {
2998     /* // moved to std.internal.phobosinit
2999     shared static this()
3000     {
3001         EncodingScheme.register("std.encoding.EncodingSchemeWindows1250");
3002     }*/
3003
3004     const
3005     {
3006         override string[] names() @safe pure nothrow
3007         {
3008             return
3009             [
3010                 "windows-1250"
3011             ];
3012         }
3013
3014         override string toString() @safe pure nothrow @nogc
3015         {
3016             return "windows-1250";
3017         }
3018
3019         override bool canEncode(dchar c) @safe pure nothrow @nogc
3020         {
3021             return std.encoding.canEncode!(Windows1250Char)(c);
3022         }
3023
3024         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3025         {
3026             return std.encoding.encodedLength!(Windows1250Char)(c);
3027         }
3028
3029         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3030         {
3031             auto r = cast(Windows1250Char[]) buffer;
3032             return std.encoding.encode(c,r);
3033         }
3034
3035         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3036         {
3037             auto t = cast(const(Windows1250Char)[]) s;
3038             dchar c = std.encoding.decode(t);
3039             s = s[$-t.length..$];
3040             return c;
3041         }
3042
3043         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3044         {
3045             auto t = cast(const(Windows1250Char)[]) s;
3046             dchar c = std.encoding.safeDecode(t);
3047             s = s[$-t.length..$];
3048             return c;
3049         }
3050
3051         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3052         {
3053             return cast(immutable(ubyte)[])"?";
3054         }
3055     }
3056 }
3057
3058 /**
3059  EncodingScheme to handle Windows-1252
3060
3061  This scheme recognises the following names:
3062                  "windows-1252"
3063  */
3064 class EncodingSchemeWindows1252 : EncodingScheme
3065 {
3066     /* // moved to std.internal.phobosinit
3067     shared static this()
3068     {
3069         EncodingScheme.register("std.encoding.EncodingSchemeWindows1252");
3070     }*/
3071
3072     const
3073     {
3074         override string[] names() @safe pure nothrow
3075         {
3076             return
3077             [
3078                 "windows-1252"
3079             ];
3080         }
3081
3082         override string toString() @safe pure nothrow @nogc
3083         {
3084             return "windows-1252";
3085         }
3086
3087         override bool canEncode(dchar c) @safe pure nothrow @nogc
3088         {
3089             return std.encoding.canEncode!(Windows1252Char)(c);
3090         }
3091
3092         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3093         {
3094             return std.encoding.encodedLength!(Windows1252Char)(c);
3095         }
3096
3097         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3098         {
3099             auto r = cast(Windows1252Char[]) buffer;
3100             return std.encoding.encode(c,r);
3101         }
3102
3103         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3104         {
3105             auto t = cast(const(Windows1252Char)[]) s;
3106             dchar c = std.encoding.decode(t);
3107             s = s[$-t.length..$];
3108             return c;
3109         }
3110
3111         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3112         {
3113             auto t = cast(const(Windows1252Char)[]) s;
3114             dchar c = std.encoding.safeDecode(t);
3115             s = s[$-t.length..$];
3116             return c;
3117         }
3118
3119         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3120         {
3121             return cast(immutable(ubyte)[])"?";
3122         }
3123     }
3124 }
3125
3126 /**
3127  EncodingScheme to handle UTF-8
3128
3129  This scheme recognises the following names:
3130                  "UTF-8"
3131  */
3132 class EncodingSchemeUtf8 : EncodingScheme
3133 {
3134     /* // moved to std.internal.phobosinit
3135     shared static this()
3136     {
3137         EncodingScheme.register("std.encoding.EncodingSchemeUtf8");
3138     }*/
3139
3140     const
3141     {
3142         override string[] names() @safe pure nothrow
3143         {
3144             return
3145             [
3146                 "UTF-8"
3147             ];
3148         }
3149
3150         override string toString() @safe pure nothrow @nogc
3151         {
3152             return "UTF-8";
3153         }
3154
3155         override bool canEncode(dchar c) @safe pure nothrow @nogc
3156         {
3157             return std.encoding.canEncode!(char)(c);
3158         }
3159
3160         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3161         {
3162             return std.encoding.encodedLength!(char)(c);
3163         }
3164
3165         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3166         {
3167             auto r = cast(char[]) buffer;
3168             return std.encoding.encode(c,r);
3169         }
3170
3171         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3172         {
3173             auto t = cast(const(char)[]) s;
3174             dchar c = std.encoding.decode(t);
3175             s = s[$-t.length..$];
3176             return c;
3177         }
3178
3179         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3180         {
3181             auto t = cast(const(char)[]) s;
3182             dchar c = std.encoding.safeDecode(t);
3183             s = s[$-t.length..$];
3184             return c;
3185         }
3186
3187         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3188         {
3189             return cast(immutable(ubyte)[])"\uFFFD";
3190         }
3191     }
3192 }
3193
3194 /**
3195  EncodingScheme to handle UTF-16 in native byte order
3196
3197  This scheme recognises the following names:
3198                  "UTF-16LE" (little-endian architecture only)
3199                  "UTF-16BE" (big-endian architecture only)
3200  */
3201 class EncodingSchemeUtf16Native : EncodingScheme
3202 {
3203     /* // moved to std.internal.phobosinit
3204     shared static this()
3205     {
3206         EncodingScheme.register("std.encoding.EncodingSchemeUtf16Native");
3207     }*/
3208
3209     const
3210     {
3211         version (LittleEndian) { enum string NAME = "UTF-16LE"; }
3212         version (BigEndian)    { enum string NAME = "UTF-16BE"; }
3213
3214         override string[] names() @safe pure nothrow
3215         {
3216             return [ NAME ];
3217         }
3218
3219         override string toString() @safe pure nothrow @nogc
3220         {
3221             return NAME;
3222         }
3223
3224         override bool canEncode(dchar c) @safe pure nothrow @nogc
3225         {
3226             return std.encoding.canEncode!(wchar)(c);
3227         }
3228
3229         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3230         {
3231             return std.encoding.encodedLength!(wchar)(c);
3232         }
3233
3234         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3235         {
3236             auto r = cast(wchar[]) buffer;
3237             return wchar.sizeof * std.encoding.encode(c,r);
3238         }
3239
3240         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3241         in
3242         {
3243             assert((s.length & 1) == 0);
3244         }
3245         body
3246         {
3247             auto t = cast(const(wchar)[]) s;
3248             dchar c = std.encoding.decode(t);
3249             s = s[$-t.length * wchar.sizeof..$];
3250             return c;
3251         }
3252
3253         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3254         in
3255         {
3256             assert((s.length & 1) == 0);
3257         }
3258         body
3259         {
3260             auto t = cast(const(wchar)[]) s;
3261             dchar c = std.encoding.safeDecode(t);
3262             s = s[$-t.length * wchar.sizeof..$];
3263             return c;
3264         }
3265
3266         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3267         {
3268             return cast(immutable(ubyte)[])"\uFFFD"w;
3269         }
3270     }
3271 }
3272 @system unittest
3273 {
3274     version (LittleEndian)
3275     {
3276         auto efrom = EncodingScheme.create("utf-16le");
3277         ubyte[6] sample = [154,1, 155,1, 156,1];
3278     }
3279     version (BigEndian)
3280     {
3281         auto efrom = EncodingScheme.create("utf-16be");
3282         ubyte[6] sample = [1,154, 1,155, 1,156];
3283     }
3284     const(ubyte)[] ub = cast(const(ubyte)[])sample;
3285     dchar dc = efrom.safeDecode(ub);
3286     assert(dc == 410);
3287     assert(ub.length == 4);
3288 }
3289
3290 /**
3291  EncodingScheme to handle UTF-32 in native byte order
3292
3293  This scheme recognises the following names:
3294                  "UTF-32LE" (little-endian architecture only)
3295                  "UTF-32BE" (big-endian architecture only)
3296  */
3297 class EncodingSchemeUtf32Native : EncodingScheme
3298 {
3299     /* // moved to std.internal.phobosinit
3300     shared static this()
3301     {
3302         EncodingScheme.register("std.encoding.EncodingSchemeUtf32Native");
3303     }*/
3304
3305     const
3306     {
3307         version (LittleEndian) { enum string NAME = "UTF-32LE"; }
3308         version (BigEndian)    { enum string NAME = "UTF-32BE"; }
3309
3310         override string[] names() @safe pure nothrow
3311         {
3312             return [ NAME ];
3313         }
3314
3315         override string toString() @safe pure nothrow @nogc
3316         {
3317             return NAME;
3318         }
3319
3320         override bool canEncode(dchar c) @safe pure nothrow @nogc
3321         {
3322             return std.encoding.canEncode!(dchar)(c);
3323         }
3324
3325         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3326         {
3327             return std.encoding.encodedLength!(dchar)(c);
3328         }
3329
3330         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3331         {
3332             auto r = cast(dchar[]) buffer;
3333             return dchar.sizeof * std.encoding.encode(c,r);
3334         }
3335
3336         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3337         in
3338         {
3339             assert((s.length & 3) == 0);
3340         }
3341         body
3342         {
3343             auto t = cast(const(dchar)[]) s;
3344             dchar c = std.encoding.decode(t);
3345             s = s[$-t.length * dchar.sizeof..$];
3346             return c;
3347         }
3348
3349         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3350         in
3351         {
3352             assert((s.length & 3) == 0);
3353         }
3354         body
3355         {
3356             auto t = cast(const(dchar)[]) s;
3357             dchar c = std.encoding.safeDecode(t);
3358             s = s[$-t.length * dchar.sizeof..$];
3359             return c;
3360         }
3361
3362         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3363         {
3364             return cast(immutable(ubyte)[])"\uFFFD"d;
3365         }
3366     }
3367 }
3368 @system unittest
3369 {
3370     version (LittleEndian)
3371     {
3372         auto efrom = EncodingScheme.create("utf-32le");
3373         ubyte[12] sample = [154,1,0,0, 155,1,0,0, 156,1,0,0];
3374     }
3375     version (BigEndian)
3376     {
3377         auto efrom = EncodingScheme.create("utf-32be");
3378         ubyte[12] sample = [0,0,1,154, 0,0,1,155, 0,0,1,156];
3379     }
3380     const(ubyte)[] ub = cast(const(ubyte)[])sample;
3381     dchar dc = efrom.safeDecode(ub);
3382     assert(dc == 410);
3383     assert(ub.length == 8);
3384 }
3385
3386 //=============================================================================
3387
3388
3389 // Helper functions
3390 version (unittest)
3391 {
3392     void transcodeReverse(Src,Dst)(immutable(Src)[] s, out immutable(Dst)[] r)
3393     {
3394         static if (is(Src == Dst))
3395         {
3396             return s;
3397         }
3398         else static if (is(Src == AsciiChar))
3399         {
3400             transcodeReverse!(char,Dst)(cast(string) s,r);
3401         }
3402         else
3403         {
3404             foreach_reverse (d;codePoints(s))
3405             {
3406                 foreach_reverse (c;codeUnits!(Dst)(d))
3407                 {
3408                     r = c ~ r;
3409                 }
3410             }
3411         }
3412     }
3413
3414     string makeReadable(string s)
3415     {
3416         string r = "\"";
3417         foreach (char c;s)
3418         {
3419             if (c >= 0x20 && c < 0x80)
3420             {
3421                 r ~= c;
3422             }
3423             else
3424             {
3425                 r ~= "\\x";
3426                 r ~= toHexDigit(c >> 4);
3427                 r ~= toHexDigit(c);
3428             }
3429         }
3430         r ~= "\"";
3431         return r;
3432     }
3433
3434     string makeReadable(wstring s)
3435     {
3436         string r = "\"";
3437         foreach (wchar c;s)
3438         {
3439             if (c >= 0x20 && c < 0x80)
3440             {
3441                 r ~= cast(char) c;
3442             }
3443             else
3444             {
3445                 r ~= "\\u";
3446                 r ~= toHexDigit(c >> 12);
3447                 r ~= toHexDigit(c >> 8);
3448                 r ~= toHexDigit(c >> 4);
3449                 r ~= toHexDigit(c);
3450             }
3451         }
3452         r ~= "\"w";
3453         return r;
3454     }
3455
3456     string makeReadable(dstring s)
3457     {
3458         string r = "\"";
3459         foreach (dchar c; s)
3460         {
3461             if (c >= 0x20 && c < 0x80)
3462             {
3463                 r ~= cast(char) c;
3464             }
3465             else if (c < 0x10000)
3466             {
3467                 r ~= "\\u";
3468                 r ~= toHexDigit(c >> 12);
3469                 r ~= toHexDigit(c >> 8);
3470                 r ~= toHexDigit(c >> 4);
3471                 r ~= toHexDigit(c);
3472             }
3473             else
3474             {
3475                 r ~= "\\U00";
3476                 r ~= toHexDigit(c >> 20);
3477                 r ~= toHexDigit(c >> 16);
3478                 r ~= toHexDigit(c >> 12);
3479                 r ~= toHexDigit(c >> 8);
3480                 r ~= toHexDigit(c >> 4);
3481                 r ~= toHexDigit(c);
3482             }
3483         }
3484         r ~= "\"d";
3485         return r;
3486     }
3487
3488     char toHexDigit(int n)
3489     {
3490         return "0123456789ABCDEF"[n & 0xF];
3491     }
3492 }
3493
3494 /** Definitions of common Byte Order Marks.
3495 The elements of the $(D enum) can used as indices into $(D bomTable) to get
3496 matching $(D BOMSeq).
3497 */
3498 enum BOM
3499 {
3500     none      = 0,  /// no BOM was found
3501     utf32be   = 1,  /// [0x00, 0x00, 0xFE, 0xFF]
3502     utf32le   = 2,  /// [0xFF, 0xFE, 0x00, 0x00]
3503     utf7      = 3,  /*  [0x2B, 0x2F, 0x76, 0x38]
3504                         [0x2B, 0x2F, 0x76, 0x39],
3505                         [0x2B, 0x2F, 0x76, 0x2B],
3506                         [0x2B, 0x2F, 0x76, 0x2F],
3507                         [0x2B, 0x2F, 0x76, 0x38, 0x2D]
3508                     */
3509     utf1      = 8,  /// [0xF7, 0x64, 0x4C]
3510     utfebcdic = 9,  /// [0xDD, 0x73, 0x66, 0x73]
3511     scsu      = 10, /// [0x0E, 0xFE, 0xFF]
3512     bocu1     = 11, /// [0xFB, 0xEE, 0x28]
3513     gb18030   = 12, /// [0x84, 0x31, 0x95, 0x33]
3514     utf8      = 13, /// [0xEF, 0xBB, 0xBF]
3515     utf16be   = 14, /// [0xFE, 0xFF]
3516     utf16le   = 15  /// [0xFF, 0xFE]
3517 }
3518
3519 /// The type stored inside $(D bomTable).
3520 alias BOMSeq = Tuple!(BOM, "schema", ubyte[], "sequence");
3521
3522 /** Mapping of a byte sequence to $(B Byte Order Mark (BOM))
3523 */
3524 immutable bomTable = [
3525     BOMSeq(BOM.none, null),
3526     BOMSeq(BOM.utf32be, cast(ubyte[])([0x00, 0x00, 0xFE, 0xFF])),
3527     BOMSeq(BOM.utf32le, cast(ubyte[])([0xFF, 0xFE, 0x00, 0x00])),
3528     BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x39])),
3529     BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2B])),
3530     BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2F])),
3531     BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38, 0x2D])),
3532     BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38])),
3533     BOMSeq(BOM.utf1, cast(ubyte[])([0xF7, 0x64, 0x4C])),
3534     BOMSeq(BOM.utfebcdic, cast(ubyte[])([0xDD, 0x73, 0x66, 0x73])),
3535     BOMSeq(BOM.scsu, cast(ubyte[])([0x0E, 0xFE, 0xFF])),
3536     BOMSeq(BOM.bocu1, cast(ubyte[])([0xFB, 0xEE, 0x28])),
3537     BOMSeq(BOM.gb18030, cast(ubyte[])([0x84, 0x31, 0x95, 0x33])),
3538     BOMSeq(BOM.utf8, cast(ubyte[])([0xEF, 0xBB, 0xBF])),
3539     BOMSeq(BOM.utf16be, cast(ubyte[])([0xFE, 0xFF])),
3540     BOMSeq(BOM.utf16le, cast(ubyte[])([0xFF, 0xFE]))
3541 ];
3542
3543 /** Returns a $(D BOMSeq) for a given $(D input).
3544 If no $(D BOM) is present the $(D BOMSeq) for $(D BOM.none) is
3545 returned. The $(D BOM) sequence at the beginning of the range will
3546 not be comsumed from the passed range. If you pass a reference type
3547 range make sure that $(D save) creates a deep copy.
3548
3549 Params:
3550     input = The sequence to check for the $(D BOM)
3551
3552 Returns:
3553     the found $(D BOMSeq) corresponding to the passed $(D input).
3554 */
3555 immutable(BOMSeq) getBOM(Range)(Range input)
3556 if (isForwardRange!Range && is(Unqual!(ElementType!Range) == ubyte))
3557 {
3558     import std.algorithm.searching : startsWith;
3559     foreach (it; bomTable[1 .. $])
3560     {
3561         if (startsWith(input.save, it.sequence))
3562         {
3563             return it;
3564         }
3565     }
3566
3567     return bomTable[0];
3568 }
3569
3570 ///
3571 @system unittest
3572 {
3573     import std.format : format;
3574
3575     auto ts = dchar(0x0000FEFF) ~ "Hello World"d;
3576
3577     auto entry = getBOM(cast(ubyte[]) ts);
3578     version (BigEndian)
3579     {
3580         assert(entry.schema == BOM.utf32be, format("%s", entry.schema));
3581     }
3582     else
3583     {
3584         assert(entry.schema == BOM.utf32le, format("%s", entry.schema));
3585     }
3586 }
3587
3588 @system unittest
3589 {
3590     import std.format : format;
3591
3592     foreach (idx, it; bomTable)
3593     {
3594         auto s = it[1] ~ cast(ubyte[])"hello world";
3595         auto i = getBOM(s);
3596         assert(i[0] == bomTable[idx][0]);
3597
3598         if (idx < 4 || idx > 7) // get around the multiple utf7 bom's
3599         {
3600             assert(i[0] == BOM.init + idx);
3601             assert(i[1] == it[1]);
3602         }
3603     }
3604 }
3605
3606 @safe pure unittest
3607 {
3608     struct BOMInputRange
3609     {
3610         ubyte[] arr;
3611
3612         @property ubyte front()
3613         {
3614             return this.arr.front;
3615         }
3616
3617         @property bool empty()
3618         {
3619             return this.arr.empty;
3620         }
3621
3622         void popFront()
3623         {
3624             this.arr = this.arr[1 .. $];
3625         }
3626
3627         @property typeof(this) save()
3628         {
3629             return this;
3630         }
3631     }
3632
3633     static assert( isInputRange!BOMInputRange);
3634     static assert(!isArray!BOMInputRange);
3635
3636     ubyte[] dummyEnd = [0,0,0,0];
3637
3638     foreach (idx, it; bomTable[1 .. $])
3639     {
3640         {
3641             auto ir = BOMInputRange(it.sequence.dup);
3642
3643             auto b = getBOM(ir);
3644             assert(b.schema == it.schema);
3645             assert(ir.arr == it.sequence);
3646         }
3647
3648         {
3649             auto noBom = it.sequence[0 .. 1].dup ~ dummyEnd;
3650             size_t oldLen = noBom.length;
3651             assert(oldLen - 4 < it.sequence.length);
3652
3653             auto ir = BOMInputRange(noBom.dup);
3654             auto b = getBOM(ir);
3655             assert(b.schema == BOM.none);
3656             assert(noBom.length == oldLen);
3657         }
3658     }
3659 }
3660
3661 /** Constant defining a fully decoded BOM */
3662 enum dchar utfBOM = 0xfeff;