libstdc++-v3/src/c++11/codecvt.cc

   1 // Locale support (codecvt) -*- C++ -*-
   2
   3 // Copyright (C) 2015-2021 Free Software Foundation, Inc.
   4 //
   5 // This file is part of the GNU ISO C++ Library.  This library is free
   6 // software; you can redistribute it and/or modify it under the
   7 // terms of the GNU General Public License as published by the
   8 // Free Software Foundation; either version 3, or (at your option)
   9 // any later version.
  10
  11 // This library is distributed in the hope that it will be useful,
  12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 // GNU General Public License for more details.
  15
  16 // Under Section 7 of GPL version 3, you are granted additional
  17 // permissions described in the GCC Runtime Library Exception, version
  18 // 3.1, as published by the Free Software Foundation.
  19
  20 // You should have received a copy of the GNU General Public License and
  21 // a copy of the GCC Runtime Library Exception along with this program;
  22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23 // <http://www.gnu.org/licenses/>.
  24
  25 #include <codecvt>
  26 #include <cstring>              // std::memcpy, std::memcmp
  27 #include <bits/stl_algobase.h>  // std::min
  28
  29 namespace std _GLIBCXX_VISIBILITY(default)
  30 {
  31 _GLIBCXX_BEGIN_NAMESPACE_VERSION
  32
  33   // The standard doesn't define these operators, which is annoying.
  34   static underlying_type<codecvt_mode>::type
  35   to_integer(codecvt_mode m)
  36   { return static_cast<underlying_type<codecvt_mode>::type>(m); }
  37
  38   static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
  39   { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
  40
  41   static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
  42   { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
  43
  44   static codecvt_mode operator~(codecvt_mode m)
  45   { return codecvt_mode(~to_integer(m)); }
  46
  47 namespace
  48 {
  49   // Largest code point that fits in a single UTF-16 code unit.
  50   const char32_t max_single_utf16_unit = 0xFFFF;
  51
  52   const char32_t max_code_point = 0x10FFFF;
  53
  54   // The functions below rely on maxcode < incomplete_mb_character
  55   // (which is enforced by the codecvt_utf* classes on construction).
  56   const char32_t incomplete_mb_character = char32_t(-2);
  57   const char32_t invalid_mb_sequence = char32_t(-1);
  58
  59   // Utility type for reading and writing code units of type Elem from
  60   // a range defined by a pair of pointers.
  61   template<typename Elem, bool Aligned = true>
  62     struct range
  63     {
  64       Elem* next;
  65       Elem* end;
  66
  67       // Write a code unit.
  68       range& operator=(Elem e)
  69       {
  70         *next++ = e;
  71         return *this;
  72       }
  73
  74       // Read the next code unit.
  75       Elem operator*() const { return *next; }
  76
  77       // Read the Nth code unit.
  78       Elem operator[](size_t n) const { return next[n]; }
  79
  80       // Move to the next code unit.
  81       range& operator++()
  82       {
  83         ++next;
  84         return *this;
  85       }
  86
  87       // Move to the Nth code unit.
  88       range& operator+=(size_t n)
  89       {
  90         next += n;
  91         return *this;
  92       }
  93
  94       // The number of code units remaining.
  95       size_t size() const { return end - next; }
  96
  97       // The number of bytes remaining.
  98       size_t nbytes() const { return (const char*)end - (const char*)next; }
  99     };
 100
 101   // This specialization is used when accessing char16_t values through
 102   // pointers to char, which might not be correctly aligned for char16_t.
 103   template<typename Elem>
 104     struct range<Elem, false>
 105     {
 106       using value_type = typename remove_const<Elem>::type;
 107
 108       using char_pointer = typename
 109         conditional<is_const<Elem>::value, const char*, char*>::type;
 110
 111       char_pointer next;
 112       char_pointer end;
 113
 114       // Write a code unit.
 115       range& operator=(Elem e)
 116       {
 117         memcpy(next, &e, sizeof(Elem));
 118         ++*this;
 119         return *this;
 120       }
 121
 122       // Read the next code unit.
 123       Elem operator*() const
 124       {
 125         value_type e;
 126         memcpy(&e, next, sizeof(Elem));
 127         return e;
 128       }
 129
 130       // Read the Nth code unit.
 131       Elem operator[](size_t n) const
 132       {
 133         value_type e;
 134         memcpy(&e, next + n * sizeof(Elem), sizeof(Elem));
 135         return e;
 136       }
 137
 138       // Move to the next code unit.
 139       range& operator++()
 140       {
 141         next += sizeof(Elem);
 142         return *this;
 143       }
 144
 145       // Move to the Nth code unit.
 146       range& operator+=(size_t n)
 147       {
 148         next += n * sizeof(Elem);
 149         return *this;
 150       }
 151
 152       // The number of code units remaining.
 153       size_t size() const { return nbytes() / sizeof(Elem); }
 154
 155       // The number of bytes remaining.
 156       size_t nbytes() const { return end - next; }
 157     };
 158
 159   // Multibyte sequences can have "header" consisting of Byte Order Mark
 160   const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
 161   const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
 162   const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
 163
 164   // Write a BOM (space permitting).
 165   template<typename C, bool A, size_t N>
 166     bool
 167     write_bom(range<C, A>& to, const unsigned char (&bom)[N])
 168     {
 169       static_assert( (N / sizeof(C)) != 0, "" );
 170       static_assert( (N % sizeof(C)) == 0, "" );
 171
 172       if (to.nbytes() < N)
 173         return false;
 174       memcpy(to.next, bom, N);
 175       to += (N / sizeof(C));
 176       return true;
 177     }
 178
 179   // Try to read a BOM.
 180   template<typename C, bool A, size_t N>
 181     bool
 182     read_bom(range<C, A>& from, const unsigned char (&bom)[N])
 183     {
 184       static_assert( (N / sizeof(C)) != 0, "" );
 185       static_assert( (N % sizeof(C)) == 0, "" );
 186
 187       if (from.nbytes() >= N && !memcmp(from.next, bom, N))
 188         {
 189           from += (N / sizeof(C));
 190           return true;
 191         }
 192       return false;
 193     }
 194
 195   // If generate_header is set in mode write out UTF-8 BOM.
 196   template<typename C>
 197   bool
 198   write_utf8_bom(range<C>& to, codecvt_mode mode)
 199   {
 200     if (mode & generate_header)
 201       return write_bom(to, utf8_bom);
 202     return true;
 203   }
 204
 205   // If generate_header is set in mode write out the UTF-16 BOM indicated
 206   // by whether little_endian is set in mode.
 207   template<bool Aligned>
 208   bool
 209   write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode)
 210   {
 211     if (mode & generate_header)
 212     {
 213       if (mode & little_endian)
 214         return write_bom(to, utf16le_bom);
 215       else
 216         return write_bom(to, utf16_bom);
 217     }
 218     return true;
 219   }
 220
 221   // If consume_header is set in mode update from.next to after any BOM.
 222   template<typename C>
 223   void
 224   read_utf8_bom(range<const C>& from, codecvt_mode mode)
 225   {
 226     if (mode & consume_header)
 227       read_bom(from, utf8_bom);
 228   }
 229
 230   // If consume_header is not set in mode, no effects.
 231   // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
 232   // - if the UTF-16BE BOM was found unset little_endian in mode, or
 233   // - if the UTF-16LE BOM was found set little_endian in mode.
 234   template<bool Aligned>
 235   void
 236   read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode)
 237   {
 238     if (mode & consume_header)
 239       {
 240         if (read_bom(from, utf16_bom))
 241           mode &= ~little_endian;
 242         else if (read_bom(from, utf16le_bom))
 243           mode |= little_endian;
 244       }
 245   }
 246
 247   // Read a codepoint from a UTF-8 multibyte sequence.
 248   // Updates from.next if the codepoint is not greater than maxcode.
 249   // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
 250   template<typename C>
 251   char32_t
 252   read_utf8_code_point(range<const C>& from, unsigned long maxcode)
 253   {
 254     const size_t avail = from.size();
 255     if (avail == 0)
 256       return incomplete_mb_character;
 257     unsigned char c1 = from[0];
 258     // https://en.wikipedia.org/wiki/UTF-8#Sample_code
 259     if (c1 < 0x80)
 260     {
 261       ++from;
 262       return c1;
 263     }
 264     else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
 265       return invalid_mb_sequence;
 266     else if (c1 < 0xE0) // 2-byte sequence
 267     {
 268       if (avail < 2)
 269         return incomplete_mb_character;
 270       unsigned char c2 = from[1];
 271       if ((c2 & 0xC0) != 0x80)
 272         return invalid_mb_sequence;
 273       char32_t c = (c1 << 6) + c2 - 0x3080;
 274       if (c <= maxcode)
 275         from += 2;
 276       return c;
 277     }
 278     else if (c1 < 0xF0) // 3-byte sequence
 279     {
 280       if (avail < 3)
 281         return incomplete_mb_character;
 282       unsigned char c2 = from[1];
 283       if ((c2 & 0xC0) != 0x80)
 284         return invalid_mb_sequence;
 285       if (c1 == 0xE0 && c2 < 0xA0) // overlong
 286         return invalid_mb_sequence;
 287       unsigned char c3 = from[2];
 288       if ((c3 & 0xC0) != 0x80)
 289         return invalid_mb_sequence;
 290       char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
 291       if (c <= maxcode)
 292         from += 3;
 293       return c;
 294     }
 295     else if (c1 < 0xF5) // 4-byte sequence
 296     {
 297       if (avail < 4)
 298         return incomplete_mb_character;
 299       unsigned char c2 = from[1];
 300       if ((c2 & 0xC0) != 0x80)
 301         return invalid_mb_sequence;
 302       if (c1 == 0xF0 && c2 < 0x90) // overlong
 303         return invalid_mb_sequence;
 304       if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
 305       return invalid_mb_sequence;
 306       unsigned char c3 = from[2];
 307       if ((c3 & 0xC0) != 0x80)
 308         return invalid_mb_sequence;
 309       unsigned char c4 = from[3];
 310       if ((c4 & 0xC0) != 0x80)
 311         return invalid_mb_sequence;
 312       char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
 313       if (c <= maxcode)
 314         from += 4;
 315       return c;
 316     }
 317     else // > U+10FFFF
 318       return invalid_mb_sequence;
 319   }
 320
 321   template<typename C>
 322   bool
 323   write_utf8_code_point(range<C>& to, char32_t code_point)
 324   {
 325     if (code_point < 0x80)
 326       {
 327         if (to.size() < 1)
 328           return false;
 329         to = code_point;
 330       }
 331     else if (code_point <= 0x7FF)
 332       {
 333         if (to.size() < 2)
 334           return false;
 335         to = (code_point >> 6) + 0xC0;
 336         to = (code_point & 0x3F) + 0x80;
 337       }
 338     else if (code_point <= 0xFFFF)
 339       {
 340         if (to.size() < 3)
 341           return false;
 342         to = (code_point >> 12) + 0xE0;
 343         to = ((code_point >> 6) & 0x3F) + 0x80;
 344         to = (code_point & 0x3F) + 0x80;
 345       }
 346     else if (code_point <= 0x10FFFF)
 347       {
 348         if (to.size() < 4)
 349           return false;
 350         to = (code_point >> 18) + 0xF0;
 351         to = ((code_point >> 12) & 0x3F) + 0x80;
 352         to = ((code_point >> 6) & 0x3F) + 0x80;
 353         to = (code_point & 0x3F) + 0x80;
 354       }
 355     else
 356       return false;
 357     return true;
 358   }
 359
 360   inline char16_t
 361   adjust_byte_order(char16_t c, codecvt_mode mode)
 362   {
 363 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 364     return (mode & little_endian) ? __builtin_bswap16(c) : c;
 365 #else
 366     return (mode & little_endian) ? c : __builtin_bswap16(c);
 367 #endif
 368   }
 369
 370   // Return true if c is a high-surrogate (aka leading) code point.
 371   inline bool
 372   is_high_surrogate(char32_t c)
 373   {
 374     return c >= 0xD800 && c <= 0xDBFF;
 375   }
 376
 377   // Return true if c is a low-surrogate (aka trailing) code point.
 378   inline bool
 379   is_low_surrogate(char32_t c)
 380   {
 381     return c >= 0xDC00 && c <= 0xDFFF;
 382   }
 383
 384   inline char32_t
 385   surrogate_pair_to_code_point(char32_t high, char32_t low)
 386   {
 387     return (high << 10) + low - 0x35FDC00;
 388   }
 389
 390   // Read a codepoint from a UTF-16 multibyte sequence.
 391   // The sequence's endianness is indicated by (mode & little_endian).
 392   // Updates from.next if the codepoint is not greater than maxcode.
 393   // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
 394   template<bool Aligned>
 395     char32_t
 396     read_utf16_code_point(range<const char16_t, Aligned>& from,
 397                           unsigned long maxcode, codecvt_mode mode)
 398     {
 399       const size_t avail = from.size();
 400       if (avail == 0)
 401         return incomplete_mb_character;
 402       int inc = 1;
 403       char32_t c = adjust_byte_order(from[0], mode);
 404       if (is_high_surrogate(c))
 405         {
 406           if (avail < 2)
 407             return incomplete_mb_character;
 408           const char16_t c2 = adjust_byte_order(from[1], mode);
 409           if (is_low_surrogate(c2))
 410             {
 411               c = surrogate_pair_to_code_point(c, c2);
 412               inc = 2;
 413             }
 414           else
 415             return invalid_mb_sequence;
 416         }
 417       else if (is_low_surrogate(c))
 418         return invalid_mb_sequence;
 419       if (c <= maxcode)
 420         from += inc;
 421       return c;
 422     }
 423
 424   template<typename C, bool A>
 425   bool
 426   write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode)
 427   {
 428     static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
 429
 430     if (codepoint <= max_single_utf16_unit)
 431       {
 432         if (to.size() > 0)
 433           {
 434             to = adjust_byte_order(codepoint, mode);
 435             return true;
 436           }
 437       }
 438     else if (to.size() > 1)
 439       {
 440         // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
 441         const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
 442         char16_t lead = LEAD_OFFSET + (codepoint >> 10);
 443         char16_t trail = 0xDC00 + (codepoint & 0x3FF);
 444         to = adjust_byte_order(lead, mode);
 445         to = adjust_byte_order(trail, mode);
 446         return true;
 447       }
 448     return false;
 449   }
 450
 451   // utf8 -> ucs4
 452   template<typename C>
 453   codecvt_base::result
 454   ucs4_in(range<const C>& from, range<char32_t>& to,
 455           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
 456   {
 457     read_utf8_bom(from, mode);
 458     while (from.size() && to.size())
 459       {
 460         const char32_t codepoint = read_utf8_code_point(from, maxcode);
 461         if (codepoint == incomplete_mb_character)
 462           return codecvt_base::partial;
 463         if (codepoint > maxcode)
 464           return codecvt_base::error;
 465         to = codepoint;
 466       }
 467     return from.size() ? codecvt_base::partial : codecvt_base::ok;
 468   }
 469
 470   // ucs4 -> utf8
 471   template<typename C>
 472   codecvt_base::result
 473   ucs4_out(range<const char32_t>& from, range<C>& to,
 474            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
 475   {
 476     if (!write_utf8_bom(to, mode))
 477       return codecvt_base::partial;
 478     while (from.size())
 479       {
 480         const char32_t c = from[0];
 481         if (c > maxcode)
 482           return codecvt_base::error;
 483         if (!write_utf8_code_point(to, c))
 484           return codecvt_base::partial;
 485         ++from;
 486       }
 487     return codecvt_base::ok;
 488   }
 489
 490   // utf16 -> ucs4
 491   codecvt_base::result
 492   ucs4_in(range<const char16_t, false>& from, range<char32_t>& to,
 493           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
 494   {
 495     read_utf16_bom(from, mode);
 496     while (from.size() && to.size())
 497       {
 498         const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
 499         if (codepoint == incomplete_mb_character)
 500           return codecvt_base::partial;
 501         if (codepoint > maxcode)
 502           return codecvt_base::error;
 503         to = codepoint;
 504       }
 505     return from.size() ? codecvt_base::partial : codecvt_base::ok;
 506   }
 507
 508   // ucs4 -> utf16
 509   codecvt_base::result
 510   ucs4_out(range<const char32_t>& from, range<char16_t, false>& to,
 511            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
 512   {
 513     if (!write_utf16_bom(to, mode))
 514       return codecvt_base::partial;
 515     while (from.size())
 516       {
 517         const char32_t c = from[0];
 518         if (c > maxcode)
 519           return codecvt_base::error;
 520         if (!write_utf16_code_point(to, c, mode))
 521           return codecvt_base::partial;
 522         ++from;
 523       }
 524     return codecvt_base::ok;
 525   }
 526
 527   // Flag indicating whether to process UTF-16 or UCS2
 528   enum class surrogates { allowed, disallowed };
 529
 530   // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
 531   template<typename C8, typename C16>
 532   codecvt_base::result
 533   utf16_in(range<const C8>& from, range<C16>& to,
 534            unsigned long maxcode = max_code_point, codecvt_mode mode = {},
 535            surrogates s = surrogates::allowed)
 536   {
 537     read_utf8_bom(from, mode);
 538     while (from.size() && to.size())
 539       {
 540         auto orig = from;
 541         const char32_t codepoint = read_utf8_code_point(from, maxcode);
 542         if (codepoint == incomplete_mb_character)
 543           {
 544             if (s == surrogates::allowed)
 545               return codecvt_base::partial;
 546             else
 547               return codecvt_base::error; // No surrogates in UCS2
 548           }
 549         if (codepoint > maxcode)
 550           return codecvt_base::error;
 551         if (!write_utf16_code_point(to, codepoint, mode))
 552           {
 553             from = orig; // rewind to previous position
 554             return codecvt_base::partial;
 555           }
 556       }
 557     return codecvt_base::ok;
 558   }
 559
 560   // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
 561   template<typename C16, typename C8>
 562   codecvt_base::result
 563   utf16_out(range<const C16>& from, range<C8>& to,
 564             unsigned long maxcode = max_code_point, codecvt_mode mode = {},
 565             surrogates s = surrogates::allowed)
 566   {
 567     if (!write_utf8_bom(to, mode))
 568       return codecvt_base::partial;
 569     while (from.size())
 570       {
 571         char32_t c = from[0];
 572         int inc = 1;
 573         if (is_high_surrogate(c))
 574           {
 575             if (s == surrogates::disallowed)
 576               return codecvt_base::error; // No surrogates in UCS-2
 577
 578             if (from.size() < 2)
 579               return codecvt_base::ok; // stop converting at this point
 580
 581             const char32_t c2 = from[1];
 582             if (is_low_surrogate(c2))
 583               {
 584                 c = surrogate_pair_to_code_point(c, c2);
 585                 inc = 2;
 586               }
 587             else
 588               return codecvt_base::error;
 589           }
 590         else if (is_low_surrogate(c))
 591           return codecvt_base::error;
 592         if (c > maxcode)
 593           return codecvt_base::error;
 594         if (!write_utf8_code_point(to, c))
 595           return codecvt_base::partial;
 596         from += inc;
 597       }
 598     return codecvt_base::ok;
 599   }
 600
 601   // return pos such that [begin,pos) is valid UTF-16 string no longer than max
 602   template<typename C>
 603   const C*
 604   utf16_span(const C* begin, const C* end, size_t max,
 605              char32_t maxcode = max_code_point, codecvt_mode mode = {})
 606   {
 607     range<const C> from{ begin, end };
 608     read_utf8_bom(from, mode);
 609     size_t count = 0;
 610     while (count+1 < max)
 611       {
 612         char32_t c = read_utf8_code_point(from, maxcode);
 613         if (c > maxcode)
 614           return from.next;
 615         else if (c > max_single_utf16_unit)
 616           ++count;
 617         ++count;
 618       }
 619     if (count+1 == max) // take one more character if it fits in a single unit
 620       read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
 621     return from.next;
 622   }
 623
 624   // utf8 -> ucs2
 625   template<typename C>
 626   codecvt_base::result
 627   ucs2_in(range<const C>& from, range<char16_t>& to,
 628           char32_t maxcode = max_code_point, codecvt_mode mode = {})
 629   {
 630     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 631     maxcode = std::min(max_single_utf16_unit, maxcode);
 632     return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
 633   }
 634
 635   // ucs2 -> utf8
 636   template<typename C>
 637   codecvt_base::result
 638   ucs2_out(range<const char16_t>& from, range<C>& to,
 639            char32_t maxcode = max_code_point, codecvt_mode mode = {})
 640   {
 641     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 642     maxcode = std::min(max_single_utf16_unit, maxcode);
 643     return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
 644   }
 645
 646   // ucs2 -> utf16
 647   codecvt_base::result
 648   ucs2_out(range<const char16_t>& from, range<char16_t, false>& to,
 649            char32_t maxcode = max_code_point, codecvt_mode mode = {})
 650   {
 651     if (!write_utf16_bom(to, mode))
 652       return codecvt_base::partial;
 653     while (from.size() && to.size())
 654       {
 655         char16_t c = from[0];
 656         if (is_high_surrogate(c))
 657           return codecvt_base::error;
 658         if (c > maxcode)
 659           return codecvt_base::error;
 660         to = adjust_byte_order(c, mode);
 661         ++from;
 662       }
 663     return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
 664   }
 665
 666   // utf16 -> ucs2
 667   codecvt_base::result
 668   ucs2_in(range<const char16_t, false>& from, range<char16_t>& to,
 669           char32_t maxcode = max_code_point, codecvt_mode mode = {})
 670   {
 671     read_utf16_bom(from, mode);
 672     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 673     maxcode = std::min(max_single_utf16_unit, maxcode);
 674     while (from.size() && to.size())
 675       {
 676         const char32_t c = read_utf16_code_point(from, maxcode, mode);
 677         if (c == incomplete_mb_character)
 678           return codecvt_base::error; // UCS-2 only supports single units.
 679         if (c > maxcode)
 680           return codecvt_base::error;
 681         to = c;
 682       }
 683     return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
 684   }
 685
 686   const char16_t*
 687   ucs2_span(range<const char16_t, false>& from, size_t max,
 688             char32_t maxcode, codecvt_mode mode)
 689   {
 690     read_utf16_bom(from, mode);
 691     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 692     maxcode = std::min(max_single_utf16_unit, maxcode);
 693     char32_t c = 0;
 694     while (max-- && c <= maxcode)
 695       c = read_utf16_code_point(from, maxcode, mode);
 696     return reinterpret_cast<const char16_t*>(from.next);
 697   }
 698
 699   template<typename C>
 700   const C*
 701   ucs2_span(const C* begin, const C* end, size_t max,
 702             char32_t maxcode, codecvt_mode mode)
 703   {
 704     range<const C> from{ begin, end };
 705     read_utf8_bom(from, mode);
 706     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 707     maxcode = std::min(max_single_utf16_unit, maxcode);
 708     char32_t c = 0;
 709     while (max-- && c <= maxcode)
 710       c = read_utf8_code_point(from, maxcode);
 711     return from.next;
 712   }
 713
 714   // return pos such that [begin,pos) is valid UCS-4 string no longer than max
 715   template<typename C>
 716   const C*
 717   ucs4_span(const C* begin, const C* end, size_t max,
 718             char32_t maxcode = max_code_point, codecvt_mode mode = {})
 719   {
 720     range<const C> from{ begin, end };
 721     read_utf8_bom(from, mode);
 722     char32_t c = 0;
 723     while (max-- && c <= maxcode)
 724       c = read_utf8_code_point(from, maxcode);
 725     return from.next;
 726   }
 727
 728   // return pos such that [begin,pos) is valid UCS-4 string no longer than max
 729   const char16_t*
 730   ucs4_span(range<const char16_t, false>& from, size_t max,
 731             char32_t maxcode = max_code_point, codecvt_mode mode = {})
 732   {
 733     read_utf16_bom(from, mode);
 734     char32_t c = 0;
 735     while (max-- && c <= maxcode)
 736       c = read_utf16_code_point(from, maxcode, mode);
 737     return reinterpret_cast<const char16_t*>(from.next);
 738   }
 739 }
 740
 741 // Define members of codecvt<char16_t, char, mbstate_t> specialization.
 742 // Converts from UTF-8 to UTF-16.
 743
 744 locale::id codecvt<char16_t, char, mbstate_t>::id;
 745
 746 codecvt<char16_t, char, mbstate_t>::~codecvt() { }
 747
 748 codecvt_base::result
 749 codecvt<char16_t, char, mbstate_t>::
 750 do_out(state_type&,
 751        const intern_type* __from,
 752        const intern_type* __from_end, const intern_type*& __from_next,
 753        extern_type* __to, extern_type* __to_end,
 754        extern_type*& __to_next) const
 755 {
 756   range<const char16_t> from{ __from, __from_end };
 757   range<char> to{ __to, __to_end };
 758   auto res = utf16_out(from, to);
 759   __from_next = from.next;
 760   __to_next = to.next;
 761   return res;
 762 }
 763
 764 codecvt_base::result
 765 codecvt<char16_t, char, mbstate_t>::
 766 do_unshift(state_type&, extern_type* __to, extern_type*,
 767            extern_type*& __to_next) const
 768 {
 769   __to_next = __to;
 770   return noconv; // we don't use mbstate_t for the unicode facets
 771 }
 772
 773 codecvt_base::result
 774 codecvt<char16_t, char, mbstate_t>::
 775 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 776       const extern_type*& __from_next,
 777       intern_type* __to, intern_type* __to_end,
 778       intern_type*& __to_next) const
 779 {
 780   range<const char> from{ __from, __from_end };
 781   range<char16_t> to{ __to, __to_end };
 782 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 783   codecvt_mode mode = {};
 784 #else
 785   codecvt_mode mode = little_endian;
 786 #endif
 787   auto res = utf16_in(from, to, max_code_point, mode);
 788   __from_next = from.next;
 789   __to_next = to.next;
 790   return res;
 791 }
 792
 793 int
 794 codecvt<char16_t, char, mbstate_t>::do_encoding() const throw()
 795 { return 0; } // UTF-8 is not a fixed-width encoding
 796
 797 bool
 798 codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw()
 799 { return false; }
 800
 801 int
 802 codecvt<char16_t, char, mbstate_t>::
 803 do_length(state_type&, const extern_type* __from,
 804           const extern_type* __end, size_t __max) const
 805 {
 806   __end = utf16_span(__from, __end, __max);
 807   return __end - __from;
 808 }
 809
 810 int
 811 codecvt<char16_t, char, mbstate_t>::do_max_length() const throw()
 812 {
 813   // A single character (one or two UTF-16 code units) requires
 814   // up to four UTF-8 code units.
 815   return 4;
 816 }
 817
 818 // Define members of codecvt<char32_t, char, mbstate_t> specialization.
 819 // Converts from UTF-8 to UTF-32 (aka UCS-4).
 820
 821 locale::id codecvt<char32_t, char, mbstate_t>::id;
 822
 823 codecvt<char32_t, char, mbstate_t>::~codecvt() { }
 824
 825 codecvt_base::result
 826 codecvt<char32_t, char, mbstate_t>::
 827 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
 828        const intern_type*& __from_next,
 829        extern_type* __to, extern_type* __to_end,
 830        extern_type*& __to_next) const
 831 {
 832   range<const char32_t> from{ __from, __from_end };
 833   range<char> to{ __to, __to_end };
 834   auto res = ucs4_out(from, to);
 835   __from_next = from.next;
 836   __to_next = to.next;
 837   return res;
 838 }
 839
 840 codecvt_base::result
 841 codecvt<char32_t, char, mbstate_t>::
 842 do_unshift(state_type&, extern_type* __to, extern_type*,
 843            extern_type*& __to_next) const
 844 {
 845   __to_next = __to;
 846   return noconv;
 847 }
 848
 849 codecvt_base::result
 850 codecvt<char32_t, char, mbstate_t>::
 851 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 852       const extern_type*& __from_next,
 853       intern_type* __to, intern_type* __to_end,
 854       intern_type*& __to_next) const
 855 {
 856   range<const char> from{ __from, __from_end };
 857   range<char32_t> to{ __to, __to_end };
 858   auto res = ucs4_in(from, to);
 859   __from_next = from.next;
 860   __to_next = to.next;
 861   return res;
 862 }
 863
 864 int
 865 codecvt<char32_t, char, mbstate_t>::do_encoding() const throw()
 866 { return 0; } // UTF-8 is not a fixed-width encoding
 867
 868 bool
 869 codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw()
 870 { return false; }
 871
 872 int
 873 codecvt<char32_t, char, mbstate_t>::
 874 do_length(state_type&, const extern_type* __from,
 875           const extern_type* __end, size_t __max) const
 876 {
 877   __end = ucs4_span(__from, __end, __max);
 878   return __end - __from;
 879 }
 880
 881 int
 882 codecvt<char32_t, char, mbstate_t>::do_max_length() const throw()
 883 {
 884   // A single character (one UTF-32 code unit) requires
 885   // up to 4 UTF-8 code units.
 886   return 4;
 887 }
 888
 889 #if defined(_GLIBCXX_USE_CHAR8_T)
 890 // Define members of codecvt<char16_t, char8_t, mbstate_t> specialization.
 891 // Converts from UTF-8 to UTF-16.
 892
 893 locale::id codecvt<char16_t, char8_t, mbstate_t>::id;
 894
 895 codecvt<char16_t, char8_t, mbstate_t>::~codecvt() { }
 896
 897 codecvt_base::result
 898 codecvt<char16_t, char8_t, mbstate_t>::
 899 do_out(state_type&,
 900        const intern_type* __from,
 901        const intern_type* __from_end, const intern_type*& __from_next,
 902        extern_type* __to, extern_type* __to_end,
 903        extern_type*& __to_next) const
 904 {
 905   range<const char16_t> from{ __from, __from_end };
 906   range<char8_t> to{ __to, __to_end };
 907   auto res = utf16_out(from, to);
 908   __from_next = from.next;
 909   __to_next = to.next;
 910   return res;
 911 }
 912
 913 codecvt_base::result
 914 codecvt<char16_t, char8_t, mbstate_t>::
 915 do_unshift(state_type&, extern_type* __to, extern_type*,
 916            extern_type*& __to_next) const
 917 {
 918   __to_next = __to;
 919   return noconv; // we don't use mbstate_t for the unicode facets
 920 }
 921
 922 codecvt_base::result
 923 codecvt<char16_t, char8_t, mbstate_t>::
 924 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 925       const extern_type*& __from_next,
 926       intern_type* __to, intern_type* __to_end,
 927       intern_type*& __to_next) const
 928 {
 929   range<const char8_t> from{ __from, __from_end };
 930   range<char16_t> to{ __to, __to_end };
 931 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 932   codecvt_mode mode = {};
 933 #else
 934   codecvt_mode mode = little_endian;
 935 #endif
 936   auto res = utf16_in(from, to, max_code_point, mode);
 937   __from_next = from.next;
 938   __to_next = to.next;
 939   return res;
 940 }
 941
 942 int
 943 codecvt<char16_t, char8_t, mbstate_t>::do_encoding() const throw()
 944 { return 0; } // UTF-8 is not a fixed-width encoding
 945
 946 bool
 947 codecvt<char16_t, char8_t, mbstate_t>::do_always_noconv() const throw()
 948 { return false; }
 949
 950 int
 951 codecvt<char16_t, char8_t, mbstate_t>::
 952 do_length(state_type&, const extern_type* __from,
 953           const extern_type* __end, size_t __max) const
 954 {
 955   __end = utf16_span(__from, __end, __max);
 956   return __end - __from;
 957 }
 958
 959 int
 960 codecvt<char16_t, char8_t, mbstate_t>::do_max_length() const throw()
 961 {
 962   // A single character (one or two UTF-16 code units) requires
 963   // up to four UTF-8 code units.
 964   return 4;
 965 }
 966
 967 // Define members of codecvt<char32_t, char8_t, mbstate_t> specialization.
 968 // Converts from UTF-8 to UTF-32 (aka UCS-4).
 969
 970 locale::id codecvt<char32_t, char8_t, mbstate_t>::id;
 971
 972 codecvt<char32_t, char8_t, mbstate_t>::~codecvt() { }
 973
 974 codecvt_base::result
 975 codecvt<char32_t, char8_t, mbstate_t>::
 976 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
 977        const intern_type*& __from_next,
 978        extern_type* __to, extern_type* __to_end,
 979        extern_type*& __to_next) const
 980 {
 981   range<const char32_t> from{ __from, __from_end };
 982   range<char8_t> to{ __to, __to_end };
 983   auto res = ucs4_out(from, to);
 984   __from_next = from.next;
 985   __to_next = to.next;
 986   return res;
 987 }
 988
 989 codecvt_base::result
 990 codecvt<char32_t, char8_t, mbstate_t>::
 991 do_unshift(state_type&, extern_type* __to, extern_type*,
 992            extern_type*& __to_next) const
 993 {
 994   __to_next = __to;
 995   return noconv;
 996 }
 997
 998 codecvt_base::result
 999 codecvt<char32_t, char8_t, mbstate_t>::
1000 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1001       const extern_type*& __from_next,
1002       intern_type* __to, intern_type* __to_end,
1003       intern_type*& __to_next) const
1004 {
1005   range<const char8_t> from{ __from, __from_end };
1006   range<char32_t> to{ __to, __to_end };
1007   auto res = ucs4_in(from, to);
1008   __from_next = from.next;
1009   __to_next = to.next;
1010   return res;
1011 }
1012
1013 int
1014 codecvt<char32_t, char8_t, mbstate_t>::do_encoding() const throw()
1015 { return 0; } // UTF-8 is not a fixed-width encoding
1016
1017 bool
1018 codecvt<char32_t, char8_t, mbstate_t>::do_always_noconv() const throw()
1019 { return false; }
1020
1021 int
1022 codecvt<char32_t, char8_t, mbstate_t>::
1023 do_length(state_type&, const extern_type* __from,
1024           const extern_type* __end, size_t __max) const
1025 {
1026   __end = ucs4_span(__from, __end, __max);
1027   return __end - __from;
1028 }
1029
1030 int
1031 codecvt<char32_t, char8_t, mbstate_t>::do_max_length() const throw()
1032 {
1033   // A single character (one UTF-32 code unit) requires
1034   // up to 4 UTF-8 code units.
1035   return 4;
1036 }
1037 #endif // _GLIBCXX_USE_CHAR8_T
1038
1039 // Define members of codecvt_utf8<char16_t> base class implementation.
1040 // Converts from UTF-8 to UCS-2.
1041
1042 __codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { }
1043
1044 codecvt_base::result
1045 __codecvt_utf8_base<char16_t>::
1046 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1047        const intern_type*& __from_next,
1048        extern_type* __to, extern_type* __to_end,
1049        extern_type*& __to_next) const
1050 {
1051   range<const char16_t> from{ __from, __from_end };
1052   range<char> to{ __to, __to_end };
1053   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1054   __from_next = from.next;
1055   __to_next = to.next;
1056   return res;
1057 }
1058
1059 codecvt_base::result
1060 __codecvt_utf8_base<char16_t>::
1061 do_unshift(state_type&, extern_type* __to, extern_type*,
1062            extern_type*& __to_next) const
1063 {
1064   __to_next = __to;
1065   return noconv;
1066 }
1067
1068 codecvt_base::result
1069 __codecvt_utf8_base<char16_t>::
1070 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1071       const extern_type*& __from_next,
1072       intern_type* __to, intern_type* __to_end,
1073       intern_type*& __to_next) const
1074 {
1075   range<const char> from{ __from, __from_end };
1076   range<char16_t> to{ __to, __to_end };
1077   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1078 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1079   mode = codecvt_mode(mode | little_endian);
1080 #endif
1081   auto res = ucs2_in(from, to, _M_maxcode, mode);
1082   __from_next = from.next;
1083   __to_next = to.next;
1084   return res;
1085 }
1086
1087 int
1088 __codecvt_utf8_base<char16_t>::do_encoding() const throw()
1089 { return 0; } // UTF-8 is not a fixed-width encoding
1090
1091 bool
1092 __codecvt_utf8_base<char16_t>::do_always_noconv() const throw()
1093 { return false; }
1094
1095 int
1096 __codecvt_utf8_base<char16_t>::
1097 do_length(state_type&, const extern_type* __from,
1098           const extern_type* __end, size_t __max) const
1099 {
1100   __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1101   return __end - __from;
1102 }
1103
1104 int
1105 __codecvt_utf8_base<char16_t>::do_max_length() const throw()
1106 {
1107   // A single UCS-2 character requires up to three UTF-8 code units.
1108   // (UCS-2 cannot represent characters that use four UTF-8 code units).
1109   int max = 3;
1110   if (_M_mode & consume_header)
1111     max += sizeof(utf8_bom);
1112   return max;
1113 }
1114
1115 // Define members of codecvt_utf8<char32_t> base class implementation.
1116 // Converts from UTF-8 to UTF-32 (aka UCS-4).
1117
1118 __codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { }
1119
1120 codecvt_base::result
1121 __codecvt_utf8_base<char32_t>::
1122 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1123        const intern_type*& __from_next,
1124        extern_type* __to, extern_type* __to_end,
1125        extern_type*& __to_next) const
1126 {
1127   range<const char32_t> from{ __from, __from_end };
1128   range<char> to{ __to, __to_end };
1129   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1130   __from_next = from.next;
1131   __to_next = to.next;
1132   return res;
1133 }
1134
1135 codecvt_base::result
1136 __codecvt_utf8_base<char32_t>::
1137 do_unshift(state_type&, extern_type* __to, extern_type*,
1138            extern_type*& __to_next) const
1139 {
1140   __to_next = __to;
1141   return noconv;
1142 }
1143
1144 codecvt_base::result
1145 __codecvt_utf8_base<char32_t>::
1146 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1147       const extern_type*& __from_next,
1148       intern_type* __to, intern_type* __to_end,
1149       intern_type*& __to_next) const
1150 {
1151   range<const char> from{ __from, __from_end };
1152   range<char32_t> to{ __to, __to_end };
1153   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1154   __from_next = from.next;
1155   __to_next = to.next;
1156   return res;
1157 }
1158
1159 int
1160 __codecvt_utf8_base<char32_t>::do_encoding() const throw()
1161 { return 0; } // UTF-8 is not a fixed-width encoding
1162
1163 bool
1164 __codecvt_utf8_base<char32_t>::do_always_noconv() const throw()
1165 { return false; }
1166
1167 int
1168 __codecvt_utf8_base<char32_t>::
1169 do_length(state_type&, const extern_type* __from,
1170           const extern_type* __end, size_t __max) const
1171 {
1172   __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1173   return __end - __from;
1174 }
1175
1176 int
1177 __codecvt_utf8_base<char32_t>::do_max_length() const throw()
1178 {
1179   // A single UCS-4 character requires up to four UTF-8 code units.
1180   int max = 4;
1181   if (_M_mode & consume_header)
1182     max += sizeof(utf8_bom);
1183   return max;
1184 }
1185
1186 #ifdef _GLIBCXX_USE_WCHAR_T
1187
1188 #if __SIZEOF_WCHAR_T__ == 2
1189 static_assert(sizeof(wchar_t) == sizeof(char16_t), "");
1190 #elif __SIZEOF_WCHAR_T__ == 4
1191 static_assert(sizeof(wchar_t) == sizeof(char32_t), "");
1192 #endif
1193
1194 // Define members of codecvt_utf8<wchar_t> base class implementation.
1195 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1196
1197 __codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { }
1198
1199 codecvt_base::result
1200 __codecvt_utf8_base<wchar_t>::
1201 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1202        const intern_type*& __from_next,
1203        extern_type* __to, extern_type* __to_end,
1204        extern_type*& __to_next) const
1205 {
1206   range<char> to{ __to, __to_end };
1207 #if __SIZEOF_WCHAR_T__ == 2
1208   range<const char16_t> from{
1209     reinterpret_cast<const char16_t*>(__from),
1210     reinterpret_cast<const char16_t*>(__from_end)
1211   };
1212   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1213 #elif __SIZEOF_WCHAR_T__ == 4
1214   range<const char32_t> from{
1215     reinterpret_cast<const char32_t*>(__from),
1216     reinterpret_cast<const char32_t*>(__from_end)
1217   };
1218   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1219 #else
1220   return codecvt_base::error;
1221 #endif
1222   __from_next = reinterpret_cast<const wchar_t*>(from.next);
1223   __to_next = to.next;
1224   return res;
1225 }
1226
1227 codecvt_base::result
1228 __codecvt_utf8_base<wchar_t>::
1229 do_unshift(state_type&, extern_type* __to, extern_type*,
1230            extern_type*& __to_next) const
1231 {
1232   __to_next = __to;
1233   return noconv;
1234 }
1235
1236 codecvt_base::result
1237 __codecvt_utf8_base<wchar_t>::
1238 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1239       const extern_type*& __from_next,
1240       intern_type* __to, intern_type* __to_end,
1241       intern_type*& __to_next) const
1242 {
1243   range<const char> from{ __from, __from_end };
1244 #if __SIZEOF_WCHAR_T__ == 2
1245   range<char16_t> to{
1246     reinterpret_cast<char16_t*>(__to),
1247     reinterpret_cast<char16_t*>(__to_end)
1248   };
1249 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1250   codecvt_mode mode = {};
1251 #else
1252   codecvt_mode mode = little_endian;
1253 #endif
1254   auto res = ucs2_in(from, to, _M_maxcode, mode);
1255 #elif __SIZEOF_WCHAR_T__ == 4
1256   range<char32_t> to{
1257     reinterpret_cast<char32_t*>(__to),
1258     reinterpret_cast<char32_t*>(__to_end)
1259   };
1260   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1261 #else
1262   return codecvt_base::error;
1263 #endif
1264   __from_next = from.next;
1265   __to_next = reinterpret_cast<wchar_t*>(to.next);
1266   return res;
1267 }
1268
1269 int
1270 __codecvt_utf8_base<wchar_t>::do_encoding() const throw()
1271 { return 0; } // UTF-8 is not a fixed-width encoding
1272
1273 bool
1274 __codecvt_utf8_base<wchar_t>::do_always_noconv() const throw()
1275 { return false; }
1276
1277 int
1278 __codecvt_utf8_base<wchar_t>::
1279 do_length(state_type&, const extern_type* __from,
1280           const extern_type* __end, size_t __max) const
1281 {
1282 #if __SIZEOF_WCHAR_T__ == 2
1283   __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1284 #elif __SIZEOF_WCHAR_T__ == 4
1285   __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1286 #else
1287   __end = __from;
1288 #endif
1289   return __end - __from;
1290 }
1291
1292 int
1293 __codecvt_utf8_base<wchar_t>::do_max_length() const throw()
1294 {
1295 #if __SIZEOF_WCHAR_T__ == 2
1296   int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
1297 #else
1298   int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
1299 #endif
1300   if (_M_mode & consume_header)
1301     max += sizeof(utf8_bom);
1302   return max;
1303 }
1304 #endif
1305
1306 // Define members of codecvt_utf16<char16_t> base class implementation.
1307 // Converts from UTF-16 to UCS-2.
1308
1309 __codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { }
1310
1311 codecvt_base::result
1312 __codecvt_utf16_base<char16_t>::
1313 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1314        const intern_type*& __from_next,
1315        extern_type* __to, extern_type* __to_end,
1316        extern_type*& __to_next) const
1317 {
1318   range<const char16_t> from{ __from, __from_end };
1319   range<char16_t, false> to{ __to, __to_end };
1320   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1321   __from_next = from.next;
1322   __to_next = reinterpret_cast<char*>(to.next);
1323   return res;
1324 }
1325
1326 codecvt_base::result
1327 __codecvt_utf16_base<char16_t>::
1328 do_unshift(state_type&, extern_type* __to, extern_type*,
1329            extern_type*& __to_next) const
1330 {
1331   __to_next = __to;
1332   return noconv;
1333 }
1334
1335 codecvt_base::result
1336 __codecvt_utf16_base<char16_t>::
1337 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1338       const extern_type*& __from_next,
1339       intern_type* __to, intern_type* __to_end,
1340       intern_type*& __to_next) const
1341 {
1342   range<const char16_t, false> from{ __from, __from_end };
1343   range<char16_t> to{ __to, __to_end };
1344   auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1345   __from_next = reinterpret_cast<const char*>(from.next);
1346   __to_next = to.next;
1347   if (res == codecvt_base::ok && __from_next != __from_end)
1348     res = codecvt_base::error;
1349   return res;
1350 }
1351
1352 int
1353 __codecvt_utf16_base<char16_t>::do_encoding() const throw()
1354 { return 0; } // UTF-16 is not a fixed-width encoding
1355
1356 bool
1357 __codecvt_utf16_base<char16_t>::do_always_noconv() const throw()
1358 { return false; }
1359
1360 int
1361 __codecvt_utf16_base<char16_t>::
1362 do_length(state_type&, const extern_type* __from,
1363           const extern_type* __end, size_t __max) const
1364 {
1365   range<const char16_t, false> from{ __from, __end };
1366   const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1367   return reinterpret_cast<const char*>(next) - __from;
1368 }
1369
1370 int
1371 __codecvt_utf16_base<char16_t>::do_max_length() const throw()
1372 {
1373   // A single UCS-2 character requires one UTF-16 code unit (so two chars).
1374   // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
1375   int max = 2;
1376   if (_M_mode & consume_header)
1377     max += sizeof(utf16_bom);
1378   return max;
1379 }
1380
1381 // Define members of codecvt_utf16<char32_t> base class implementation.
1382 // Converts from UTF-16 to UTF-32 (aka UCS-4).
1383
1384 __codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { }
1385
1386 codecvt_base::result
1387 __codecvt_utf16_base<char32_t>::
1388 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1389        const intern_type*& __from_next,
1390        extern_type* __to, extern_type* __to_end,
1391        extern_type*& __to_next) const
1392 {
1393   range<const char32_t> from{ __from, __from_end };
1394   range<char16_t, false> to{ __to, __to_end };
1395   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1396   __from_next = from.next;
1397   __to_next = reinterpret_cast<char*>(to.next);
1398   return res;
1399 }
1400
1401 codecvt_base::result
1402 __codecvt_utf16_base<char32_t>::
1403 do_unshift(state_type&, extern_type* __to, extern_type*,
1404            extern_type*& __to_next) const
1405 {
1406   __to_next = __to;
1407   return noconv;
1408 }
1409
1410 codecvt_base::result
1411 __codecvt_utf16_base<char32_t>::
1412 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1413       const extern_type*& __from_next,
1414       intern_type* __to, intern_type* __to_end,
1415       intern_type*& __to_next) const
1416 {
1417   range<const char16_t, false> from{ __from, __from_end };
1418   range<char32_t> to{ __to, __to_end };
1419   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1420   __from_next = reinterpret_cast<const char*>(from.next);
1421   __to_next = to.next;
1422   if (res == codecvt_base::ok && __from_next != __from_end)
1423     res = codecvt_base::error;
1424   return res;
1425 }
1426
1427 int
1428 __codecvt_utf16_base<char32_t>::do_encoding() const throw()
1429 { return 0; } // UTF-16 is not a fixed-width encoding
1430
1431 bool
1432 __codecvt_utf16_base<char32_t>::do_always_noconv() const throw()
1433 { return false; }
1434
1435 int
1436 __codecvt_utf16_base<char32_t>::
1437 do_length(state_type&, const extern_type* __from,
1438           const extern_type* __end, size_t __max) const
1439 {
1440   range<const char16_t, false> from{ __from, __end };
1441   const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1442   return reinterpret_cast<const char*>(next) - __from;
1443 }
1444
1445 int
1446 __codecvt_utf16_base<char32_t>::do_max_length() const throw()
1447 {
1448   // A single UCS-4 character requires one or two UTF-16 code units
1449   // (so up to four chars).
1450   int max = 4;
1451   if (_M_mode & consume_header)
1452     max += sizeof(utf16_bom);
1453   return max;
1454 }
1455
1456 #ifdef _GLIBCXX_USE_WCHAR_T
1457 // Define members of codecvt_utf16<wchar_t> base class implementation.
1458 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1459
1460 __codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { }
1461
1462 codecvt_base::result
1463 __codecvt_utf16_base<wchar_t>::
1464 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1465        const intern_type*& __from_next,
1466        extern_type* __to, extern_type* __to_end,
1467        extern_type*& __to_next) const
1468 {
1469   range<char16_t, false> to{ __to, __to_end };
1470 #if __SIZEOF_WCHAR_T__ == 2
1471   range<const char16_t> from{
1472     reinterpret_cast<const char16_t*>(__from),
1473     reinterpret_cast<const char16_t*>(__from_end),
1474   };
1475   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1476 #elif __SIZEOF_WCHAR_T__ == 4
1477   range<const char32_t> from{
1478     reinterpret_cast<const char32_t*>(__from),
1479     reinterpret_cast<const char32_t*>(__from_end),
1480   };
1481   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1482 #else
1483   return codecvt_base::error;
1484 #endif
1485   __from_next = reinterpret_cast<const wchar_t*>(from.next);
1486   __to_next = reinterpret_cast<char*>(to.next);
1487   return res;
1488 }
1489
1490 codecvt_base::result
1491 __codecvt_utf16_base<wchar_t>::
1492 do_unshift(state_type&, extern_type* __to, extern_type*,
1493            extern_type*& __to_next) const
1494 {
1495   __to_next = __to;
1496   return noconv;
1497 }
1498
1499 codecvt_base::result
1500 __codecvt_utf16_base<wchar_t>::
1501 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1502       const extern_type*& __from_next,
1503       intern_type* __to, intern_type* __to_end,
1504       intern_type*& __to_next) const
1505 {
1506   range<const char16_t, false> from{ __from, __from_end };
1507 #if __SIZEOF_WCHAR_T__ == 2
1508   range<char16_t> to{
1509     reinterpret_cast<char16_t*>(__to),
1510     reinterpret_cast<char16_t*>(__to_end),
1511   };
1512   auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1513 #elif __SIZEOF_WCHAR_T__ == 4
1514   range<char32_t> to{
1515     reinterpret_cast<char32_t*>(__to),
1516     reinterpret_cast<char32_t*>(__to_end),
1517   };
1518   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1519 #else
1520   return codecvt_base::error;
1521 #endif
1522   __from_next = reinterpret_cast<const char*>(from.next);
1523   __to_next = reinterpret_cast<wchar_t*>(to.next);
1524   if (res == codecvt_base::ok && __from_next != __from_end)
1525     res = codecvt_base::error;
1526   return res;
1527 }
1528
1529 int
1530 __codecvt_utf16_base<wchar_t>::do_encoding() const throw()
1531 { return 0; } // UTF-16 is not a fixed-width encoding
1532
1533 bool
1534 __codecvt_utf16_base<wchar_t>::do_always_noconv() const throw()
1535 { return false; }
1536
1537 int
1538 __codecvt_utf16_base<wchar_t>::
1539 do_length(state_type&, const extern_type* __from,
1540           const extern_type* __end, size_t __max) const
1541 {
1542   range<const char16_t, false> from{ __from, __end };
1543 #if __SIZEOF_WCHAR_T__ == 2
1544   const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1545 #elif __SIZEOF_WCHAR_T__ == 4
1546   const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1547 #endif
1548   return reinterpret_cast<const char*>(next) - __from;
1549 }
1550
1551 int
1552 __codecvt_utf16_base<wchar_t>::do_max_length() const throw()
1553 {
1554 #if __SIZEOF_WCHAR_T__ == 2
1555   int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
1556 #else
1557   int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
1558 #endif
1559   if (_M_mode & consume_header)
1560     max += sizeof(utf16_bom);
1561   return max;
1562 }
1563 #endif
1564
1565 // Define members of codecvt_utf8_utf16<char16_t> base class implementation.
1566 // Converts from UTF-8 to UTF-16.
1567
1568 __codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { }
1569
1570 codecvt_base::result
1571 __codecvt_utf8_utf16_base<char16_t>::
1572 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1573        const intern_type*& __from_next,
1574        extern_type* __to, extern_type* __to_end,
1575        extern_type*& __to_next) const
1576 {
1577   range<const char16_t> from{ __from, __from_end };
1578   range<char> to{ __to, __to_end };
1579   auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1580   __from_next = from.next;
1581   __to_next = to.next;
1582   return res;
1583 }
1584
1585 codecvt_base::result
1586 __codecvt_utf8_utf16_base<char16_t>::
1587 do_unshift(state_type&, extern_type* __to, extern_type*,
1588            extern_type*& __to_next) const
1589 {
1590   __to_next = __to;
1591   return noconv;
1592 }
1593
1594 codecvt_base::result
1595 __codecvt_utf8_utf16_base<char16_t>::
1596 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1597       const extern_type*& __from_next,
1598       intern_type* __to, intern_type* __to_end,
1599       intern_type*& __to_next) const
1600 {
1601   range<const char> from{ __from, __from_end };
1602   range<char16_t> to{ __to, __to_end };
1603   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1604 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1605   mode = codecvt_mode(mode | little_endian);
1606 #endif
1607   auto res = utf16_in(from, to, _M_maxcode, mode);
1608   __from_next = from.next;
1609   __to_next = to.next;
1610   return res;
1611 }
1612
1613 int
1614 __codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw()
1615 { return 0; } // UTF-8 is not a fixed-width encoding
1616
1617 bool
1618 __codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw()
1619 { return false; }
1620
1621 int
1622 __codecvt_utf8_utf16_base<char16_t>::
1623 do_length(state_type&, const extern_type* __from,
1624           const extern_type* __end, size_t __max) const
1625 {
1626   __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1627   return __end - __from;
1628 }
1629
1630 int
1631 __codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw()
1632 {
1633   // A single character can be 1 or 2 UTF-16 code units,
1634   // requiring up to 4 UTF-8 code units.
1635   int max = 4;
1636   if (_M_mode & consume_header)
1637     max += sizeof(utf8_bom);
1638   return max;
1639 }
1640
1641 // Define members of codecvt_utf8_utf16<char32_t> base class implementation.
1642 // Converts from UTF-8 to UTF-16.
1643
1644 __codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { }
1645
1646 codecvt_base::result
1647 __codecvt_utf8_utf16_base<char32_t>::
1648 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1649        const intern_type*& __from_next,
1650        extern_type* __to, extern_type* __to_end,
1651        extern_type*& __to_next) const
1652 {
1653   range<const char32_t> from{ __from, __from_end };
1654   range<char> to{ __to, __to_end };
1655   auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1656   __from_next = from.next;
1657   __to_next = to.next;
1658   return res;
1659 }
1660
1661 codecvt_base::result
1662 __codecvt_utf8_utf16_base<char32_t>::
1663 do_unshift(state_type&, extern_type* __to, extern_type*,
1664            extern_type*& __to_next) const
1665 {
1666   __to_next = __to;
1667   return noconv;
1668 }
1669
1670 codecvt_base::result
1671 __codecvt_utf8_utf16_base<char32_t>::
1672 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1673       const extern_type*& __from_next,
1674       intern_type* __to, intern_type* __to_end,
1675       intern_type*& __to_next) const
1676 {
1677   range<const char> from{ __from, __from_end };
1678   range<char32_t> to{ __to, __to_end };
1679   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1680 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1681   mode = codecvt_mode(mode | little_endian);
1682 #endif
1683   auto res = utf16_in(from, to, _M_maxcode, mode);
1684   __from_next = from.next;
1685   __to_next = to.next;
1686   return res;
1687 }
1688
1689 int
1690 __codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw()
1691 { return 0; } // UTF-8 is not a fixed-width encoding
1692
1693 bool
1694 __codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw()
1695 { return false; }
1696
1697 int
1698 __codecvt_utf8_utf16_base<char32_t>::
1699 do_length(state_type&, const extern_type* __from,
1700           const extern_type* __end, size_t __max) const
1701 {
1702   __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1703   return __end - __from;
1704 }
1705
1706 int
1707 __codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw()
1708 {
1709   // A single character can be 1 or 2 UTF-16 code units,
1710   // requiring up to 4 UTF-8 code units.
1711   int max = 4;
1712   if (_M_mode & consume_header)
1713     max += sizeof(utf8_bom);
1714   return max;
1715 }
1716
1717 #ifdef _GLIBCXX_USE_WCHAR_T
1718 // Define members of codecvt_utf8_utf16<wchar_t> base class implementation.
1719 // Converts from UTF-8 to UTF-16.
1720
1721 __codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { }
1722
1723 codecvt_base::result
1724 __codecvt_utf8_utf16_base<wchar_t>::
1725 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1726        const intern_type*& __from_next,
1727        extern_type* __to, extern_type* __to_end,
1728        extern_type*& __to_next) const
1729 {
1730   range<const wchar_t> from{ __from, __from_end };
1731   range<char> to{ __to, __to_end };
1732   auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1733   __from_next = from.next;
1734   __to_next = to.next;
1735   return res;
1736 }
1737
1738 codecvt_base::result
1739 __codecvt_utf8_utf16_base<wchar_t>::
1740 do_unshift(state_type&, extern_type* __to, extern_type*,
1741            extern_type*& __to_next) const
1742 {
1743   __to_next = __to;
1744   return noconv;
1745 }
1746
1747 codecvt_base::result
1748 __codecvt_utf8_utf16_base<wchar_t>::
1749 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1750       const extern_type*& __from_next,
1751       intern_type* __to, intern_type* __to_end,
1752       intern_type*& __to_next) const
1753 {
1754   range<const char> from{ __from, __from_end };
1755   range<wchar_t> to{ __to, __to_end };
1756   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1757 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1758   mode = codecvt_mode(mode | little_endian);
1759 #endif
1760   auto res = utf16_in(from, to, _M_maxcode, mode);
1761   __from_next = from.next;
1762   __to_next = to.next;
1763   return res;
1764 }
1765
1766 int
1767 __codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw()
1768 { return 0; } // UTF-8 is not a fixed-width encoding
1769
1770 bool
1771 __codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw()
1772 { return false; }
1773
1774 int
1775 __codecvt_utf8_utf16_base<wchar_t>::
1776 do_length(state_type&, const extern_type* __from,
1777           const extern_type* __end, size_t __max) const
1778 {
1779   __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1780   return __end - __from;
1781 }
1782
1783 int
1784 __codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw()
1785 {
1786   // A single character can be 1 or 2 UTF-16 code units,
1787   // requiring up to 4 UTF-8 code units.
1788   int max = 4;
1789   if (_M_mode & consume_header)
1790     max += sizeof(utf8_bom);
1791   return max;
1792 }
1793 #endif
1794
1795 inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>;
1796 inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>;
1797 template class codecvt_byname<char16_t, char, mbstate_t>;
1798 template class codecvt_byname<char32_t, char, mbstate_t>;
1799
1800 #if defined(_GLIBCXX_USE_CHAR8_T)
1801 inline template class __codecvt_abstract_base<char16_t, char8_t, mbstate_t>;
1802 inline template class __codecvt_abstract_base<char32_t, char8_t, mbstate_t>;
1803 template class codecvt_byname<char16_t, char8_t, mbstate_t>;
1804 template class codecvt_byname<char32_t, char8_t, mbstate_t>;
1805 #endif
1806
1807 _GLIBCXX_END_NAMESPACE_VERSION
1808 }