]> git.ipfire.org Git - thirdparty/gcc.git/blame - libstdc++-v3/src/c++11/codecvt.cc
Update copyright years.
[thirdparty/gcc.git] / libstdc++-v3 / src / c++11 / codecvt.cc
CommitLineData
bb93f35d
JW
1// Locale support (codecvt) -*- C++ -*-
2
83ffe9cd 3// Copyright (C) 2015-2023 Free Software Foundation, Inc.
bb93f35d
JW
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
28af1fb3 25#include <codecvt>
bb93f35d 26#include <cstring> // std::memcpy, std::memcmp
bcd682e1 27#include <bits/stl_algobase.h> // std::min
bb93f35d 28
bb93f35d
JW
29namespace std _GLIBCXX_VISIBILITY(default)
30{
31_GLIBCXX_BEGIN_NAMESPACE_VERSION
32
bcd682e1
JW
33 // The standard doesn't define these operators, which is annoying.
34 static underlying_type<codecvt_mode>::type
35 to_integer(codecvt_mode m)
e363c939 36 { return static_cast<underlying_type<codecvt_mode>::type>(m); }
bcd682e1
JW
37
38 static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
39 { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
40
41 static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
42 { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
43
44 static codecvt_mode operator~(codecvt_mode m)
45 { return codecvt_mode(~to_integer(m)); }
46
bb93f35d
JW
47namespace
48{
49 // Largest code point that fits in a single UTF-16 code unit.
50 const char32_t max_single_utf16_unit = 0xFFFF;
b6584a72 51
bb93f35d
JW
52 const char32_t max_code_point = 0x10FFFF;
53
b6584a72
JW
54 // The functions below rely on maxcode < incomplete_mb_character
55 // (which is enforced by the codecvt_utf* classes on construction).
56 const char32_t incomplete_mb_character = char32_t(-2);
57 const char32_t invalid_mb_sequence = char32_t(-1);
58
d951e75d
JW
59 // Utility type for reading and writing code units of type Elem from
60 // a range defined by a pair of pointers.
61 template<typename Elem, bool Aligned = true>
bb93f35d
JW
62 struct range
63 {
64 Elem* next;
65 Elem* end;
66
d951e75d
JW
67 // Write a code unit.
68 range& operator=(Elem e)
69 {
70 *next++ = e;
71 return *this;
72 }
73
74 // Read the next code unit.
bb93f35d
JW
75 Elem operator*() const { return *next; }
76
d951e75d
JW
77 // Read the Nth code unit.
78 Elem operator[](size_t n) const { return next[n]; }
79
80 // Move to the next code unit.
81 range& operator++()
82 {
83 ++next;
84 return *this;
85 }
86
87 // Move to the Nth code unit.
88 range& operator+=(size_t n)
89 {
90 next += n;
91 return *this;
92 }
bb93f35d 93
d951e75d 94 // The number of code units remaining.
bb93f35d 95 size_t size() const { return end - next; }
d951e75d
JW
96
97 // The number of bytes remaining.
98 size_t nbytes() const { return (const char*)end - (const char*)next; }
99 };
100
101 // This specialization is used when accessing char16_t values through
102 // pointers to char, which might not be correctly aligned for char16_t.
103 template<typename Elem>
104 struct range<Elem, false>
105 {
106 using value_type = typename remove_const<Elem>::type;
107
108 using char_pointer = typename
109 conditional<is_const<Elem>::value, const char*, char*>::type;
110
111 char_pointer next;
112 char_pointer end;
113
114 // Write a code unit.
115 range& operator=(Elem e)
116 {
117 memcpy(next, &e, sizeof(Elem));
118 ++*this;
119 return *this;
120 }
121
122 // Read the next code unit.
123 Elem operator*() const
124 {
125 value_type e;
126 memcpy(&e, next, sizeof(Elem));
127 return e;
128 }
129
130 // Read the Nth code unit.
131 Elem operator[](size_t n) const
132 {
133 value_type e;
134 memcpy(&e, next + n * sizeof(Elem), sizeof(Elem));
135 return e;
136 }
137
138 // Move to the next code unit.
139 range& operator++()
140 {
141 next += sizeof(Elem);
142 return *this;
143 }
144
145 // Move to the Nth code unit.
146 range& operator+=(size_t n)
147 {
148 next += n * sizeof(Elem);
149 return *this;
150 }
151
152 // The number of code units remaining.
153 size_t size() const { return nbytes() / sizeof(Elem); }
154
155 // The number of bytes remaining.
156 size_t nbytes() const { return end - next; }
bb93f35d
JW
157 };
158
28af1fb3
JW
159 // Multibyte sequences can have "header" consisting of Byte Order Mark
160 const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
516231de
JW
161 const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
162 const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
28af1fb3 163
d951e75d
JW
164 // Write a BOM (space permitting).
165 template<typename C, bool A, size_t N>
166 bool
167 write_bom(range<C, A>& to, const unsigned char (&bom)[N])
28af1fb3 168 {
d951e75d
JW
169 static_assert( (N / sizeof(C)) != 0, "" );
170 static_assert( (N % sizeof(C)) == 0, "" );
171
172 if (to.nbytes() < N)
28af1fb3
JW
173 return false;
174 memcpy(to.next, bom, N);
d951e75d 175 to += (N / sizeof(C));
28af1fb3
JW
176 return true;
177 }
178
d951e75d
JW
179 // Try to read a BOM.
180 template<typename C, bool A, size_t N>
181 bool
182 read_bom(range<C, A>& from, const unsigned char (&bom)[N])
183 {
184 static_assert( (N / sizeof(C)) != 0, "" );
185 static_assert( (N % sizeof(C)) == 0, "" );
186
187 if (from.nbytes() >= N && !memcmp(from.next, bom, N))
188 {
189 from += (N / sizeof(C));
190 return true;
191 }
192 return false;
193 }
194
28af1fb3 195 // If generate_header is set in mode write out UTF-8 BOM.
c124af93 196 template<typename C>
28af1fb3 197 bool
c124af93 198 write_utf8_bom(range<C>& to, codecvt_mode mode)
28af1fb3
JW
199 {
200 if (mode & generate_header)
201 return write_bom(to, utf8_bom);
202 return true;
203 }
204
205 // If generate_header is set in mode write out the UTF-16 BOM indicated
206 // by whether little_endian is set in mode.
d951e75d 207 template<bool Aligned>
28af1fb3 208 bool
d951e75d 209 write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode)
28af1fb3
JW
210 {
211 if (mode & generate_header)
212 {
d951e75d
JW
213 if (mode & little_endian)
214 return write_bom(to, utf16le_bom);
215 else
216 return write_bom(to, utf16_bom);
28af1fb3
JW
217 }
218 return true;
219 }
220
28af1fb3 221 // If consume_header is set in mode update from.next to after any BOM.
c124af93 222 template<typename C>
28af1fb3 223 void
c124af93 224 read_utf8_bom(range<const C>& from, codecvt_mode mode)
28af1fb3
JW
225 {
226 if (mode & consume_header)
227 read_bom(from, utf8_bom);
228 }
229
bcd682e1
JW
230 // If consume_header is not set in mode, no effects.
231 // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
232 // - if the UTF-16BE BOM was found unset little_endian in mode, or
233 // - if the UTF-16LE BOM was found set little_endian in mode.
d951e75d 234 template<bool Aligned>
bcd682e1 235 void
d951e75d 236 read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode)
28af1fb3 237 {
d951e75d 238 if (mode & consume_header)
28af1fb3 239 {
d951e75d
JW
240 if (read_bom(from, utf16_bom))
241 mode &= ~little_endian;
242 else if (read_bom(from, utf16le_bom))
243 mode |= little_endian;
28af1fb3 244 }
28af1fb3
JW
245 }
246
247 // Read a codepoint from a UTF-8 multibyte sequence.
248 // Updates from.next if the codepoint is not greater than maxcode.
b6584a72 249 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
c124af93 250 template<typename C>
bb93f35d 251 char32_t
c124af93 252 read_utf8_code_point(range<const C>& from, unsigned long maxcode)
bb93f35d 253 {
b6584a72 254 const size_t avail = from.size();
bb93f35d 255 if (avail == 0)
b6584a72 256 return incomplete_mb_character;
8f7b7c14 257 char32_t c1 = (unsigned char) from[0];
bb93f35d
JW
258 // https://en.wikipedia.org/wiki/UTF-8#Sample_code
259 if (c1 < 0x80)
260 {
d951e75d 261 ++from;
bb93f35d
JW
262 return c1;
263 }
264 else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
b6584a72 265 return invalid_mb_sequence;
bb93f35d
JW
266 else if (c1 < 0xE0) // 2-byte sequence
267 {
268 if (avail < 2)
b6584a72 269 return incomplete_mb_character;
8f7b7c14 270 char32_t c2 = (unsigned char) from[1];
bb93f35d 271 if ((c2 & 0xC0) != 0x80)
b6584a72 272 return invalid_mb_sequence;
bb93f35d 273 char32_t c = (c1 << 6) + c2 - 0x3080;
28af1fb3 274 if (c <= maxcode)
d951e75d 275 from += 2;
bb93f35d
JW
276 return c;
277 }
278 else if (c1 < 0xF0) // 3-byte sequence
279 {
02dab998 280 if (avail < 2)
b6584a72 281 return incomplete_mb_character;
8f7b7c14 282 char32_t c2 = (unsigned char) from[1];
bb93f35d 283 if ((c2 & 0xC0) != 0x80)
b6584a72 284 return invalid_mb_sequence;
bb93f35d 285 if (c1 == 0xE0 && c2 < 0xA0) // overlong
b6584a72 286 return invalid_mb_sequence;
02dab998
DM
287 if (avail < 3)
288 return incomplete_mb_character;
8f7b7c14 289 char32_t c3 = (unsigned char) from[2];
bb93f35d 290 if ((c3 & 0xC0) != 0x80)
b6584a72 291 return invalid_mb_sequence;
bb93f35d 292 char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
28af1fb3 293 if (c <= maxcode)
d951e75d 294 from += 3;
bb93f35d
JW
295 return c;
296 }
02dab998 297 else if (c1 < 0xF5 && maxcode > 0xFFFF) // 4-byte sequence
bb93f35d 298 {
02dab998 299 if (avail < 2)
b6584a72 300 return incomplete_mb_character;
8f7b7c14 301 char32_t c2 = (unsigned char) from[1];
bb93f35d 302 if ((c2 & 0xC0) != 0x80)
b6584a72 303 return invalid_mb_sequence;
bb93f35d 304 if (c1 == 0xF0 && c2 < 0x90) // overlong
b6584a72 305 return invalid_mb_sequence;
bb93f35d 306 if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
02dab998
DM
307 return invalid_mb_sequence;
308 if (avail < 3)
309 return incomplete_mb_character;
8f7b7c14 310 char32_t c3 = (unsigned char) from[2];
bb93f35d 311 if ((c3 & 0xC0) != 0x80)
b6584a72 312 return invalid_mb_sequence;
02dab998
DM
313 if (avail < 4)
314 return incomplete_mb_character;
8f7b7c14 315 char32_t c4 = (unsigned char) from[3];
bb93f35d 316 if ((c4 & 0xC0) != 0x80)
b6584a72 317 return invalid_mb_sequence;
bb93f35d 318 char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
28af1fb3 319 if (c <= maxcode)
d951e75d 320 from += 4;
bb93f35d
JW
321 return c;
322 }
323 else // > U+10FFFF
b6584a72 324 return invalid_mb_sequence;
bb93f35d
JW
325 }
326
c124af93 327 template<typename C>
bb93f35d 328 bool
c124af93 329 write_utf8_code_point(range<C>& to, char32_t code_point)
bb93f35d
JW
330 {
331 if (code_point < 0x80)
332 {
333 if (to.size() < 1)
334 return false;
d951e75d 335 to = code_point;
bb93f35d
JW
336 }
337 else if (code_point <= 0x7FF)
338 {
339 if (to.size() < 2)
340 return false;
d951e75d
JW
341 to = (code_point >> 6) + 0xC0;
342 to = (code_point & 0x3F) + 0x80;
bb93f35d
JW
343 }
344 else if (code_point <= 0xFFFF)
345 {
346 if (to.size() < 3)
347 return false;
d951e75d
JW
348 to = (code_point >> 12) + 0xE0;
349 to = ((code_point >> 6) & 0x3F) + 0x80;
350 to = (code_point & 0x3F) + 0x80;
bb93f35d
JW
351 }
352 else if (code_point <= 0x10FFFF)
353 {
354 if (to.size() < 4)
355 return false;
d951e75d
JW
356 to = (code_point >> 18) + 0xF0;
357 to = ((code_point >> 12) & 0x3F) + 0x80;
358 to = ((code_point >> 6) & 0x3F) + 0x80;
359 to = (code_point & 0x3F) + 0x80;
bb93f35d
JW
360 }
361 else
362 return false;
363 return true;
364 }
365
28af1fb3
JW
366 inline char16_t
367 adjust_byte_order(char16_t c, codecvt_mode mode)
368 {
369#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
370 return (mode & little_endian) ? __builtin_bswap16(c) : c;
371#else
372 return (mode & little_endian) ? c : __builtin_bswap16(c);
373#endif
374 }
375
b6584a72
JW
376 // Return true if c is a high-surrogate (aka leading) code point.
377 inline bool
378 is_high_surrogate(char32_t c)
379 {
380 return c >= 0xD800 && c <= 0xDBFF;
381 }
382
383 // Return true if c is a low-surrogate (aka trailing) code point.
384 inline bool
385 is_low_surrogate(char32_t c)
386 {
387 return c >= 0xDC00 && c <= 0xDFFF;
388 }
389
390 inline char32_t
391 surrogate_pair_to_code_point(char32_t high, char32_t low)
392 {
393 return (high << 10) + low - 0x35FDC00;
394 }
395
28af1fb3
JW
396 // Read a codepoint from a UTF-16 multibyte sequence.
397 // The sequence's endianness is indicated by (mode & little_endian).
398 // Updates from.next if the codepoint is not greater than maxcode.
b6584a72 399 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
d951e75d
JW
400 template<bool Aligned>
401 char32_t
402 read_utf16_code_point(range<const char16_t, Aligned>& from,
403 unsigned long maxcode, codecvt_mode mode)
404 {
405 const size_t avail = from.size();
406 if (avail == 0)
407 return incomplete_mb_character;
408 int inc = 1;
409 char32_t c = adjust_byte_order(from[0], mode);
410 if (is_high_surrogate(c))
411 {
412 if (avail < 2)
413 return incomplete_mb_character;
414 const char16_t c2 = adjust_byte_order(from[1], mode);
415 if (is_low_surrogate(c2))
416 {
417 c = surrogate_pair_to_code_point(c, c2);
418 inc = 2;
419 }
420 else
421 return invalid_mb_sequence;
422 }
423 else if (is_low_surrogate(c))
424 return invalid_mb_sequence;
425 if (c <= maxcode)
426 from += inc;
427 return c;
428 }
28af1fb3 429
d951e75d 430 template<typename C, bool A>
bb93f35d 431 bool
d951e75d 432 write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode)
bb93f35d 433 {
28af1fb3
JW
434 static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
435
02e12bda 436 if (codepoint <= max_single_utf16_unit)
bb93f35d
JW
437 {
438 if (to.size() > 0)
439 {
d951e75d 440 to = adjust_byte_order(codepoint, mode);
bb93f35d
JW
441 return true;
442 }
443 }
444 else if (to.size() > 1)
445 {
446 // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
447 const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
bb93f35d
JW
448 char16_t lead = LEAD_OFFSET + (codepoint >> 10);
449 char16_t trail = 0xDC00 + (codepoint & 0x3FF);
d951e75d
JW
450 to = adjust_byte_order(lead, mode);
451 to = adjust_byte_order(trail, mode);
bb93f35d
JW
452 return true;
453 }
454 return false;
455 }
456
457 // utf8 -> ucs4
c124af93 458 template<typename C>
bb93f35d 459 codecvt_base::result
c124af93 460 ucs4_in(range<const C>& from, range<char32_t>& to,
28af1fb3 461 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
bb93f35d 462 {
28af1fb3 463 read_utf8_bom(from, mode);
bb93f35d
JW
464 while (from.size() && to.size())
465 {
466 const char32_t codepoint = read_utf8_code_point(from, maxcode);
b6584a72
JW
467 if (codepoint == incomplete_mb_character)
468 return codecvt_base::partial;
28af1fb3 469 if (codepoint > maxcode)
bb93f35d 470 return codecvt_base::error;
d951e75d 471 to = codepoint;
bb93f35d
JW
472 }
473 return from.size() ? codecvt_base::partial : codecvt_base::ok;
474 }
475
476 // ucs4 -> utf8
c124af93 477 template<typename C>
bb93f35d 478 codecvt_base::result
c124af93 479 ucs4_out(range<const char32_t>& from, range<C>& to,
28af1fb3 480 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
bb93f35d 481 {
28af1fb3
JW
482 if (!write_utf8_bom(to, mode))
483 return codecvt_base::partial;
bb93f35d
JW
484 while (from.size())
485 {
d951e75d 486 const char32_t c = from[0];
bb93f35d
JW
487 if (c > maxcode)
488 return codecvt_base::error;
489 if (!write_utf8_code_point(to, c))
490 return codecvt_base::partial;
d951e75d 491 ++from;
bb93f35d
JW
492 }
493 return codecvt_base::ok;
494 }
495
28af1fb3
JW
496 // utf16 -> ucs4
497 codecvt_base::result
d951e75d 498 ucs4_in(range<const char16_t, false>& from, range<char32_t>& to,
28af1fb3
JW
499 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
500 {
bcd682e1 501 read_utf16_bom(from, mode);
28af1fb3
JW
502 while (from.size() && to.size())
503 {
504 const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
b6584a72
JW
505 if (codepoint == incomplete_mb_character)
506 return codecvt_base::partial;
28af1fb3
JW
507 if (codepoint > maxcode)
508 return codecvt_base::error;
d951e75d 509 to = codepoint;
28af1fb3
JW
510 }
511 return from.size() ? codecvt_base::partial : codecvt_base::ok;
512 }
513
514 // ucs4 -> utf16
515 codecvt_base::result
d951e75d 516 ucs4_out(range<const char32_t>& from, range<char16_t, false>& to,
28af1fb3
JW
517 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
518 {
519 if (!write_utf16_bom(to, mode))
520 return codecvt_base::partial;
521 while (from.size())
522 {
d951e75d 523 const char32_t c = from[0];
28af1fb3
JW
524 if (c > maxcode)
525 return codecvt_base::error;
526 if (!write_utf16_code_point(to, c, mode))
527 return codecvt_base::partial;
d951e75d 528 ++from;
28af1fb3
JW
529 }
530 return codecvt_base::ok;
531 }
532
bcd682e1
JW
533 // Flag indicating whether to process UTF-16 or UCS2
534 enum class surrogates { allowed, disallowed };
535
02dab998
DM
536 // utf8 -> utf16 (or utf8 -> ucs2 if maxcode <= 0xFFFF)
537 template <typename C8, typename C16>
bb93f35d 538 codecvt_base::result
02dab998
DM
539 utf16_in(range<const C8> &from, range<C16> &to,
540 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
bb93f35d 541 {
28af1fb3 542 read_utf8_bom(from, mode);
bb93f35d
JW
543 while (from.size() && to.size())
544 {
d951e75d 545 auto orig = from;
bb93f35d 546 const char32_t codepoint = read_utf8_code_point(from, maxcode);
b6584a72 547 if (codepoint == incomplete_mb_character)
02dab998 548 return codecvt_base::partial;
28af1fb3 549 if (codepoint > maxcode)
bb93f35d 550 return codecvt_base::error;
7f971f18 551 if (!write_utf16_code_point(to, codepoint, mode))
bb93f35d 552 {
d951e75d 553 from = orig; // rewind to previous position
bb93f35d
JW
554 return codecvt_base::partial;
555 }
556 }
02dab998 557 return from.size() ? codecvt_base::partial : codecvt_base::ok;
bb93f35d
JW
558 }
559
bcd682e1 560 // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
c124af93 561 template<typename C16, typename C8>
bb93f35d 562 codecvt_base::result
c124af93 563 utf16_out(range<const C16>& from, range<C8>& to,
bcd682e1
JW
564 unsigned long maxcode = max_code_point, codecvt_mode mode = {},
565 surrogates s = surrogates::allowed)
bb93f35d 566 {
28af1fb3
JW
567 if (!write_utf8_bom(to, mode))
568 return codecvt_base::partial;
bb93f35d
JW
569 while (from.size())
570 {
d951e75d 571 char32_t c = from[0];
bb93f35d 572 int inc = 1;
b6584a72 573 if (is_high_surrogate(c))
bb93f35d 574 {
bcd682e1
JW
575 if (s == surrogates::disallowed)
576 return codecvt_base::error; // No surrogates in UCS-2
577
bb93f35d 578 if (from.size() < 2)
02dab998 579 return codecvt_base::partial; // stop converting at this point
bb93f35d 580
d951e75d 581 const char32_t c2 = from[1];
b6584a72 582 if (is_low_surrogate(c2))
bb93f35d 583 {
b6584a72 584 c = surrogate_pair_to_code_point(c, c2);
bb93f35d 585 inc = 2;
bb93f35d
JW
586 }
587 else
588 return codecvt_base::error;
589 }
b6584a72
JW
590 else if (is_low_surrogate(c))
591 return codecvt_base::error;
bb93f35d
JW
592 if (c > maxcode)
593 return codecvt_base::error;
594 if (!write_utf8_code_point(to, c))
595 return codecvt_base::partial;
d951e75d 596 from += inc;
bb93f35d
JW
597 }
598 return codecvt_base::ok;
599 }
600
601 // return pos such that [begin,pos) is valid UTF-16 string no longer than max
c124af93
TH
602 template<typename C>
603 const C*
604 utf16_span(const C* begin, const C* end, size_t max,
28af1fb3 605 char32_t maxcode = max_code_point, codecvt_mode mode = {})
bb93f35d 606 {
c124af93 607 range<const C> from{ begin, end };
28af1fb3 608 read_utf8_bom(from, mode);
bb93f35d
JW
609 size_t count = 0;
610 while (count+1 < max)
611 {
612 char32_t c = read_utf8_code_point(from, maxcode);
b6584a72
JW
613 if (c > maxcode)
614 return from.next;
bb93f35d
JW
615 else if (c > max_single_utf16_unit)
616 ++count;
617 ++count;
618 }
619 if (count+1 == max) // take one more character if it fits in a single unit
bcd682e1 620 read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
28af1fb3 621 return from.next;
bb93f35d
JW
622 }
623
28af1fb3 624 // utf8 -> ucs2
c124af93 625 template<typename C>
28af1fb3 626 codecvt_base::result
c124af93 627 ucs2_in(range<const C>& from, range<char16_t>& to,
28af1fb3 628 char32_t maxcode = max_code_point, codecvt_mode mode = {})
bb93f35d 629 {
bcd682e1
JW
630 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
631 maxcode = std::min(max_single_utf16_unit, maxcode);
02dab998 632 return utf16_in(from, to, maxcode, mode);
28af1fb3
JW
633 }
634
635 // ucs2 -> utf8
c124af93 636 template<typename C>
28af1fb3 637 codecvt_base::result
c124af93 638 ucs2_out(range<const char16_t>& from, range<C>& to,
28af1fb3
JW
639 char32_t maxcode = max_code_point, codecvt_mode mode = {})
640 {
bcd682e1
JW
641 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
642 maxcode = std::min(max_single_utf16_unit, maxcode);
643 return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
28af1fb3
JW
644 }
645
646 // ucs2 -> utf16
647 codecvt_base::result
d951e75d 648 ucs2_out(range<const char16_t>& from, range<char16_t, false>& to,
28af1fb3
JW
649 char32_t maxcode = max_code_point, codecvt_mode mode = {})
650 {
651 if (!write_utf16_bom(to, mode))
652 return codecvt_base::partial;
653 while (from.size() && to.size())
bb93f35d 654 {
d951e75d 655 char16_t c = from[0];
b6584a72 656 if (is_high_surrogate(c))
28af1fb3
JW
657 return codecvt_base::error;
658 if (c > maxcode)
659 return codecvt_base::error;
d951e75d
JW
660 to = adjust_byte_order(c, mode);
661 ++from;
28af1fb3
JW
662 }
663 return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
664 }
665
666 // utf16 -> ucs2
667 codecvt_base::result
d951e75d 668 ucs2_in(range<const char16_t, false>& from, range<char16_t>& to,
28af1fb3
JW
669 char32_t maxcode = max_code_point, codecvt_mode mode = {})
670 {
bcd682e1
JW
671 read_utf16_bom(from, mode);
672 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
673 maxcode = std::min(max_single_utf16_unit, maxcode);
28af1fb3
JW
674 while (from.size() && to.size())
675 {
676 const char32_t c = read_utf16_code_point(from, maxcode, mode);
b6584a72 677 if (c == incomplete_mb_character)
bcd682e1 678 return codecvt_base::error; // UCS-2 only supports single units.
b6584a72 679 if (c > maxcode)
28af1fb3 680 return codecvt_base::error;
d951e75d 681 to = c;
bb93f35d 682 }
28af1fb3
JW
683 return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
684 }
685
686 const char16_t*
d951e75d 687 ucs2_span(range<const char16_t, false>& from, size_t max,
28af1fb3
JW
688 char32_t maxcode, codecvt_mode mode)
689 {
bcd682e1
JW
690 read_utf16_bom(from, mode);
691 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
692 maxcode = std::min(max_single_utf16_unit, maxcode);
28af1fb3
JW
693 char32_t c = 0;
694 while (max-- && c <= maxcode)
695 c = read_utf16_code_point(from, maxcode, mode);
d951e75d 696 return reinterpret_cast<const char16_t*>(from.next);
28af1fb3
JW
697 }
698
c124af93
TH
699 template<typename C>
700 const C*
701 ucs2_span(const C* begin, const C* end, size_t max,
28af1fb3
JW
702 char32_t maxcode, codecvt_mode mode)
703 {
c124af93 704 range<const C> from{ begin, end };
28af1fb3 705 read_utf8_bom(from, mode);
bcd682e1
JW
706 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
707 maxcode = std::min(max_single_utf16_unit, maxcode);
28af1fb3
JW
708 char32_t c = 0;
709 while (max-- && c <= maxcode)
710 c = read_utf8_code_point(from, maxcode);
711 return from.next;
712 }
713
714 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
c124af93
TH
715 template<typename C>
716 const C*
717 ucs4_span(const C* begin, const C* end, size_t max,
28af1fb3
JW
718 char32_t maxcode = max_code_point, codecvt_mode mode = {})
719 {
c124af93 720 range<const C> from{ begin, end };
28af1fb3
JW
721 read_utf8_bom(from, mode);
722 char32_t c = 0;
723 while (max-- && c <= maxcode)
724 c = read_utf8_code_point(from, maxcode);
725 return from.next;
726 }
727
728 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
729 const char16_t*
d951e75d 730 ucs4_span(range<const char16_t, false>& from, size_t max,
28af1fb3
JW
731 char32_t maxcode = max_code_point, codecvt_mode mode = {})
732 {
bcd682e1 733 read_utf16_bom(from, mode);
28af1fb3
JW
734 char32_t c = 0;
735 while (max-- && c <= maxcode)
736 c = read_utf16_code_point(from, maxcode, mode);
d951e75d 737 return reinterpret_cast<const char16_t*>(from.next);
bb93f35d
JW
738 }
739}
740
741// Define members of codecvt<char16_t, char, mbstate_t> specialization.
742// Converts from UTF-8 to UTF-16.
743
744locale::id codecvt<char16_t, char, mbstate_t>::id;
745
746codecvt<char16_t, char, mbstate_t>::~codecvt() { }
747
748codecvt_base::result
749codecvt<char16_t, char, mbstate_t>::
750do_out(state_type&,
751 const intern_type* __from,
752 const intern_type* __from_end, const intern_type*& __from_next,
753 extern_type* __to, extern_type* __to_end,
754 extern_type*& __to_next) const
755{
756 range<const char16_t> from{ __from, __from_end };
757 range<char> to{ __to, __to_end };
758 auto res = utf16_out(from, to);
759 __from_next = from.next;
760 __to_next = to.next;
761 return res;
762}
763
764codecvt_base::result
765codecvt<char16_t, char, mbstate_t>::
766do_unshift(state_type&, extern_type* __to, extern_type*,
767 extern_type*& __to_next) const
768{
769 __to_next = __to;
770 return noconv; // we don't use mbstate_t for the unicode facets
771}
772
773codecvt_base::result
774codecvt<char16_t, char, mbstate_t>::
775do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
776 const extern_type*& __from_next,
777 intern_type* __to, intern_type* __to_end,
778 intern_type*& __to_next) const
779{
780 range<const char> from{ __from, __from_end };
781 range<char16_t> to{ __to, __to_end };
7f971f18
JW
782#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
783 codecvt_mode mode = {};
784#else
785 codecvt_mode mode = little_endian;
786#endif
787 auto res = utf16_in(from, to, max_code_point, mode);
bb93f35d
JW
788 __from_next = from.next;
789 __to_next = to.next;
790 return res;
791}
792
793int
794codecvt<char16_t, char, mbstate_t>::do_encoding() const throw()
516231de 795{ return 0; } // UTF-8 is not a fixed-width encoding
bb93f35d
JW
796
797bool
798codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw()
799{ return false; }
800
801int
802codecvt<char16_t, char, mbstate_t>::
803do_length(state_type&, const extern_type* __from,
804 const extern_type* __end, size_t __max) const
805{
28af1fb3
JW
806 __end = utf16_span(__from, __end, __max);
807 return __end - __from;
bb93f35d
JW
808}
809
810int
811codecvt<char16_t, char, mbstate_t>::do_max_length() const throw()
812{
516231de
JW
813 // A single character (one or two UTF-16 code units) requires
814 // up to four UTF-8 code units.
815 return 4;
bb93f35d
JW
816}
817
818// Define members of codecvt<char32_t, char, mbstate_t> specialization.
819// Converts from UTF-8 to UTF-32 (aka UCS-4).
820
821locale::id codecvt<char32_t, char, mbstate_t>::id;
822
823codecvt<char32_t, char, mbstate_t>::~codecvt() { }
824
825codecvt_base::result
826codecvt<char32_t, char, mbstate_t>::
827do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
828 const intern_type*& __from_next,
829 extern_type* __to, extern_type* __to_end,
830 extern_type*& __to_next) const
831{
832 range<const char32_t> from{ __from, __from_end };
833 range<char> to{ __to, __to_end };
834 auto res = ucs4_out(from, to);
835 __from_next = from.next;
836 __to_next = to.next;
837 return res;
838}
839
840codecvt_base::result
841codecvt<char32_t, char, mbstate_t>::
842do_unshift(state_type&, extern_type* __to, extern_type*,
843 extern_type*& __to_next) const
844{
845 __to_next = __to;
846 return noconv;
847}
848
849codecvt_base::result
850codecvt<char32_t, char, mbstate_t>::
851do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
852 const extern_type*& __from_next,
853 intern_type* __to, intern_type* __to_end,
854 intern_type*& __to_next) const
855{
856 range<const char> from{ __from, __from_end };
857 range<char32_t> to{ __to, __to_end };
858 auto res = ucs4_in(from, to);
859 __from_next = from.next;
860 __to_next = to.next;
861 return res;
862}
863
864int
865codecvt<char32_t, char, mbstate_t>::do_encoding() const throw()
516231de 866{ return 0; } // UTF-8 is not a fixed-width encoding
bb93f35d
JW
867
868bool
869codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw()
870{ return false; }
871
872int
873codecvt<char32_t, char, mbstate_t>::
874do_length(state_type&, const extern_type* __from,
875 const extern_type* __end, size_t __max) const
876{
28af1fb3
JW
877 __end = ucs4_span(__from, __end, __max);
878 return __end - __from;
bb93f35d
JW
879}
880
881int
882codecvt<char32_t, char, mbstate_t>::do_max_length() const throw()
516231de
JW
883{
884 // A single character (one UTF-32 code unit) requires
885 // up to 4 UTF-8 code units.
886 return 4;
887}
bb93f35d 888
c124af93
TH
889#if defined(_GLIBCXX_USE_CHAR8_T)
890// Define members of codecvt<char16_t, char8_t, mbstate_t> specialization.
891// Converts from UTF-8 to UTF-16.
892
893locale::id codecvt<char16_t, char8_t, mbstate_t>::id;
894
895codecvt<char16_t, char8_t, mbstate_t>::~codecvt() { }
896
897codecvt_base::result
898codecvt<char16_t, char8_t, mbstate_t>::
899do_out(state_type&,
900 const intern_type* __from,
901 const intern_type* __from_end, const intern_type*& __from_next,
902 extern_type* __to, extern_type* __to_end,
903 extern_type*& __to_next) const
904{
905 range<const char16_t> from{ __from, __from_end };
906 range<char8_t> to{ __to, __to_end };
907 auto res = utf16_out(from, to);
908 __from_next = from.next;
909 __to_next = to.next;
910 return res;
911}
912
913codecvt_base::result
914codecvt<char16_t, char8_t, mbstate_t>::
915do_unshift(state_type&, extern_type* __to, extern_type*,
916 extern_type*& __to_next) const
917{
918 __to_next = __to;
919 return noconv; // we don't use mbstate_t for the unicode facets
920}
921
922codecvt_base::result
923codecvt<char16_t, char8_t, mbstate_t>::
924do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
925 const extern_type*& __from_next,
926 intern_type* __to, intern_type* __to_end,
927 intern_type*& __to_next) const
928{
929 range<const char8_t> from{ __from, __from_end };
930 range<char16_t> to{ __to, __to_end };
931#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
932 codecvt_mode mode = {};
933#else
934 codecvt_mode mode = little_endian;
935#endif
936 auto res = utf16_in(from, to, max_code_point, mode);
937 __from_next = from.next;
938 __to_next = to.next;
939 return res;
940}
941
942int
943codecvt<char16_t, char8_t, mbstate_t>::do_encoding() const throw()
944{ return 0; } // UTF-8 is not a fixed-width encoding
945
946bool
947codecvt<char16_t, char8_t, mbstate_t>::do_always_noconv() const throw()
948{ return false; }
949
950int
951codecvt<char16_t, char8_t, mbstate_t>::
952do_length(state_type&, const extern_type* __from,
953 const extern_type* __end, size_t __max) const
954{
955 __end = utf16_span(__from, __end, __max);
956 return __end - __from;
957}
958
959int
960codecvt<char16_t, char8_t, mbstate_t>::do_max_length() const throw()
961{
962 // A single character (one or two UTF-16 code units) requires
963 // up to four UTF-8 code units.
964 return 4;
965}
966
967// Define members of codecvt<char32_t, char8_t, mbstate_t> specialization.
968// Converts from UTF-8 to UTF-32 (aka UCS-4).
969
970locale::id codecvt<char32_t, char8_t, mbstate_t>::id;
971
972codecvt<char32_t, char8_t, mbstate_t>::~codecvt() { }
973
974codecvt_base::result
975codecvt<char32_t, char8_t, mbstate_t>::
976do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
977 const intern_type*& __from_next,
978 extern_type* __to, extern_type* __to_end,
979 extern_type*& __to_next) const
980{
981 range<const char32_t> from{ __from, __from_end };
982 range<char8_t> to{ __to, __to_end };
983 auto res = ucs4_out(from, to);
984 __from_next = from.next;
985 __to_next = to.next;
986 return res;
987}
988
989codecvt_base::result
990codecvt<char32_t, char8_t, mbstate_t>::
991do_unshift(state_type&, extern_type* __to, extern_type*,
992 extern_type*& __to_next) const
993{
994 __to_next = __to;
995 return noconv;
996}
997
998codecvt_base::result
999codecvt<char32_t, char8_t, mbstate_t>::
1000do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1001 const extern_type*& __from_next,
1002 intern_type* __to, intern_type* __to_end,
1003 intern_type*& __to_next) const
1004{
1005 range<const char8_t> from{ __from, __from_end };
1006 range<char32_t> to{ __to, __to_end };
1007 auto res = ucs4_in(from, to);
1008 __from_next = from.next;
1009 __to_next = to.next;
1010 return res;
1011}
1012
1013int
1014codecvt<char32_t, char8_t, mbstate_t>::do_encoding() const throw()
1015{ return 0; } // UTF-8 is not a fixed-width encoding
1016
1017bool
1018codecvt<char32_t, char8_t, mbstate_t>::do_always_noconv() const throw()
1019{ return false; }
1020
1021int
1022codecvt<char32_t, char8_t, mbstate_t>::
1023do_length(state_type&, const extern_type* __from,
1024 const extern_type* __end, size_t __max) const
1025{
1026 __end = ucs4_span(__from, __end, __max);
1027 return __end - __from;
1028}
1029
1030int
1031codecvt<char32_t, char8_t, mbstate_t>::do_max_length() const throw()
1032{
1033 // A single character (one UTF-32 code unit) requires
1034 // up to 4 UTF-8 code units.
1035 return 4;
1036}
1037#endif // _GLIBCXX_USE_CHAR8_T
1038
28af1fb3
JW
1039// Define members of codecvt_utf8<char16_t> base class implementation.
1040// Converts from UTF-8 to UCS-2.
1041
1042__codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { }
1043
1044codecvt_base::result
1045__codecvt_utf8_base<char16_t>::
1046do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1047 const intern_type*& __from_next,
1048 extern_type* __to, extern_type* __to_end,
1049 extern_type*& __to_next) const
1050{
1051 range<const char16_t> from{ __from, __from_end };
1052 range<char> to{ __to, __to_end };
1053 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1054 __from_next = from.next;
1055 __to_next = to.next;
1056 return res;
1057}
1058
1059codecvt_base::result
1060__codecvt_utf8_base<char16_t>::
1061do_unshift(state_type&, extern_type* __to, extern_type*,
1062 extern_type*& __to_next) const
1063{
1064 __to_next = __to;
1065 return noconv;
1066}
1067
1068codecvt_base::result
1069__codecvt_utf8_base<char16_t>::
1070do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1071 const extern_type*& __from_next,
1072 intern_type* __to, intern_type* __to_end,
1073 intern_type*& __to_next) const
1074{
1075 range<const char> from{ __from, __from_end };
1076 range<char16_t> to{ __to, __to_end };
a1e1ec76 1077 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
29ca91f7
JW
1078#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1079 mode = codecvt_mode(mode | little_endian);
1080#endif
1081 auto res = ucs2_in(from, to, _M_maxcode, mode);
28af1fb3
JW
1082 __from_next = from.next;
1083 __to_next = to.next;
1084 return res;
1085}
1086
1087int
1088__codecvt_utf8_base<char16_t>::do_encoding() const throw()
516231de 1089{ return 0; } // UTF-8 is not a fixed-width encoding
28af1fb3
JW
1090
1091bool
1092__codecvt_utf8_base<char16_t>::do_always_noconv() const throw()
1093{ return false; }
1094
1095int
1096__codecvt_utf8_base<char16_t>::
1097do_length(state_type&, const extern_type* __from,
1098 const extern_type* __end, size_t __max) const
1099{
1100 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1101 return __end - __from;
1102}
1103
1104int
1105__codecvt_utf8_base<char16_t>::do_max_length() const throw()
516231de
JW
1106{
1107 // A single UCS-2 character requires up to three UTF-8 code units.
1108 // (UCS-2 cannot represent characters that use four UTF-8 code units).
1109 int max = 3;
1110 if (_M_mode & consume_header)
1111 max += sizeof(utf8_bom);
1112 return max;
1113}
28af1fb3
JW
1114
1115// Define members of codecvt_utf8<char32_t> base class implementation.
1116// Converts from UTF-8 to UTF-32 (aka UCS-4).
1117
1118__codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { }
1119
1120codecvt_base::result
1121__codecvt_utf8_base<char32_t>::
1122do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1123 const intern_type*& __from_next,
1124 extern_type* __to, extern_type* __to_end,
1125 extern_type*& __to_next) const
1126{
1127 range<const char32_t> from{ __from, __from_end };
1128 range<char> to{ __to, __to_end };
1129 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1130 __from_next = from.next;
1131 __to_next = to.next;
1132 return res;
1133}
1134
1135codecvt_base::result
1136__codecvt_utf8_base<char32_t>::
1137do_unshift(state_type&, extern_type* __to, extern_type*,
1138 extern_type*& __to_next) const
1139{
1140 __to_next = __to;
1141 return noconv;
1142}
1143
1144codecvt_base::result
1145__codecvt_utf8_base<char32_t>::
1146do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1147 const extern_type*& __from_next,
1148 intern_type* __to, intern_type* __to_end,
1149 intern_type*& __to_next) const
1150{
1151 range<const char> from{ __from, __from_end };
1152 range<char32_t> to{ __to, __to_end };
1153 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1154 __from_next = from.next;
1155 __to_next = to.next;
1156 return res;
1157}
1158
1159int
1160__codecvt_utf8_base<char32_t>::do_encoding() const throw()
516231de 1161{ return 0; } // UTF-8 is not a fixed-width encoding
28af1fb3
JW
1162
1163bool
1164__codecvt_utf8_base<char32_t>::do_always_noconv() const throw()
1165{ return false; }
1166
1167int
1168__codecvt_utf8_base<char32_t>::
1169do_length(state_type&, const extern_type* __from,
1170 const extern_type* __end, size_t __max) const
1171{
1172 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1173 return __end - __from;
1174}
1175
1176int
1177__codecvt_utf8_base<char32_t>::do_max_length() const throw()
516231de
JW
1178{
1179 // A single UCS-4 character requires up to four UTF-8 code units.
1180 int max = 4;
1181 if (_M_mode & consume_header)
1182 max += sizeof(utf8_bom);
1183 return max;
1184}
28af1fb3
JW
1185
1186#ifdef _GLIBCXX_USE_WCHAR_T
d951e75d
JW
1187
1188#if __SIZEOF_WCHAR_T__ == 2
1189static_assert(sizeof(wchar_t) == sizeof(char16_t), "");
1190#elif __SIZEOF_WCHAR_T__ == 4
1191static_assert(sizeof(wchar_t) == sizeof(char32_t), "");
1192#endif
1193
28af1fb3
JW
1194// Define members of codecvt_utf8<wchar_t> base class implementation.
1195// Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1196
1197__codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { }
1198
1199codecvt_base::result
1200__codecvt_utf8_base<wchar_t>::
1201do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1202 const intern_type*& __from_next,
1203 extern_type* __to, extern_type* __to_end,
1204 extern_type*& __to_next) const
1205{
1206 range<char> to{ __to, __to_end };
1207#if __SIZEOF_WCHAR_T__ == 2
1208 range<const char16_t> from{
1209 reinterpret_cast<const char16_t*>(__from),
1210 reinterpret_cast<const char16_t*>(__from_end)
1211 };
1212 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1213#elif __SIZEOF_WCHAR_T__ == 4
1214 range<const char32_t> from{
1215 reinterpret_cast<const char32_t*>(__from),
1216 reinterpret_cast<const char32_t*>(__from_end)
1217 };
1218 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1219#else
1220 return codecvt_base::error;
1221#endif
1222 __from_next = reinterpret_cast<const wchar_t*>(from.next);
1223 __to_next = to.next;
1224 return res;
1225}
1226
1227codecvt_base::result
1228__codecvt_utf8_base<wchar_t>::
1229do_unshift(state_type&, extern_type* __to, extern_type*,
1230 extern_type*& __to_next) const
1231{
1232 __to_next = __to;
1233 return noconv;
1234}
1235
1236codecvt_base::result
1237__codecvt_utf8_base<wchar_t>::
1238do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1239 const extern_type*& __from_next,
1240 intern_type* __to, intern_type* __to_end,
1241 intern_type*& __to_next) const
1242{
1243 range<const char> from{ __from, __from_end };
1244#if __SIZEOF_WCHAR_T__ == 2
1245 range<char16_t> to{
1246 reinterpret_cast<char16_t*>(__to),
1247 reinterpret_cast<char16_t*>(__to_end)
1248 };
a76400f4
JW
1249#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1250 codecvt_mode mode = {};
1251#else
1252 codecvt_mode mode = little_endian;
1253#endif
1254 auto res = ucs2_in(from, to, _M_maxcode, mode);
28af1fb3
JW
1255#elif __SIZEOF_WCHAR_T__ == 4
1256 range<char32_t> to{
1257 reinterpret_cast<char32_t*>(__to),
1258 reinterpret_cast<char32_t*>(__to_end)
1259 };
1260 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1261#else
1262 return codecvt_base::error;
1263#endif
1264 __from_next = from.next;
1265 __to_next = reinterpret_cast<wchar_t*>(to.next);
1266 return res;
1267}
1268
1269int
1270__codecvt_utf8_base<wchar_t>::do_encoding() const throw()
516231de 1271{ return 0; } // UTF-8 is not a fixed-width encoding
28af1fb3
JW
1272
1273bool
1274__codecvt_utf8_base<wchar_t>::do_always_noconv() const throw()
1275{ return false; }
1276
1277int
1278__codecvt_utf8_base<wchar_t>::
1279do_length(state_type&, const extern_type* __from,
1280 const extern_type* __end, size_t __max) const
1281{
1282#if __SIZEOF_WCHAR_T__ == 2
1283 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1284#elif __SIZEOF_WCHAR_T__ == 4
1285 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1286#else
1287 __end = __from;
1288#endif
1289 return __end - __from;
1290}
1291
1292int
1293__codecvt_utf8_base<wchar_t>::do_max_length() const throw()
516231de
JW
1294{
1295#if __SIZEOF_WCHAR_T__ == 2
1296 int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
1297#else
1298 int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
1299#endif
1300 if (_M_mode & consume_header)
1301 max += sizeof(utf8_bom);
1302 return max;
1303}
28af1fb3
JW
1304#endif
1305
1306// Define members of codecvt_utf16<char16_t> base class implementation.
1307// Converts from UTF-16 to UCS-2.
1308
1309__codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { }
1310
1311codecvt_base::result
1312__codecvt_utf16_base<char16_t>::
1313do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1314 const intern_type*& __from_next,
1315 extern_type* __to, extern_type* __to_end,
1316 extern_type*& __to_next) const
1317{
1318 range<const char16_t> from{ __from, __from_end };
d951e75d 1319 range<char16_t, false> to{ __to, __to_end };
28af1fb3
JW
1320 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1321 __from_next = from.next;
1322 __to_next = reinterpret_cast<char*>(to.next);
1323 return res;
1324}
1325
1326codecvt_base::result
1327__codecvt_utf16_base<char16_t>::
1328do_unshift(state_type&, extern_type* __to, extern_type*,
1329 extern_type*& __to_next) const
1330{
1331 __to_next = __to;
1332 return noconv;
1333}
1334
1335codecvt_base::result
1336__codecvt_utf16_base<char16_t>::
1337do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1338 const extern_type*& __from_next,
1339 intern_type* __to, intern_type* __to_end,
1340 intern_type*& __to_next) const
1341{
d951e75d 1342 range<const char16_t, false> from{ __from, __from_end };
28af1fb3
JW
1343 range<char16_t> to{ __to, __to_end };
1344 auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1345 __from_next = reinterpret_cast<const char*>(from.next);
1346 __to_next = to.next;
d951e75d
JW
1347 if (res == codecvt_base::ok && __from_next != __from_end)
1348 res = codecvt_base::error;
28af1fb3
JW
1349 return res;
1350}
1351
1352int
1353__codecvt_utf16_base<char16_t>::do_encoding() const throw()
516231de 1354{ return 0; } // UTF-16 is not a fixed-width encoding
28af1fb3
JW
1355
1356bool
1357__codecvt_utf16_base<char16_t>::do_always_noconv() const throw()
1358{ return false; }
1359
1360int
1361__codecvt_utf16_base<char16_t>::
1362do_length(state_type&, const extern_type* __from,
1363 const extern_type* __end, size_t __max) const
1364{
d951e75d
JW
1365 range<const char16_t, false> from{ __from, __end };
1366 const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
28af1fb3
JW
1367 return reinterpret_cast<const char*>(next) - __from;
1368}
1369
1370int
1371__codecvt_utf16_base<char16_t>::do_max_length() const throw()
516231de
JW
1372{
1373 // A single UCS-2 character requires one UTF-16 code unit (so two chars).
1374 // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
1375 int max = 2;
1376 if (_M_mode & consume_header)
1377 max += sizeof(utf16_bom);
1378 return max;
1379}
28af1fb3
JW
1380
1381// Define members of codecvt_utf16<char32_t> base class implementation.
1382// Converts from UTF-16 to UTF-32 (aka UCS-4).
1383
1384__codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { }
1385
1386codecvt_base::result
1387__codecvt_utf16_base<char32_t>::
1388do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1389 const intern_type*& __from_next,
1390 extern_type* __to, extern_type* __to_end,
1391 extern_type*& __to_next) const
1392{
1393 range<const char32_t> from{ __from, __from_end };
d951e75d 1394 range<char16_t, false> to{ __to, __to_end };
28af1fb3
JW
1395 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1396 __from_next = from.next;
1397 __to_next = reinterpret_cast<char*>(to.next);
1398 return res;
1399}
1400
1401codecvt_base::result
1402__codecvt_utf16_base<char32_t>::
1403do_unshift(state_type&, extern_type* __to, extern_type*,
1404 extern_type*& __to_next) const
1405{
1406 __to_next = __to;
1407 return noconv;
1408}
1409
1410codecvt_base::result
1411__codecvt_utf16_base<char32_t>::
1412do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1413 const extern_type*& __from_next,
1414 intern_type* __to, intern_type* __to_end,
1415 intern_type*& __to_next) const
1416{
d951e75d 1417 range<const char16_t, false> from{ __from, __from_end };
28af1fb3
JW
1418 range<char32_t> to{ __to, __to_end };
1419 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1420 __from_next = reinterpret_cast<const char*>(from.next);
1421 __to_next = to.next;
d951e75d
JW
1422 if (res == codecvt_base::ok && __from_next != __from_end)
1423 res = codecvt_base::error;
28af1fb3
JW
1424 return res;
1425}
1426
1427int
1428__codecvt_utf16_base<char32_t>::do_encoding() const throw()
516231de 1429{ return 0; } // UTF-16 is not a fixed-width encoding
28af1fb3
JW
1430
1431bool
1432__codecvt_utf16_base<char32_t>::do_always_noconv() const throw()
1433{ return false; }
1434
1435int
1436__codecvt_utf16_base<char32_t>::
1437do_length(state_type&, const extern_type* __from,
1438 const extern_type* __end, size_t __max) const
1439{
d951e75d
JW
1440 range<const char16_t, false> from{ __from, __end };
1441 const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
28af1fb3
JW
1442 return reinterpret_cast<const char*>(next) - __from;
1443}
1444
1445int
1446__codecvt_utf16_base<char32_t>::do_max_length() const throw()
516231de
JW
1447{
1448 // A single UCS-4 character requires one or two UTF-16 code units
1449 // (so up to four chars).
1450 int max = 4;
1451 if (_M_mode & consume_header)
1452 max += sizeof(utf16_bom);
1453 return max;
1454}
28af1fb3
JW
1455
1456#ifdef _GLIBCXX_USE_WCHAR_T
1457// Define members of codecvt_utf16<wchar_t> base class implementation.
1458// Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1459
1460__codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { }
1461
1462codecvt_base::result
1463__codecvt_utf16_base<wchar_t>::
1464do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1465 const intern_type*& __from_next,
1466 extern_type* __to, extern_type* __to_end,
1467 extern_type*& __to_next) const
1468{
d951e75d 1469 range<char16_t, false> to{ __to, __to_end };
28af1fb3
JW
1470#if __SIZEOF_WCHAR_T__ == 2
1471 range<const char16_t> from{
1472 reinterpret_cast<const char16_t*>(__from),
d951e75d 1473 reinterpret_cast<const char16_t*>(__from_end),
28af1fb3
JW
1474 };
1475 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1476#elif __SIZEOF_WCHAR_T__ == 4
1477 range<const char32_t> from{
1478 reinterpret_cast<const char32_t*>(__from),
d951e75d 1479 reinterpret_cast<const char32_t*>(__from_end),
28af1fb3
JW
1480 };
1481 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1482#else
1483 return codecvt_base::error;
1484#endif
1485 __from_next = reinterpret_cast<const wchar_t*>(from.next);
a4c687d6 1486 __to_next = reinterpret_cast<char*>(to.next);
28af1fb3
JW
1487 return res;
1488}
1489
1490codecvt_base::result
1491__codecvt_utf16_base<wchar_t>::
1492do_unshift(state_type&, extern_type* __to, extern_type*,
1493 extern_type*& __to_next) const
1494{
1495 __to_next = __to;
1496 return noconv;
1497}
1498
1499codecvt_base::result
1500__codecvt_utf16_base<wchar_t>::
1501do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1502 const extern_type*& __from_next,
1503 intern_type* __to, intern_type* __to_end,
1504 intern_type*& __to_next) const
1505{
d951e75d 1506 range<const char16_t, false> from{ __from, __from_end };
28af1fb3
JW
1507#if __SIZEOF_WCHAR_T__ == 2
1508 range<char16_t> to{
1509 reinterpret_cast<char16_t*>(__to),
d951e75d 1510 reinterpret_cast<char16_t*>(__to_end),
28af1fb3
JW
1511 };
1512 auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1513#elif __SIZEOF_WCHAR_T__ == 4
1514 range<char32_t> to{
1515 reinterpret_cast<char32_t*>(__to),
d951e75d 1516 reinterpret_cast<char32_t*>(__to_end),
28af1fb3
JW
1517 };
1518 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1519#else
1520 return codecvt_base::error;
1521#endif
a4c687d6 1522 __from_next = reinterpret_cast<const char*>(from.next);
28af1fb3 1523 __to_next = reinterpret_cast<wchar_t*>(to.next);
d951e75d
JW
1524 if (res == codecvt_base::ok && __from_next != __from_end)
1525 res = codecvt_base::error;
28af1fb3
JW
1526 return res;
1527}
1528
1529int
1530__codecvt_utf16_base<wchar_t>::do_encoding() const throw()
516231de 1531{ return 0; } // UTF-16 is not a fixed-width encoding
28af1fb3
JW
1532
1533bool
1534__codecvt_utf16_base<wchar_t>::do_always_noconv() const throw()
1535{ return false; }
1536
1537int
1538__codecvt_utf16_base<wchar_t>::
1539do_length(state_type&, const extern_type* __from,
1540 const extern_type* __end, size_t __max) const
1541{
d951e75d 1542 range<const char16_t, false> from{ __from, __end };
28af1fb3 1543#if __SIZEOF_WCHAR_T__ == 2
d951e75d 1544 const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
28af1fb3 1545#elif __SIZEOF_WCHAR_T__ == 4
d951e75d 1546 const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
28af1fb3
JW
1547#endif
1548 return reinterpret_cast<const char*>(next) - __from;
1549}
1550
1551int
1552__codecvt_utf16_base<wchar_t>::do_max_length() const throw()
516231de
JW
1553{
1554#if __SIZEOF_WCHAR_T__ == 2
1555 int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
1556#else
1557 int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
1558#endif
1559 if (_M_mode & consume_header)
1560 max += sizeof(utf16_bom);
1561 return max;
1562}
28af1fb3
JW
1563#endif
1564
1565// Define members of codecvt_utf8_utf16<char16_t> base class implementation.
1566// Converts from UTF-8 to UTF-16.
1567
1568__codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { }
1569
1570codecvt_base::result
1571__codecvt_utf8_utf16_base<char16_t>::
1572do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1573 const intern_type*& __from_next,
1574 extern_type* __to, extern_type* __to_end,
1575 extern_type*& __to_next) const
1576{
1577 range<const char16_t> from{ __from, __from_end };
1578 range<char> to{ __to, __to_end };
1579 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1580 __from_next = from.next;
1581 __to_next = to.next;
1582 return res;
1583}
1584
1585codecvt_base::result
1586__codecvt_utf8_utf16_base<char16_t>::
1587do_unshift(state_type&, extern_type* __to, extern_type*,
1588 extern_type*& __to_next) const
1589{
1590 __to_next = __to;
1591 return noconv;
1592}
1593
1594codecvt_base::result
1595__codecvt_utf8_utf16_base<char16_t>::
1596do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1597 const extern_type*& __from_next,
1598 intern_type* __to, intern_type* __to_end,
1599 intern_type*& __to_next) const
1600{
1601 range<const char> from{ __from, __from_end };
1602 range<char16_t> to{ __to, __to_end };
a1e1ec76 1603 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
795038b7
JW
1604#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1605 mode = codecvt_mode(mode | little_endian);
1606#endif
1607 auto res = utf16_in(from, to, _M_maxcode, mode);
28af1fb3
JW
1608 __from_next = from.next;
1609 __to_next = to.next;
1610 return res;
1611}
1612
1613int
1614__codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw()
516231de 1615{ return 0; } // UTF-8 is not a fixed-width encoding
28af1fb3
JW
1616
1617bool
1618__codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw()
1619{ return false; }
1620
1621int
1622__codecvt_utf8_utf16_base<char16_t>::
1623do_length(state_type&, const extern_type* __from,
1624 const extern_type* __end, size_t __max) const
1625{
1626 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1627 return __end - __from;
1628}
1629
1630int
1631__codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw()
1632{
516231de
JW
1633 // A single character can be 1 or 2 UTF-16 code units,
1634 // requiring up to 4 UTF-8 code units.
1635 int max = 4;
1636 if (_M_mode & consume_header)
1637 max += sizeof(utf8_bom);
1638 return max;
28af1fb3
JW
1639}
1640
1641// Define members of codecvt_utf8_utf16<char32_t> base class implementation.
1642// Converts from UTF-8 to UTF-16.
1643
1644__codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { }
1645
1646codecvt_base::result
1647__codecvt_utf8_utf16_base<char32_t>::
1648do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1649 const intern_type*& __from_next,
1650 extern_type* __to, extern_type* __to_end,
1651 extern_type*& __to_next) const
1652{
1653 range<const char32_t> from{ __from, __from_end };
1654 range<char> to{ __to, __to_end };
1655 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1656 __from_next = from.next;
1657 __to_next = to.next;
1658 return res;
1659}
1660
1661codecvt_base::result
1662__codecvt_utf8_utf16_base<char32_t>::
1663do_unshift(state_type&, extern_type* __to, extern_type*,
1664 extern_type*& __to_next) const
1665{
1666 __to_next = __to;
1667 return noconv;
1668}
1669
1670codecvt_base::result
1671__codecvt_utf8_utf16_base<char32_t>::
1672do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1673 const extern_type*& __from_next,
1674 intern_type* __to, intern_type* __to_end,
1675 intern_type*& __to_next) const
1676{
1677 range<const char> from{ __from, __from_end };
1678 range<char32_t> to{ __to, __to_end };
02e12bda
JW
1679 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1680#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1681 mode = codecvt_mode(mode | little_endian);
1682#endif
1683 auto res = utf16_in(from, to, _M_maxcode, mode);
28af1fb3
JW
1684 __from_next = from.next;
1685 __to_next = to.next;
1686 return res;
1687}
1688
1689int
1690__codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw()
516231de 1691{ return 0; } // UTF-8 is not a fixed-width encoding
28af1fb3
JW
1692
1693bool
1694__codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw()
1695{ return false; }
1696
1697int
1698__codecvt_utf8_utf16_base<char32_t>::
1699do_length(state_type&, const extern_type* __from,
1700 const extern_type* __end, size_t __max) const
1701{
1702 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1703 return __end - __from;
1704}
1705
1706int
1707__codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw()
1708{
516231de
JW
1709 // A single character can be 1 or 2 UTF-16 code units,
1710 // requiring up to 4 UTF-8 code units.
1711 int max = 4;
1712 if (_M_mode & consume_header)
1713 max += sizeof(utf8_bom);
1714 return max;
28af1fb3
JW
1715}
1716
1717#ifdef _GLIBCXX_USE_WCHAR_T
1718// Define members of codecvt_utf8_utf16<wchar_t> base class implementation.
1719// Converts from UTF-8 to UTF-16.
1720
1721__codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { }
1722
1723codecvt_base::result
1724__codecvt_utf8_utf16_base<wchar_t>::
1725do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1726 const intern_type*& __from_next,
1727 extern_type* __to, extern_type* __to_end,
1728 extern_type*& __to_next) const
1729{
1730 range<const wchar_t> from{ __from, __from_end };
1731 range<char> to{ __to, __to_end };
1732 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1733 __from_next = from.next;
1734 __to_next = to.next;
1735 return res;
1736}
1737
1738codecvt_base::result
1739__codecvt_utf8_utf16_base<wchar_t>::
1740do_unshift(state_type&, extern_type* __to, extern_type*,
1741 extern_type*& __to_next) const
1742{
1743 __to_next = __to;
1744 return noconv;
1745}
1746
1747codecvt_base::result
1748__codecvt_utf8_utf16_base<wchar_t>::
1749do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1750 const extern_type*& __from_next,
1751 intern_type* __to, intern_type* __to_end,
1752 intern_type*& __to_next) const
1753{
1754 range<const char> from{ __from, __from_end };
1755 range<wchar_t> to{ __to, __to_end };
02e12bda
JW
1756 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1757#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1758 mode = codecvt_mode(mode | little_endian);
1759#endif
1760 auto res = utf16_in(from, to, _M_maxcode, mode);
28af1fb3
JW
1761 __from_next = from.next;
1762 __to_next = to.next;
1763 return res;
1764}
1765
1766int
1767__codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw()
516231de 1768{ return 0; } // UTF-8 is not a fixed-width encoding
28af1fb3
JW
1769
1770bool
1771__codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw()
1772{ return false; }
1773
1774int
1775__codecvt_utf8_utf16_base<wchar_t>::
1776do_length(state_type&, const extern_type* __from,
1777 const extern_type* __end, size_t __max) const
1778{
1779 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1780 return __end - __from;
1781}
1782
1783int
1784__codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw()
1785{
516231de
JW
1786 // A single character can be 1 or 2 UTF-16 code units,
1787 // requiring up to 4 UTF-8 code units.
1788 int max = 4;
1789 if (_M_mode & consume_header)
1790 max += sizeof(utf8_bom);
1791 return max;
28af1fb3
JW
1792}
1793#endif
1794
bb93f35d
JW
1795inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>;
1796inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>;
71a16cd8
JW
1797template class codecvt_byname<char16_t, char, mbstate_t>;
1798template class codecvt_byname<char32_t, char, mbstate_t>;
bb93f35d 1799
c124af93
TH
1800#if defined(_GLIBCXX_USE_CHAR8_T)
1801inline template class __codecvt_abstract_base<char16_t, char8_t, mbstate_t>;
1802inline template class __codecvt_abstract_base<char32_t, char8_t, mbstate_t>;
1803template class codecvt_byname<char16_t, char8_t, mbstate_t>;
1804template class codecvt_byname<char32_t, char8_t, mbstate_t>;
1805#endif
1806
bb93f35d
JW
1807_GLIBCXX_END_NAMESPACE_VERSION
1808}