]> git.ipfire.org Git - thirdparty/gcc.git/blob - libstdc++-v3/src/c++11/codecvt.cc
Update copyright years.
[thirdparty/gcc.git] / libstdc++-v3 / src / c++11 / codecvt.cc
1 // Locale support (codecvt) -*- C++ -*-
2
3 // Copyright (C) 2015-2020 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24
25 #include <codecvt>
26 #include <cstring> // std::memcpy, std::memcmp
27 #include <bits/stl_algobase.h> // std::min
28
29 namespace std _GLIBCXX_VISIBILITY(default)
30 {
31 _GLIBCXX_BEGIN_NAMESPACE_VERSION
32
33 // The standard doesn't define these operators, which is annoying.
34 static underlying_type<codecvt_mode>::type
35 to_integer(codecvt_mode m)
36 { return static_cast<underlying_type<codecvt_mode>::type>(m); }
37
38 static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
39 { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
40
41 static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
42 { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
43
44 static codecvt_mode operator~(codecvt_mode m)
45 { return codecvt_mode(~to_integer(m)); }
46
47 namespace
48 {
49 // Largest code point that fits in a single UTF-16 code unit.
50 const char32_t max_single_utf16_unit = 0xFFFF;
51
52 const char32_t max_code_point = 0x10FFFF;
53
54 // The functions below rely on maxcode < incomplete_mb_character
55 // (which is enforced by the codecvt_utf* classes on construction).
56 const char32_t incomplete_mb_character = char32_t(-2);
57 const char32_t invalid_mb_sequence = char32_t(-1);
58
59 // Utility type for reading and writing code units of type Elem from
60 // a range defined by a pair of pointers.
61 template<typename Elem, bool Aligned = true>
62 struct range
63 {
64 Elem* next;
65 Elem* end;
66
67 // Write a code unit.
68 range& operator=(Elem e)
69 {
70 *next++ = e;
71 return *this;
72 }
73
74 // Read the next code unit.
75 Elem operator*() const { return *next; }
76
77 // Read the Nth code unit.
78 Elem operator[](size_t n) const { return next[n]; }
79
80 // Move to the next code unit.
81 range& operator++()
82 {
83 ++next;
84 return *this;
85 }
86
87 // Move to the Nth code unit.
88 range& operator+=(size_t n)
89 {
90 next += n;
91 return *this;
92 }
93
94 // The number of code units remaining.
95 size_t size() const { return end - next; }
96
97 // The number of bytes remaining.
98 size_t nbytes() const { return (const char*)end - (const char*)next; }
99 };
100
101 // This specialization is used when accessing char16_t values through
102 // pointers to char, which might not be correctly aligned for char16_t.
103 template<typename Elem>
104 struct range<Elem, false>
105 {
106 using value_type = typename remove_const<Elem>::type;
107
108 using char_pointer = typename
109 conditional<is_const<Elem>::value, const char*, char*>::type;
110
111 char_pointer next;
112 char_pointer end;
113
114 // Write a code unit.
115 range& operator=(Elem e)
116 {
117 memcpy(next, &e, sizeof(Elem));
118 ++*this;
119 return *this;
120 }
121
122 // Read the next code unit.
123 Elem operator*() const
124 {
125 value_type e;
126 memcpy(&e, next, sizeof(Elem));
127 return e;
128 }
129
130 // Read the Nth code unit.
131 Elem operator[](size_t n) const
132 {
133 value_type e;
134 memcpy(&e, next + n * sizeof(Elem), sizeof(Elem));
135 return e;
136 }
137
138 // Move to the next code unit.
139 range& operator++()
140 {
141 next += sizeof(Elem);
142 return *this;
143 }
144
145 // Move to the Nth code unit.
146 range& operator+=(size_t n)
147 {
148 next += n * sizeof(Elem);
149 return *this;
150 }
151
152 // The number of code units remaining.
153 size_t size() const { return nbytes() / sizeof(Elem); }
154
155 // The number of bytes remaining.
156 size_t nbytes() const { return end - next; }
157 };
158
159 // Multibyte sequences can have "header" consisting of Byte Order Mark
160 const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
161 const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
162 const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
163
164 // Write a BOM (space permitting).
165 template<typename C, bool A, size_t N>
166 bool
167 write_bom(range<C, A>& to, const unsigned char (&bom)[N])
168 {
169 static_assert( (N / sizeof(C)) != 0, "" );
170 static_assert( (N % sizeof(C)) == 0, "" );
171
172 if (to.nbytes() < N)
173 return false;
174 memcpy(to.next, bom, N);
175 to += (N / sizeof(C));
176 return true;
177 }
178
179 // Try to read a BOM.
180 template<typename C, bool A, size_t N>
181 bool
182 read_bom(range<C, A>& from, const unsigned char (&bom)[N])
183 {
184 static_assert( (N / sizeof(C)) != 0, "" );
185 static_assert( (N % sizeof(C)) == 0, "" );
186
187 if (from.nbytes() >= N && !memcmp(from.next, bom, N))
188 {
189 from += (N / sizeof(C));
190 return true;
191 }
192 return false;
193 }
194
195 // If generate_header is set in mode write out UTF-8 BOM.
196 template<typename C>
197 bool
198 write_utf8_bom(range<C>& to, codecvt_mode mode)
199 {
200 if (mode & generate_header)
201 return write_bom(to, utf8_bom);
202 return true;
203 }
204
205 // If generate_header is set in mode write out the UTF-16 BOM indicated
206 // by whether little_endian is set in mode.
207 template<bool Aligned>
208 bool
209 write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode)
210 {
211 if (mode & generate_header)
212 {
213 if (mode & little_endian)
214 return write_bom(to, utf16le_bom);
215 else
216 return write_bom(to, utf16_bom);
217 }
218 return true;
219 }
220
221 // If consume_header is set in mode update from.next to after any BOM.
222 template<typename C>
223 void
224 read_utf8_bom(range<const C>& from, codecvt_mode mode)
225 {
226 if (mode & consume_header)
227 read_bom(from, utf8_bom);
228 }
229
230 // If consume_header is not set in mode, no effects.
231 // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
232 // - if the UTF-16BE BOM was found unset little_endian in mode, or
233 // - if the UTF-16LE BOM was found set little_endian in mode.
234 template<bool Aligned>
235 void
236 read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode)
237 {
238 if (mode & consume_header)
239 {
240 if (read_bom(from, utf16_bom))
241 mode &= ~little_endian;
242 else if (read_bom(from, utf16le_bom))
243 mode |= little_endian;
244 }
245 }
246
247 // Read a codepoint from a UTF-8 multibyte sequence.
248 // Updates from.next if the codepoint is not greater than maxcode.
249 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
250 template<typename C>
251 char32_t
252 read_utf8_code_point(range<const C>& from, unsigned long maxcode)
253 {
254 const size_t avail = from.size();
255 if (avail == 0)
256 return incomplete_mb_character;
257 unsigned char c1 = from[0];
258 // https://en.wikipedia.org/wiki/UTF-8#Sample_code
259 if (c1 < 0x80)
260 {
261 ++from;
262 return c1;
263 }
264 else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
265 return invalid_mb_sequence;
266 else if (c1 < 0xE0) // 2-byte sequence
267 {
268 if (avail < 2)
269 return incomplete_mb_character;
270 unsigned char c2 = from[1];
271 if ((c2 & 0xC0) != 0x80)
272 return invalid_mb_sequence;
273 char32_t c = (c1 << 6) + c2 - 0x3080;
274 if (c <= maxcode)
275 from += 2;
276 return c;
277 }
278 else if (c1 < 0xF0) // 3-byte sequence
279 {
280 if (avail < 3)
281 return incomplete_mb_character;
282 unsigned char c2 = from[1];
283 if ((c2 & 0xC0) != 0x80)
284 return invalid_mb_sequence;
285 if (c1 == 0xE0 && c2 < 0xA0) // overlong
286 return invalid_mb_sequence;
287 unsigned char c3 = from[2];
288 if ((c3 & 0xC0) != 0x80)
289 return invalid_mb_sequence;
290 char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
291 if (c <= maxcode)
292 from += 3;
293 return c;
294 }
295 else if (c1 < 0xF5) // 4-byte sequence
296 {
297 if (avail < 4)
298 return incomplete_mb_character;
299 unsigned char c2 = from[1];
300 if ((c2 & 0xC0) != 0x80)
301 return invalid_mb_sequence;
302 if (c1 == 0xF0 && c2 < 0x90) // overlong
303 return invalid_mb_sequence;
304 if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
305 return invalid_mb_sequence;
306 unsigned char c3 = from[2];
307 if ((c3 & 0xC0) != 0x80)
308 return invalid_mb_sequence;
309 unsigned char c4 = from[3];
310 if ((c4 & 0xC0) != 0x80)
311 return invalid_mb_sequence;
312 char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
313 if (c <= maxcode)
314 from += 4;
315 return c;
316 }
317 else // > U+10FFFF
318 return invalid_mb_sequence;
319 }
320
321 template<typename C>
322 bool
323 write_utf8_code_point(range<C>& to, char32_t code_point)
324 {
325 if (code_point < 0x80)
326 {
327 if (to.size() < 1)
328 return false;
329 to = code_point;
330 }
331 else if (code_point <= 0x7FF)
332 {
333 if (to.size() < 2)
334 return false;
335 to = (code_point >> 6) + 0xC0;
336 to = (code_point & 0x3F) + 0x80;
337 }
338 else if (code_point <= 0xFFFF)
339 {
340 if (to.size() < 3)
341 return false;
342 to = (code_point >> 12) + 0xE0;
343 to = ((code_point >> 6) & 0x3F) + 0x80;
344 to = (code_point & 0x3F) + 0x80;
345 }
346 else if (code_point <= 0x10FFFF)
347 {
348 if (to.size() < 4)
349 return false;
350 to = (code_point >> 18) + 0xF0;
351 to = ((code_point >> 12) & 0x3F) + 0x80;
352 to = ((code_point >> 6) & 0x3F) + 0x80;
353 to = (code_point & 0x3F) + 0x80;
354 }
355 else
356 return false;
357 return true;
358 }
359
360 inline char16_t
361 adjust_byte_order(char16_t c, codecvt_mode mode)
362 {
363 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
364 return (mode & little_endian) ? __builtin_bswap16(c) : c;
365 #else
366 return (mode & little_endian) ? c : __builtin_bswap16(c);
367 #endif
368 }
369
370 // Return true if c is a high-surrogate (aka leading) code point.
371 inline bool
372 is_high_surrogate(char32_t c)
373 {
374 return c >= 0xD800 && c <= 0xDBFF;
375 }
376
377 // Return true if c is a low-surrogate (aka trailing) code point.
378 inline bool
379 is_low_surrogate(char32_t c)
380 {
381 return c >= 0xDC00 && c <= 0xDFFF;
382 }
383
384 inline char32_t
385 surrogate_pair_to_code_point(char32_t high, char32_t low)
386 {
387 return (high << 10) + low - 0x35FDC00;
388 }
389
390 // Read a codepoint from a UTF-16 multibyte sequence.
391 // The sequence's endianness is indicated by (mode & little_endian).
392 // Updates from.next if the codepoint is not greater than maxcode.
393 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
394 template<bool Aligned>
395 char32_t
396 read_utf16_code_point(range<const char16_t, Aligned>& from,
397 unsigned long maxcode, codecvt_mode mode)
398 {
399 const size_t avail = from.size();
400 if (avail == 0)
401 return incomplete_mb_character;
402 int inc = 1;
403 char32_t c = adjust_byte_order(from[0], mode);
404 if (is_high_surrogate(c))
405 {
406 if (avail < 2)
407 return incomplete_mb_character;
408 const char16_t c2 = adjust_byte_order(from[1], mode);
409 if (is_low_surrogate(c2))
410 {
411 c = surrogate_pair_to_code_point(c, c2);
412 inc = 2;
413 }
414 else
415 return invalid_mb_sequence;
416 }
417 else if (is_low_surrogate(c))
418 return invalid_mb_sequence;
419 if (c <= maxcode)
420 from += inc;
421 return c;
422 }
423
424 template<typename C, bool A>
425 bool
426 write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode)
427 {
428 static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
429
430 if (codepoint <= max_single_utf16_unit)
431 {
432 if (to.size() > 0)
433 {
434 to = adjust_byte_order(codepoint, mode);
435 return true;
436 }
437 }
438 else if (to.size() > 1)
439 {
440 // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
441 const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
442 char16_t lead = LEAD_OFFSET + (codepoint >> 10);
443 char16_t trail = 0xDC00 + (codepoint & 0x3FF);
444 to = adjust_byte_order(lead, mode);
445 to = adjust_byte_order(trail, mode);
446 return true;
447 }
448 return false;
449 }
450
451 // utf8 -> ucs4
452 template<typename C>
453 codecvt_base::result
454 ucs4_in(range<const C>& from, range<char32_t>& to,
455 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
456 {
457 read_utf8_bom(from, mode);
458 while (from.size() && to.size())
459 {
460 const char32_t codepoint = read_utf8_code_point(from, maxcode);
461 if (codepoint == incomplete_mb_character)
462 return codecvt_base::partial;
463 if (codepoint > maxcode)
464 return codecvt_base::error;
465 to = codepoint;
466 }
467 return from.size() ? codecvt_base::partial : codecvt_base::ok;
468 }
469
470 // ucs4 -> utf8
471 template<typename C>
472 codecvt_base::result
473 ucs4_out(range<const char32_t>& from, range<C>& to,
474 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
475 {
476 if (!write_utf8_bom(to, mode))
477 return codecvt_base::partial;
478 while (from.size())
479 {
480 const char32_t c = from[0];
481 if (c > maxcode)
482 return codecvt_base::error;
483 if (!write_utf8_code_point(to, c))
484 return codecvt_base::partial;
485 ++from;
486 }
487 return codecvt_base::ok;
488 }
489
490 // utf16 -> ucs4
491 codecvt_base::result
492 ucs4_in(range<const char16_t, false>& from, range<char32_t>& to,
493 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
494 {
495 read_utf16_bom(from, mode);
496 while (from.size() && to.size())
497 {
498 const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
499 if (codepoint == incomplete_mb_character)
500 return codecvt_base::partial;
501 if (codepoint > maxcode)
502 return codecvt_base::error;
503 to = codepoint;
504 }
505 return from.size() ? codecvt_base::partial : codecvt_base::ok;
506 }
507
508 // ucs4 -> utf16
509 codecvt_base::result
510 ucs4_out(range<const char32_t>& from, range<char16_t, false>& to,
511 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
512 {
513 if (!write_utf16_bom(to, mode))
514 return codecvt_base::partial;
515 while (from.size())
516 {
517 const char32_t c = from[0];
518 if (c > maxcode)
519 return codecvt_base::error;
520 if (!write_utf16_code_point(to, c, mode))
521 return codecvt_base::partial;
522 ++from;
523 }
524 return codecvt_base::ok;
525 }
526
527 // Flag indicating whether to process UTF-16 or UCS2
528 enum class surrogates { allowed, disallowed };
529
530 // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
531 template<typename C8, typename C16>
532 codecvt_base::result
533 utf16_in(range<const C8>& from, range<C16>& to,
534 unsigned long maxcode = max_code_point, codecvt_mode mode = {},
535 surrogates s = surrogates::allowed)
536 {
537 read_utf8_bom(from, mode);
538 while (from.size() && to.size())
539 {
540 auto orig = from;
541 const char32_t codepoint = read_utf8_code_point(from, maxcode);
542 if (codepoint == incomplete_mb_character)
543 {
544 if (s == surrogates::allowed)
545 return codecvt_base::partial;
546 else
547 return codecvt_base::error; // No surrogates in UCS2
548 }
549 if (codepoint > maxcode)
550 return codecvt_base::error;
551 if (!write_utf16_code_point(to, codepoint, mode))
552 {
553 from = orig; // rewind to previous position
554 return codecvt_base::partial;
555 }
556 }
557 return codecvt_base::ok;
558 }
559
560 // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
561 template<typename C16, typename C8>
562 codecvt_base::result
563 utf16_out(range<const C16>& from, range<C8>& to,
564 unsigned long maxcode = max_code_point, codecvt_mode mode = {},
565 surrogates s = surrogates::allowed)
566 {
567 if (!write_utf8_bom(to, mode))
568 return codecvt_base::partial;
569 while (from.size())
570 {
571 char32_t c = from[0];
572 int inc = 1;
573 if (is_high_surrogate(c))
574 {
575 if (s == surrogates::disallowed)
576 return codecvt_base::error; // No surrogates in UCS-2
577
578 if (from.size() < 2)
579 return codecvt_base::ok; // stop converting at this point
580
581 const char32_t c2 = from[1];
582 if (is_low_surrogate(c2))
583 {
584 c = surrogate_pair_to_code_point(c, c2);
585 inc = 2;
586 }
587 else
588 return codecvt_base::error;
589 }
590 else if (is_low_surrogate(c))
591 return codecvt_base::error;
592 if (c > maxcode)
593 return codecvt_base::error;
594 if (!write_utf8_code_point(to, c))
595 return codecvt_base::partial;
596 from += inc;
597 }
598 return codecvt_base::ok;
599 }
600
601 // return pos such that [begin,pos) is valid UTF-16 string no longer than max
602 template<typename C>
603 const C*
604 utf16_span(const C* begin, const C* end, size_t max,
605 char32_t maxcode = max_code_point, codecvt_mode mode = {})
606 {
607 range<const C> from{ begin, end };
608 read_utf8_bom(from, mode);
609 size_t count = 0;
610 while (count+1 < max)
611 {
612 char32_t c = read_utf8_code_point(from, maxcode);
613 if (c > maxcode)
614 return from.next;
615 else if (c > max_single_utf16_unit)
616 ++count;
617 ++count;
618 }
619 if (count+1 == max) // take one more character if it fits in a single unit
620 read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
621 return from.next;
622 }
623
624 // utf8 -> ucs2
625 template<typename C>
626 codecvt_base::result
627 ucs2_in(range<const C>& from, range<char16_t>& to,
628 char32_t maxcode = max_code_point, codecvt_mode mode = {})
629 {
630 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
631 maxcode = std::min(max_single_utf16_unit, maxcode);
632 return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
633 }
634
635 // ucs2 -> utf8
636 template<typename C>
637 codecvt_base::result
638 ucs2_out(range<const char16_t>& from, range<C>& to,
639 char32_t maxcode = max_code_point, codecvt_mode mode = {})
640 {
641 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
642 maxcode = std::min(max_single_utf16_unit, maxcode);
643 return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
644 }
645
646 // ucs2 -> utf16
647 codecvt_base::result
648 ucs2_out(range<const char16_t>& from, range<char16_t, false>& to,
649 char32_t maxcode = max_code_point, codecvt_mode mode = {})
650 {
651 if (!write_utf16_bom(to, mode))
652 return codecvt_base::partial;
653 while (from.size() && to.size())
654 {
655 char16_t c = from[0];
656 if (is_high_surrogate(c))
657 return codecvt_base::error;
658 if (c > maxcode)
659 return codecvt_base::error;
660 to = adjust_byte_order(c, mode);
661 ++from;
662 }
663 return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
664 }
665
666 // utf16 -> ucs2
667 codecvt_base::result
668 ucs2_in(range<const char16_t, false>& from, range<char16_t>& to,
669 char32_t maxcode = max_code_point, codecvt_mode mode = {})
670 {
671 read_utf16_bom(from, mode);
672 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
673 maxcode = std::min(max_single_utf16_unit, maxcode);
674 while (from.size() && to.size())
675 {
676 const char32_t c = read_utf16_code_point(from, maxcode, mode);
677 if (c == incomplete_mb_character)
678 return codecvt_base::error; // UCS-2 only supports single units.
679 if (c > maxcode)
680 return codecvt_base::error;
681 to = c;
682 }
683 return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
684 }
685
686 const char16_t*
687 ucs2_span(range<const char16_t, false>& from, size_t max,
688 char32_t maxcode, codecvt_mode mode)
689 {
690 read_utf16_bom(from, mode);
691 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
692 maxcode = std::min(max_single_utf16_unit, maxcode);
693 char32_t c = 0;
694 while (max-- && c <= maxcode)
695 c = read_utf16_code_point(from, maxcode, mode);
696 return reinterpret_cast<const char16_t*>(from.next);
697 }
698
699 template<typename C>
700 const C*
701 ucs2_span(const C* begin, const C* end, size_t max,
702 char32_t maxcode, codecvt_mode mode)
703 {
704 range<const C> from{ begin, end };
705 read_utf8_bom(from, mode);
706 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
707 maxcode = std::min(max_single_utf16_unit, maxcode);
708 char32_t c = 0;
709 while (max-- && c <= maxcode)
710 c = read_utf8_code_point(from, maxcode);
711 return from.next;
712 }
713
714 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
715 template<typename C>
716 const C*
717 ucs4_span(const C* begin, const C* end, size_t max,
718 char32_t maxcode = max_code_point, codecvt_mode mode = {})
719 {
720 range<const C> from{ begin, end };
721 read_utf8_bom(from, mode);
722 char32_t c = 0;
723 while (max-- && c <= maxcode)
724 c = read_utf8_code_point(from, maxcode);
725 return from.next;
726 }
727
728 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
729 const char16_t*
730 ucs4_span(range<const char16_t, false>& from, size_t max,
731 char32_t maxcode = max_code_point, codecvt_mode mode = {})
732 {
733 read_utf16_bom(from, mode);
734 char32_t c = 0;
735 while (max-- && c <= maxcode)
736 c = read_utf16_code_point(from, maxcode, mode);
737 return reinterpret_cast<const char16_t*>(from.next);
738 }
739 }
740
741 // Define members of codecvt<char16_t, char, mbstate_t> specialization.
742 // Converts from UTF-8 to UTF-16.
743
744 locale::id codecvt<char16_t, char, mbstate_t>::id;
745
746 codecvt<char16_t, char, mbstate_t>::~codecvt() { }
747
748 codecvt_base::result
749 codecvt<char16_t, char, mbstate_t>::
750 do_out(state_type&,
751 const intern_type* __from,
752 const intern_type* __from_end, const intern_type*& __from_next,
753 extern_type* __to, extern_type* __to_end,
754 extern_type*& __to_next) const
755 {
756 range<const char16_t> from{ __from, __from_end };
757 range<char> to{ __to, __to_end };
758 auto res = utf16_out(from, to);
759 __from_next = from.next;
760 __to_next = to.next;
761 return res;
762 }
763
764 codecvt_base::result
765 codecvt<char16_t, char, mbstate_t>::
766 do_unshift(state_type&, extern_type* __to, extern_type*,
767 extern_type*& __to_next) const
768 {
769 __to_next = __to;
770 return noconv; // we don't use mbstate_t for the unicode facets
771 }
772
773 codecvt_base::result
774 codecvt<char16_t, char, mbstate_t>::
775 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
776 const extern_type*& __from_next,
777 intern_type* __to, intern_type* __to_end,
778 intern_type*& __to_next) const
779 {
780 range<const char> from{ __from, __from_end };
781 range<char16_t> to{ __to, __to_end };
782 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
783 codecvt_mode mode = {};
784 #else
785 codecvt_mode mode = little_endian;
786 #endif
787 auto res = utf16_in(from, to, max_code_point, mode);
788 __from_next = from.next;
789 __to_next = to.next;
790 return res;
791 }
792
793 int
794 codecvt<char16_t, char, mbstate_t>::do_encoding() const throw()
795 { return 0; } // UTF-8 is not a fixed-width encoding
796
797 bool
798 codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw()
799 { return false; }
800
801 int
802 codecvt<char16_t, char, mbstate_t>::
803 do_length(state_type&, const extern_type* __from,
804 const extern_type* __end, size_t __max) const
805 {
806 __end = utf16_span(__from, __end, __max);
807 return __end - __from;
808 }
809
810 int
811 codecvt<char16_t, char, mbstate_t>::do_max_length() const throw()
812 {
813 // A single character (one or two UTF-16 code units) requires
814 // up to four UTF-8 code units.
815 return 4;
816 }
817
818 // Define members of codecvt<char32_t, char, mbstate_t> specialization.
819 // Converts from UTF-8 to UTF-32 (aka UCS-4).
820
821 locale::id codecvt<char32_t, char, mbstate_t>::id;
822
823 codecvt<char32_t, char, mbstate_t>::~codecvt() { }
824
825 codecvt_base::result
826 codecvt<char32_t, char, mbstate_t>::
827 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
828 const intern_type*& __from_next,
829 extern_type* __to, extern_type* __to_end,
830 extern_type*& __to_next) const
831 {
832 range<const char32_t> from{ __from, __from_end };
833 range<char> to{ __to, __to_end };
834 auto res = ucs4_out(from, to);
835 __from_next = from.next;
836 __to_next = to.next;
837 return res;
838 }
839
840 codecvt_base::result
841 codecvt<char32_t, char, mbstate_t>::
842 do_unshift(state_type&, extern_type* __to, extern_type*,
843 extern_type*& __to_next) const
844 {
845 __to_next = __to;
846 return noconv;
847 }
848
849 codecvt_base::result
850 codecvt<char32_t, char, mbstate_t>::
851 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
852 const extern_type*& __from_next,
853 intern_type* __to, intern_type* __to_end,
854 intern_type*& __to_next) const
855 {
856 range<const char> from{ __from, __from_end };
857 range<char32_t> to{ __to, __to_end };
858 auto res = ucs4_in(from, to);
859 __from_next = from.next;
860 __to_next = to.next;
861 return res;
862 }
863
864 int
865 codecvt<char32_t, char, mbstate_t>::do_encoding() const throw()
866 { return 0; } // UTF-8 is not a fixed-width encoding
867
868 bool
869 codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw()
870 { return false; }
871
872 int
873 codecvt<char32_t, char, mbstate_t>::
874 do_length(state_type&, const extern_type* __from,
875 const extern_type* __end, size_t __max) const
876 {
877 __end = ucs4_span(__from, __end, __max);
878 return __end - __from;
879 }
880
881 int
882 codecvt<char32_t, char, mbstate_t>::do_max_length() const throw()
883 {
884 // A single character (one UTF-32 code unit) requires
885 // up to 4 UTF-8 code units.
886 return 4;
887 }
888
889 #if defined(_GLIBCXX_USE_CHAR8_T)
890 // Define members of codecvt<char16_t, char8_t, mbstate_t> specialization.
891 // Converts from UTF-8 to UTF-16.
892
893 locale::id codecvt<char16_t, char8_t, mbstate_t>::id;
894
895 codecvt<char16_t, char8_t, mbstate_t>::~codecvt() { }
896
897 codecvt_base::result
898 codecvt<char16_t, char8_t, mbstate_t>::
899 do_out(state_type&,
900 const intern_type* __from,
901 const intern_type* __from_end, const intern_type*& __from_next,
902 extern_type* __to, extern_type* __to_end,
903 extern_type*& __to_next) const
904 {
905 range<const char16_t> from{ __from, __from_end };
906 range<char8_t> to{ __to, __to_end };
907 auto res = utf16_out(from, to);
908 __from_next = from.next;
909 __to_next = to.next;
910 return res;
911 }
912
913 codecvt_base::result
914 codecvt<char16_t, char8_t, mbstate_t>::
915 do_unshift(state_type&, extern_type* __to, extern_type*,
916 extern_type*& __to_next) const
917 {
918 __to_next = __to;
919 return noconv; // we don't use mbstate_t for the unicode facets
920 }
921
922 codecvt_base::result
923 codecvt<char16_t, char8_t, mbstate_t>::
924 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
925 const extern_type*& __from_next,
926 intern_type* __to, intern_type* __to_end,
927 intern_type*& __to_next) const
928 {
929 range<const char8_t> from{ __from, __from_end };
930 range<char16_t> to{ __to, __to_end };
931 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
932 codecvt_mode mode = {};
933 #else
934 codecvt_mode mode = little_endian;
935 #endif
936 auto res = utf16_in(from, to, max_code_point, mode);
937 __from_next = from.next;
938 __to_next = to.next;
939 return res;
940 }
941
942 int
943 codecvt<char16_t, char8_t, mbstate_t>::do_encoding() const throw()
944 { return 0; } // UTF-8 is not a fixed-width encoding
945
946 bool
947 codecvt<char16_t, char8_t, mbstate_t>::do_always_noconv() const throw()
948 { return false; }
949
950 int
951 codecvt<char16_t, char8_t, mbstate_t>::
952 do_length(state_type&, const extern_type* __from,
953 const extern_type* __end, size_t __max) const
954 {
955 __end = utf16_span(__from, __end, __max);
956 return __end - __from;
957 }
958
959 int
960 codecvt<char16_t, char8_t, mbstate_t>::do_max_length() const throw()
961 {
962 // A single character (one or two UTF-16 code units) requires
963 // up to four UTF-8 code units.
964 return 4;
965 }
966
967 // Define members of codecvt<char32_t, char8_t, mbstate_t> specialization.
968 // Converts from UTF-8 to UTF-32 (aka UCS-4).
969
970 locale::id codecvt<char32_t, char8_t, mbstate_t>::id;
971
972 codecvt<char32_t, char8_t, mbstate_t>::~codecvt() { }
973
974 codecvt_base::result
975 codecvt<char32_t, char8_t, mbstate_t>::
976 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
977 const intern_type*& __from_next,
978 extern_type* __to, extern_type* __to_end,
979 extern_type*& __to_next) const
980 {
981 range<const char32_t> from{ __from, __from_end };
982 range<char8_t> to{ __to, __to_end };
983 auto res = ucs4_out(from, to);
984 __from_next = from.next;
985 __to_next = to.next;
986 return res;
987 }
988
989 codecvt_base::result
990 codecvt<char32_t, char8_t, mbstate_t>::
991 do_unshift(state_type&, extern_type* __to, extern_type*,
992 extern_type*& __to_next) const
993 {
994 __to_next = __to;
995 return noconv;
996 }
997
998 codecvt_base::result
999 codecvt<char32_t, char8_t, mbstate_t>::
1000 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1001 const extern_type*& __from_next,
1002 intern_type* __to, intern_type* __to_end,
1003 intern_type*& __to_next) const
1004 {
1005 range<const char8_t> from{ __from, __from_end };
1006 range<char32_t> to{ __to, __to_end };
1007 auto res = ucs4_in(from, to);
1008 __from_next = from.next;
1009 __to_next = to.next;
1010 return res;
1011 }
1012
1013 int
1014 codecvt<char32_t, char8_t, mbstate_t>::do_encoding() const throw()
1015 { return 0; } // UTF-8 is not a fixed-width encoding
1016
1017 bool
1018 codecvt<char32_t, char8_t, mbstate_t>::do_always_noconv() const throw()
1019 { return false; }
1020
1021 int
1022 codecvt<char32_t, char8_t, mbstate_t>::
1023 do_length(state_type&, const extern_type* __from,
1024 const extern_type* __end, size_t __max) const
1025 {
1026 __end = ucs4_span(__from, __end, __max);
1027 return __end - __from;
1028 }
1029
1030 int
1031 codecvt<char32_t, char8_t, mbstate_t>::do_max_length() const throw()
1032 {
1033 // A single character (one UTF-32 code unit) requires
1034 // up to 4 UTF-8 code units.
1035 return 4;
1036 }
1037 #endif // _GLIBCXX_USE_CHAR8_T
1038
1039 // Define members of codecvt_utf8<char16_t> base class implementation.
1040 // Converts from UTF-8 to UCS-2.
1041
1042 __codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { }
1043
1044 codecvt_base::result
1045 __codecvt_utf8_base<char16_t>::
1046 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1047 const intern_type*& __from_next,
1048 extern_type* __to, extern_type* __to_end,
1049 extern_type*& __to_next) const
1050 {
1051 range<const char16_t> from{ __from, __from_end };
1052 range<char> to{ __to, __to_end };
1053 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1054 __from_next = from.next;
1055 __to_next = to.next;
1056 return res;
1057 }
1058
1059 codecvt_base::result
1060 __codecvt_utf8_base<char16_t>::
1061 do_unshift(state_type&, extern_type* __to, extern_type*,
1062 extern_type*& __to_next) const
1063 {
1064 __to_next = __to;
1065 return noconv;
1066 }
1067
1068 codecvt_base::result
1069 __codecvt_utf8_base<char16_t>::
1070 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1071 const extern_type*& __from_next,
1072 intern_type* __to, intern_type* __to_end,
1073 intern_type*& __to_next) const
1074 {
1075 range<const char> from{ __from, __from_end };
1076 range<char16_t> to{ __to, __to_end };
1077 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1078 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1079 mode = codecvt_mode(mode | little_endian);
1080 #endif
1081 auto res = ucs2_in(from, to, _M_maxcode, mode);
1082 __from_next = from.next;
1083 __to_next = to.next;
1084 return res;
1085 }
1086
1087 int
1088 __codecvt_utf8_base<char16_t>::do_encoding() const throw()
1089 { return 0; } // UTF-8 is not a fixed-width encoding
1090
1091 bool
1092 __codecvt_utf8_base<char16_t>::do_always_noconv() const throw()
1093 { return false; }
1094
1095 int
1096 __codecvt_utf8_base<char16_t>::
1097 do_length(state_type&, const extern_type* __from,
1098 const extern_type* __end, size_t __max) const
1099 {
1100 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1101 return __end - __from;
1102 }
1103
1104 int
1105 __codecvt_utf8_base<char16_t>::do_max_length() const throw()
1106 {
1107 // A single UCS-2 character requires up to three UTF-8 code units.
1108 // (UCS-2 cannot represent characters that use four UTF-8 code units).
1109 int max = 3;
1110 if (_M_mode & consume_header)
1111 max += sizeof(utf8_bom);
1112 return max;
1113 }
1114
1115 // Define members of codecvt_utf8<char32_t> base class implementation.
1116 // Converts from UTF-8 to UTF-32 (aka UCS-4).
1117
1118 __codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { }
1119
1120 codecvt_base::result
1121 __codecvt_utf8_base<char32_t>::
1122 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1123 const intern_type*& __from_next,
1124 extern_type* __to, extern_type* __to_end,
1125 extern_type*& __to_next) const
1126 {
1127 range<const char32_t> from{ __from, __from_end };
1128 range<char> to{ __to, __to_end };
1129 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1130 __from_next = from.next;
1131 __to_next = to.next;
1132 return res;
1133 }
1134
1135 codecvt_base::result
1136 __codecvt_utf8_base<char32_t>::
1137 do_unshift(state_type&, extern_type* __to, extern_type*,
1138 extern_type*& __to_next) const
1139 {
1140 __to_next = __to;
1141 return noconv;
1142 }
1143
1144 codecvt_base::result
1145 __codecvt_utf8_base<char32_t>::
1146 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1147 const extern_type*& __from_next,
1148 intern_type* __to, intern_type* __to_end,
1149 intern_type*& __to_next) const
1150 {
1151 range<const char> from{ __from, __from_end };
1152 range<char32_t> to{ __to, __to_end };
1153 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1154 __from_next = from.next;
1155 __to_next = to.next;
1156 return res;
1157 }
1158
1159 int
1160 __codecvt_utf8_base<char32_t>::do_encoding() const throw()
1161 { return 0; } // UTF-8 is not a fixed-width encoding
1162
1163 bool
1164 __codecvt_utf8_base<char32_t>::do_always_noconv() const throw()
1165 { return false; }
1166
1167 int
1168 __codecvt_utf8_base<char32_t>::
1169 do_length(state_type&, const extern_type* __from,
1170 const extern_type* __end, size_t __max) const
1171 {
1172 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1173 return __end - __from;
1174 }
1175
1176 int
1177 __codecvt_utf8_base<char32_t>::do_max_length() const throw()
1178 {
1179 // A single UCS-4 character requires up to four UTF-8 code units.
1180 int max = 4;
1181 if (_M_mode & consume_header)
1182 max += sizeof(utf8_bom);
1183 return max;
1184 }
1185
1186 #ifdef _GLIBCXX_USE_WCHAR_T
1187
1188 #if __SIZEOF_WCHAR_T__ == 2
1189 static_assert(sizeof(wchar_t) == sizeof(char16_t), "");
1190 #elif __SIZEOF_WCHAR_T__ == 4
1191 static_assert(sizeof(wchar_t) == sizeof(char32_t), "");
1192 #endif
1193
1194 // Define members of codecvt_utf8<wchar_t> base class implementation.
1195 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1196
1197 __codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { }
1198
1199 codecvt_base::result
1200 __codecvt_utf8_base<wchar_t>::
1201 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1202 const intern_type*& __from_next,
1203 extern_type* __to, extern_type* __to_end,
1204 extern_type*& __to_next) const
1205 {
1206 range<char> to{ __to, __to_end };
1207 #if __SIZEOF_WCHAR_T__ == 2
1208 range<const char16_t> from{
1209 reinterpret_cast<const char16_t*>(__from),
1210 reinterpret_cast<const char16_t*>(__from_end)
1211 };
1212 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1213 #elif __SIZEOF_WCHAR_T__ == 4
1214 range<const char32_t> from{
1215 reinterpret_cast<const char32_t*>(__from),
1216 reinterpret_cast<const char32_t*>(__from_end)
1217 };
1218 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1219 #else
1220 return codecvt_base::error;
1221 #endif
1222 __from_next = reinterpret_cast<const wchar_t*>(from.next);
1223 __to_next = to.next;
1224 return res;
1225 }
1226
1227 codecvt_base::result
1228 __codecvt_utf8_base<wchar_t>::
1229 do_unshift(state_type&, extern_type* __to, extern_type*,
1230 extern_type*& __to_next) const
1231 {
1232 __to_next = __to;
1233 return noconv;
1234 }
1235
1236 codecvt_base::result
1237 __codecvt_utf8_base<wchar_t>::
1238 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1239 const extern_type*& __from_next,
1240 intern_type* __to, intern_type* __to_end,
1241 intern_type*& __to_next) const
1242 {
1243 range<const char> from{ __from, __from_end };
1244 #if __SIZEOF_WCHAR_T__ == 2
1245 range<char16_t> to{
1246 reinterpret_cast<char16_t*>(__to),
1247 reinterpret_cast<char16_t*>(__to_end)
1248 };
1249 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1250 codecvt_mode mode = {};
1251 #else
1252 codecvt_mode mode = little_endian;
1253 #endif
1254 auto res = ucs2_in(from, to, _M_maxcode, mode);
1255 #elif __SIZEOF_WCHAR_T__ == 4
1256 range<char32_t> to{
1257 reinterpret_cast<char32_t*>(__to),
1258 reinterpret_cast<char32_t*>(__to_end)
1259 };
1260 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1261 #else
1262 return codecvt_base::error;
1263 #endif
1264 __from_next = from.next;
1265 __to_next = reinterpret_cast<wchar_t*>(to.next);
1266 return res;
1267 }
1268
1269 int
1270 __codecvt_utf8_base<wchar_t>::do_encoding() const throw()
1271 { return 0; } // UTF-8 is not a fixed-width encoding
1272
1273 bool
1274 __codecvt_utf8_base<wchar_t>::do_always_noconv() const throw()
1275 { return false; }
1276
1277 int
1278 __codecvt_utf8_base<wchar_t>::
1279 do_length(state_type&, const extern_type* __from,
1280 const extern_type* __end, size_t __max) const
1281 {
1282 #if __SIZEOF_WCHAR_T__ == 2
1283 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1284 #elif __SIZEOF_WCHAR_T__ == 4
1285 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1286 #else
1287 __end = __from;
1288 #endif
1289 return __end - __from;
1290 }
1291
1292 int
1293 __codecvt_utf8_base<wchar_t>::do_max_length() const throw()
1294 {
1295 #if __SIZEOF_WCHAR_T__ == 2
1296 int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
1297 #else
1298 int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
1299 #endif
1300 if (_M_mode & consume_header)
1301 max += sizeof(utf8_bom);
1302 return max;
1303 }
1304 #endif
1305
1306 // Define members of codecvt_utf16<char16_t> base class implementation.
1307 // Converts from UTF-16 to UCS-2.
1308
1309 __codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { }
1310
1311 codecvt_base::result
1312 __codecvt_utf16_base<char16_t>::
1313 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1314 const intern_type*& __from_next,
1315 extern_type* __to, extern_type* __to_end,
1316 extern_type*& __to_next) const
1317 {
1318 range<const char16_t> from{ __from, __from_end };
1319 range<char16_t, false> to{ __to, __to_end };
1320 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1321 __from_next = from.next;
1322 __to_next = reinterpret_cast<char*>(to.next);
1323 return res;
1324 }
1325
1326 codecvt_base::result
1327 __codecvt_utf16_base<char16_t>::
1328 do_unshift(state_type&, extern_type* __to, extern_type*,
1329 extern_type*& __to_next) const
1330 {
1331 __to_next = __to;
1332 return noconv;
1333 }
1334
1335 codecvt_base::result
1336 __codecvt_utf16_base<char16_t>::
1337 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1338 const extern_type*& __from_next,
1339 intern_type* __to, intern_type* __to_end,
1340 intern_type*& __to_next) const
1341 {
1342 range<const char16_t, false> from{ __from, __from_end };
1343 range<char16_t> to{ __to, __to_end };
1344 auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1345 __from_next = reinterpret_cast<const char*>(from.next);
1346 __to_next = to.next;
1347 if (res == codecvt_base::ok && __from_next != __from_end)
1348 res = codecvt_base::error;
1349 return res;
1350 }
1351
1352 int
1353 __codecvt_utf16_base<char16_t>::do_encoding() const throw()
1354 { return 0; } // UTF-16 is not a fixed-width encoding
1355
1356 bool
1357 __codecvt_utf16_base<char16_t>::do_always_noconv() const throw()
1358 { return false; }
1359
1360 int
1361 __codecvt_utf16_base<char16_t>::
1362 do_length(state_type&, const extern_type* __from,
1363 const extern_type* __end, size_t __max) const
1364 {
1365 range<const char16_t, false> from{ __from, __end };
1366 const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1367 return reinterpret_cast<const char*>(next) - __from;
1368 }
1369
1370 int
1371 __codecvt_utf16_base<char16_t>::do_max_length() const throw()
1372 {
1373 // A single UCS-2 character requires one UTF-16 code unit (so two chars).
1374 // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
1375 int max = 2;
1376 if (_M_mode & consume_header)
1377 max += sizeof(utf16_bom);
1378 return max;
1379 }
1380
1381 // Define members of codecvt_utf16<char32_t> base class implementation.
1382 // Converts from UTF-16 to UTF-32 (aka UCS-4).
1383
1384 __codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { }
1385
1386 codecvt_base::result
1387 __codecvt_utf16_base<char32_t>::
1388 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1389 const intern_type*& __from_next,
1390 extern_type* __to, extern_type* __to_end,
1391 extern_type*& __to_next) const
1392 {
1393 range<const char32_t> from{ __from, __from_end };
1394 range<char16_t, false> to{ __to, __to_end };
1395 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1396 __from_next = from.next;
1397 __to_next = reinterpret_cast<char*>(to.next);
1398 return res;
1399 }
1400
1401 codecvt_base::result
1402 __codecvt_utf16_base<char32_t>::
1403 do_unshift(state_type&, extern_type* __to, extern_type*,
1404 extern_type*& __to_next) const
1405 {
1406 __to_next = __to;
1407 return noconv;
1408 }
1409
1410 codecvt_base::result
1411 __codecvt_utf16_base<char32_t>::
1412 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1413 const extern_type*& __from_next,
1414 intern_type* __to, intern_type* __to_end,
1415 intern_type*& __to_next) const
1416 {
1417 range<const char16_t, false> from{ __from, __from_end };
1418 range<char32_t> to{ __to, __to_end };
1419 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1420 __from_next = reinterpret_cast<const char*>(from.next);
1421 __to_next = to.next;
1422 if (res == codecvt_base::ok && __from_next != __from_end)
1423 res = codecvt_base::error;
1424 return res;
1425 }
1426
1427 int
1428 __codecvt_utf16_base<char32_t>::do_encoding() const throw()
1429 { return 0; } // UTF-16 is not a fixed-width encoding
1430
1431 bool
1432 __codecvt_utf16_base<char32_t>::do_always_noconv() const throw()
1433 { return false; }
1434
1435 int
1436 __codecvt_utf16_base<char32_t>::
1437 do_length(state_type&, const extern_type* __from,
1438 const extern_type* __end, size_t __max) const
1439 {
1440 range<const char16_t, false> from{ __from, __end };
1441 const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1442 return reinterpret_cast<const char*>(next) - __from;
1443 }
1444
1445 int
1446 __codecvt_utf16_base<char32_t>::do_max_length() const throw()
1447 {
1448 // A single UCS-4 character requires one or two UTF-16 code units
1449 // (so up to four chars).
1450 int max = 4;
1451 if (_M_mode & consume_header)
1452 max += sizeof(utf16_bom);
1453 return max;
1454 }
1455
1456 #ifdef _GLIBCXX_USE_WCHAR_T
1457 // Define members of codecvt_utf16<wchar_t> base class implementation.
1458 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1459
1460 __codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { }
1461
1462 codecvt_base::result
1463 __codecvt_utf16_base<wchar_t>::
1464 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1465 const intern_type*& __from_next,
1466 extern_type* __to, extern_type* __to_end,
1467 extern_type*& __to_next) const
1468 {
1469 range<char16_t, false> to{ __to, __to_end };
1470 #if __SIZEOF_WCHAR_T__ == 2
1471 range<const char16_t> from{
1472 reinterpret_cast<const char16_t*>(__from),
1473 reinterpret_cast<const char16_t*>(__from_end),
1474 };
1475 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1476 #elif __SIZEOF_WCHAR_T__ == 4
1477 range<const char32_t> from{
1478 reinterpret_cast<const char32_t*>(__from),
1479 reinterpret_cast<const char32_t*>(__from_end),
1480 };
1481 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1482 #else
1483 return codecvt_base::error;
1484 #endif
1485 __from_next = reinterpret_cast<const wchar_t*>(from.next);
1486 __to_next = reinterpret_cast<char*>(to.next);
1487 return res;
1488 }
1489
1490 codecvt_base::result
1491 __codecvt_utf16_base<wchar_t>::
1492 do_unshift(state_type&, extern_type* __to, extern_type*,
1493 extern_type*& __to_next) const
1494 {
1495 __to_next = __to;
1496 return noconv;
1497 }
1498
1499 codecvt_base::result
1500 __codecvt_utf16_base<wchar_t>::
1501 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1502 const extern_type*& __from_next,
1503 intern_type* __to, intern_type* __to_end,
1504 intern_type*& __to_next) const
1505 {
1506 range<const char16_t, false> from{ __from, __from_end };
1507 #if __SIZEOF_WCHAR_T__ == 2
1508 range<char16_t> to{
1509 reinterpret_cast<char16_t*>(__to),
1510 reinterpret_cast<char16_t*>(__to_end),
1511 };
1512 auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1513 #elif __SIZEOF_WCHAR_T__ == 4
1514 range<char32_t> to{
1515 reinterpret_cast<char32_t*>(__to),
1516 reinterpret_cast<char32_t*>(__to_end),
1517 };
1518 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1519 #else
1520 return codecvt_base::error;
1521 #endif
1522 __from_next = reinterpret_cast<const char*>(from.next);
1523 __to_next = reinterpret_cast<wchar_t*>(to.next);
1524 if (res == codecvt_base::ok && __from_next != __from_end)
1525 res = codecvt_base::error;
1526 return res;
1527 }
1528
1529 int
1530 __codecvt_utf16_base<wchar_t>::do_encoding() const throw()
1531 { return 0; } // UTF-16 is not a fixed-width encoding
1532
1533 bool
1534 __codecvt_utf16_base<wchar_t>::do_always_noconv() const throw()
1535 { return false; }
1536
1537 int
1538 __codecvt_utf16_base<wchar_t>::
1539 do_length(state_type&, const extern_type* __from,
1540 const extern_type* __end, size_t __max) const
1541 {
1542 range<const char16_t, false> from{ __from, __end };
1543 #if __SIZEOF_WCHAR_T__ == 2
1544 const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1545 #elif __SIZEOF_WCHAR_T__ == 4
1546 const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1547 #endif
1548 return reinterpret_cast<const char*>(next) - __from;
1549 }
1550
1551 int
1552 __codecvt_utf16_base<wchar_t>::do_max_length() const throw()
1553 {
1554 #if __SIZEOF_WCHAR_T__ == 2
1555 int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
1556 #else
1557 int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
1558 #endif
1559 if (_M_mode & consume_header)
1560 max += sizeof(utf16_bom);
1561 return max;
1562 }
1563 #endif
1564
1565 // Define members of codecvt_utf8_utf16<char16_t> base class implementation.
1566 // Converts from UTF-8 to UTF-16.
1567
1568 __codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { }
1569
1570 codecvt_base::result
1571 __codecvt_utf8_utf16_base<char16_t>::
1572 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1573 const intern_type*& __from_next,
1574 extern_type* __to, extern_type* __to_end,
1575 extern_type*& __to_next) const
1576 {
1577 range<const char16_t> from{ __from, __from_end };
1578 range<char> to{ __to, __to_end };
1579 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1580 __from_next = from.next;
1581 __to_next = to.next;
1582 return res;
1583 }
1584
1585 codecvt_base::result
1586 __codecvt_utf8_utf16_base<char16_t>::
1587 do_unshift(state_type&, extern_type* __to, extern_type*,
1588 extern_type*& __to_next) const
1589 {
1590 __to_next = __to;
1591 return noconv;
1592 }
1593
1594 codecvt_base::result
1595 __codecvt_utf8_utf16_base<char16_t>::
1596 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1597 const extern_type*& __from_next,
1598 intern_type* __to, intern_type* __to_end,
1599 intern_type*& __to_next) const
1600 {
1601 range<const char> from{ __from, __from_end };
1602 range<char16_t> to{ __to, __to_end };
1603 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1604 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1605 mode = codecvt_mode(mode | little_endian);
1606 #endif
1607 auto res = utf16_in(from, to, _M_maxcode, mode);
1608 __from_next = from.next;
1609 __to_next = to.next;
1610 return res;
1611 }
1612
1613 int
1614 __codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw()
1615 { return 0; } // UTF-8 is not a fixed-width encoding
1616
1617 bool
1618 __codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw()
1619 { return false; }
1620
1621 int
1622 __codecvt_utf8_utf16_base<char16_t>::
1623 do_length(state_type&, const extern_type* __from,
1624 const extern_type* __end, size_t __max) const
1625 {
1626 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1627 return __end - __from;
1628 }
1629
1630 int
1631 __codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw()
1632 {
1633 // A single character can be 1 or 2 UTF-16 code units,
1634 // requiring up to 4 UTF-8 code units.
1635 int max = 4;
1636 if (_M_mode & consume_header)
1637 max += sizeof(utf8_bom);
1638 return max;
1639 }
1640
1641 // Define members of codecvt_utf8_utf16<char32_t> base class implementation.
1642 // Converts from UTF-8 to UTF-16.
1643
1644 __codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { }
1645
1646 codecvt_base::result
1647 __codecvt_utf8_utf16_base<char32_t>::
1648 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1649 const intern_type*& __from_next,
1650 extern_type* __to, extern_type* __to_end,
1651 extern_type*& __to_next) const
1652 {
1653 range<const char32_t> from{ __from, __from_end };
1654 range<char> to{ __to, __to_end };
1655 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1656 __from_next = from.next;
1657 __to_next = to.next;
1658 return res;
1659 }
1660
1661 codecvt_base::result
1662 __codecvt_utf8_utf16_base<char32_t>::
1663 do_unshift(state_type&, extern_type* __to, extern_type*,
1664 extern_type*& __to_next) const
1665 {
1666 __to_next = __to;
1667 return noconv;
1668 }
1669
1670 codecvt_base::result
1671 __codecvt_utf8_utf16_base<char32_t>::
1672 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1673 const extern_type*& __from_next,
1674 intern_type* __to, intern_type* __to_end,
1675 intern_type*& __to_next) const
1676 {
1677 range<const char> from{ __from, __from_end };
1678 range<char32_t> to{ __to, __to_end };
1679 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1680 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1681 mode = codecvt_mode(mode | little_endian);
1682 #endif
1683 auto res = utf16_in(from, to, _M_maxcode, mode);
1684 __from_next = from.next;
1685 __to_next = to.next;
1686 return res;
1687 }
1688
1689 int
1690 __codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw()
1691 { return 0; } // UTF-8 is not a fixed-width encoding
1692
1693 bool
1694 __codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw()
1695 { return false; }
1696
1697 int
1698 __codecvt_utf8_utf16_base<char32_t>::
1699 do_length(state_type&, const extern_type* __from,
1700 const extern_type* __end, size_t __max) const
1701 {
1702 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1703 return __end - __from;
1704 }
1705
1706 int
1707 __codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw()
1708 {
1709 // A single character can be 1 or 2 UTF-16 code units,
1710 // requiring up to 4 UTF-8 code units.
1711 int max = 4;
1712 if (_M_mode & consume_header)
1713 max += sizeof(utf8_bom);
1714 return max;
1715 }
1716
1717 #ifdef _GLIBCXX_USE_WCHAR_T
1718 // Define members of codecvt_utf8_utf16<wchar_t> base class implementation.
1719 // Converts from UTF-8 to UTF-16.
1720
1721 __codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { }
1722
1723 codecvt_base::result
1724 __codecvt_utf8_utf16_base<wchar_t>::
1725 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1726 const intern_type*& __from_next,
1727 extern_type* __to, extern_type* __to_end,
1728 extern_type*& __to_next) const
1729 {
1730 range<const wchar_t> from{ __from, __from_end };
1731 range<char> to{ __to, __to_end };
1732 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1733 __from_next = from.next;
1734 __to_next = to.next;
1735 return res;
1736 }
1737
1738 codecvt_base::result
1739 __codecvt_utf8_utf16_base<wchar_t>::
1740 do_unshift(state_type&, extern_type* __to, extern_type*,
1741 extern_type*& __to_next) const
1742 {
1743 __to_next = __to;
1744 return noconv;
1745 }
1746
1747 codecvt_base::result
1748 __codecvt_utf8_utf16_base<wchar_t>::
1749 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1750 const extern_type*& __from_next,
1751 intern_type* __to, intern_type* __to_end,
1752 intern_type*& __to_next) const
1753 {
1754 range<const char> from{ __from, __from_end };
1755 range<wchar_t> to{ __to, __to_end };
1756 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1757 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1758 mode = codecvt_mode(mode | little_endian);
1759 #endif
1760 auto res = utf16_in(from, to, _M_maxcode, mode);
1761 __from_next = from.next;
1762 __to_next = to.next;
1763 return res;
1764 }
1765
1766 int
1767 __codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw()
1768 { return 0; } // UTF-8 is not a fixed-width encoding
1769
1770 bool
1771 __codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw()
1772 { return false; }
1773
1774 int
1775 __codecvt_utf8_utf16_base<wchar_t>::
1776 do_length(state_type&, const extern_type* __from,
1777 const extern_type* __end, size_t __max) const
1778 {
1779 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1780 return __end - __from;
1781 }
1782
1783 int
1784 __codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw()
1785 {
1786 // A single character can be 1 or 2 UTF-16 code units,
1787 // requiring up to 4 UTF-8 code units.
1788 int max = 4;
1789 if (_M_mode & consume_header)
1790 max += sizeof(utf8_bom);
1791 return max;
1792 }
1793 #endif
1794
1795 inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>;
1796 inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>;
1797 template class codecvt_byname<char16_t, char, mbstate_t>;
1798 template class codecvt_byname<char32_t, char, mbstate_t>;
1799
1800 #if defined(_GLIBCXX_USE_CHAR8_T)
1801 inline template class __codecvt_abstract_base<char16_t, char8_t, mbstate_t>;
1802 inline template class __codecvt_abstract_base<char32_t, char8_t, mbstate_t>;
1803 template class codecvt_byname<char16_t, char8_t, mbstate_t>;
1804 template class codecvt_byname<char32_t, char8_t, mbstate_t>;
1805 #endif
1806
1807 _GLIBCXX_END_NAMESPACE_VERSION
1808 }