1 // Locale support (codecvt) -*- C++ -*-
3 // Copyright (C) 2015-2021 Free Software Foundation, Inc.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
26 #include <cstring> // std::memcpy, std::memcmp
27 #include <bits/stl_algobase.h> // std::min
29 namespace std
_GLIBCXX_VISIBILITY(default)
31 _GLIBCXX_BEGIN_NAMESPACE_VERSION
33 // The standard doesn't define these operators, which is annoying.
34 static underlying_type
<codecvt_mode
>::type
35 to_integer(codecvt_mode m
)
36 { return static_cast<underlying_type
<codecvt_mode
>::type
>(m
); }
38 static codecvt_mode
& operator&=(codecvt_mode
& m
, codecvt_mode n
)
39 { return m
= codecvt_mode(to_integer(m
) & to_integer(n
)); }
41 static codecvt_mode
& operator|=(codecvt_mode
& m
, codecvt_mode n
)
42 { return m
= codecvt_mode(to_integer(m
) | to_integer(n
)); }
44 static codecvt_mode
operator~(codecvt_mode m
)
45 { return codecvt_mode(~to_integer(m
)); }
49 // Largest code point that fits in a single UTF-16 code unit.
50 const char32_t max_single_utf16_unit
= 0xFFFF;
52 const char32_t max_code_point
= 0x10FFFF;
54 // The functions below rely on maxcode < incomplete_mb_character
55 // (which is enforced by the codecvt_utf* classes on construction).
56 const char32_t incomplete_mb_character
= char32_t(-2);
57 const char32_t invalid_mb_sequence
= char32_t(-1);
59 // Utility type for reading and writing code units of type Elem from
60 // a range defined by a pair of pointers.
61 template<typename Elem
, bool Aligned
= true>
68 range
& operator=(Elem e
)
74 // Read the next code unit.
75 Elem
operator*() const { return *next
; }
77 // Read the Nth code unit.
78 Elem
operator[](size_t n
) const { return next
[n
]; }
80 // Move to the next code unit.
87 // Move to the Nth code unit.
88 range
& operator+=(size_t n
)
94 // The number of code units remaining.
95 size_t size() const { return end
- next
; }
97 // The number of bytes remaining.
98 size_t nbytes() const { return (const char*)end
- (const char*)next
; }
101 // This specialization is used when accessing char16_t values through
102 // pointers to char, which might not be correctly aligned for char16_t.
103 template<typename Elem
>
104 struct range
<Elem
, false>
106 using value_type
= typename remove_const
<Elem
>::type
;
108 using char_pointer
= typename
109 conditional
<is_const
<Elem
>::value
, const char*, char*>::type
;
114 // Write a code unit.
115 range
& operator=(Elem e
)
117 memcpy(next
, &e
, sizeof(Elem
));
122 // Read the next code unit.
123 Elem
operator*() const
126 memcpy(&e
, next
, sizeof(Elem
));
130 // Read the Nth code unit.
131 Elem
operator[](size_t n
) const
134 memcpy(&e
, next
+ n
* sizeof(Elem
), sizeof(Elem
));
138 // Move to the next code unit.
141 next
+= sizeof(Elem
);
145 // Move to the Nth code unit.
146 range
& operator+=(size_t n
)
148 next
+= n
* sizeof(Elem
);
152 // The number of code units remaining.
153 size_t size() const { return nbytes() / sizeof(Elem
); }
155 // The number of bytes remaining.
156 size_t nbytes() const { return end
- next
; }
159 // Multibyte sequences can have "header" consisting of Byte Order Mark
160 const unsigned char utf8_bom
[3] = { 0xEF, 0xBB, 0xBF };
161 const unsigned char utf16_bom
[2] = { 0xFE, 0xFF };
162 const unsigned char utf16le_bom
[2] = { 0xFF, 0xFE };
164 // Write a BOM (space permitting).
165 template<typename C
, bool A
, size_t N
>
167 write_bom(range
<C
, A
>& to
, const unsigned char (&bom
)[N
])
169 static_assert( (N
/ sizeof(C
)) != 0, "" );
170 static_assert( (N
% sizeof(C
)) == 0, "" );
174 memcpy(to
.next
, bom
, N
);
175 to
+= (N
/ sizeof(C
));
179 // Try to read a BOM.
180 template<typename C
, bool A
, size_t N
>
182 read_bom(range
<C
, A
>& from
, const unsigned char (&bom
)[N
])
184 static_assert( (N
/ sizeof(C
)) != 0, "" );
185 static_assert( (N
% sizeof(C
)) == 0, "" );
187 if (from
.nbytes() >= N
&& !memcmp(from
.next
, bom
, N
))
189 from
+= (N
/ sizeof(C
));
195 // If generate_header is set in mode write out UTF-8 BOM.
198 write_utf8_bom(range
<C
>& to
, codecvt_mode mode
)
200 if (mode
& generate_header
)
201 return write_bom(to
, utf8_bom
);
205 // If generate_header is set in mode write out the UTF-16 BOM indicated
206 // by whether little_endian is set in mode.
207 template<bool Aligned
>
209 write_utf16_bom(range
<char16_t
, Aligned
>& to
, codecvt_mode mode
)
211 if (mode
& generate_header
)
213 if (mode
& little_endian
)
214 return write_bom(to
, utf16le_bom
);
216 return write_bom(to
, utf16_bom
);
221 // If consume_header is set in mode update from.next to after any BOM.
224 read_utf8_bom(range
<const C
>& from
, codecvt_mode mode
)
226 if (mode
& consume_header
)
227 read_bom(from
, utf8_bom
);
230 // If consume_header is not set in mode, no effects.
231 // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
232 // - if the UTF-16BE BOM was found unset little_endian in mode, or
233 // - if the UTF-16LE BOM was found set little_endian in mode.
234 template<bool Aligned
>
236 read_utf16_bom(range
<const char16_t
, Aligned
>& from
, codecvt_mode
& mode
)
238 if (mode
& consume_header
)
240 if (read_bom(from
, utf16_bom
))
241 mode
&= ~little_endian
;
242 else if (read_bom(from
, utf16le_bom
))
243 mode
|= little_endian
;
247 // Read a codepoint from a UTF-8 multibyte sequence.
248 // Updates from.next if the codepoint is not greater than maxcode.
249 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
252 read_utf8_code_point(range
<const C
>& from
, unsigned long maxcode
)
254 const size_t avail
= from
.size();
256 return incomplete_mb_character
;
257 unsigned char c1
= from
[0];
258 // https://en.wikipedia.org/wiki/UTF-8#Sample_code
264 else if (c1
< 0xC2) // continuation or overlong 2-byte sequence
265 return invalid_mb_sequence
;
266 else if (c1
< 0xE0) // 2-byte sequence
269 return incomplete_mb_character
;
270 unsigned char c2
= from
[1];
271 if ((c2
& 0xC0) != 0x80)
272 return invalid_mb_sequence
;
273 char32_t c
= (c1
<< 6) + c2
- 0x3080;
278 else if (c1
< 0xF0) // 3-byte sequence
281 return incomplete_mb_character
;
282 unsigned char c2
= from
[1];
283 if ((c2
& 0xC0) != 0x80)
284 return invalid_mb_sequence
;
285 if (c1
== 0xE0 && c2
< 0xA0) // overlong
286 return invalid_mb_sequence
;
287 unsigned char c3
= from
[2];
288 if ((c3
& 0xC0) != 0x80)
289 return invalid_mb_sequence
;
290 char32_t c
= (c1
<< 12) + (c2
<< 6) + c3
- 0xE2080;
295 else if (c1
< 0xF5) // 4-byte sequence
298 return incomplete_mb_character
;
299 unsigned char c2
= from
[1];
300 if ((c2
& 0xC0) != 0x80)
301 return invalid_mb_sequence
;
302 if (c1
== 0xF0 && c2
< 0x90) // overlong
303 return invalid_mb_sequence
;
304 if (c1
== 0xF4 && c2
>= 0x90) // > U+10FFFF
305 return invalid_mb_sequence
;
306 unsigned char c3
= from
[2];
307 if ((c3
& 0xC0) != 0x80)
308 return invalid_mb_sequence
;
309 unsigned char c4
= from
[3];
310 if ((c4
& 0xC0) != 0x80)
311 return invalid_mb_sequence
;
312 char32_t c
= (c1
<< 18) + (c2
<< 12) + (c3
<< 6) + c4
- 0x3C82080;
318 return invalid_mb_sequence
;
323 write_utf8_code_point(range
<C
>& to
, char32_t code_point
)
325 if (code_point
< 0x80)
331 else if (code_point
<= 0x7FF)
335 to
= (code_point
>> 6) + 0xC0;
336 to
= (code_point
& 0x3F) + 0x80;
338 else if (code_point
<= 0xFFFF)
342 to
= (code_point
>> 12) + 0xE0;
343 to
= ((code_point
>> 6) & 0x3F) + 0x80;
344 to
= (code_point
& 0x3F) + 0x80;
346 else if (code_point
<= 0x10FFFF)
350 to
= (code_point
>> 18) + 0xF0;
351 to
= ((code_point
>> 12) & 0x3F) + 0x80;
352 to
= ((code_point
>> 6) & 0x3F) + 0x80;
353 to
= (code_point
& 0x3F) + 0x80;
361 adjust_byte_order(char16_t c
, codecvt_mode mode
)
363 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
364 return (mode
& little_endian
) ? __builtin_bswap16(c
) : c
;
366 return (mode
& little_endian
) ? c
: __builtin_bswap16(c
);
370 // Return true if c is a high-surrogate (aka leading) code point.
372 is_high_surrogate(char32_t c
)
374 return c
>= 0xD800 && c
<= 0xDBFF;
377 // Return true if c is a low-surrogate (aka trailing) code point.
379 is_low_surrogate(char32_t c
)
381 return c
>= 0xDC00 && c
<= 0xDFFF;
385 surrogate_pair_to_code_point(char32_t high
, char32_t low
)
387 return (high
<< 10) + low
- 0x35FDC00;
390 // Read a codepoint from a UTF-16 multibyte sequence.
391 // The sequence's endianness is indicated by (mode & little_endian).
392 // Updates from.next if the codepoint is not greater than maxcode.
393 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
394 template<bool Aligned
>
396 read_utf16_code_point(range
<const char16_t
, Aligned
>& from
,
397 unsigned long maxcode
, codecvt_mode mode
)
399 const size_t avail
= from
.size();
401 return incomplete_mb_character
;
403 char32_t c
= adjust_byte_order(from
[0], mode
);
404 if (is_high_surrogate(c
))
407 return incomplete_mb_character
;
408 const char16_t c2
= adjust_byte_order(from
[1], mode
);
409 if (is_low_surrogate(c2
))
411 c
= surrogate_pair_to_code_point(c
, c2
);
415 return invalid_mb_sequence
;
417 else if (is_low_surrogate(c
))
418 return invalid_mb_sequence
;
424 template<typename C
, bool A
>
426 write_utf16_code_point(range
<C
, A
>& to
, char32_t codepoint
, codecvt_mode mode
)
428 static_assert(sizeof(C
) >= 2, "a code unit must be at least 16-bit");
430 if (codepoint
<= max_single_utf16_unit
)
434 to
= adjust_byte_order(codepoint
, mode
);
438 else if (to
.size() > 1)
440 // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
441 const char32_t LEAD_OFFSET
= 0xD800 - (0x10000 >> 10);
442 char16_t lead
= LEAD_OFFSET
+ (codepoint
>> 10);
443 char16_t trail
= 0xDC00 + (codepoint
& 0x3FF);
444 to
= adjust_byte_order(lead
, mode
);
445 to
= adjust_byte_order(trail
, mode
);
454 ucs4_in(range
<const C
>& from
, range
<char32_t
>& to
,
455 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
457 read_utf8_bom(from
, mode
);
458 while (from
.size() && to
.size())
460 const char32_t codepoint
= read_utf8_code_point(from
, maxcode
);
461 if (codepoint
== incomplete_mb_character
)
462 return codecvt_base::partial
;
463 if (codepoint
> maxcode
)
464 return codecvt_base::error
;
467 return from
.size() ? codecvt_base::partial
: codecvt_base::ok
;
473 ucs4_out(range
<const char32_t
>& from
, range
<C
>& to
,
474 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
476 if (!write_utf8_bom(to
, mode
))
477 return codecvt_base::partial
;
480 const char32_t c
= from
[0];
482 return codecvt_base::error
;
483 if (!write_utf8_code_point(to
, c
))
484 return codecvt_base::partial
;
487 return codecvt_base::ok
;
492 ucs4_in(range
<const char16_t
, false>& from
, range
<char32_t
>& to
,
493 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
495 read_utf16_bom(from
, mode
);
496 while (from
.size() && to
.size())
498 const char32_t codepoint
= read_utf16_code_point(from
, maxcode
, mode
);
499 if (codepoint
== incomplete_mb_character
)
500 return codecvt_base::partial
;
501 if (codepoint
> maxcode
)
502 return codecvt_base::error
;
505 return from
.size() ? codecvt_base::partial
: codecvt_base::ok
;
510 ucs4_out(range
<const char32_t
>& from
, range
<char16_t
, false>& to
,
511 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
513 if (!write_utf16_bom(to
, mode
))
514 return codecvt_base::partial
;
517 const char32_t c
= from
[0];
519 return codecvt_base::error
;
520 if (!write_utf16_code_point(to
, c
, mode
))
521 return codecvt_base::partial
;
524 return codecvt_base::ok
;
527 // Flag indicating whether to process UTF-16 or UCS2
528 enum class surrogates
{ allowed
, disallowed
};
530 // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
531 template<typename C8
, typename C16
>
533 utf16_in(range
<const C8
>& from
, range
<C16
>& to
,
534 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {},
535 surrogates s
= surrogates::allowed
)
537 read_utf8_bom(from
, mode
);
538 while (from
.size() && to
.size())
541 const char32_t codepoint
= read_utf8_code_point(from
, maxcode
);
542 if (codepoint
== incomplete_mb_character
)
544 if (s
== surrogates::allowed
)
545 return codecvt_base::partial
;
547 return codecvt_base::error
; // No surrogates in UCS2
549 if (codepoint
> maxcode
)
550 return codecvt_base::error
;
551 if (!write_utf16_code_point(to
, codepoint
, mode
))
553 from
= orig
; // rewind to previous position
554 return codecvt_base::partial
;
557 return codecvt_base::ok
;
560 // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
561 template<typename C16
, typename C8
>
563 utf16_out(range
<const C16
>& from
, range
<C8
>& to
,
564 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {},
565 surrogates s
= surrogates::allowed
)
567 if (!write_utf8_bom(to
, mode
))
568 return codecvt_base::partial
;
571 char32_t c
= from
[0];
573 if (is_high_surrogate(c
))
575 if (s
== surrogates::disallowed
)
576 return codecvt_base::error
; // No surrogates in UCS-2
579 return codecvt_base::ok
; // stop converting at this point
581 const char32_t c2
= from
[1];
582 if (is_low_surrogate(c2
))
584 c
= surrogate_pair_to_code_point(c
, c2
);
588 return codecvt_base::error
;
590 else if (is_low_surrogate(c
))
591 return codecvt_base::error
;
593 return codecvt_base::error
;
594 if (!write_utf8_code_point(to
, c
))
595 return codecvt_base::partial
;
598 return codecvt_base::ok
;
601 // return pos such that [begin,pos) is valid UTF-16 string no longer than max
604 utf16_span(const C
* begin
, const C
* end
, size_t max
,
605 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
607 range
<const C
> from
{ begin
, end
};
608 read_utf8_bom(from
, mode
);
610 while (count
+1 < max
)
612 char32_t c
= read_utf8_code_point(from
, maxcode
);
615 else if (c
> max_single_utf16_unit
)
619 if (count
+1 == max
) // take one more character if it fits in a single unit
620 read_utf8_code_point(from
, std::min(max_single_utf16_unit
, maxcode
));
627 ucs2_in(range
<const C
>& from
, range
<char16_t
>& to
,
628 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
630 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
631 maxcode
= std::min(max_single_utf16_unit
, maxcode
);
632 return utf16_in(from
, to
, maxcode
, mode
, surrogates::disallowed
);
638 ucs2_out(range
<const char16_t
>& from
, range
<C
>& to
,
639 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
641 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
642 maxcode
= std::min(max_single_utf16_unit
, maxcode
);
643 return utf16_out(from
, to
, maxcode
, mode
, surrogates::disallowed
);
648 ucs2_out(range
<const char16_t
>& from
, range
<char16_t
, false>& to
,
649 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
651 if (!write_utf16_bom(to
, mode
))
652 return codecvt_base::partial
;
653 while (from
.size() && to
.size())
655 char16_t c
= from
[0];
656 if (is_high_surrogate(c
))
657 return codecvt_base::error
;
659 return codecvt_base::error
;
660 to
= adjust_byte_order(c
, mode
);
663 return from
.size() == 0 ? codecvt_base::ok
: codecvt_base::partial
;
668 ucs2_in(range
<const char16_t
, false>& from
, range
<char16_t
>& to
,
669 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
671 read_utf16_bom(from
, mode
);
672 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
673 maxcode
= std::min(max_single_utf16_unit
, maxcode
);
674 while (from
.size() && to
.size())
676 const char32_t c
= read_utf16_code_point(from
, maxcode
, mode
);
677 if (c
== incomplete_mb_character
)
678 return codecvt_base::error
; // UCS-2 only supports single units.
680 return codecvt_base::error
;
683 return from
.size() == 0 ? codecvt_base::ok
: codecvt_base::partial
;
687 ucs2_span(range
<const char16_t
, false>& from
, size_t max
,
688 char32_t maxcode
, codecvt_mode mode
)
690 read_utf16_bom(from
, mode
);
691 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
692 maxcode
= std::min(max_single_utf16_unit
, maxcode
);
694 while (max
-- && c
<= maxcode
)
695 c
= read_utf16_code_point(from
, maxcode
, mode
);
696 return reinterpret_cast<const char16_t
*>(from
.next
);
701 ucs2_span(const C
* begin
, const C
* end
, size_t max
,
702 char32_t maxcode
, codecvt_mode mode
)
704 range
<const C
> from
{ begin
, end
};
705 read_utf8_bom(from
, mode
);
706 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
707 maxcode
= std::min(max_single_utf16_unit
, maxcode
);
709 while (max
-- && c
<= maxcode
)
710 c
= read_utf8_code_point(from
, maxcode
);
714 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
717 ucs4_span(const C
* begin
, const C
* end
, size_t max
,
718 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
720 range
<const C
> from
{ begin
, end
};
721 read_utf8_bom(from
, mode
);
723 while (max
-- && c
<= maxcode
)
724 c
= read_utf8_code_point(from
, maxcode
);
728 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
730 ucs4_span(range
<const char16_t
, false>& from
, size_t max
,
731 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
733 read_utf16_bom(from
, mode
);
735 while (max
-- && c
<= maxcode
)
736 c
= read_utf16_code_point(from
, maxcode
, mode
);
737 return reinterpret_cast<const char16_t
*>(from
.next
);
741 // Define members of codecvt<char16_t, char, mbstate_t> specialization.
742 // Converts from UTF-8 to UTF-16.
744 locale::id codecvt
<char16_t
, char, mbstate_t>::id
;
746 codecvt
<char16_t
, char, mbstate_t>::~codecvt() { }
749 codecvt
<char16_t
, char, mbstate_t>::
751 const intern_type
* __from
,
752 const intern_type
* __from_end
, const intern_type
*& __from_next
,
753 extern_type
* __to
, extern_type
* __to_end
,
754 extern_type
*& __to_next
) const
756 range
<const char16_t
> from
{ __from
, __from_end
};
757 range
<char> to
{ __to
, __to_end
};
758 auto res
= utf16_out(from
, to
);
759 __from_next
= from
.next
;
765 codecvt
<char16_t
, char, mbstate_t>::
766 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
767 extern_type
*& __to_next
) const
770 return noconv
; // we don't use mbstate_t for the unicode facets
774 codecvt
<char16_t
, char, mbstate_t>::
775 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
776 const extern_type
*& __from_next
,
777 intern_type
* __to
, intern_type
* __to_end
,
778 intern_type
*& __to_next
) const
780 range
<const char> from
{ __from
, __from_end
};
781 range
<char16_t
> to
{ __to
, __to_end
};
782 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
783 codecvt_mode mode
= {};
785 codecvt_mode mode
= little_endian
;
787 auto res
= utf16_in(from
, to
, max_code_point
, mode
);
788 __from_next
= from
.next
;
794 codecvt
<char16_t
, char, mbstate_t>::do_encoding() const throw()
795 { return 0; } // UTF-8 is not a fixed-width encoding
798 codecvt
<char16_t
, char, mbstate_t>::do_always_noconv() const throw()
802 codecvt
<char16_t
, char, mbstate_t>::
803 do_length(state_type
&, const extern_type
* __from
,
804 const extern_type
* __end
, size_t __max
) const
806 __end
= utf16_span(__from
, __end
, __max
);
807 return __end
- __from
;
811 codecvt
<char16_t
, char, mbstate_t>::do_max_length() const throw()
813 // A single character (one or two UTF-16 code units) requires
814 // up to four UTF-8 code units.
818 // Define members of codecvt<char32_t, char, mbstate_t> specialization.
819 // Converts from UTF-8 to UTF-32 (aka UCS-4).
821 locale::id codecvt
<char32_t
, char, mbstate_t>::id
;
823 codecvt
<char32_t
, char, mbstate_t>::~codecvt() { }
826 codecvt
<char32_t
, char, mbstate_t>::
827 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
828 const intern_type
*& __from_next
,
829 extern_type
* __to
, extern_type
* __to_end
,
830 extern_type
*& __to_next
) const
832 range
<const char32_t
> from
{ __from
, __from_end
};
833 range
<char> to
{ __to
, __to_end
};
834 auto res
= ucs4_out(from
, to
);
835 __from_next
= from
.next
;
841 codecvt
<char32_t
, char, mbstate_t>::
842 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
843 extern_type
*& __to_next
) const
850 codecvt
<char32_t
, char, mbstate_t>::
851 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
852 const extern_type
*& __from_next
,
853 intern_type
* __to
, intern_type
* __to_end
,
854 intern_type
*& __to_next
) const
856 range
<const char> from
{ __from
, __from_end
};
857 range
<char32_t
> to
{ __to
, __to_end
};
858 auto res
= ucs4_in(from
, to
);
859 __from_next
= from
.next
;
865 codecvt
<char32_t
, char, mbstate_t>::do_encoding() const throw()
866 { return 0; } // UTF-8 is not a fixed-width encoding
869 codecvt
<char32_t
, char, mbstate_t>::do_always_noconv() const throw()
873 codecvt
<char32_t
, char, mbstate_t>::
874 do_length(state_type
&, const extern_type
* __from
,
875 const extern_type
* __end
, size_t __max
) const
877 __end
= ucs4_span(__from
, __end
, __max
);
878 return __end
- __from
;
882 codecvt
<char32_t
, char, mbstate_t>::do_max_length() const throw()
884 // A single character (one UTF-32 code unit) requires
885 // up to 4 UTF-8 code units.
889 #if defined(_GLIBCXX_USE_CHAR8_T)
890 // Define members of codecvt<char16_t, char8_t, mbstate_t> specialization.
891 // Converts from UTF-8 to UTF-16.
893 locale::id codecvt
<char16_t
, char8_t
, mbstate_t>::id
;
895 codecvt
<char16_t
, char8_t
, mbstate_t>::~codecvt() { }
898 codecvt
<char16_t
, char8_t
, mbstate_t>::
900 const intern_type
* __from
,
901 const intern_type
* __from_end
, const intern_type
*& __from_next
,
902 extern_type
* __to
, extern_type
* __to_end
,
903 extern_type
*& __to_next
) const
905 range
<const char16_t
> from
{ __from
, __from_end
};
906 range
<char8_t
> to
{ __to
, __to_end
};
907 auto res
= utf16_out(from
, to
);
908 __from_next
= from
.next
;
914 codecvt
<char16_t
, char8_t
, mbstate_t>::
915 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
916 extern_type
*& __to_next
) const
919 return noconv
; // we don't use mbstate_t for the unicode facets
923 codecvt
<char16_t
, char8_t
, mbstate_t>::
924 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
925 const extern_type
*& __from_next
,
926 intern_type
* __to
, intern_type
* __to_end
,
927 intern_type
*& __to_next
) const
929 range
<const char8_t
> from
{ __from
, __from_end
};
930 range
<char16_t
> to
{ __to
, __to_end
};
931 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
932 codecvt_mode mode
= {};
934 codecvt_mode mode
= little_endian
;
936 auto res
= utf16_in(from
, to
, max_code_point
, mode
);
937 __from_next
= from
.next
;
943 codecvt
<char16_t
, char8_t
, mbstate_t>::do_encoding() const throw()
944 { return 0; } // UTF-8 is not a fixed-width encoding
947 codecvt
<char16_t
, char8_t
, mbstate_t>::do_always_noconv() const throw()
951 codecvt
<char16_t
, char8_t
, mbstate_t>::
952 do_length(state_type
&, const extern_type
* __from
,
953 const extern_type
* __end
, size_t __max
) const
955 __end
= utf16_span(__from
, __end
, __max
);
956 return __end
- __from
;
960 codecvt
<char16_t
, char8_t
, mbstate_t>::do_max_length() const throw()
962 // A single character (one or two UTF-16 code units) requires
963 // up to four UTF-8 code units.
967 // Define members of codecvt<char32_t, char8_t, mbstate_t> specialization.
968 // Converts from UTF-8 to UTF-32 (aka UCS-4).
970 locale::id codecvt
<char32_t
, char8_t
, mbstate_t>::id
;
972 codecvt
<char32_t
, char8_t
, mbstate_t>::~codecvt() { }
975 codecvt
<char32_t
, char8_t
, mbstate_t>::
976 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
977 const intern_type
*& __from_next
,
978 extern_type
* __to
, extern_type
* __to_end
,
979 extern_type
*& __to_next
) const
981 range
<const char32_t
> from
{ __from
, __from_end
};
982 range
<char8_t
> to
{ __to
, __to_end
};
983 auto res
= ucs4_out(from
, to
);
984 __from_next
= from
.next
;
990 codecvt
<char32_t
, char8_t
, mbstate_t>::
991 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
992 extern_type
*& __to_next
) const
999 codecvt
<char32_t
, char8_t
, mbstate_t>::
1000 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1001 const extern_type
*& __from_next
,
1002 intern_type
* __to
, intern_type
* __to_end
,
1003 intern_type
*& __to_next
) const
1005 range
<const char8_t
> from
{ __from
, __from_end
};
1006 range
<char32_t
> to
{ __to
, __to_end
};
1007 auto res
= ucs4_in(from
, to
);
1008 __from_next
= from
.next
;
1009 __to_next
= to
.next
;
1014 codecvt
<char32_t
, char8_t
, mbstate_t>::do_encoding() const throw()
1015 { return 0; } // UTF-8 is not a fixed-width encoding
1018 codecvt
<char32_t
, char8_t
, mbstate_t>::do_always_noconv() const throw()
1022 codecvt
<char32_t
, char8_t
, mbstate_t>::
1023 do_length(state_type
&, const extern_type
* __from
,
1024 const extern_type
* __end
, size_t __max
) const
1026 __end
= ucs4_span(__from
, __end
, __max
);
1027 return __end
- __from
;
1031 codecvt
<char32_t
, char8_t
, mbstate_t>::do_max_length() const throw()
1033 // A single character (one UTF-32 code unit) requires
1034 // up to 4 UTF-8 code units.
1037 #endif // _GLIBCXX_USE_CHAR8_T
1039 // Define members of codecvt_utf8<char16_t> base class implementation.
1040 // Converts from UTF-8 to UCS-2.
1042 __codecvt_utf8_base
<char16_t
>::~__codecvt_utf8_base() { }
1044 codecvt_base::result
1045 __codecvt_utf8_base
<char16_t
>::
1046 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1047 const intern_type
*& __from_next
,
1048 extern_type
* __to
, extern_type
* __to_end
,
1049 extern_type
*& __to_next
) const
1051 range
<const char16_t
> from
{ __from
, __from_end
};
1052 range
<char> to
{ __to
, __to_end
};
1053 auto res
= ucs2_out(from
, to
, _M_maxcode
, _M_mode
);
1054 __from_next
= from
.next
;
1055 __to_next
= to
.next
;
1059 codecvt_base::result
1060 __codecvt_utf8_base
<char16_t
>::
1061 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1062 extern_type
*& __to_next
) const
1068 codecvt_base::result
1069 __codecvt_utf8_base
<char16_t
>::
1070 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1071 const extern_type
*& __from_next
,
1072 intern_type
* __to
, intern_type
* __to_end
,
1073 intern_type
*& __to_next
) const
1075 range
<const char> from
{ __from
, __from_end
};
1076 range
<char16_t
> to
{ __to
, __to_end
};
1077 codecvt_mode mode
= codecvt_mode(_M_mode
& (consume_header
|generate_header
));
1078 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1079 mode
= codecvt_mode(mode
| little_endian
);
1081 auto res
= ucs2_in(from
, to
, _M_maxcode
, mode
);
1082 __from_next
= from
.next
;
1083 __to_next
= to
.next
;
1088 __codecvt_utf8_base
<char16_t
>::do_encoding() const throw()
1089 { return 0; } // UTF-8 is not a fixed-width encoding
1092 __codecvt_utf8_base
<char16_t
>::do_always_noconv() const throw()
1096 __codecvt_utf8_base
<char16_t
>::
1097 do_length(state_type
&, const extern_type
* __from
,
1098 const extern_type
* __end
, size_t __max
) const
1100 __end
= ucs2_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1101 return __end
- __from
;
1105 __codecvt_utf8_base
<char16_t
>::do_max_length() const throw()
1107 // A single UCS-2 character requires up to three UTF-8 code units.
1108 // (UCS-2 cannot represent characters that use four UTF-8 code units).
1110 if (_M_mode
& consume_header
)
1111 max
+= sizeof(utf8_bom
);
1115 // Define members of codecvt_utf8<char32_t> base class implementation.
1116 // Converts from UTF-8 to UTF-32 (aka UCS-4).
1118 __codecvt_utf8_base
<char32_t
>::~__codecvt_utf8_base() { }
1120 codecvt_base::result
1121 __codecvt_utf8_base
<char32_t
>::
1122 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1123 const intern_type
*& __from_next
,
1124 extern_type
* __to
, extern_type
* __to_end
,
1125 extern_type
*& __to_next
) const
1127 range
<const char32_t
> from
{ __from
, __from_end
};
1128 range
<char> to
{ __to
, __to_end
};
1129 auto res
= ucs4_out(from
, to
, _M_maxcode
, _M_mode
);
1130 __from_next
= from
.next
;
1131 __to_next
= to
.next
;
1135 codecvt_base::result
1136 __codecvt_utf8_base
<char32_t
>::
1137 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1138 extern_type
*& __to_next
) const
1144 codecvt_base::result
1145 __codecvt_utf8_base
<char32_t
>::
1146 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1147 const extern_type
*& __from_next
,
1148 intern_type
* __to
, intern_type
* __to_end
,
1149 intern_type
*& __to_next
) const
1151 range
<const char> from
{ __from
, __from_end
};
1152 range
<char32_t
> to
{ __to
, __to_end
};
1153 auto res
= ucs4_in(from
, to
, _M_maxcode
, _M_mode
);
1154 __from_next
= from
.next
;
1155 __to_next
= to
.next
;
1160 __codecvt_utf8_base
<char32_t
>::do_encoding() const throw()
1161 { return 0; } // UTF-8 is not a fixed-width encoding
1164 __codecvt_utf8_base
<char32_t
>::do_always_noconv() const throw()
1168 __codecvt_utf8_base
<char32_t
>::
1169 do_length(state_type
&, const extern_type
* __from
,
1170 const extern_type
* __end
, size_t __max
) const
1172 __end
= ucs4_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1173 return __end
- __from
;
1177 __codecvt_utf8_base
<char32_t
>::do_max_length() const throw()
1179 // A single UCS-4 character requires up to four UTF-8 code units.
1181 if (_M_mode
& consume_header
)
1182 max
+= sizeof(utf8_bom
);
1186 #ifdef _GLIBCXX_USE_WCHAR_T
1188 #if __SIZEOF_WCHAR_T__ == 2
1189 static_assert(sizeof(wchar_t) == sizeof(char16_t
), "");
1190 #elif __SIZEOF_WCHAR_T__ == 4
1191 static_assert(sizeof(wchar_t) == sizeof(char32_t
), "");
1194 // Define members of codecvt_utf8<wchar_t> base class implementation.
1195 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1197 __codecvt_utf8_base
<wchar_t>::~__codecvt_utf8_base() { }
1199 codecvt_base::result
1200 __codecvt_utf8_base
<wchar_t>::
1201 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1202 const intern_type
*& __from_next
,
1203 extern_type
* __to
, extern_type
* __to_end
,
1204 extern_type
*& __to_next
) const
1206 range
<char> to
{ __to
, __to_end
};
1207 #if __SIZEOF_WCHAR_T__ == 2
1208 range
<const char16_t
> from
{
1209 reinterpret_cast<const char16_t
*>(__from
),
1210 reinterpret_cast<const char16_t
*>(__from_end
)
1212 auto res
= ucs2_out(from
, to
, _M_maxcode
, _M_mode
);
1213 #elif __SIZEOF_WCHAR_T__ == 4
1214 range
<const char32_t
> from
{
1215 reinterpret_cast<const char32_t
*>(__from
),
1216 reinterpret_cast<const char32_t
*>(__from_end
)
1218 auto res
= ucs4_out(from
, to
, _M_maxcode
, _M_mode
);
1220 return codecvt_base::error
;
1222 __from_next
= reinterpret_cast<const wchar_t*>(from
.next
);
1223 __to_next
= to
.next
;
1227 codecvt_base::result
1228 __codecvt_utf8_base
<wchar_t>::
1229 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1230 extern_type
*& __to_next
) const
1236 codecvt_base::result
1237 __codecvt_utf8_base
<wchar_t>::
1238 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1239 const extern_type
*& __from_next
,
1240 intern_type
* __to
, intern_type
* __to_end
,
1241 intern_type
*& __to_next
) const
1243 range
<const char> from
{ __from
, __from_end
};
1244 #if __SIZEOF_WCHAR_T__ == 2
1246 reinterpret_cast<char16_t
*>(__to
),
1247 reinterpret_cast<char16_t
*>(__to_end
)
1249 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1250 codecvt_mode mode
= {};
1252 codecvt_mode mode
= little_endian
;
1254 auto res
= ucs2_in(from
, to
, _M_maxcode
, mode
);
1255 #elif __SIZEOF_WCHAR_T__ == 4
1257 reinterpret_cast<char32_t
*>(__to
),
1258 reinterpret_cast<char32_t
*>(__to_end
)
1260 auto res
= ucs4_in(from
, to
, _M_maxcode
, _M_mode
);
1262 return codecvt_base::error
;
1264 __from_next
= from
.next
;
1265 __to_next
= reinterpret_cast<wchar_t*>(to
.next
);
1270 __codecvt_utf8_base
<wchar_t>::do_encoding() const throw()
1271 { return 0; } // UTF-8 is not a fixed-width encoding
1274 __codecvt_utf8_base
<wchar_t>::do_always_noconv() const throw()
1278 __codecvt_utf8_base
<wchar_t>::
1279 do_length(state_type
&, const extern_type
* __from
,
1280 const extern_type
* __end
, size_t __max
) const
1282 #if __SIZEOF_WCHAR_T__ == 2
1283 __end
= ucs2_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1284 #elif __SIZEOF_WCHAR_T__ == 4
1285 __end
= ucs4_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1289 return __end
- __from
;
1293 __codecvt_utf8_base
<wchar_t>::do_max_length() const throw()
1295 #if __SIZEOF_WCHAR_T__ == 2
1296 int max
= 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
1298 int max
= 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
1300 if (_M_mode
& consume_header
)
1301 max
+= sizeof(utf8_bom
);
1306 // Define members of codecvt_utf16<char16_t> base class implementation.
1307 // Converts from UTF-16 to UCS-2.
1309 __codecvt_utf16_base
<char16_t
>::~__codecvt_utf16_base() { }
1311 codecvt_base::result
1312 __codecvt_utf16_base
<char16_t
>::
1313 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1314 const intern_type
*& __from_next
,
1315 extern_type
* __to
, extern_type
* __to_end
,
1316 extern_type
*& __to_next
) const
1318 range
<const char16_t
> from
{ __from
, __from_end
};
1319 range
<char16_t
, false> to
{ __to
, __to_end
};
1320 auto res
= ucs2_out(from
, to
, _M_maxcode
, _M_mode
);
1321 __from_next
= from
.next
;
1322 __to_next
= reinterpret_cast<char*>(to
.next
);
1326 codecvt_base::result
1327 __codecvt_utf16_base
<char16_t
>::
1328 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1329 extern_type
*& __to_next
) const
1335 codecvt_base::result
1336 __codecvt_utf16_base
<char16_t
>::
1337 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1338 const extern_type
*& __from_next
,
1339 intern_type
* __to
, intern_type
* __to_end
,
1340 intern_type
*& __to_next
) const
1342 range
<const char16_t
, false> from
{ __from
, __from_end
};
1343 range
<char16_t
> to
{ __to
, __to_end
};
1344 auto res
= ucs2_in(from
, to
, _M_maxcode
, _M_mode
);
1345 __from_next
= reinterpret_cast<const char*>(from
.next
);
1346 __to_next
= to
.next
;
1347 if (res
== codecvt_base::ok
&& __from_next
!= __from_end
)
1348 res
= codecvt_base::error
;
1353 __codecvt_utf16_base
<char16_t
>::do_encoding() const throw()
1354 { return 0; } // UTF-16 is not a fixed-width encoding
1357 __codecvt_utf16_base
<char16_t
>::do_always_noconv() const throw()
1361 __codecvt_utf16_base
<char16_t
>::
1362 do_length(state_type
&, const extern_type
* __from
,
1363 const extern_type
* __end
, size_t __max
) const
1365 range
<const char16_t
, false> from
{ __from
, __end
};
1366 const char16_t
* next
= ucs2_span(from
, __max
, _M_maxcode
, _M_mode
);
1367 return reinterpret_cast<const char*>(next
) - __from
;
1371 __codecvt_utf16_base
<char16_t
>::do_max_length() const throw()
1373 // A single UCS-2 character requires one UTF-16 code unit (so two chars).
1374 // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
1376 if (_M_mode
& consume_header
)
1377 max
+= sizeof(utf16_bom
);
1381 // Define members of codecvt_utf16<char32_t> base class implementation.
1382 // Converts from UTF-16 to UTF-32 (aka UCS-4).
1384 __codecvt_utf16_base
<char32_t
>::~__codecvt_utf16_base() { }
1386 codecvt_base::result
1387 __codecvt_utf16_base
<char32_t
>::
1388 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1389 const intern_type
*& __from_next
,
1390 extern_type
* __to
, extern_type
* __to_end
,
1391 extern_type
*& __to_next
) const
1393 range
<const char32_t
> from
{ __from
, __from_end
};
1394 range
<char16_t
, false> to
{ __to
, __to_end
};
1395 auto res
= ucs4_out(from
, to
, _M_maxcode
, _M_mode
);
1396 __from_next
= from
.next
;
1397 __to_next
= reinterpret_cast<char*>(to
.next
);
1401 codecvt_base::result
1402 __codecvt_utf16_base
<char32_t
>::
1403 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1404 extern_type
*& __to_next
) const
1410 codecvt_base::result
1411 __codecvt_utf16_base
<char32_t
>::
1412 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1413 const extern_type
*& __from_next
,
1414 intern_type
* __to
, intern_type
* __to_end
,
1415 intern_type
*& __to_next
) const
1417 range
<const char16_t
, false> from
{ __from
, __from_end
};
1418 range
<char32_t
> to
{ __to
, __to_end
};
1419 auto res
= ucs4_in(from
, to
, _M_maxcode
, _M_mode
);
1420 __from_next
= reinterpret_cast<const char*>(from
.next
);
1421 __to_next
= to
.next
;
1422 if (res
== codecvt_base::ok
&& __from_next
!= __from_end
)
1423 res
= codecvt_base::error
;
1428 __codecvt_utf16_base
<char32_t
>::do_encoding() const throw()
1429 { return 0; } // UTF-16 is not a fixed-width encoding
1432 __codecvt_utf16_base
<char32_t
>::do_always_noconv() const throw()
1436 __codecvt_utf16_base
<char32_t
>::
1437 do_length(state_type
&, const extern_type
* __from
,
1438 const extern_type
* __end
, size_t __max
) const
1440 range
<const char16_t
, false> from
{ __from
, __end
};
1441 const char16_t
* next
= ucs4_span(from
, __max
, _M_maxcode
, _M_mode
);
1442 return reinterpret_cast<const char*>(next
) - __from
;
1446 __codecvt_utf16_base
<char32_t
>::do_max_length() const throw()
1448 // A single UCS-4 character requires one or two UTF-16 code units
1449 // (so up to four chars).
1451 if (_M_mode
& consume_header
)
1452 max
+= sizeof(utf16_bom
);
1456 #ifdef _GLIBCXX_USE_WCHAR_T
1457 // Define members of codecvt_utf16<wchar_t> base class implementation.
1458 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1460 __codecvt_utf16_base
<wchar_t>::~__codecvt_utf16_base() { }
1462 codecvt_base::result
1463 __codecvt_utf16_base
<wchar_t>::
1464 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1465 const intern_type
*& __from_next
,
1466 extern_type
* __to
, extern_type
* __to_end
,
1467 extern_type
*& __to_next
) const
1469 range
<char16_t
, false> to
{ __to
, __to_end
};
1470 #if __SIZEOF_WCHAR_T__ == 2
1471 range
<const char16_t
> from
{
1472 reinterpret_cast<const char16_t
*>(__from
),
1473 reinterpret_cast<const char16_t
*>(__from_end
),
1475 auto res
= ucs2_out(from
, to
, _M_maxcode
, _M_mode
);
1476 #elif __SIZEOF_WCHAR_T__ == 4
1477 range
<const char32_t
> from
{
1478 reinterpret_cast<const char32_t
*>(__from
),
1479 reinterpret_cast<const char32_t
*>(__from_end
),
1481 auto res
= ucs4_out(from
, to
, _M_maxcode
, _M_mode
);
1483 return codecvt_base::error
;
1485 __from_next
= reinterpret_cast<const wchar_t*>(from
.next
);
1486 __to_next
= reinterpret_cast<char*>(to
.next
);
1490 codecvt_base::result
1491 __codecvt_utf16_base
<wchar_t>::
1492 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1493 extern_type
*& __to_next
) const
1499 codecvt_base::result
1500 __codecvt_utf16_base
<wchar_t>::
1501 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1502 const extern_type
*& __from_next
,
1503 intern_type
* __to
, intern_type
* __to_end
,
1504 intern_type
*& __to_next
) const
1506 range
<const char16_t
, false> from
{ __from
, __from_end
};
1507 #if __SIZEOF_WCHAR_T__ == 2
1509 reinterpret_cast<char16_t
*>(__to
),
1510 reinterpret_cast<char16_t
*>(__to_end
),
1512 auto res
= ucs2_in(from
, to
, _M_maxcode
, _M_mode
);
1513 #elif __SIZEOF_WCHAR_T__ == 4
1515 reinterpret_cast<char32_t
*>(__to
),
1516 reinterpret_cast<char32_t
*>(__to_end
),
1518 auto res
= ucs4_in(from
, to
, _M_maxcode
, _M_mode
);
1520 return codecvt_base::error
;
1522 __from_next
= reinterpret_cast<const char*>(from
.next
);
1523 __to_next
= reinterpret_cast<wchar_t*>(to
.next
);
1524 if (res
== codecvt_base::ok
&& __from_next
!= __from_end
)
1525 res
= codecvt_base::error
;
1530 __codecvt_utf16_base
<wchar_t>::do_encoding() const throw()
1531 { return 0; } // UTF-16 is not a fixed-width encoding
1534 __codecvt_utf16_base
<wchar_t>::do_always_noconv() const throw()
1538 __codecvt_utf16_base
<wchar_t>::
1539 do_length(state_type
&, const extern_type
* __from
,
1540 const extern_type
* __end
, size_t __max
) const
1542 range
<const char16_t
, false> from
{ __from
, __end
};
1543 #if __SIZEOF_WCHAR_T__ == 2
1544 const char16_t
* next
= ucs2_span(from
, __max
, _M_maxcode
, _M_mode
);
1545 #elif __SIZEOF_WCHAR_T__ == 4
1546 const char16_t
* next
= ucs4_span(from
, __max
, _M_maxcode
, _M_mode
);
1548 return reinterpret_cast<const char*>(next
) - __from
;
1552 __codecvt_utf16_base
<wchar_t>::do_max_length() const throw()
1554 #if __SIZEOF_WCHAR_T__ == 2
1555 int max
= 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
1557 int max
= 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
1559 if (_M_mode
& consume_header
)
1560 max
+= sizeof(utf16_bom
);
1565 // Define members of codecvt_utf8_utf16<char16_t> base class implementation.
1566 // Converts from UTF-8 to UTF-16.
1568 __codecvt_utf8_utf16_base
<char16_t
>::~__codecvt_utf8_utf16_base() { }
1570 codecvt_base::result
1571 __codecvt_utf8_utf16_base
<char16_t
>::
1572 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1573 const intern_type
*& __from_next
,
1574 extern_type
* __to
, extern_type
* __to_end
,
1575 extern_type
*& __to_next
) const
1577 range
<const char16_t
> from
{ __from
, __from_end
};
1578 range
<char> to
{ __to
, __to_end
};
1579 auto res
= utf16_out(from
, to
, _M_maxcode
, _M_mode
);
1580 __from_next
= from
.next
;
1581 __to_next
= to
.next
;
1585 codecvt_base::result
1586 __codecvt_utf8_utf16_base
<char16_t
>::
1587 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1588 extern_type
*& __to_next
) const
1594 codecvt_base::result
1595 __codecvt_utf8_utf16_base
<char16_t
>::
1596 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1597 const extern_type
*& __from_next
,
1598 intern_type
* __to
, intern_type
* __to_end
,
1599 intern_type
*& __to_next
) const
1601 range
<const char> from
{ __from
, __from_end
};
1602 range
<char16_t
> to
{ __to
, __to_end
};
1603 codecvt_mode mode
= codecvt_mode(_M_mode
& (consume_header
|generate_header
));
1604 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1605 mode
= codecvt_mode(mode
| little_endian
);
1607 auto res
= utf16_in(from
, to
, _M_maxcode
, mode
);
1608 __from_next
= from
.next
;
1609 __to_next
= to
.next
;
1614 __codecvt_utf8_utf16_base
<char16_t
>::do_encoding() const throw()
1615 { return 0; } // UTF-8 is not a fixed-width encoding
1618 __codecvt_utf8_utf16_base
<char16_t
>::do_always_noconv() const throw()
1622 __codecvt_utf8_utf16_base
<char16_t
>::
1623 do_length(state_type
&, const extern_type
* __from
,
1624 const extern_type
* __end
, size_t __max
) const
1626 __end
= utf16_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1627 return __end
- __from
;
1631 __codecvt_utf8_utf16_base
<char16_t
>::do_max_length() const throw()
1633 // A single character can be 1 or 2 UTF-16 code units,
1634 // requiring up to 4 UTF-8 code units.
1636 if (_M_mode
& consume_header
)
1637 max
+= sizeof(utf8_bom
);
1641 // Define members of codecvt_utf8_utf16<char32_t> base class implementation.
1642 // Converts from UTF-8 to UTF-16.
1644 __codecvt_utf8_utf16_base
<char32_t
>::~__codecvt_utf8_utf16_base() { }
1646 codecvt_base::result
1647 __codecvt_utf8_utf16_base
<char32_t
>::
1648 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1649 const intern_type
*& __from_next
,
1650 extern_type
* __to
, extern_type
* __to_end
,
1651 extern_type
*& __to_next
) const
1653 range
<const char32_t
> from
{ __from
, __from_end
};
1654 range
<char> to
{ __to
, __to_end
};
1655 auto res
= utf16_out(from
, to
, _M_maxcode
, _M_mode
);
1656 __from_next
= from
.next
;
1657 __to_next
= to
.next
;
1661 codecvt_base::result
1662 __codecvt_utf8_utf16_base
<char32_t
>::
1663 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1664 extern_type
*& __to_next
) const
1670 codecvt_base::result
1671 __codecvt_utf8_utf16_base
<char32_t
>::
1672 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1673 const extern_type
*& __from_next
,
1674 intern_type
* __to
, intern_type
* __to_end
,
1675 intern_type
*& __to_next
) const
1677 range
<const char> from
{ __from
, __from_end
};
1678 range
<char32_t
> to
{ __to
, __to_end
};
1679 codecvt_mode mode
= codecvt_mode(_M_mode
& (consume_header
|generate_header
));
1680 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1681 mode
= codecvt_mode(mode
| little_endian
);
1683 auto res
= utf16_in(from
, to
, _M_maxcode
, mode
);
1684 __from_next
= from
.next
;
1685 __to_next
= to
.next
;
1690 __codecvt_utf8_utf16_base
<char32_t
>::do_encoding() const throw()
1691 { return 0; } // UTF-8 is not a fixed-width encoding
1694 __codecvt_utf8_utf16_base
<char32_t
>::do_always_noconv() const throw()
1698 __codecvt_utf8_utf16_base
<char32_t
>::
1699 do_length(state_type
&, const extern_type
* __from
,
1700 const extern_type
* __end
, size_t __max
) const
1702 __end
= utf16_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1703 return __end
- __from
;
1707 __codecvt_utf8_utf16_base
<char32_t
>::do_max_length() const throw()
1709 // A single character can be 1 or 2 UTF-16 code units,
1710 // requiring up to 4 UTF-8 code units.
1712 if (_M_mode
& consume_header
)
1713 max
+= sizeof(utf8_bom
);
1717 #ifdef _GLIBCXX_USE_WCHAR_T
1718 // Define members of codecvt_utf8_utf16<wchar_t> base class implementation.
1719 // Converts from UTF-8 to UTF-16.
1721 __codecvt_utf8_utf16_base
<wchar_t>::~__codecvt_utf8_utf16_base() { }
1723 codecvt_base::result
1724 __codecvt_utf8_utf16_base
<wchar_t>::
1725 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1726 const intern_type
*& __from_next
,
1727 extern_type
* __to
, extern_type
* __to_end
,
1728 extern_type
*& __to_next
) const
1730 range
<const wchar_t> from
{ __from
, __from_end
};
1731 range
<char> to
{ __to
, __to_end
};
1732 auto res
= utf16_out(from
, to
, _M_maxcode
, _M_mode
);
1733 __from_next
= from
.next
;
1734 __to_next
= to
.next
;
1738 codecvt_base::result
1739 __codecvt_utf8_utf16_base
<wchar_t>::
1740 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1741 extern_type
*& __to_next
) const
1747 codecvt_base::result
1748 __codecvt_utf8_utf16_base
<wchar_t>::
1749 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1750 const extern_type
*& __from_next
,
1751 intern_type
* __to
, intern_type
* __to_end
,
1752 intern_type
*& __to_next
) const
1754 range
<const char> from
{ __from
, __from_end
};
1755 range
<wchar_t> to
{ __to
, __to_end
};
1756 codecvt_mode mode
= codecvt_mode(_M_mode
& (consume_header
|generate_header
));
1757 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1758 mode
= codecvt_mode(mode
| little_endian
);
1760 auto res
= utf16_in(from
, to
, _M_maxcode
, mode
);
1761 __from_next
= from
.next
;
1762 __to_next
= to
.next
;
1767 __codecvt_utf8_utf16_base
<wchar_t>::do_encoding() const throw()
1768 { return 0; } // UTF-8 is not a fixed-width encoding
1771 __codecvt_utf8_utf16_base
<wchar_t>::do_always_noconv() const throw()
1775 __codecvt_utf8_utf16_base
<wchar_t>::
1776 do_length(state_type
&, const extern_type
* __from
,
1777 const extern_type
* __end
, size_t __max
) const
1779 __end
= utf16_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1780 return __end
- __from
;
1784 __codecvt_utf8_utf16_base
<wchar_t>::do_max_length() const throw()
1786 // A single character can be 1 or 2 UTF-16 code units,
1787 // requiring up to 4 UTF-8 code units.
1789 if (_M_mode
& consume_header
)
1790 max
+= sizeof(utf8_bom
);
1795 inline template class __codecvt_abstract_base
<char16_t
, char, mbstate_t>;
1796 inline template class __codecvt_abstract_base
<char32_t
, char, mbstate_t>;
1797 template class codecvt_byname
<char16_t
, char, mbstate_t>;
1798 template class codecvt_byname
<char32_t
, char, mbstate_t>;
1800 #if defined(_GLIBCXX_USE_CHAR8_T)
1801 inline template class __codecvt_abstract_base
<char16_t
, char8_t
, mbstate_t>;
1802 inline template class __codecvt_abstract_base
<char32_t
, char8_t
, mbstate_t>;
1803 template class codecvt_byname
<char16_t
, char8_t
, mbstate_t>;
1804 template class codecvt_byname
<char32_t
, char8_t
, mbstate_t>;
1807 _GLIBCXX_END_NAMESPACE_VERSION