]> git.ipfire.org Git - thirdparty/gcc.git/blame - libstdc++-v3/src/c++11/codecvt.cc
Update copyright years.
[thirdparty/gcc.git] / libstdc++-v3 / src / c++11 / codecvt.cc
CommitLineData
bb93f35d
JW
1// Locale support (codecvt) -*- C++ -*-
2
a5544970 3// Copyright (C) 2015-2019 Free Software Foundation, Inc.
bb93f35d
JW
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
28af1fb3 25#include <codecvt>
bb93f35d 26#include <cstring> // std::memcpy, std::memcmp
bcd682e1 27#include <bits/stl_algobase.h> // std::min
bb93f35d 28
bb93f35d
JW
29namespace std _GLIBCXX_VISIBILITY(default)
30{
31_GLIBCXX_BEGIN_NAMESPACE_VERSION
32
bcd682e1
JW
33 // The standard doesn't define these operators, which is annoying.
34 static underlying_type<codecvt_mode>::type
35 to_integer(codecvt_mode m)
e363c939 36 { return static_cast<underlying_type<codecvt_mode>::type>(m); }
bcd682e1
JW
37
38 static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
39 { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
40
41 static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
42 { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
43
44 static codecvt_mode operator~(codecvt_mode m)
45 { return codecvt_mode(~to_integer(m)); }
46
bb93f35d
JW
47namespace
48{
49 // Largest code point that fits in a single UTF-16 code unit.
50 const char32_t max_single_utf16_unit = 0xFFFF;
b6584a72 51
bb93f35d
JW
52 const char32_t max_code_point = 0x10FFFF;
53
b6584a72
JW
54 // The functions below rely on maxcode < incomplete_mb_character
55 // (which is enforced by the codecvt_utf* classes on construction).
56 const char32_t incomplete_mb_character = char32_t(-2);
57 const char32_t invalid_mb_sequence = char32_t(-1);
58
d951e75d
JW
59 // Utility type for reading and writing code units of type Elem from
60 // a range defined by a pair of pointers.
61 template<typename Elem, bool Aligned = true>
bb93f35d
JW
62 struct range
63 {
64 Elem* next;
65 Elem* end;
66
d951e75d
JW
67 // Write a code unit.
68 range& operator=(Elem e)
69 {
70 *next++ = e;
71 return *this;
72 }
73
74 // Read the next code unit.
bb93f35d
JW
75 Elem operator*() const { return *next; }
76
d951e75d
JW
77 // Read the Nth code unit.
78 Elem operator[](size_t n) const { return next[n]; }
79
80 // Move to the next code unit.
81 range& operator++()
82 {
83 ++next;
84 return *this;
85 }
86
87 // Move to the Nth code unit.
88 range& operator+=(size_t n)
89 {
90 next += n;
91 return *this;
92 }
bb93f35d 93
d951e75d 94 // The number of code units remaining.
bb93f35d 95 size_t size() const { return end - next; }
d951e75d
JW
96
97 // The number of bytes remaining.
98 size_t nbytes() const { return (const char*)end - (const char*)next; }
99 };
100
101 // This specialization is used when accessing char16_t values through
102 // pointers to char, which might not be correctly aligned for char16_t.
103 template<typename Elem>
104 struct range<Elem, false>
105 {
106 using value_type = typename remove_const<Elem>::type;
107
108 using char_pointer = typename
109 conditional<is_const<Elem>::value, const char*, char*>::type;
110
111 char_pointer next;
112 char_pointer end;
113
114 // Write a code unit.
115 range& operator=(Elem e)
116 {
117 memcpy(next, &e, sizeof(Elem));
118 ++*this;
119 return *this;
120 }
121
122 // Read the next code unit.
123 Elem operator*() const
124 {
125 value_type e;
126 memcpy(&e, next, sizeof(Elem));
127 return e;
128 }
129
130 // Read the Nth code unit.
131 Elem operator[](size_t n) const
132 {
133 value_type e;
134 memcpy(&e, next + n * sizeof(Elem), sizeof(Elem));
135 return e;
136 }
137
138 // Move to the next code unit.
139 range& operator++()
140 {
141 next += sizeof(Elem);
142 return *this;
143 }
144
145 // Move to the Nth code unit.
146 range& operator+=(size_t n)
147 {
148 next += n * sizeof(Elem);
149 return *this;
150 }
151
152 // The number of code units remaining.
153 size_t size() const { return nbytes() / sizeof(Elem); }
154
155 // The number of bytes remaining.
156 size_t nbytes() const { return end - next; }
bb93f35d
JW
157 };
158
28af1fb3
JW
159 // Multibyte sequences can have "header" consisting of Byte Order Mark
160 const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
516231de
JW
161 const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
162 const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
28af1fb3 163
d951e75d
JW
164 // Write a BOM (space permitting).
165 template<typename C, bool A, size_t N>
166 bool
167 write_bom(range<C, A>& to, const unsigned char (&bom)[N])
28af1fb3 168 {
d951e75d
JW
169 static_assert( (N / sizeof(C)) != 0, "" );
170 static_assert( (N % sizeof(C)) == 0, "" );
171
172 if (to.nbytes() < N)
28af1fb3
JW
173 return false;
174 memcpy(to.next, bom, N);
d951e75d 175 to += (N / sizeof(C));
28af1fb3
JW
176 return true;
177 }
178
d951e75d
JW
179 // Try to read a BOM.
180 template<typename C, bool A, size_t N>
181 bool
182 read_bom(range<C, A>& from, const unsigned char (&bom)[N])
183 {
184 static_assert( (N / sizeof(C)) != 0, "" );
185 static_assert( (N % sizeof(C)) == 0, "" );
186
187 if (from.nbytes() >= N && !memcmp(from.next, bom, N))
188 {
189 from += (N / sizeof(C));
190 return true;
191 }
192 return false;
193 }
194
28af1fb3
JW
195 // If generate_header is set in mode write out UTF-8 BOM.
196 bool
197 write_utf8_bom(range<char>& to, codecvt_mode mode)
198 {
199 if (mode & generate_header)
200 return write_bom(to, utf8_bom);
201 return true;
202 }
203
204 // If generate_header is set in mode write out the UTF-16 BOM indicated
205 // by whether little_endian is set in mode.
d951e75d 206 template<bool Aligned>
28af1fb3 207 bool
d951e75d 208 write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode)
28af1fb3
JW
209 {
210 if (mode & generate_header)
211 {
d951e75d
JW
212 if (mode & little_endian)
213 return write_bom(to, utf16le_bom);
214 else
215 return write_bom(to, utf16_bom);
28af1fb3
JW
216 }
217 return true;
218 }
219
28af1fb3
JW
220 // If consume_header is set in mode update from.next to after any BOM.
221 void
222 read_utf8_bom(range<const char>& from, codecvt_mode mode)
223 {
224 if (mode & consume_header)
225 read_bom(from, utf8_bom);
226 }
227
bcd682e1
JW
228 // If consume_header is not set in mode, no effects.
229 // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
230 // - if the UTF-16BE BOM was found unset little_endian in mode, or
231 // - if the UTF-16LE BOM was found set little_endian in mode.
d951e75d 232 template<bool Aligned>
bcd682e1 233 void
d951e75d 234 read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode)
28af1fb3 235 {
d951e75d 236 if (mode & consume_header)
28af1fb3 237 {
d951e75d
JW
238 if (read_bom(from, utf16_bom))
239 mode &= ~little_endian;
240 else if (read_bom(from, utf16le_bom))
241 mode |= little_endian;
28af1fb3 242 }
28af1fb3
JW
243 }
244
245 // Read a codepoint from a UTF-8 multibyte sequence.
246 // Updates from.next if the codepoint is not greater than maxcode.
b6584a72 247 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
bb93f35d
JW
248 char32_t
249 read_utf8_code_point(range<const char>& from, unsigned long maxcode)
250 {
b6584a72 251 const size_t avail = from.size();
bb93f35d 252 if (avail == 0)
b6584a72 253 return incomplete_mb_character;
d951e75d 254 unsigned char c1 = from[0];
bb93f35d
JW
255 // https://en.wikipedia.org/wiki/UTF-8#Sample_code
256 if (c1 < 0x80)
257 {
d951e75d 258 ++from;
bb93f35d
JW
259 return c1;
260 }
261 else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
b6584a72 262 return invalid_mb_sequence;
bb93f35d
JW
263 else if (c1 < 0xE0) // 2-byte sequence
264 {
265 if (avail < 2)
b6584a72 266 return incomplete_mb_character;
d951e75d 267 unsigned char c2 = from[1];
bb93f35d 268 if ((c2 & 0xC0) != 0x80)
b6584a72 269 return invalid_mb_sequence;
bb93f35d 270 char32_t c = (c1 << 6) + c2 - 0x3080;
28af1fb3 271 if (c <= maxcode)
d951e75d 272 from += 2;
bb93f35d
JW
273 return c;
274 }
275 else if (c1 < 0xF0) // 3-byte sequence
276 {
277 if (avail < 3)
b6584a72 278 return incomplete_mb_character;
d951e75d 279 unsigned char c2 = from[1];
bb93f35d 280 if ((c2 & 0xC0) != 0x80)
b6584a72 281 return invalid_mb_sequence;
bb93f35d 282 if (c1 == 0xE0 && c2 < 0xA0) // overlong
b6584a72 283 return invalid_mb_sequence;
d951e75d 284 unsigned char c3 = from[2];
bb93f35d 285 if ((c3 & 0xC0) != 0x80)
b6584a72 286 return invalid_mb_sequence;
bb93f35d 287 char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
28af1fb3 288 if (c <= maxcode)
d951e75d 289 from += 3;
bb93f35d
JW
290 return c;
291 }
292 else if (c1 < 0xF5) // 4-byte sequence
293 {
294 if (avail < 4)
b6584a72 295 return incomplete_mb_character;
d951e75d 296 unsigned char c2 = from[1];
bb93f35d 297 if ((c2 & 0xC0) != 0x80)
b6584a72 298 return invalid_mb_sequence;
bb93f35d 299 if (c1 == 0xF0 && c2 < 0x90) // overlong
b6584a72 300 return invalid_mb_sequence;
bb93f35d 301 if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
b6584a72 302 return invalid_mb_sequence;
d951e75d 303 unsigned char c3 = from[2];
bb93f35d 304 if ((c3 & 0xC0) != 0x80)
b6584a72 305 return invalid_mb_sequence;
d951e75d 306 unsigned char c4 = from[3];
bb93f35d 307 if ((c4 & 0xC0) != 0x80)
b6584a72 308 return invalid_mb_sequence;
bb93f35d 309 char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
28af1fb3 310 if (c <= maxcode)
d951e75d 311 from += 4;
bb93f35d
JW
312 return c;
313 }
314 else // > U+10FFFF
b6584a72 315 return invalid_mb_sequence;
bb93f35d
JW
316 }
317
318 bool
319 write_utf8_code_point(range<char>& to, char32_t code_point)
320 {
321 if (code_point < 0x80)
322 {
323 if (to.size() < 1)
324 return false;
d951e75d 325 to = code_point;
bb93f35d
JW
326 }
327 else if (code_point <= 0x7FF)
328 {
329 if (to.size() < 2)
330 return false;
d951e75d
JW
331 to = (code_point >> 6) + 0xC0;
332 to = (code_point & 0x3F) + 0x80;
bb93f35d
JW
333 }
334 else if (code_point <= 0xFFFF)
335 {
336 if (to.size() < 3)
337 return false;
d951e75d
JW
338 to = (code_point >> 12) + 0xE0;
339 to = ((code_point >> 6) & 0x3F) + 0x80;
340 to = (code_point & 0x3F) + 0x80;
bb93f35d
JW
341 }
342 else if (code_point <= 0x10FFFF)
343 {
344 if (to.size() < 4)
345 return false;
d951e75d
JW
346 to = (code_point >> 18) + 0xF0;
347 to = ((code_point >> 12) & 0x3F) + 0x80;
348 to = ((code_point >> 6) & 0x3F) + 0x80;
349 to = (code_point & 0x3F) + 0x80;
bb93f35d
JW
350 }
351 else
352 return false;
353 return true;
354 }
355
28af1fb3
JW
356 inline char16_t
357 adjust_byte_order(char16_t c, codecvt_mode mode)
358 {
359#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
360 return (mode & little_endian) ? __builtin_bswap16(c) : c;
361#else
362 return (mode & little_endian) ? c : __builtin_bswap16(c);
363#endif
364 }
365
b6584a72
JW
366 // Return true if c is a high-surrogate (aka leading) code point.
367 inline bool
368 is_high_surrogate(char32_t c)
369 {
370 return c >= 0xD800 && c <= 0xDBFF;
371 }
372
373 // Return true if c is a low-surrogate (aka trailing) code point.
374 inline bool
375 is_low_surrogate(char32_t c)
376 {
377 return c >= 0xDC00 && c <= 0xDFFF;
378 }
379
380 inline char32_t
381 surrogate_pair_to_code_point(char32_t high, char32_t low)
382 {
383 return (high << 10) + low - 0x35FDC00;
384 }
385
28af1fb3
JW
386 // Read a codepoint from a UTF-16 multibyte sequence.
387 // The sequence's endianness is indicated by (mode & little_endian).
388 // Updates from.next if the codepoint is not greater than maxcode.
b6584a72 389 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
d951e75d
JW
390 template<bool Aligned>
391 char32_t
392 read_utf16_code_point(range<const char16_t, Aligned>& from,
393 unsigned long maxcode, codecvt_mode mode)
394 {
395 const size_t avail = from.size();
396 if (avail == 0)
397 return incomplete_mb_character;
398 int inc = 1;
399 char32_t c = adjust_byte_order(from[0], mode);
400 if (is_high_surrogate(c))
401 {
402 if (avail < 2)
403 return incomplete_mb_character;
404 const char16_t c2 = adjust_byte_order(from[1], mode);
405 if (is_low_surrogate(c2))
406 {
407 c = surrogate_pair_to_code_point(c, c2);
408 inc = 2;
409 }
410 else
411 return invalid_mb_sequence;
412 }
413 else if (is_low_surrogate(c))
414 return invalid_mb_sequence;
415 if (c <= maxcode)
416 from += inc;
417 return c;
418 }
28af1fb3 419
d951e75d 420 template<typename C, bool A>
bb93f35d 421 bool
d951e75d 422 write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode)
bb93f35d 423 {
28af1fb3
JW
424 static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
425
02e12bda 426 if (codepoint <= max_single_utf16_unit)
bb93f35d
JW
427 {
428 if (to.size() > 0)
429 {
d951e75d 430 to = adjust_byte_order(codepoint, mode);
bb93f35d
JW
431 return true;
432 }
433 }
434 else if (to.size() > 1)
435 {
436 // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
437 const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
bb93f35d
JW
438 char16_t lead = LEAD_OFFSET + (codepoint >> 10);
439 char16_t trail = 0xDC00 + (codepoint & 0x3FF);
d951e75d
JW
440 to = adjust_byte_order(lead, mode);
441 to = adjust_byte_order(trail, mode);
bb93f35d
JW
442 return true;
443 }
444 return false;
445 }
446
447 // utf8 -> ucs4
448 codecvt_base::result
449 ucs4_in(range<const char>& from, range<char32_t>& to,
28af1fb3 450 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
bb93f35d 451 {
28af1fb3 452 read_utf8_bom(from, mode);
bb93f35d
JW
453 while (from.size() && to.size())
454 {
455 const char32_t codepoint = read_utf8_code_point(from, maxcode);
b6584a72
JW
456 if (codepoint == incomplete_mb_character)
457 return codecvt_base::partial;
28af1fb3 458 if (codepoint > maxcode)
bb93f35d 459 return codecvt_base::error;
d951e75d 460 to = codepoint;
bb93f35d
JW
461 }
462 return from.size() ? codecvt_base::partial : codecvt_base::ok;
463 }
464
465 // ucs4 -> utf8
466 codecvt_base::result
467 ucs4_out(range<const char32_t>& from, range<char>& to,
28af1fb3 468 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
bb93f35d 469 {
28af1fb3
JW
470 if (!write_utf8_bom(to, mode))
471 return codecvt_base::partial;
bb93f35d
JW
472 while (from.size())
473 {
d951e75d 474 const char32_t c = from[0];
bb93f35d
JW
475 if (c > maxcode)
476 return codecvt_base::error;
477 if (!write_utf8_code_point(to, c))
478 return codecvt_base::partial;
d951e75d 479 ++from;
bb93f35d
JW
480 }
481 return codecvt_base::ok;
482 }
483
28af1fb3
JW
484 // utf16 -> ucs4
485 codecvt_base::result
d951e75d 486 ucs4_in(range<const char16_t, false>& from, range<char32_t>& to,
28af1fb3
JW
487 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
488 {
bcd682e1 489 read_utf16_bom(from, mode);
28af1fb3
JW
490 while (from.size() && to.size())
491 {
492 const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
b6584a72
JW
493 if (codepoint == incomplete_mb_character)
494 return codecvt_base::partial;
28af1fb3
JW
495 if (codepoint > maxcode)
496 return codecvt_base::error;
d951e75d 497 to = codepoint;
28af1fb3
JW
498 }
499 return from.size() ? codecvt_base::partial : codecvt_base::ok;
500 }
501
502 // ucs4 -> utf16
503 codecvt_base::result
d951e75d 504 ucs4_out(range<const char32_t>& from, range<char16_t, false>& to,
28af1fb3
JW
505 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
506 {
507 if (!write_utf16_bom(to, mode))
508 return codecvt_base::partial;
509 while (from.size())
510 {
d951e75d 511 const char32_t c = from[0];
28af1fb3
JW
512 if (c > maxcode)
513 return codecvt_base::error;
514 if (!write_utf16_code_point(to, c, mode))
515 return codecvt_base::partial;
d951e75d 516 ++from;
28af1fb3
JW
517 }
518 return codecvt_base::ok;
519 }
520
bcd682e1
JW
521 // Flag indicating whether to process UTF-16 or UCS2
522 enum class surrogates { allowed, disallowed };
523
524 // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
28af1fb3 525 template<typename C>
bb93f35d 526 codecvt_base::result
28af1fb3 527 utf16_in(range<const char>& from, range<C>& to,
bcd682e1
JW
528 unsigned long maxcode = max_code_point, codecvt_mode mode = {},
529 surrogates s = surrogates::allowed)
bb93f35d 530 {
28af1fb3 531 read_utf8_bom(from, mode);
bb93f35d
JW
532 while (from.size() && to.size())
533 {
d951e75d 534 auto orig = from;
bb93f35d 535 const char32_t codepoint = read_utf8_code_point(from, maxcode);
b6584a72 536 if (codepoint == incomplete_mb_character)
bcd682e1
JW
537 {
538 if (s == surrogates::allowed)
539 return codecvt_base::partial;
540 else
541 return codecvt_base::error; // No surrogates in UCS2
542 }
28af1fb3 543 if (codepoint > maxcode)
bb93f35d 544 return codecvt_base::error;
7f971f18 545 if (!write_utf16_code_point(to, codepoint, mode))
bb93f35d 546 {
d951e75d 547 from = orig; // rewind to previous position
bb93f35d
JW
548 return codecvt_base::partial;
549 }
550 }
551 return codecvt_base::ok;
552 }
553
bcd682e1 554 // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
28af1fb3 555 template<typename C>
bb93f35d 556 codecvt_base::result
28af1fb3 557 utf16_out(range<const C>& from, range<char>& to,
bcd682e1
JW
558 unsigned long maxcode = max_code_point, codecvt_mode mode = {},
559 surrogates s = surrogates::allowed)
bb93f35d 560 {
28af1fb3
JW
561 if (!write_utf8_bom(to, mode))
562 return codecvt_base::partial;
bb93f35d
JW
563 while (from.size())
564 {
d951e75d 565 char32_t c = from[0];
bb93f35d 566 int inc = 1;
b6584a72 567 if (is_high_surrogate(c))
bb93f35d 568 {
bcd682e1
JW
569 if (s == surrogates::disallowed)
570 return codecvt_base::error; // No surrogates in UCS-2
571
bb93f35d
JW
572 if (from.size() < 2)
573 return codecvt_base::ok; // stop converting at this point
574
d951e75d 575 const char32_t c2 = from[1];
b6584a72 576 if (is_low_surrogate(c2))
bb93f35d 577 {
b6584a72 578 c = surrogate_pair_to_code_point(c, c2);
bb93f35d 579 inc = 2;
bb93f35d
JW
580 }
581 else
582 return codecvt_base::error;
583 }
b6584a72
JW
584 else if (is_low_surrogate(c))
585 return codecvt_base::error;
bb93f35d
JW
586 if (c > maxcode)
587 return codecvt_base::error;
588 if (!write_utf8_code_point(to, c))
589 return codecvt_base::partial;
d951e75d 590 from += inc;
bb93f35d
JW
591 }
592 return codecvt_base::ok;
593 }
594
595 // return pos such that [begin,pos) is valid UTF-16 string no longer than max
28af1fb3
JW
596 const char*
597 utf16_span(const char* begin, const char* end, size_t max,
598 char32_t maxcode = max_code_point, codecvt_mode mode = {})
bb93f35d
JW
599 {
600 range<const char> from{ begin, end };
28af1fb3 601 read_utf8_bom(from, mode);
bb93f35d
JW
602 size_t count = 0;
603 while (count+1 < max)
604 {
605 char32_t c = read_utf8_code_point(from, maxcode);
b6584a72
JW
606 if (c > maxcode)
607 return from.next;
bb93f35d
JW
608 else if (c > max_single_utf16_unit)
609 ++count;
610 ++count;
611 }
612 if (count+1 == max) // take one more character if it fits in a single unit
bcd682e1 613 read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
28af1fb3 614 return from.next;
bb93f35d
JW
615 }
616
28af1fb3
JW
617 // utf8 -> ucs2
618 codecvt_base::result
619 ucs2_in(range<const char>& from, range<char16_t>& to,
620 char32_t maxcode = max_code_point, codecvt_mode mode = {})
bb93f35d 621 {
bcd682e1
JW
622 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
623 maxcode = std::min(max_single_utf16_unit, maxcode);
624 return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
28af1fb3
JW
625 }
626
627 // ucs2 -> utf8
628 codecvt_base::result
629 ucs2_out(range<const char16_t>& from, range<char>& to,
630 char32_t maxcode = max_code_point, codecvt_mode mode = {})
631 {
bcd682e1
JW
632 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
633 maxcode = std::min(max_single_utf16_unit, maxcode);
634 return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
28af1fb3
JW
635 }
636
637 // ucs2 -> utf16
638 codecvt_base::result
d951e75d 639 ucs2_out(range<const char16_t>& from, range<char16_t, false>& to,
28af1fb3
JW
640 char32_t maxcode = max_code_point, codecvt_mode mode = {})
641 {
642 if (!write_utf16_bom(to, mode))
643 return codecvt_base::partial;
644 while (from.size() && to.size())
bb93f35d 645 {
d951e75d 646 char16_t c = from[0];
b6584a72 647 if (is_high_surrogate(c))
28af1fb3
JW
648 return codecvt_base::error;
649 if (c > maxcode)
650 return codecvt_base::error;
d951e75d
JW
651 to = adjust_byte_order(c, mode);
652 ++from;
28af1fb3
JW
653 }
654 return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
655 }
656
657 // utf16 -> ucs2
658 codecvt_base::result
d951e75d 659 ucs2_in(range<const char16_t, false>& from, range<char16_t>& to,
28af1fb3
JW
660 char32_t maxcode = max_code_point, codecvt_mode mode = {})
661 {
bcd682e1
JW
662 read_utf16_bom(from, mode);
663 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
664 maxcode = std::min(max_single_utf16_unit, maxcode);
28af1fb3
JW
665 while (from.size() && to.size())
666 {
667 const char32_t c = read_utf16_code_point(from, maxcode, mode);
b6584a72 668 if (c == incomplete_mb_character)
bcd682e1 669 return codecvt_base::error; // UCS-2 only supports single units.
b6584a72 670 if (c > maxcode)
28af1fb3 671 return codecvt_base::error;
d951e75d 672 to = c;
bb93f35d 673 }
28af1fb3
JW
674 return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
675 }
676
677 const char16_t*
d951e75d 678 ucs2_span(range<const char16_t, false>& from, size_t max,
28af1fb3
JW
679 char32_t maxcode, codecvt_mode mode)
680 {
bcd682e1
JW
681 read_utf16_bom(from, mode);
682 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
683 maxcode = std::min(max_single_utf16_unit, maxcode);
28af1fb3
JW
684 char32_t c = 0;
685 while (max-- && c <= maxcode)
686 c = read_utf16_code_point(from, maxcode, mode);
d951e75d 687 return reinterpret_cast<const char16_t*>(from.next);
28af1fb3
JW
688 }
689
690 const char*
691 ucs2_span(const char* begin, const char* end, size_t max,
692 char32_t maxcode, codecvt_mode mode)
693 {
694 range<const char> from{ begin, end };
695 read_utf8_bom(from, mode);
bcd682e1
JW
696 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
697 maxcode = std::min(max_single_utf16_unit, maxcode);
28af1fb3
JW
698 char32_t c = 0;
699 while (max-- && c <= maxcode)
700 c = read_utf8_code_point(from, maxcode);
701 return from.next;
702 }
703
704 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
705 const char*
706 ucs4_span(const char* begin, const char* end, size_t max,
707 char32_t maxcode = max_code_point, codecvt_mode mode = {})
708 {
709 range<const char> from{ begin, end };
710 read_utf8_bom(from, mode);
711 char32_t c = 0;
712 while (max-- && c <= maxcode)
713 c = read_utf8_code_point(from, maxcode);
714 return from.next;
715 }
716
717 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
718 const char16_t*
d951e75d 719 ucs4_span(range<const char16_t, false>& from, size_t max,
28af1fb3
JW
720 char32_t maxcode = max_code_point, codecvt_mode mode = {})
721 {
bcd682e1 722 read_utf16_bom(from, mode);
28af1fb3
JW
723 char32_t c = 0;
724 while (max-- && c <= maxcode)
725 c = read_utf16_code_point(from, maxcode, mode);
d951e75d 726 return reinterpret_cast<const char16_t*>(from.next);
bb93f35d
JW
727 }
728}
729
730// Define members of codecvt<char16_t, char, mbstate_t> specialization.
731// Converts from UTF-8 to UTF-16.
732
733locale::id codecvt<char16_t, char, mbstate_t>::id;
734
735codecvt<char16_t, char, mbstate_t>::~codecvt() { }
736
737codecvt_base::result
738codecvt<char16_t, char, mbstate_t>::
739do_out(state_type&,
740 const intern_type* __from,
741 const intern_type* __from_end, const intern_type*& __from_next,
742 extern_type* __to, extern_type* __to_end,
743 extern_type*& __to_next) const
744{
745 range<const char16_t> from{ __from, __from_end };
746 range<char> to{ __to, __to_end };
747 auto res = utf16_out(from, to);
748 __from_next = from.next;
749 __to_next = to.next;
750 return res;
751}
752
753codecvt_base::result
754codecvt<char16_t, char, mbstate_t>::
755do_unshift(state_type&, extern_type* __to, extern_type*,
756 extern_type*& __to_next) const
757{
758 __to_next = __to;
759 return noconv; // we don't use mbstate_t for the unicode facets
760}
761
762codecvt_base::result
763codecvt<char16_t, char, mbstate_t>::
764do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
765 const extern_type*& __from_next,
766 intern_type* __to, intern_type* __to_end,
767 intern_type*& __to_next) const
768{
769 range<const char> from{ __from, __from_end };
770 range<char16_t> to{ __to, __to_end };
7f971f18
JW
771#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
772 codecvt_mode mode = {};
773#else
774 codecvt_mode mode = little_endian;
775#endif
776 auto res = utf16_in(from, to, max_code_point, mode);
bb93f35d
JW
777 __from_next = from.next;
778 __to_next = to.next;
779 return res;
780}
781
782int
783codecvt<char16_t, char, mbstate_t>::do_encoding() const throw()
516231de 784{ return 0; } // UTF-8 is not a fixed-width encoding
bb93f35d
JW
785
786bool
787codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw()
788{ return false; }
789
790int
791codecvt<char16_t, char, mbstate_t>::
792do_length(state_type&, const extern_type* __from,
793 const extern_type* __end, size_t __max) const
794{
28af1fb3
JW
795 __end = utf16_span(__from, __end, __max);
796 return __end - __from;
bb93f35d
JW
797}
798
799int
800codecvt<char16_t, char, mbstate_t>::do_max_length() const throw()
801{
516231de
JW
802 // A single character (one or two UTF-16 code units) requires
803 // up to four UTF-8 code units.
804 return 4;
bb93f35d
JW
805}
806
807// Define members of codecvt<char32_t, char, mbstate_t> specialization.
808// Converts from UTF-8 to UTF-32 (aka UCS-4).
809
810locale::id codecvt<char32_t, char, mbstate_t>::id;
811
812codecvt<char32_t, char, mbstate_t>::~codecvt() { }
813
814codecvt_base::result
815codecvt<char32_t, char, mbstate_t>::
816do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
817 const intern_type*& __from_next,
818 extern_type* __to, extern_type* __to_end,
819 extern_type*& __to_next) const
820{
821 range<const char32_t> from{ __from, __from_end };
822 range<char> to{ __to, __to_end };
823 auto res = ucs4_out(from, to);
824 __from_next = from.next;
825 __to_next = to.next;
826 return res;
827}
828
829codecvt_base::result
830codecvt<char32_t, char, mbstate_t>::
831do_unshift(state_type&, extern_type* __to, extern_type*,
832 extern_type*& __to_next) const
833{
834 __to_next = __to;
835 return noconv;
836}
837
838codecvt_base::result
839codecvt<char32_t, char, mbstate_t>::
840do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
841 const extern_type*& __from_next,
842 intern_type* __to, intern_type* __to_end,
843 intern_type*& __to_next) const
844{
845 range<const char> from{ __from, __from_end };
846 range<char32_t> to{ __to, __to_end };
847 auto res = ucs4_in(from, to);
848 __from_next = from.next;
849 __to_next = to.next;
850 return res;
851}
852
853int
854codecvt<char32_t, char, mbstate_t>::do_encoding() const throw()
516231de 855{ return 0; } // UTF-8 is not a fixed-width encoding
bb93f35d
JW
856
857bool
858codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw()
859{ return false; }
860
861int
862codecvt<char32_t, char, mbstate_t>::
863do_length(state_type&, const extern_type* __from,
864 const extern_type* __end, size_t __max) const
865{
28af1fb3
JW
866 __end = ucs4_span(__from, __end, __max);
867 return __end - __from;
bb93f35d
JW
868}
869
870int
871codecvt<char32_t, char, mbstate_t>::do_max_length() const throw()
516231de
JW
872{
873 // A single character (one UTF-32 code unit) requires
874 // up to 4 UTF-8 code units.
875 return 4;
876}
bb93f35d 877
28af1fb3
JW
878// Define members of codecvt_utf8<char16_t> base class implementation.
879// Converts from UTF-8 to UCS-2.
880
881__codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { }
882
883codecvt_base::result
884__codecvt_utf8_base<char16_t>::
885do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
886 const intern_type*& __from_next,
887 extern_type* __to, extern_type* __to_end,
888 extern_type*& __to_next) const
889{
890 range<const char16_t> from{ __from, __from_end };
891 range<char> to{ __to, __to_end };
892 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
893 __from_next = from.next;
894 __to_next = to.next;
895 return res;
896}
897
898codecvt_base::result
899__codecvt_utf8_base<char16_t>::
900do_unshift(state_type&, extern_type* __to, extern_type*,
901 extern_type*& __to_next) const
902{
903 __to_next = __to;
904 return noconv;
905}
906
907codecvt_base::result
908__codecvt_utf8_base<char16_t>::
909do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
910 const extern_type*& __from_next,
911 intern_type* __to, intern_type* __to_end,
912 intern_type*& __to_next) const
913{
914 range<const char> from{ __from, __from_end };
915 range<char16_t> to{ __to, __to_end };
a1e1ec76 916 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
29ca91f7
JW
917#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
918 mode = codecvt_mode(mode | little_endian);
919#endif
920 auto res = ucs2_in(from, to, _M_maxcode, mode);
28af1fb3
JW
921 __from_next = from.next;
922 __to_next = to.next;
923 return res;
924}
925
926int
927__codecvt_utf8_base<char16_t>::do_encoding() const throw()
516231de 928{ return 0; } // UTF-8 is not a fixed-width encoding
28af1fb3
JW
929
930bool
931__codecvt_utf8_base<char16_t>::do_always_noconv() const throw()
932{ return false; }
933
934int
935__codecvt_utf8_base<char16_t>::
936do_length(state_type&, const extern_type* __from,
937 const extern_type* __end, size_t __max) const
938{
939 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
940 return __end - __from;
941}
942
943int
944__codecvt_utf8_base<char16_t>::do_max_length() const throw()
516231de
JW
945{
946 // A single UCS-2 character requires up to three UTF-8 code units.
947 // (UCS-2 cannot represent characters that use four UTF-8 code units).
948 int max = 3;
949 if (_M_mode & consume_header)
950 max += sizeof(utf8_bom);
951 return max;
952}
28af1fb3
JW
953
954// Define members of codecvt_utf8<char32_t> base class implementation.
955// Converts from UTF-8 to UTF-32 (aka UCS-4).
956
957__codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { }
958
959codecvt_base::result
960__codecvt_utf8_base<char32_t>::
961do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
962 const intern_type*& __from_next,
963 extern_type* __to, extern_type* __to_end,
964 extern_type*& __to_next) const
965{
966 range<const char32_t> from{ __from, __from_end };
967 range<char> to{ __to, __to_end };
968 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
969 __from_next = from.next;
970 __to_next = to.next;
971 return res;
972}
973
974codecvt_base::result
975__codecvt_utf8_base<char32_t>::
976do_unshift(state_type&, extern_type* __to, extern_type*,
977 extern_type*& __to_next) const
978{
979 __to_next = __to;
980 return noconv;
981}
982
983codecvt_base::result
984__codecvt_utf8_base<char32_t>::
985do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
986 const extern_type*& __from_next,
987 intern_type* __to, intern_type* __to_end,
988 intern_type*& __to_next) const
989{
990 range<const char> from{ __from, __from_end };
991 range<char32_t> to{ __to, __to_end };
992 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
993 __from_next = from.next;
994 __to_next = to.next;
995 return res;
996}
997
998int
999__codecvt_utf8_base<char32_t>::do_encoding() const throw()
516231de 1000{ return 0; } // UTF-8 is not a fixed-width encoding
28af1fb3
JW
1001
1002bool
1003__codecvt_utf8_base<char32_t>::do_always_noconv() const throw()
1004{ return false; }
1005
1006int
1007__codecvt_utf8_base<char32_t>::
1008do_length(state_type&, const extern_type* __from,
1009 const extern_type* __end, size_t __max) const
1010{
1011 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1012 return __end - __from;
1013}
1014
1015int
1016__codecvt_utf8_base<char32_t>::do_max_length() const throw()
516231de
JW
1017{
1018 // A single UCS-4 character requires up to four UTF-8 code units.
1019 int max = 4;
1020 if (_M_mode & consume_header)
1021 max += sizeof(utf8_bom);
1022 return max;
1023}
28af1fb3
JW
1024
1025#ifdef _GLIBCXX_USE_WCHAR_T
d951e75d
JW
1026
1027#if __SIZEOF_WCHAR_T__ == 2
1028static_assert(sizeof(wchar_t) == sizeof(char16_t), "");
1029#elif __SIZEOF_WCHAR_T__ == 4
1030static_assert(sizeof(wchar_t) == sizeof(char32_t), "");
1031#endif
1032
28af1fb3
JW
1033// Define members of codecvt_utf8<wchar_t> base class implementation.
1034// Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1035
1036__codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { }
1037
1038codecvt_base::result
1039__codecvt_utf8_base<wchar_t>::
1040do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1041 const intern_type*& __from_next,
1042 extern_type* __to, extern_type* __to_end,
1043 extern_type*& __to_next) const
1044{
1045 range<char> to{ __to, __to_end };
1046#if __SIZEOF_WCHAR_T__ == 2
1047 range<const char16_t> from{
1048 reinterpret_cast<const char16_t*>(__from),
1049 reinterpret_cast<const char16_t*>(__from_end)
1050 };
1051 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1052#elif __SIZEOF_WCHAR_T__ == 4
1053 range<const char32_t> from{
1054 reinterpret_cast<const char32_t*>(__from),
1055 reinterpret_cast<const char32_t*>(__from_end)
1056 };
1057 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1058#else
1059 return codecvt_base::error;
1060#endif
1061 __from_next = reinterpret_cast<const wchar_t*>(from.next);
1062 __to_next = to.next;
1063 return res;
1064}
1065
1066codecvt_base::result
1067__codecvt_utf8_base<wchar_t>::
1068do_unshift(state_type&, extern_type* __to, extern_type*,
1069 extern_type*& __to_next) const
1070{
1071 __to_next = __to;
1072 return noconv;
1073}
1074
1075codecvt_base::result
1076__codecvt_utf8_base<wchar_t>::
1077do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1078 const extern_type*& __from_next,
1079 intern_type* __to, intern_type* __to_end,
1080 intern_type*& __to_next) const
1081{
1082 range<const char> from{ __from, __from_end };
1083#if __SIZEOF_WCHAR_T__ == 2
1084 range<char16_t> to{
1085 reinterpret_cast<char16_t*>(__to),
1086 reinterpret_cast<char16_t*>(__to_end)
1087 };
a76400f4
JW
1088#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1089 codecvt_mode mode = {};
1090#else
1091 codecvt_mode mode = little_endian;
1092#endif
1093 auto res = ucs2_in(from, to, _M_maxcode, mode);
28af1fb3
JW
1094#elif __SIZEOF_WCHAR_T__ == 4
1095 range<char32_t> to{
1096 reinterpret_cast<char32_t*>(__to),
1097 reinterpret_cast<char32_t*>(__to_end)
1098 };
1099 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1100#else
1101 return codecvt_base::error;
1102#endif
1103 __from_next = from.next;
1104 __to_next = reinterpret_cast<wchar_t*>(to.next);
1105 return res;
1106}
1107
1108int
1109__codecvt_utf8_base<wchar_t>::do_encoding() const throw()
516231de 1110{ return 0; } // UTF-8 is not a fixed-width encoding
28af1fb3
JW
1111
1112bool
1113__codecvt_utf8_base<wchar_t>::do_always_noconv() const throw()
1114{ return false; }
1115
1116int
1117__codecvt_utf8_base<wchar_t>::
1118do_length(state_type&, const extern_type* __from,
1119 const extern_type* __end, size_t __max) const
1120{
1121#if __SIZEOF_WCHAR_T__ == 2
1122 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1123#elif __SIZEOF_WCHAR_T__ == 4
1124 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1125#else
1126 __end = __from;
1127#endif
1128 return __end - __from;
1129}
1130
1131int
1132__codecvt_utf8_base<wchar_t>::do_max_length() const throw()
516231de
JW
1133{
1134#if __SIZEOF_WCHAR_T__ == 2
1135 int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
1136#else
1137 int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
1138#endif
1139 if (_M_mode & consume_header)
1140 max += sizeof(utf8_bom);
1141 return max;
1142}
28af1fb3
JW
1143#endif
1144
1145// Define members of codecvt_utf16<char16_t> base class implementation.
1146// Converts from UTF-16 to UCS-2.
1147
1148__codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { }
1149
1150codecvt_base::result
1151__codecvt_utf16_base<char16_t>::
1152do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1153 const intern_type*& __from_next,
1154 extern_type* __to, extern_type* __to_end,
1155 extern_type*& __to_next) const
1156{
1157 range<const char16_t> from{ __from, __from_end };
d951e75d 1158 range<char16_t, false> to{ __to, __to_end };
28af1fb3
JW
1159 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1160 __from_next = from.next;
1161 __to_next = reinterpret_cast<char*>(to.next);
1162 return res;
1163}
1164
1165codecvt_base::result
1166__codecvt_utf16_base<char16_t>::
1167do_unshift(state_type&, extern_type* __to, extern_type*,
1168 extern_type*& __to_next) const
1169{
1170 __to_next = __to;
1171 return noconv;
1172}
1173
1174codecvt_base::result
1175__codecvt_utf16_base<char16_t>::
1176do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1177 const extern_type*& __from_next,
1178 intern_type* __to, intern_type* __to_end,
1179 intern_type*& __to_next) const
1180{
d951e75d 1181 range<const char16_t, false> from{ __from, __from_end };
28af1fb3
JW
1182 range<char16_t> to{ __to, __to_end };
1183 auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1184 __from_next = reinterpret_cast<const char*>(from.next);
1185 __to_next = to.next;
d951e75d
JW
1186 if (res == codecvt_base::ok && __from_next != __from_end)
1187 res = codecvt_base::error;
28af1fb3
JW
1188 return res;
1189}
1190
1191int
1192__codecvt_utf16_base<char16_t>::do_encoding() const throw()
516231de 1193{ return 0; } // UTF-16 is not a fixed-width encoding
28af1fb3
JW
1194
1195bool
1196__codecvt_utf16_base<char16_t>::do_always_noconv() const throw()
1197{ return false; }
1198
1199int
1200__codecvt_utf16_base<char16_t>::
1201do_length(state_type&, const extern_type* __from,
1202 const extern_type* __end, size_t __max) const
1203{
d951e75d
JW
1204 range<const char16_t, false> from{ __from, __end };
1205 const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
28af1fb3
JW
1206 return reinterpret_cast<const char*>(next) - __from;
1207}
1208
1209int
1210__codecvt_utf16_base<char16_t>::do_max_length() const throw()
516231de
JW
1211{
1212 // A single UCS-2 character requires one UTF-16 code unit (so two chars).
1213 // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
1214 int max = 2;
1215 if (_M_mode & consume_header)
1216 max += sizeof(utf16_bom);
1217 return max;
1218}
28af1fb3
JW
1219
1220// Define members of codecvt_utf16<char32_t> base class implementation.
1221// Converts from UTF-16 to UTF-32 (aka UCS-4).
1222
1223__codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { }
1224
1225codecvt_base::result
1226__codecvt_utf16_base<char32_t>::
1227do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1228 const intern_type*& __from_next,
1229 extern_type* __to, extern_type* __to_end,
1230 extern_type*& __to_next) const
1231{
1232 range<const char32_t> from{ __from, __from_end };
d951e75d 1233 range<char16_t, false> to{ __to, __to_end };
28af1fb3
JW
1234 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1235 __from_next = from.next;
1236 __to_next = reinterpret_cast<char*>(to.next);
1237 return res;
1238}
1239
1240codecvt_base::result
1241__codecvt_utf16_base<char32_t>::
1242do_unshift(state_type&, extern_type* __to, extern_type*,
1243 extern_type*& __to_next) const
1244{
1245 __to_next = __to;
1246 return noconv;
1247}
1248
1249codecvt_base::result
1250__codecvt_utf16_base<char32_t>::
1251do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1252 const extern_type*& __from_next,
1253 intern_type* __to, intern_type* __to_end,
1254 intern_type*& __to_next) const
1255{
d951e75d 1256 range<const char16_t, false> from{ __from, __from_end };
28af1fb3
JW
1257 range<char32_t> to{ __to, __to_end };
1258 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1259 __from_next = reinterpret_cast<const char*>(from.next);
1260 __to_next = to.next;
d951e75d
JW
1261 if (res == codecvt_base::ok && __from_next != __from_end)
1262 res = codecvt_base::error;
28af1fb3
JW
1263 return res;
1264}
1265
1266int
1267__codecvt_utf16_base<char32_t>::do_encoding() const throw()
516231de 1268{ return 0; } // UTF-16 is not a fixed-width encoding
28af1fb3
JW
1269
1270bool
1271__codecvt_utf16_base<char32_t>::do_always_noconv() const throw()
1272{ return false; }
1273
1274int
1275__codecvt_utf16_base<char32_t>::
1276do_length(state_type&, const extern_type* __from,
1277 const extern_type* __end, size_t __max) const
1278{
d951e75d
JW
1279 range<const char16_t, false> from{ __from, __end };
1280 const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
28af1fb3
JW
1281 return reinterpret_cast<const char*>(next) - __from;
1282}
1283
1284int
1285__codecvt_utf16_base<char32_t>::do_max_length() const throw()
516231de
JW
1286{
1287 // A single UCS-4 character requires one or two UTF-16 code units
1288 // (so up to four chars).
1289 int max = 4;
1290 if (_M_mode & consume_header)
1291 max += sizeof(utf16_bom);
1292 return max;
1293}
28af1fb3
JW
1294
1295#ifdef _GLIBCXX_USE_WCHAR_T
1296// Define members of codecvt_utf16<wchar_t> base class implementation.
1297// Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1298
1299__codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { }
1300
1301codecvt_base::result
1302__codecvt_utf16_base<wchar_t>::
1303do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1304 const intern_type*& __from_next,
1305 extern_type* __to, extern_type* __to_end,
1306 extern_type*& __to_next) const
1307{
d951e75d 1308 range<char16_t, false> to{ __to, __to_end };
28af1fb3
JW
1309#if __SIZEOF_WCHAR_T__ == 2
1310 range<const char16_t> from{
1311 reinterpret_cast<const char16_t*>(__from),
d951e75d 1312 reinterpret_cast<const char16_t*>(__from_end),
28af1fb3
JW
1313 };
1314 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1315#elif __SIZEOF_WCHAR_T__ == 4
1316 range<const char32_t> from{
1317 reinterpret_cast<const char32_t*>(__from),
d951e75d 1318 reinterpret_cast<const char32_t*>(__from_end),
28af1fb3
JW
1319 };
1320 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1321#else
1322 return codecvt_base::error;
1323#endif
1324 __from_next = reinterpret_cast<const wchar_t*>(from.next);
a4c687d6 1325 __to_next = reinterpret_cast<char*>(to.next);
28af1fb3
JW
1326 return res;
1327}
1328
1329codecvt_base::result
1330__codecvt_utf16_base<wchar_t>::
1331do_unshift(state_type&, extern_type* __to, extern_type*,
1332 extern_type*& __to_next) const
1333{
1334 __to_next = __to;
1335 return noconv;
1336}
1337
1338codecvt_base::result
1339__codecvt_utf16_base<wchar_t>::
1340do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1341 const extern_type*& __from_next,
1342 intern_type* __to, intern_type* __to_end,
1343 intern_type*& __to_next) const
1344{
d951e75d 1345 range<const char16_t, false> from{ __from, __from_end };
28af1fb3
JW
1346#if __SIZEOF_WCHAR_T__ == 2
1347 range<char16_t> to{
1348 reinterpret_cast<char16_t*>(__to),
d951e75d 1349 reinterpret_cast<char16_t*>(__to_end),
28af1fb3
JW
1350 };
1351 auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1352#elif __SIZEOF_WCHAR_T__ == 4
1353 range<char32_t> to{
1354 reinterpret_cast<char32_t*>(__to),
d951e75d 1355 reinterpret_cast<char32_t*>(__to_end),
28af1fb3
JW
1356 };
1357 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1358#else
1359 return codecvt_base::error;
1360#endif
a4c687d6 1361 __from_next = reinterpret_cast<const char*>(from.next);
28af1fb3 1362 __to_next = reinterpret_cast<wchar_t*>(to.next);
d951e75d
JW
1363 if (res == codecvt_base::ok && __from_next != __from_end)
1364 res = codecvt_base::error;
28af1fb3
JW
1365 return res;
1366}
1367
1368int
1369__codecvt_utf16_base<wchar_t>::do_encoding() const throw()
516231de 1370{ return 0; } // UTF-16 is not a fixed-width encoding
28af1fb3
JW
1371
1372bool
1373__codecvt_utf16_base<wchar_t>::do_always_noconv() const throw()
1374{ return false; }
1375
1376int
1377__codecvt_utf16_base<wchar_t>::
1378do_length(state_type&, const extern_type* __from,
1379 const extern_type* __end, size_t __max) const
1380{
d951e75d 1381 range<const char16_t, false> from{ __from, __end };
28af1fb3 1382#if __SIZEOF_WCHAR_T__ == 2
d951e75d 1383 const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
28af1fb3 1384#elif __SIZEOF_WCHAR_T__ == 4
d951e75d 1385 const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
28af1fb3
JW
1386#endif
1387 return reinterpret_cast<const char*>(next) - __from;
1388}
1389
1390int
1391__codecvt_utf16_base<wchar_t>::do_max_length() const throw()
516231de
JW
1392{
1393#if __SIZEOF_WCHAR_T__ == 2
1394 int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
1395#else
1396 int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
1397#endif
1398 if (_M_mode & consume_header)
1399 max += sizeof(utf16_bom);
1400 return max;
1401}
28af1fb3
JW
1402#endif
1403
1404// Define members of codecvt_utf8_utf16<char16_t> base class implementation.
1405// Converts from UTF-8 to UTF-16.
1406
1407__codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { }
1408
1409codecvt_base::result
1410__codecvt_utf8_utf16_base<char16_t>::
1411do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1412 const intern_type*& __from_next,
1413 extern_type* __to, extern_type* __to_end,
1414 extern_type*& __to_next) const
1415{
1416 range<const char16_t> from{ __from, __from_end };
1417 range<char> to{ __to, __to_end };
1418 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1419 __from_next = from.next;
1420 __to_next = to.next;
1421 return res;
1422}
1423
1424codecvt_base::result
1425__codecvt_utf8_utf16_base<char16_t>::
1426do_unshift(state_type&, extern_type* __to, extern_type*,
1427 extern_type*& __to_next) const
1428{
1429 __to_next = __to;
1430 return noconv;
1431}
1432
1433codecvt_base::result
1434__codecvt_utf8_utf16_base<char16_t>::
1435do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1436 const extern_type*& __from_next,
1437 intern_type* __to, intern_type* __to_end,
1438 intern_type*& __to_next) const
1439{
1440 range<const char> from{ __from, __from_end };
1441 range<char16_t> to{ __to, __to_end };
a1e1ec76 1442 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
795038b7
JW
1443#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1444 mode = codecvt_mode(mode | little_endian);
1445#endif
1446 auto res = utf16_in(from, to, _M_maxcode, mode);
28af1fb3
JW
1447 __from_next = from.next;
1448 __to_next = to.next;
1449 return res;
1450}
1451
1452int
1453__codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw()
516231de 1454{ return 0; } // UTF-8 is not a fixed-width encoding
28af1fb3
JW
1455
1456bool
1457__codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw()
1458{ return false; }
1459
1460int
1461__codecvt_utf8_utf16_base<char16_t>::
1462do_length(state_type&, const extern_type* __from,
1463 const extern_type* __end, size_t __max) const
1464{
1465 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1466 return __end - __from;
1467}
1468
1469int
1470__codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw()
1471{
516231de
JW
1472 // A single character can be 1 or 2 UTF-16 code units,
1473 // requiring up to 4 UTF-8 code units.
1474 int max = 4;
1475 if (_M_mode & consume_header)
1476 max += sizeof(utf8_bom);
1477 return max;
28af1fb3
JW
1478}
1479
1480// Define members of codecvt_utf8_utf16<char32_t> base class implementation.
1481// Converts from UTF-8 to UTF-16.
1482
1483__codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { }
1484
1485codecvt_base::result
1486__codecvt_utf8_utf16_base<char32_t>::
1487do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1488 const intern_type*& __from_next,
1489 extern_type* __to, extern_type* __to_end,
1490 extern_type*& __to_next) const
1491{
1492 range<const char32_t> from{ __from, __from_end };
1493 range<char> to{ __to, __to_end };
1494 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1495 __from_next = from.next;
1496 __to_next = to.next;
1497 return res;
1498}
1499
1500codecvt_base::result
1501__codecvt_utf8_utf16_base<char32_t>::
1502do_unshift(state_type&, extern_type* __to, extern_type*,
1503 extern_type*& __to_next) const
1504{
1505 __to_next = __to;
1506 return noconv;
1507}
1508
1509codecvt_base::result
1510__codecvt_utf8_utf16_base<char32_t>::
1511do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1512 const extern_type*& __from_next,
1513 intern_type* __to, intern_type* __to_end,
1514 intern_type*& __to_next) const
1515{
1516 range<const char> from{ __from, __from_end };
1517 range<char32_t> to{ __to, __to_end };
02e12bda
JW
1518 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1519#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1520 mode = codecvt_mode(mode | little_endian);
1521#endif
1522 auto res = utf16_in(from, to, _M_maxcode, mode);
28af1fb3
JW
1523 __from_next = from.next;
1524 __to_next = to.next;
1525 return res;
1526}
1527
1528int
1529__codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw()
516231de 1530{ return 0; } // UTF-8 is not a fixed-width encoding
28af1fb3
JW
1531
1532bool
1533__codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw()
1534{ return false; }
1535
1536int
1537__codecvt_utf8_utf16_base<char32_t>::
1538do_length(state_type&, const extern_type* __from,
1539 const extern_type* __end, size_t __max) const
1540{
1541 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1542 return __end - __from;
1543}
1544
1545int
1546__codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw()
1547{
516231de
JW
1548 // A single character can be 1 or 2 UTF-16 code units,
1549 // requiring up to 4 UTF-8 code units.
1550 int max = 4;
1551 if (_M_mode & consume_header)
1552 max += sizeof(utf8_bom);
1553 return max;
28af1fb3
JW
1554}
1555
1556#ifdef _GLIBCXX_USE_WCHAR_T
1557// Define members of codecvt_utf8_utf16<wchar_t> base class implementation.
1558// Converts from UTF-8 to UTF-16.
1559
1560__codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { }
1561
1562codecvt_base::result
1563__codecvt_utf8_utf16_base<wchar_t>::
1564do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1565 const intern_type*& __from_next,
1566 extern_type* __to, extern_type* __to_end,
1567 extern_type*& __to_next) const
1568{
1569 range<const wchar_t> from{ __from, __from_end };
1570 range<char> to{ __to, __to_end };
1571 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1572 __from_next = from.next;
1573 __to_next = to.next;
1574 return res;
1575}
1576
1577codecvt_base::result
1578__codecvt_utf8_utf16_base<wchar_t>::
1579do_unshift(state_type&, extern_type* __to, extern_type*,
1580 extern_type*& __to_next) const
1581{
1582 __to_next = __to;
1583 return noconv;
1584}
1585
1586codecvt_base::result
1587__codecvt_utf8_utf16_base<wchar_t>::
1588do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1589 const extern_type*& __from_next,
1590 intern_type* __to, intern_type* __to_end,
1591 intern_type*& __to_next) const
1592{
1593 range<const char> from{ __from, __from_end };
1594 range<wchar_t> to{ __to, __to_end };
02e12bda
JW
1595 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1596#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1597 mode = codecvt_mode(mode | little_endian);
1598#endif
1599 auto res = utf16_in(from, to, _M_maxcode, mode);
28af1fb3
JW
1600 __from_next = from.next;
1601 __to_next = to.next;
1602 return res;
1603}
1604
1605int
1606__codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw()
516231de 1607{ return 0; } // UTF-8 is not a fixed-width encoding
28af1fb3
JW
1608
1609bool
1610__codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw()
1611{ return false; }
1612
1613int
1614__codecvt_utf8_utf16_base<wchar_t>::
1615do_length(state_type&, const extern_type* __from,
1616 const extern_type* __end, size_t __max) const
1617{
1618 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1619 return __end - __from;
1620}
1621
1622int
1623__codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw()
1624{
516231de
JW
1625 // A single character can be 1 or 2 UTF-16 code units,
1626 // requiring up to 4 UTF-8 code units.
1627 int max = 4;
1628 if (_M_mode & consume_header)
1629 max += sizeof(utf8_bom);
1630 return max;
28af1fb3
JW
1631}
1632#endif
1633
bb93f35d
JW
1634inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>;
1635inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>;
71a16cd8
JW
1636template class codecvt_byname<char16_t, char, mbstate_t>;
1637template class codecvt_byname<char32_t, char, mbstate_t>;
bb93f35d
JW
1638
1639_GLIBCXX_END_NAMESPACE_VERSION
1640}