]>
Commit | Line | Data |
---|---|---|
bb93f35d JW |
1 | // Locale support (codecvt) -*- C++ -*- |
2 | ||
83ffe9cd | 3 | // Copyright (C) 2015-2023 Free Software Foundation, Inc. |
bb93f35d JW |
4 | // |
5 | // This file is part of the GNU ISO C++ Library. This library is free | |
6 | // software; you can redistribute it and/or modify it under the | |
7 | // terms of the GNU General Public License as published by the | |
8 | // Free Software Foundation; either version 3, or (at your option) | |
9 | // any later version. | |
10 | ||
11 | // This library is distributed in the hope that it will be useful, | |
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | // GNU General Public License for more details. | |
15 | ||
16 | // Under Section 7 of GPL version 3, you are granted additional | |
17 | // permissions described in the GCC Runtime Library Exception, version | |
18 | // 3.1, as published by the Free Software Foundation. | |
19 | ||
20 | // You should have received a copy of the GNU General Public License and | |
21 | // a copy of the GCC Runtime Library Exception along with this program; | |
22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
23 | // <http://www.gnu.org/licenses/>. | |
24 | ||
28af1fb3 | 25 | #include <codecvt> |
bb93f35d | 26 | #include <cstring> // std::memcpy, std::memcmp |
bcd682e1 | 27 | #include <bits/stl_algobase.h> // std::min |
bb93f35d | 28 | |
bb93f35d JW |
29 | namespace std _GLIBCXX_VISIBILITY(default) |
30 | { | |
31 | _GLIBCXX_BEGIN_NAMESPACE_VERSION | |
32 | ||
bcd682e1 JW |
33 | // The standard doesn't define these operators, which is annoying. |
34 | static underlying_type<codecvt_mode>::type | |
35 | to_integer(codecvt_mode m) | |
e363c939 | 36 | { return static_cast<underlying_type<codecvt_mode>::type>(m); } |
bcd682e1 JW |
37 | |
38 | static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n) | |
39 | { return m = codecvt_mode(to_integer(m) & to_integer(n)); } | |
40 | ||
41 | static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n) | |
42 | { return m = codecvt_mode(to_integer(m) | to_integer(n)); } | |
43 | ||
44 | static codecvt_mode operator~(codecvt_mode m) | |
45 | { return codecvt_mode(~to_integer(m)); } | |
46 | ||
bb93f35d JW |
47 | namespace |
48 | { | |
49 | // Largest code point that fits in a single UTF-16 code unit. | |
50 | const char32_t max_single_utf16_unit = 0xFFFF; | |
b6584a72 | 51 | |
bb93f35d JW |
52 | const char32_t max_code_point = 0x10FFFF; |
53 | ||
b6584a72 JW |
54 | // The functions below rely on maxcode < incomplete_mb_character |
55 | // (which is enforced by the codecvt_utf* classes on construction). | |
56 | const char32_t incomplete_mb_character = char32_t(-2); | |
57 | const char32_t invalid_mb_sequence = char32_t(-1); | |
58 | ||
d951e75d JW |
59 | // Utility type for reading and writing code units of type Elem from |
60 | // a range defined by a pair of pointers. | |
61 | template<typename Elem, bool Aligned = true> | |
bb93f35d JW |
62 | struct range |
63 | { | |
64 | Elem* next; | |
65 | Elem* end; | |
66 | ||
d951e75d JW |
67 | // Write a code unit. |
68 | range& operator=(Elem e) | |
69 | { | |
70 | *next++ = e; | |
71 | return *this; | |
72 | } | |
73 | ||
74 | // Read the next code unit. | |
bb93f35d JW |
75 | Elem operator*() const { return *next; } |
76 | ||
d951e75d JW |
77 | // Read the Nth code unit. |
78 | Elem operator[](size_t n) const { return next[n]; } | |
79 | ||
80 | // Move to the next code unit. | |
81 | range& operator++() | |
82 | { | |
83 | ++next; | |
84 | return *this; | |
85 | } | |
86 | ||
87 | // Move to the Nth code unit. | |
88 | range& operator+=(size_t n) | |
89 | { | |
90 | next += n; | |
91 | return *this; | |
92 | } | |
bb93f35d | 93 | |
d951e75d | 94 | // The number of code units remaining. |
bb93f35d | 95 | size_t size() const { return end - next; } |
d951e75d JW |
96 | |
97 | // The number of bytes remaining. | |
98 | size_t nbytes() const { return (const char*)end - (const char*)next; } | |
99 | }; | |
100 | ||
101 | // This specialization is used when accessing char16_t values through | |
102 | // pointers to char, which might not be correctly aligned for char16_t. | |
103 | template<typename Elem> | |
104 | struct range<Elem, false> | |
105 | { | |
106 | using value_type = typename remove_const<Elem>::type; | |
107 | ||
108 | using char_pointer = typename | |
109 | conditional<is_const<Elem>::value, const char*, char*>::type; | |
110 | ||
111 | char_pointer next; | |
112 | char_pointer end; | |
113 | ||
114 | // Write a code unit. | |
115 | range& operator=(Elem e) | |
116 | { | |
117 | memcpy(next, &e, sizeof(Elem)); | |
118 | ++*this; | |
119 | return *this; | |
120 | } | |
121 | ||
122 | // Read the next code unit. | |
123 | Elem operator*() const | |
124 | { | |
125 | value_type e; | |
126 | memcpy(&e, next, sizeof(Elem)); | |
127 | return e; | |
128 | } | |
129 | ||
130 | // Read the Nth code unit. | |
131 | Elem operator[](size_t n) const | |
132 | { | |
133 | value_type e; | |
134 | memcpy(&e, next + n * sizeof(Elem), sizeof(Elem)); | |
135 | return e; | |
136 | } | |
137 | ||
138 | // Move to the next code unit. | |
139 | range& operator++() | |
140 | { | |
141 | next += sizeof(Elem); | |
142 | return *this; | |
143 | } | |
144 | ||
145 | // Move to the Nth code unit. | |
146 | range& operator+=(size_t n) | |
147 | { | |
148 | next += n * sizeof(Elem); | |
149 | return *this; | |
150 | } | |
151 | ||
152 | // The number of code units remaining. | |
153 | size_t size() const { return nbytes() / sizeof(Elem); } | |
154 | ||
155 | // The number of bytes remaining. | |
156 | size_t nbytes() const { return end - next; } | |
bb93f35d JW |
157 | }; |
158 | ||
28af1fb3 JW |
159 | // Multibyte sequences can have "header" consisting of Byte Order Mark |
160 | const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF }; | |
516231de JW |
161 | const unsigned char utf16_bom[2] = { 0xFE, 0xFF }; |
162 | const unsigned char utf16le_bom[2] = { 0xFF, 0xFE }; | |
28af1fb3 | 163 | |
d951e75d JW |
164 | // Write a BOM (space permitting). |
165 | template<typename C, bool A, size_t N> | |
166 | bool | |
167 | write_bom(range<C, A>& to, const unsigned char (&bom)[N]) | |
28af1fb3 | 168 | { |
d951e75d JW |
169 | static_assert( (N / sizeof(C)) != 0, "" ); |
170 | static_assert( (N % sizeof(C)) == 0, "" ); | |
171 | ||
172 | if (to.nbytes() < N) | |
28af1fb3 JW |
173 | return false; |
174 | memcpy(to.next, bom, N); | |
d951e75d | 175 | to += (N / sizeof(C)); |
28af1fb3 JW |
176 | return true; |
177 | } | |
178 | ||
d951e75d JW |
179 | // Try to read a BOM. |
180 | template<typename C, bool A, size_t N> | |
181 | bool | |
182 | read_bom(range<C, A>& from, const unsigned char (&bom)[N]) | |
183 | { | |
184 | static_assert( (N / sizeof(C)) != 0, "" ); | |
185 | static_assert( (N % sizeof(C)) == 0, "" ); | |
186 | ||
187 | if (from.nbytes() >= N && !memcmp(from.next, bom, N)) | |
188 | { | |
189 | from += (N / sizeof(C)); | |
190 | return true; | |
191 | } | |
192 | return false; | |
193 | } | |
194 | ||
28af1fb3 | 195 | // If generate_header is set in mode write out UTF-8 BOM. |
c124af93 | 196 | template<typename C> |
28af1fb3 | 197 | bool |
c124af93 | 198 | write_utf8_bom(range<C>& to, codecvt_mode mode) |
28af1fb3 JW |
199 | { |
200 | if (mode & generate_header) | |
201 | return write_bom(to, utf8_bom); | |
202 | return true; | |
203 | } | |
204 | ||
205 | // If generate_header is set in mode write out the UTF-16 BOM indicated | |
206 | // by whether little_endian is set in mode. | |
d951e75d | 207 | template<bool Aligned> |
28af1fb3 | 208 | bool |
d951e75d | 209 | write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode) |
28af1fb3 JW |
210 | { |
211 | if (mode & generate_header) | |
212 | { | |
d951e75d JW |
213 | if (mode & little_endian) |
214 | return write_bom(to, utf16le_bom); | |
215 | else | |
216 | return write_bom(to, utf16_bom); | |
28af1fb3 JW |
217 | } |
218 | return true; | |
219 | } | |
220 | ||
28af1fb3 | 221 | // If consume_header is set in mode update from.next to after any BOM. |
c124af93 | 222 | template<typename C> |
28af1fb3 | 223 | void |
c124af93 | 224 | read_utf8_bom(range<const C>& from, codecvt_mode mode) |
28af1fb3 JW |
225 | { |
226 | if (mode & consume_header) | |
227 | read_bom(from, utf8_bom); | |
228 | } | |
229 | ||
bcd682e1 JW |
230 | // If consume_header is not set in mode, no effects. |
231 | // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then: | |
232 | // - if the UTF-16BE BOM was found unset little_endian in mode, or | |
233 | // - if the UTF-16LE BOM was found set little_endian in mode. | |
d951e75d | 234 | template<bool Aligned> |
bcd682e1 | 235 | void |
d951e75d | 236 | read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode) |
28af1fb3 | 237 | { |
d951e75d | 238 | if (mode & consume_header) |
28af1fb3 | 239 | { |
d951e75d JW |
240 | if (read_bom(from, utf16_bom)) |
241 | mode &= ~little_endian; | |
242 | else if (read_bom(from, utf16le_bom)) | |
243 | mode |= little_endian; | |
28af1fb3 | 244 | } |
28af1fb3 JW |
245 | } |
246 | ||
247 | // Read a codepoint from a UTF-8 multibyte sequence. | |
248 | // Updates from.next if the codepoint is not greater than maxcode. | |
b6584a72 | 249 | // Returns invalid_mb_sequence, incomplete_mb_character or the code point. |
c124af93 | 250 | template<typename C> |
bb93f35d | 251 | char32_t |
c124af93 | 252 | read_utf8_code_point(range<const C>& from, unsigned long maxcode) |
bb93f35d | 253 | { |
b6584a72 | 254 | const size_t avail = from.size(); |
bb93f35d | 255 | if (avail == 0) |
b6584a72 | 256 | return incomplete_mb_character; |
8f7b7c14 | 257 | char32_t c1 = (unsigned char) from[0]; |
bb93f35d JW |
258 | // https://en.wikipedia.org/wiki/UTF-8#Sample_code |
259 | if (c1 < 0x80) | |
260 | { | |
d951e75d | 261 | ++from; |
bb93f35d JW |
262 | return c1; |
263 | } | |
264 | else if (c1 < 0xC2) // continuation or overlong 2-byte sequence | |
b6584a72 | 265 | return invalid_mb_sequence; |
bb93f35d JW |
266 | else if (c1 < 0xE0) // 2-byte sequence |
267 | { | |
268 | if (avail < 2) | |
b6584a72 | 269 | return incomplete_mb_character; |
8f7b7c14 | 270 | char32_t c2 = (unsigned char) from[1]; |
bb93f35d | 271 | if ((c2 & 0xC0) != 0x80) |
b6584a72 | 272 | return invalid_mb_sequence; |
bb93f35d | 273 | char32_t c = (c1 << 6) + c2 - 0x3080; |
28af1fb3 | 274 | if (c <= maxcode) |
d951e75d | 275 | from += 2; |
bb93f35d JW |
276 | return c; |
277 | } | |
278 | else if (c1 < 0xF0) // 3-byte sequence | |
279 | { | |
02dab998 | 280 | if (avail < 2) |
b6584a72 | 281 | return incomplete_mb_character; |
8f7b7c14 | 282 | char32_t c2 = (unsigned char) from[1]; |
bb93f35d | 283 | if ((c2 & 0xC0) != 0x80) |
b6584a72 | 284 | return invalid_mb_sequence; |
bb93f35d | 285 | if (c1 == 0xE0 && c2 < 0xA0) // overlong |
b6584a72 | 286 | return invalid_mb_sequence; |
02dab998 DM |
287 | if (avail < 3) |
288 | return incomplete_mb_character; | |
8f7b7c14 | 289 | char32_t c3 = (unsigned char) from[2]; |
bb93f35d | 290 | if ((c3 & 0xC0) != 0x80) |
b6584a72 | 291 | return invalid_mb_sequence; |
bb93f35d | 292 | char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080; |
28af1fb3 | 293 | if (c <= maxcode) |
d951e75d | 294 | from += 3; |
bb93f35d JW |
295 | return c; |
296 | } | |
02dab998 | 297 | else if (c1 < 0xF5 && maxcode > 0xFFFF) // 4-byte sequence |
bb93f35d | 298 | { |
02dab998 | 299 | if (avail < 2) |
b6584a72 | 300 | return incomplete_mb_character; |
8f7b7c14 | 301 | char32_t c2 = (unsigned char) from[1]; |
bb93f35d | 302 | if ((c2 & 0xC0) != 0x80) |
b6584a72 | 303 | return invalid_mb_sequence; |
bb93f35d | 304 | if (c1 == 0xF0 && c2 < 0x90) // overlong |
b6584a72 | 305 | return invalid_mb_sequence; |
bb93f35d | 306 | if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF |
02dab998 DM |
307 | return invalid_mb_sequence; |
308 | if (avail < 3) | |
309 | return incomplete_mb_character; | |
8f7b7c14 | 310 | char32_t c3 = (unsigned char) from[2]; |
bb93f35d | 311 | if ((c3 & 0xC0) != 0x80) |
b6584a72 | 312 | return invalid_mb_sequence; |
02dab998 DM |
313 | if (avail < 4) |
314 | return incomplete_mb_character; | |
8f7b7c14 | 315 | char32_t c4 = (unsigned char) from[3]; |
bb93f35d | 316 | if ((c4 & 0xC0) != 0x80) |
b6584a72 | 317 | return invalid_mb_sequence; |
bb93f35d | 318 | char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080; |
28af1fb3 | 319 | if (c <= maxcode) |
d951e75d | 320 | from += 4; |
bb93f35d JW |
321 | return c; |
322 | } | |
323 | else // > U+10FFFF | |
b6584a72 | 324 | return invalid_mb_sequence; |
bb93f35d JW |
325 | } |
326 | ||
c124af93 | 327 | template<typename C> |
bb93f35d | 328 | bool |
c124af93 | 329 | write_utf8_code_point(range<C>& to, char32_t code_point) |
bb93f35d JW |
330 | { |
331 | if (code_point < 0x80) | |
332 | { | |
333 | if (to.size() < 1) | |
334 | return false; | |
d951e75d | 335 | to = code_point; |
bb93f35d JW |
336 | } |
337 | else if (code_point <= 0x7FF) | |
338 | { | |
339 | if (to.size() < 2) | |
340 | return false; | |
d951e75d JW |
341 | to = (code_point >> 6) + 0xC0; |
342 | to = (code_point & 0x3F) + 0x80; | |
bb93f35d JW |
343 | } |
344 | else if (code_point <= 0xFFFF) | |
345 | { | |
346 | if (to.size() < 3) | |
347 | return false; | |
d951e75d JW |
348 | to = (code_point >> 12) + 0xE0; |
349 | to = ((code_point >> 6) & 0x3F) + 0x80; | |
350 | to = (code_point & 0x3F) + 0x80; | |
bb93f35d JW |
351 | } |
352 | else if (code_point <= 0x10FFFF) | |
353 | { | |
354 | if (to.size() < 4) | |
355 | return false; | |
d951e75d JW |
356 | to = (code_point >> 18) + 0xF0; |
357 | to = ((code_point >> 12) & 0x3F) + 0x80; | |
358 | to = ((code_point >> 6) & 0x3F) + 0x80; | |
359 | to = (code_point & 0x3F) + 0x80; | |
bb93f35d JW |
360 | } |
361 | else | |
362 | return false; | |
363 | return true; | |
364 | } | |
365 | ||
28af1fb3 JW |
366 | inline char16_t |
367 | adjust_byte_order(char16_t c, codecvt_mode mode) | |
368 | { | |
369 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |
370 | return (mode & little_endian) ? __builtin_bswap16(c) : c; | |
371 | #else | |
372 | return (mode & little_endian) ? c : __builtin_bswap16(c); | |
373 | #endif | |
374 | } | |
375 | ||
b6584a72 JW |
376 | // Return true if c is a high-surrogate (aka leading) code point. |
377 | inline bool | |
378 | is_high_surrogate(char32_t c) | |
379 | { | |
380 | return c >= 0xD800 && c <= 0xDBFF; | |
381 | } | |
382 | ||
383 | // Return true if c is a low-surrogate (aka trailing) code point. | |
384 | inline bool | |
385 | is_low_surrogate(char32_t c) | |
386 | { | |
387 | return c >= 0xDC00 && c <= 0xDFFF; | |
388 | } | |
389 | ||
390 | inline char32_t | |
391 | surrogate_pair_to_code_point(char32_t high, char32_t low) | |
392 | { | |
393 | return (high << 10) + low - 0x35FDC00; | |
394 | } | |
395 | ||
28af1fb3 JW |
396 | // Read a codepoint from a UTF-16 multibyte sequence. |
397 | // The sequence's endianness is indicated by (mode & little_endian). | |
398 | // Updates from.next if the codepoint is not greater than maxcode. | |
b6584a72 | 399 | // Returns invalid_mb_sequence, incomplete_mb_character or the code point. |
d951e75d JW |
400 | template<bool Aligned> |
401 | char32_t | |
402 | read_utf16_code_point(range<const char16_t, Aligned>& from, | |
403 | unsigned long maxcode, codecvt_mode mode) | |
404 | { | |
405 | const size_t avail = from.size(); | |
406 | if (avail == 0) | |
407 | return incomplete_mb_character; | |
408 | int inc = 1; | |
409 | char32_t c = adjust_byte_order(from[0], mode); | |
410 | if (is_high_surrogate(c)) | |
411 | { | |
412 | if (avail < 2) | |
413 | return incomplete_mb_character; | |
414 | const char16_t c2 = adjust_byte_order(from[1], mode); | |
415 | if (is_low_surrogate(c2)) | |
416 | { | |
417 | c = surrogate_pair_to_code_point(c, c2); | |
418 | inc = 2; | |
419 | } | |
420 | else | |
421 | return invalid_mb_sequence; | |
422 | } | |
423 | else if (is_low_surrogate(c)) | |
424 | return invalid_mb_sequence; | |
425 | if (c <= maxcode) | |
426 | from += inc; | |
427 | return c; | |
428 | } | |
28af1fb3 | 429 | |
d951e75d | 430 | template<typename C, bool A> |
bb93f35d | 431 | bool |
d951e75d | 432 | write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode) |
bb93f35d | 433 | { |
28af1fb3 JW |
434 | static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit"); |
435 | ||
02e12bda | 436 | if (codepoint <= max_single_utf16_unit) |
bb93f35d JW |
437 | { |
438 | if (to.size() > 0) | |
439 | { | |
d951e75d | 440 | to = adjust_byte_order(codepoint, mode); |
bb93f35d JW |
441 | return true; |
442 | } | |
443 | } | |
444 | else if (to.size() > 1) | |
445 | { | |
446 | // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4 | |
447 | const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10); | |
bb93f35d JW |
448 | char16_t lead = LEAD_OFFSET + (codepoint >> 10); |
449 | char16_t trail = 0xDC00 + (codepoint & 0x3FF); | |
d951e75d JW |
450 | to = adjust_byte_order(lead, mode); |
451 | to = adjust_byte_order(trail, mode); | |
bb93f35d JW |
452 | return true; |
453 | } | |
454 | return false; | |
455 | } | |
456 | ||
457 | // utf8 -> ucs4 | |
c124af93 | 458 | template<typename C> |
bb93f35d | 459 | codecvt_base::result |
c124af93 | 460 | ucs4_in(range<const C>& from, range<char32_t>& to, |
28af1fb3 | 461 | unsigned long maxcode = max_code_point, codecvt_mode mode = {}) |
bb93f35d | 462 | { |
28af1fb3 | 463 | read_utf8_bom(from, mode); |
bb93f35d JW |
464 | while (from.size() && to.size()) |
465 | { | |
466 | const char32_t codepoint = read_utf8_code_point(from, maxcode); | |
b6584a72 JW |
467 | if (codepoint == incomplete_mb_character) |
468 | return codecvt_base::partial; | |
28af1fb3 | 469 | if (codepoint > maxcode) |
bb93f35d | 470 | return codecvt_base::error; |
d951e75d | 471 | to = codepoint; |
bb93f35d JW |
472 | } |
473 | return from.size() ? codecvt_base::partial : codecvt_base::ok; | |
474 | } | |
475 | ||
476 | // ucs4 -> utf8 | |
c124af93 | 477 | template<typename C> |
bb93f35d | 478 | codecvt_base::result |
c124af93 | 479 | ucs4_out(range<const char32_t>& from, range<C>& to, |
28af1fb3 | 480 | unsigned long maxcode = max_code_point, codecvt_mode mode = {}) |
bb93f35d | 481 | { |
28af1fb3 JW |
482 | if (!write_utf8_bom(to, mode)) |
483 | return codecvt_base::partial; | |
bb93f35d JW |
484 | while (from.size()) |
485 | { | |
d951e75d | 486 | const char32_t c = from[0]; |
bb93f35d JW |
487 | if (c > maxcode) |
488 | return codecvt_base::error; | |
489 | if (!write_utf8_code_point(to, c)) | |
490 | return codecvt_base::partial; | |
d951e75d | 491 | ++from; |
bb93f35d JW |
492 | } |
493 | return codecvt_base::ok; | |
494 | } | |
495 | ||
28af1fb3 JW |
496 | // utf16 -> ucs4 |
497 | codecvt_base::result | |
d951e75d | 498 | ucs4_in(range<const char16_t, false>& from, range<char32_t>& to, |
28af1fb3 JW |
499 | unsigned long maxcode = max_code_point, codecvt_mode mode = {}) |
500 | { | |
bcd682e1 | 501 | read_utf16_bom(from, mode); |
28af1fb3 JW |
502 | while (from.size() && to.size()) |
503 | { | |
504 | const char32_t codepoint = read_utf16_code_point(from, maxcode, mode); | |
b6584a72 JW |
505 | if (codepoint == incomplete_mb_character) |
506 | return codecvt_base::partial; | |
28af1fb3 JW |
507 | if (codepoint > maxcode) |
508 | return codecvt_base::error; | |
d951e75d | 509 | to = codepoint; |
28af1fb3 JW |
510 | } |
511 | return from.size() ? codecvt_base::partial : codecvt_base::ok; | |
512 | } | |
513 | ||
514 | // ucs4 -> utf16 | |
515 | codecvt_base::result | |
d951e75d | 516 | ucs4_out(range<const char32_t>& from, range<char16_t, false>& to, |
28af1fb3 JW |
517 | unsigned long maxcode = max_code_point, codecvt_mode mode = {}) |
518 | { | |
519 | if (!write_utf16_bom(to, mode)) | |
520 | return codecvt_base::partial; | |
521 | while (from.size()) | |
522 | { | |
d951e75d | 523 | const char32_t c = from[0]; |
28af1fb3 JW |
524 | if (c > maxcode) |
525 | return codecvt_base::error; | |
526 | if (!write_utf16_code_point(to, c, mode)) | |
527 | return codecvt_base::partial; | |
d951e75d | 528 | ++from; |
28af1fb3 JW |
529 | } |
530 | return codecvt_base::ok; | |
531 | } | |
532 | ||
bcd682e1 JW |
533 | // Flag indicating whether to process UTF-16 or UCS2 |
534 | enum class surrogates { allowed, disallowed }; | |
535 | ||
02dab998 DM |
536 | // utf8 -> utf16 (or utf8 -> ucs2 if maxcode <= 0xFFFF) |
537 | template <typename C8, typename C16> | |
bb93f35d | 538 | codecvt_base::result |
02dab998 DM |
539 | utf16_in(range<const C8> &from, range<C16> &to, |
540 | unsigned long maxcode = max_code_point, codecvt_mode mode = {}) | |
bb93f35d | 541 | { |
28af1fb3 | 542 | read_utf8_bom(from, mode); |
bb93f35d JW |
543 | while (from.size() && to.size()) |
544 | { | |
d951e75d | 545 | auto orig = from; |
bb93f35d | 546 | const char32_t codepoint = read_utf8_code_point(from, maxcode); |
b6584a72 | 547 | if (codepoint == incomplete_mb_character) |
02dab998 | 548 | return codecvt_base::partial; |
28af1fb3 | 549 | if (codepoint > maxcode) |
bb93f35d | 550 | return codecvt_base::error; |
7f971f18 | 551 | if (!write_utf16_code_point(to, codepoint, mode)) |
bb93f35d | 552 | { |
d951e75d | 553 | from = orig; // rewind to previous position |
bb93f35d JW |
554 | return codecvt_base::partial; |
555 | } | |
556 | } | |
02dab998 | 557 | return from.size() ? codecvt_base::partial : codecvt_base::ok; |
bb93f35d JW |
558 | } |
559 | ||
bcd682e1 | 560 | // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed) |
c124af93 | 561 | template<typename C16, typename C8> |
bb93f35d | 562 | codecvt_base::result |
c124af93 | 563 | utf16_out(range<const C16>& from, range<C8>& to, |
bcd682e1 JW |
564 | unsigned long maxcode = max_code_point, codecvt_mode mode = {}, |
565 | surrogates s = surrogates::allowed) | |
bb93f35d | 566 | { |
28af1fb3 JW |
567 | if (!write_utf8_bom(to, mode)) |
568 | return codecvt_base::partial; | |
bb93f35d JW |
569 | while (from.size()) |
570 | { | |
d951e75d | 571 | char32_t c = from[0]; |
bb93f35d | 572 | int inc = 1; |
b6584a72 | 573 | if (is_high_surrogate(c)) |
bb93f35d | 574 | { |
bcd682e1 JW |
575 | if (s == surrogates::disallowed) |
576 | return codecvt_base::error; // No surrogates in UCS-2 | |
577 | ||
bb93f35d | 578 | if (from.size() < 2) |
02dab998 | 579 | return codecvt_base::partial; // stop converting at this point |
bb93f35d | 580 | |
d951e75d | 581 | const char32_t c2 = from[1]; |
b6584a72 | 582 | if (is_low_surrogate(c2)) |
bb93f35d | 583 | { |
b6584a72 | 584 | c = surrogate_pair_to_code_point(c, c2); |
bb93f35d | 585 | inc = 2; |
bb93f35d JW |
586 | } |
587 | else | |
588 | return codecvt_base::error; | |
589 | } | |
b6584a72 JW |
590 | else if (is_low_surrogate(c)) |
591 | return codecvt_base::error; | |
bb93f35d JW |
592 | if (c > maxcode) |
593 | return codecvt_base::error; | |
594 | if (!write_utf8_code_point(to, c)) | |
595 | return codecvt_base::partial; | |
d951e75d | 596 | from += inc; |
bb93f35d JW |
597 | } |
598 | return codecvt_base::ok; | |
599 | } | |
600 | ||
601 | // return pos such that [begin,pos) is valid UTF-16 string no longer than max | |
c124af93 TH |
602 | template<typename C> |
603 | const C* | |
604 | utf16_span(const C* begin, const C* end, size_t max, | |
28af1fb3 | 605 | char32_t maxcode = max_code_point, codecvt_mode mode = {}) |
bb93f35d | 606 | { |
c124af93 | 607 | range<const C> from{ begin, end }; |
28af1fb3 | 608 | read_utf8_bom(from, mode); |
bb93f35d JW |
609 | size_t count = 0; |
610 | while (count+1 < max) | |
611 | { | |
612 | char32_t c = read_utf8_code_point(from, maxcode); | |
b6584a72 JW |
613 | if (c > maxcode) |
614 | return from.next; | |
bb93f35d JW |
615 | else if (c > max_single_utf16_unit) |
616 | ++count; | |
617 | ++count; | |
618 | } | |
619 | if (count+1 == max) // take one more character if it fits in a single unit | |
bcd682e1 | 620 | read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode)); |
28af1fb3 | 621 | return from.next; |
bb93f35d JW |
622 | } |
623 | ||
28af1fb3 | 624 | // utf8 -> ucs2 |
c124af93 | 625 | template<typename C> |
28af1fb3 | 626 | codecvt_base::result |
c124af93 | 627 | ucs2_in(range<const C>& from, range<char16_t>& to, |
28af1fb3 | 628 | char32_t maxcode = max_code_point, codecvt_mode mode = {}) |
bb93f35d | 629 | { |
bcd682e1 JW |
630 | // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: |
631 | maxcode = std::min(max_single_utf16_unit, maxcode); | |
02dab998 | 632 | return utf16_in(from, to, maxcode, mode); |
28af1fb3 JW |
633 | } |
634 | ||
635 | // ucs2 -> utf8 | |
c124af93 | 636 | template<typename C> |
28af1fb3 | 637 | codecvt_base::result |
c124af93 | 638 | ucs2_out(range<const char16_t>& from, range<C>& to, |
28af1fb3 JW |
639 | char32_t maxcode = max_code_point, codecvt_mode mode = {}) |
640 | { | |
bcd682e1 JW |
641 | // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: |
642 | maxcode = std::min(max_single_utf16_unit, maxcode); | |
643 | return utf16_out(from, to, maxcode, mode, surrogates::disallowed); | |
28af1fb3 JW |
644 | } |
645 | ||
646 | // ucs2 -> utf16 | |
647 | codecvt_base::result | |
d951e75d | 648 | ucs2_out(range<const char16_t>& from, range<char16_t, false>& to, |
28af1fb3 JW |
649 | char32_t maxcode = max_code_point, codecvt_mode mode = {}) |
650 | { | |
651 | if (!write_utf16_bom(to, mode)) | |
652 | return codecvt_base::partial; | |
653 | while (from.size() && to.size()) | |
bb93f35d | 654 | { |
d951e75d | 655 | char16_t c = from[0]; |
b6584a72 | 656 | if (is_high_surrogate(c)) |
28af1fb3 JW |
657 | return codecvt_base::error; |
658 | if (c > maxcode) | |
659 | return codecvt_base::error; | |
d951e75d JW |
660 | to = adjust_byte_order(c, mode); |
661 | ++from; | |
28af1fb3 JW |
662 | } |
663 | return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial; | |
664 | } | |
665 | ||
666 | // utf16 -> ucs2 | |
667 | codecvt_base::result | |
d951e75d | 668 | ucs2_in(range<const char16_t, false>& from, range<char16_t>& to, |
28af1fb3 JW |
669 | char32_t maxcode = max_code_point, codecvt_mode mode = {}) |
670 | { | |
bcd682e1 JW |
671 | read_utf16_bom(from, mode); |
672 | // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: | |
673 | maxcode = std::min(max_single_utf16_unit, maxcode); | |
28af1fb3 JW |
674 | while (from.size() && to.size()) |
675 | { | |
676 | const char32_t c = read_utf16_code_point(from, maxcode, mode); | |
b6584a72 | 677 | if (c == incomplete_mb_character) |
bcd682e1 | 678 | return codecvt_base::error; // UCS-2 only supports single units. |
b6584a72 | 679 | if (c > maxcode) |
28af1fb3 | 680 | return codecvt_base::error; |
d951e75d | 681 | to = c; |
bb93f35d | 682 | } |
28af1fb3 JW |
683 | return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial; |
684 | } | |
685 | ||
686 | const char16_t* | |
d951e75d | 687 | ucs2_span(range<const char16_t, false>& from, size_t max, |
28af1fb3 JW |
688 | char32_t maxcode, codecvt_mode mode) |
689 | { | |
bcd682e1 JW |
690 | read_utf16_bom(from, mode); |
691 | // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: | |
692 | maxcode = std::min(max_single_utf16_unit, maxcode); | |
28af1fb3 JW |
693 | char32_t c = 0; |
694 | while (max-- && c <= maxcode) | |
695 | c = read_utf16_code_point(from, maxcode, mode); | |
d951e75d | 696 | return reinterpret_cast<const char16_t*>(from.next); |
28af1fb3 JW |
697 | } |
698 | ||
c124af93 TH |
699 | template<typename C> |
700 | const C* | |
701 | ucs2_span(const C* begin, const C* end, size_t max, | |
28af1fb3 JW |
702 | char32_t maxcode, codecvt_mode mode) |
703 | { | |
c124af93 | 704 | range<const C> from{ begin, end }; |
28af1fb3 | 705 | read_utf8_bom(from, mode); |
bcd682e1 JW |
706 | // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: |
707 | maxcode = std::min(max_single_utf16_unit, maxcode); | |
28af1fb3 JW |
708 | char32_t c = 0; |
709 | while (max-- && c <= maxcode) | |
710 | c = read_utf8_code_point(from, maxcode); | |
711 | return from.next; | |
712 | } | |
713 | ||
714 | // return pos such that [begin,pos) is valid UCS-4 string no longer than max | |
c124af93 TH |
715 | template<typename C> |
716 | const C* | |
717 | ucs4_span(const C* begin, const C* end, size_t max, | |
28af1fb3 JW |
718 | char32_t maxcode = max_code_point, codecvt_mode mode = {}) |
719 | { | |
c124af93 | 720 | range<const C> from{ begin, end }; |
28af1fb3 JW |
721 | read_utf8_bom(from, mode); |
722 | char32_t c = 0; | |
723 | while (max-- && c <= maxcode) | |
724 | c = read_utf8_code_point(from, maxcode); | |
725 | return from.next; | |
726 | } | |
727 | ||
728 | // return pos such that [begin,pos) is valid UCS-4 string no longer than max | |
729 | const char16_t* | |
d951e75d | 730 | ucs4_span(range<const char16_t, false>& from, size_t max, |
28af1fb3 JW |
731 | char32_t maxcode = max_code_point, codecvt_mode mode = {}) |
732 | { | |
bcd682e1 | 733 | read_utf16_bom(from, mode); |
28af1fb3 JW |
734 | char32_t c = 0; |
735 | while (max-- && c <= maxcode) | |
736 | c = read_utf16_code_point(from, maxcode, mode); | |
d951e75d | 737 | return reinterpret_cast<const char16_t*>(from.next); |
bb93f35d JW |
738 | } |
739 | } | |
740 | ||
741 | // Define members of codecvt<char16_t, char, mbstate_t> specialization. | |
742 | // Converts from UTF-8 to UTF-16. | |
743 | ||
744 | locale::id codecvt<char16_t, char, mbstate_t>::id; | |
745 | ||
746 | codecvt<char16_t, char, mbstate_t>::~codecvt() { } | |
747 | ||
748 | codecvt_base::result | |
749 | codecvt<char16_t, char, mbstate_t>:: | |
750 | do_out(state_type&, | |
751 | const intern_type* __from, | |
752 | const intern_type* __from_end, const intern_type*& __from_next, | |
753 | extern_type* __to, extern_type* __to_end, | |
754 | extern_type*& __to_next) const | |
755 | { | |
756 | range<const char16_t> from{ __from, __from_end }; | |
757 | range<char> to{ __to, __to_end }; | |
758 | auto res = utf16_out(from, to); | |
759 | __from_next = from.next; | |
760 | __to_next = to.next; | |
761 | return res; | |
762 | } | |
763 | ||
764 | codecvt_base::result | |
765 | codecvt<char16_t, char, mbstate_t>:: | |
766 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
767 | extern_type*& __to_next) const | |
768 | { | |
769 | __to_next = __to; | |
770 | return noconv; // we don't use mbstate_t for the unicode facets | |
771 | } | |
772 | ||
773 | codecvt_base::result | |
774 | codecvt<char16_t, char, mbstate_t>:: | |
775 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
776 | const extern_type*& __from_next, | |
777 | intern_type* __to, intern_type* __to_end, | |
778 | intern_type*& __to_next) const | |
779 | { | |
780 | range<const char> from{ __from, __from_end }; | |
781 | range<char16_t> to{ __to, __to_end }; | |
7f971f18 JW |
782 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
783 | codecvt_mode mode = {}; | |
784 | #else | |
785 | codecvt_mode mode = little_endian; | |
786 | #endif | |
787 | auto res = utf16_in(from, to, max_code_point, mode); | |
bb93f35d JW |
788 | __from_next = from.next; |
789 | __to_next = to.next; | |
790 | return res; | |
791 | } | |
792 | ||
793 | int | |
794 | codecvt<char16_t, char, mbstate_t>::do_encoding() const throw() | |
516231de | 795 | { return 0; } // UTF-8 is not a fixed-width encoding |
bb93f35d JW |
796 | |
797 | bool | |
798 | codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw() | |
799 | { return false; } | |
800 | ||
801 | int | |
802 | codecvt<char16_t, char, mbstate_t>:: | |
803 | do_length(state_type&, const extern_type* __from, | |
804 | const extern_type* __end, size_t __max) const | |
805 | { | |
28af1fb3 JW |
806 | __end = utf16_span(__from, __end, __max); |
807 | return __end - __from; | |
bb93f35d JW |
808 | } |
809 | ||
810 | int | |
811 | codecvt<char16_t, char, mbstate_t>::do_max_length() const throw() | |
812 | { | |
516231de JW |
813 | // A single character (one or two UTF-16 code units) requires |
814 | // up to four UTF-8 code units. | |
815 | return 4; | |
bb93f35d JW |
816 | } |
817 | ||
818 | // Define members of codecvt<char32_t, char, mbstate_t> specialization. | |
819 | // Converts from UTF-8 to UTF-32 (aka UCS-4). | |
820 | ||
821 | locale::id codecvt<char32_t, char, mbstate_t>::id; | |
822 | ||
823 | codecvt<char32_t, char, mbstate_t>::~codecvt() { } | |
824 | ||
825 | codecvt_base::result | |
826 | codecvt<char32_t, char, mbstate_t>:: | |
827 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
828 | const intern_type*& __from_next, | |
829 | extern_type* __to, extern_type* __to_end, | |
830 | extern_type*& __to_next) const | |
831 | { | |
832 | range<const char32_t> from{ __from, __from_end }; | |
833 | range<char> to{ __to, __to_end }; | |
834 | auto res = ucs4_out(from, to); | |
835 | __from_next = from.next; | |
836 | __to_next = to.next; | |
837 | return res; | |
838 | } | |
839 | ||
840 | codecvt_base::result | |
841 | codecvt<char32_t, char, mbstate_t>:: | |
842 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
843 | extern_type*& __to_next) const | |
844 | { | |
845 | __to_next = __to; | |
846 | return noconv; | |
847 | } | |
848 | ||
849 | codecvt_base::result | |
850 | codecvt<char32_t, char, mbstate_t>:: | |
851 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
852 | const extern_type*& __from_next, | |
853 | intern_type* __to, intern_type* __to_end, | |
854 | intern_type*& __to_next) const | |
855 | { | |
856 | range<const char> from{ __from, __from_end }; | |
857 | range<char32_t> to{ __to, __to_end }; | |
858 | auto res = ucs4_in(from, to); | |
859 | __from_next = from.next; | |
860 | __to_next = to.next; | |
861 | return res; | |
862 | } | |
863 | ||
864 | int | |
865 | codecvt<char32_t, char, mbstate_t>::do_encoding() const throw() | |
516231de | 866 | { return 0; } // UTF-8 is not a fixed-width encoding |
bb93f35d JW |
867 | |
868 | bool | |
869 | codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw() | |
870 | { return false; } | |
871 | ||
872 | int | |
873 | codecvt<char32_t, char, mbstate_t>:: | |
874 | do_length(state_type&, const extern_type* __from, | |
875 | const extern_type* __end, size_t __max) const | |
876 | { | |
28af1fb3 JW |
877 | __end = ucs4_span(__from, __end, __max); |
878 | return __end - __from; | |
bb93f35d JW |
879 | } |
880 | ||
881 | int | |
882 | codecvt<char32_t, char, mbstate_t>::do_max_length() const throw() | |
516231de JW |
883 | { |
884 | // A single character (one UTF-32 code unit) requires | |
885 | // up to 4 UTF-8 code units. | |
886 | return 4; | |
887 | } | |
bb93f35d | 888 | |
c124af93 TH |
889 | #if defined(_GLIBCXX_USE_CHAR8_T) |
890 | // Define members of codecvt<char16_t, char8_t, mbstate_t> specialization. | |
891 | // Converts from UTF-8 to UTF-16. | |
892 | ||
893 | locale::id codecvt<char16_t, char8_t, mbstate_t>::id; | |
894 | ||
895 | codecvt<char16_t, char8_t, mbstate_t>::~codecvt() { } | |
896 | ||
897 | codecvt_base::result | |
898 | codecvt<char16_t, char8_t, mbstate_t>:: | |
899 | do_out(state_type&, | |
900 | const intern_type* __from, | |
901 | const intern_type* __from_end, const intern_type*& __from_next, | |
902 | extern_type* __to, extern_type* __to_end, | |
903 | extern_type*& __to_next) const | |
904 | { | |
905 | range<const char16_t> from{ __from, __from_end }; | |
906 | range<char8_t> to{ __to, __to_end }; | |
907 | auto res = utf16_out(from, to); | |
908 | __from_next = from.next; | |
909 | __to_next = to.next; | |
910 | return res; | |
911 | } | |
912 | ||
913 | codecvt_base::result | |
914 | codecvt<char16_t, char8_t, mbstate_t>:: | |
915 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
916 | extern_type*& __to_next) const | |
917 | { | |
918 | __to_next = __to; | |
919 | return noconv; // we don't use mbstate_t for the unicode facets | |
920 | } | |
921 | ||
922 | codecvt_base::result | |
923 | codecvt<char16_t, char8_t, mbstate_t>:: | |
924 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
925 | const extern_type*& __from_next, | |
926 | intern_type* __to, intern_type* __to_end, | |
927 | intern_type*& __to_next) const | |
928 | { | |
929 | range<const char8_t> from{ __from, __from_end }; | |
930 | range<char16_t> to{ __to, __to_end }; | |
931 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |
932 | codecvt_mode mode = {}; | |
933 | #else | |
934 | codecvt_mode mode = little_endian; | |
935 | #endif | |
936 | auto res = utf16_in(from, to, max_code_point, mode); | |
937 | __from_next = from.next; | |
938 | __to_next = to.next; | |
939 | return res; | |
940 | } | |
941 | ||
942 | int | |
943 | codecvt<char16_t, char8_t, mbstate_t>::do_encoding() const throw() | |
944 | { return 0; } // UTF-8 is not a fixed-width encoding | |
945 | ||
946 | bool | |
947 | codecvt<char16_t, char8_t, mbstate_t>::do_always_noconv() const throw() | |
948 | { return false; } | |
949 | ||
950 | int | |
951 | codecvt<char16_t, char8_t, mbstate_t>:: | |
952 | do_length(state_type&, const extern_type* __from, | |
953 | const extern_type* __end, size_t __max) const | |
954 | { | |
955 | __end = utf16_span(__from, __end, __max); | |
956 | return __end - __from; | |
957 | } | |
958 | ||
959 | int | |
960 | codecvt<char16_t, char8_t, mbstate_t>::do_max_length() const throw() | |
961 | { | |
962 | // A single character (one or two UTF-16 code units) requires | |
963 | // up to four UTF-8 code units. | |
964 | return 4; | |
965 | } | |
966 | ||
967 | // Define members of codecvt<char32_t, char8_t, mbstate_t> specialization. | |
968 | // Converts from UTF-8 to UTF-32 (aka UCS-4). | |
969 | ||
970 | locale::id codecvt<char32_t, char8_t, mbstate_t>::id; | |
971 | ||
972 | codecvt<char32_t, char8_t, mbstate_t>::~codecvt() { } | |
973 | ||
974 | codecvt_base::result | |
975 | codecvt<char32_t, char8_t, mbstate_t>:: | |
976 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
977 | const intern_type*& __from_next, | |
978 | extern_type* __to, extern_type* __to_end, | |
979 | extern_type*& __to_next) const | |
980 | { | |
981 | range<const char32_t> from{ __from, __from_end }; | |
982 | range<char8_t> to{ __to, __to_end }; | |
983 | auto res = ucs4_out(from, to); | |
984 | __from_next = from.next; | |
985 | __to_next = to.next; | |
986 | return res; | |
987 | } | |
988 | ||
989 | codecvt_base::result | |
990 | codecvt<char32_t, char8_t, mbstate_t>:: | |
991 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
992 | extern_type*& __to_next) const | |
993 | { | |
994 | __to_next = __to; | |
995 | return noconv; | |
996 | } | |
997 | ||
998 | codecvt_base::result | |
999 | codecvt<char32_t, char8_t, mbstate_t>:: | |
1000 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
1001 | const extern_type*& __from_next, | |
1002 | intern_type* __to, intern_type* __to_end, | |
1003 | intern_type*& __to_next) const | |
1004 | { | |
1005 | range<const char8_t> from{ __from, __from_end }; | |
1006 | range<char32_t> to{ __to, __to_end }; | |
1007 | auto res = ucs4_in(from, to); | |
1008 | __from_next = from.next; | |
1009 | __to_next = to.next; | |
1010 | return res; | |
1011 | } | |
1012 | ||
1013 | int | |
1014 | codecvt<char32_t, char8_t, mbstate_t>::do_encoding() const throw() | |
1015 | { return 0; } // UTF-8 is not a fixed-width encoding | |
1016 | ||
1017 | bool | |
1018 | codecvt<char32_t, char8_t, mbstate_t>::do_always_noconv() const throw() | |
1019 | { return false; } | |
1020 | ||
1021 | int | |
1022 | codecvt<char32_t, char8_t, mbstate_t>:: | |
1023 | do_length(state_type&, const extern_type* __from, | |
1024 | const extern_type* __end, size_t __max) const | |
1025 | { | |
1026 | __end = ucs4_span(__from, __end, __max); | |
1027 | return __end - __from; | |
1028 | } | |
1029 | ||
1030 | int | |
1031 | codecvt<char32_t, char8_t, mbstate_t>::do_max_length() const throw() | |
1032 | { | |
1033 | // A single character (one UTF-32 code unit) requires | |
1034 | // up to 4 UTF-8 code units. | |
1035 | return 4; | |
1036 | } | |
1037 | #endif // _GLIBCXX_USE_CHAR8_T | |
1038 | ||
28af1fb3 JW |
1039 | // Define members of codecvt_utf8<char16_t> base class implementation. |
1040 | // Converts from UTF-8 to UCS-2. | |
1041 | ||
1042 | __codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { } | |
1043 | ||
1044 | codecvt_base::result | |
1045 | __codecvt_utf8_base<char16_t>:: | |
1046 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
1047 | const intern_type*& __from_next, | |
1048 | extern_type* __to, extern_type* __to_end, | |
1049 | extern_type*& __to_next) const | |
1050 | { | |
1051 | range<const char16_t> from{ __from, __from_end }; | |
1052 | range<char> to{ __to, __to_end }; | |
1053 | auto res = ucs2_out(from, to, _M_maxcode, _M_mode); | |
1054 | __from_next = from.next; | |
1055 | __to_next = to.next; | |
1056 | return res; | |
1057 | } | |
1058 | ||
1059 | codecvt_base::result | |
1060 | __codecvt_utf8_base<char16_t>:: | |
1061 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
1062 | extern_type*& __to_next) const | |
1063 | { | |
1064 | __to_next = __to; | |
1065 | return noconv; | |
1066 | } | |
1067 | ||
1068 | codecvt_base::result | |
1069 | __codecvt_utf8_base<char16_t>:: | |
1070 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
1071 | const extern_type*& __from_next, | |
1072 | intern_type* __to, intern_type* __to_end, | |
1073 | intern_type*& __to_next) const | |
1074 | { | |
1075 | range<const char> from{ __from, __from_end }; | |
1076 | range<char16_t> to{ __to, __to_end }; | |
a1e1ec76 | 1077 | codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); |
29ca91f7 JW |
1078 | #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ |
1079 | mode = codecvt_mode(mode | little_endian); | |
1080 | #endif | |
1081 | auto res = ucs2_in(from, to, _M_maxcode, mode); | |
28af1fb3 JW |
1082 | __from_next = from.next; |
1083 | __to_next = to.next; | |
1084 | return res; | |
1085 | } | |
1086 | ||
1087 | int | |
1088 | __codecvt_utf8_base<char16_t>::do_encoding() const throw() | |
516231de | 1089 | { return 0; } // UTF-8 is not a fixed-width encoding |
28af1fb3 JW |
1090 | |
1091 | bool | |
1092 | __codecvt_utf8_base<char16_t>::do_always_noconv() const throw() | |
1093 | { return false; } | |
1094 | ||
1095 | int | |
1096 | __codecvt_utf8_base<char16_t>:: | |
1097 | do_length(state_type&, const extern_type* __from, | |
1098 | const extern_type* __end, size_t __max) const | |
1099 | { | |
1100 | __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode); | |
1101 | return __end - __from; | |
1102 | } | |
1103 | ||
1104 | int | |
1105 | __codecvt_utf8_base<char16_t>::do_max_length() const throw() | |
516231de JW |
1106 | { |
1107 | // A single UCS-2 character requires up to three UTF-8 code units. | |
1108 | // (UCS-2 cannot represent characters that use four UTF-8 code units). | |
1109 | int max = 3; | |
1110 | if (_M_mode & consume_header) | |
1111 | max += sizeof(utf8_bom); | |
1112 | return max; | |
1113 | } | |
28af1fb3 JW |
1114 | |
1115 | // Define members of codecvt_utf8<char32_t> base class implementation. | |
1116 | // Converts from UTF-8 to UTF-32 (aka UCS-4). | |
1117 | ||
1118 | __codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { } | |
1119 | ||
1120 | codecvt_base::result | |
1121 | __codecvt_utf8_base<char32_t>:: | |
1122 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
1123 | const intern_type*& __from_next, | |
1124 | extern_type* __to, extern_type* __to_end, | |
1125 | extern_type*& __to_next) const | |
1126 | { | |
1127 | range<const char32_t> from{ __from, __from_end }; | |
1128 | range<char> to{ __to, __to_end }; | |
1129 | auto res = ucs4_out(from, to, _M_maxcode, _M_mode); | |
1130 | __from_next = from.next; | |
1131 | __to_next = to.next; | |
1132 | return res; | |
1133 | } | |
1134 | ||
1135 | codecvt_base::result | |
1136 | __codecvt_utf8_base<char32_t>:: | |
1137 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
1138 | extern_type*& __to_next) const | |
1139 | { | |
1140 | __to_next = __to; | |
1141 | return noconv; | |
1142 | } | |
1143 | ||
1144 | codecvt_base::result | |
1145 | __codecvt_utf8_base<char32_t>:: | |
1146 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
1147 | const extern_type*& __from_next, | |
1148 | intern_type* __to, intern_type* __to_end, | |
1149 | intern_type*& __to_next) const | |
1150 | { | |
1151 | range<const char> from{ __from, __from_end }; | |
1152 | range<char32_t> to{ __to, __to_end }; | |
1153 | auto res = ucs4_in(from, to, _M_maxcode, _M_mode); | |
1154 | __from_next = from.next; | |
1155 | __to_next = to.next; | |
1156 | return res; | |
1157 | } | |
1158 | ||
1159 | int | |
1160 | __codecvt_utf8_base<char32_t>::do_encoding() const throw() | |
516231de | 1161 | { return 0; } // UTF-8 is not a fixed-width encoding |
28af1fb3 JW |
1162 | |
1163 | bool | |
1164 | __codecvt_utf8_base<char32_t>::do_always_noconv() const throw() | |
1165 | { return false; } | |
1166 | ||
1167 | int | |
1168 | __codecvt_utf8_base<char32_t>:: | |
1169 | do_length(state_type&, const extern_type* __from, | |
1170 | const extern_type* __end, size_t __max) const | |
1171 | { | |
1172 | __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode); | |
1173 | return __end - __from; | |
1174 | } | |
1175 | ||
1176 | int | |
1177 | __codecvt_utf8_base<char32_t>::do_max_length() const throw() | |
516231de JW |
1178 | { |
1179 | // A single UCS-4 character requires up to four UTF-8 code units. | |
1180 | int max = 4; | |
1181 | if (_M_mode & consume_header) | |
1182 | max += sizeof(utf8_bom); | |
1183 | return max; | |
1184 | } | |
28af1fb3 JW |
1185 | |
1186 | #ifdef _GLIBCXX_USE_WCHAR_T | |
d951e75d JW |
1187 | |
1188 | #if __SIZEOF_WCHAR_T__ == 2 | |
1189 | static_assert(sizeof(wchar_t) == sizeof(char16_t), ""); | |
1190 | #elif __SIZEOF_WCHAR_T__ == 4 | |
1191 | static_assert(sizeof(wchar_t) == sizeof(char32_t), ""); | |
1192 | #endif | |
1193 | ||
28af1fb3 JW |
1194 | // Define members of codecvt_utf8<wchar_t> base class implementation. |
1195 | // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t). | |
1196 | ||
1197 | __codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { } | |
1198 | ||
1199 | codecvt_base::result | |
1200 | __codecvt_utf8_base<wchar_t>:: | |
1201 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
1202 | const intern_type*& __from_next, | |
1203 | extern_type* __to, extern_type* __to_end, | |
1204 | extern_type*& __to_next) const | |
1205 | { | |
1206 | range<char> to{ __to, __to_end }; | |
1207 | #if __SIZEOF_WCHAR_T__ == 2 | |
1208 | range<const char16_t> from{ | |
1209 | reinterpret_cast<const char16_t*>(__from), | |
1210 | reinterpret_cast<const char16_t*>(__from_end) | |
1211 | }; | |
1212 | auto res = ucs2_out(from, to, _M_maxcode, _M_mode); | |
1213 | #elif __SIZEOF_WCHAR_T__ == 4 | |
1214 | range<const char32_t> from{ | |
1215 | reinterpret_cast<const char32_t*>(__from), | |
1216 | reinterpret_cast<const char32_t*>(__from_end) | |
1217 | }; | |
1218 | auto res = ucs4_out(from, to, _M_maxcode, _M_mode); | |
1219 | #else | |
1220 | return codecvt_base::error; | |
1221 | #endif | |
1222 | __from_next = reinterpret_cast<const wchar_t*>(from.next); | |
1223 | __to_next = to.next; | |
1224 | return res; | |
1225 | } | |
1226 | ||
1227 | codecvt_base::result | |
1228 | __codecvt_utf8_base<wchar_t>:: | |
1229 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
1230 | extern_type*& __to_next) const | |
1231 | { | |
1232 | __to_next = __to; | |
1233 | return noconv; | |
1234 | } | |
1235 | ||
1236 | codecvt_base::result | |
1237 | __codecvt_utf8_base<wchar_t>:: | |
1238 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
1239 | const extern_type*& __from_next, | |
1240 | intern_type* __to, intern_type* __to_end, | |
1241 | intern_type*& __to_next) const | |
1242 | { | |
1243 | range<const char> from{ __from, __from_end }; | |
1244 | #if __SIZEOF_WCHAR_T__ == 2 | |
1245 | range<char16_t> to{ | |
1246 | reinterpret_cast<char16_t*>(__to), | |
1247 | reinterpret_cast<char16_t*>(__to_end) | |
1248 | }; | |
a76400f4 JW |
1249 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
1250 | codecvt_mode mode = {}; | |
1251 | #else | |
1252 | codecvt_mode mode = little_endian; | |
1253 | #endif | |
1254 | auto res = ucs2_in(from, to, _M_maxcode, mode); | |
28af1fb3 JW |
1255 | #elif __SIZEOF_WCHAR_T__ == 4 |
1256 | range<char32_t> to{ | |
1257 | reinterpret_cast<char32_t*>(__to), | |
1258 | reinterpret_cast<char32_t*>(__to_end) | |
1259 | }; | |
1260 | auto res = ucs4_in(from, to, _M_maxcode, _M_mode); | |
1261 | #else | |
1262 | return codecvt_base::error; | |
1263 | #endif | |
1264 | __from_next = from.next; | |
1265 | __to_next = reinterpret_cast<wchar_t*>(to.next); | |
1266 | return res; | |
1267 | } | |
1268 | ||
1269 | int | |
1270 | __codecvt_utf8_base<wchar_t>::do_encoding() const throw() | |
516231de | 1271 | { return 0; } // UTF-8 is not a fixed-width encoding |
28af1fb3 JW |
1272 | |
1273 | bool | |
1274 | __codecvt_utf8_base<wchar_t>::do_always_noconv() const throw() | |
1275 | { return false; } | |
1276 | ||
1277 | int | |
1278 | __codecvt_utf8_base<wchar_t>:: | |
1279 | do_length(state_type&, const extern_type* __from, | |
1280 | const extern_type* __end, size_t __max) const | |
1281 | { | |
1282 | #if __SIZEOF_WCHAR_T__ == 2 | |
1283 | __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode); | |
1284 | #elif __SIZEOF_WCHAR_T__ == 4 | |
1285 | __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode); | |
1286 | #else | |
1287 | __end = __from; | |
1288 | #endif | |
1289 | return __end - __from; | |
1290 | } | |
1291 | ||
1292 | int | |
1293 | __codecvt_utf8_base<wchar_t>::do_max_length() const throw() | |
516231de JW |
1294 | { |
1295 | #if __SIZEOF_WCHAR_T__ == 2 | |
1296 | int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length() | |
1297 | #else | |
1298 | int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length() | |
1299 | #endif | |
1300 | if (_M_mode & consume_header) | |
1301 | max += sizeof(utf8_bom); | |
1302 | return max; | |
1303 | } | |
28af1fb3 JW |
1304 | #endif |
1305 | ||
1306 | // Define members of codecvt_utf16<char16_t> base class implementation. | |
1307 | // Converts from UTF-16 to UCS-2. | |
1308 | ||
1309 | __codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { } | |
1310 | ||
1311 | codecvt_base::result | |
1312 | __codecvt_utf16_base<char16_t>:: | |
1313 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
1314 | const intern_type*& __from_next, | |
1315 | extern_type* __to, extern_type* __to_end, | |
1316 | extern_type*& __to_next) const | |
1317 | { | |
1318 | range<const char16_t> from{ __from, __from_end }; | |
d951e75d | 1319 | range<char16_t, false> to{ __to, __to_end }; |
28af1fb3 JW |
1320 | auto res = ucs2_out(from, to, _M_maxcode, _M_mode); |
1321 | __from_next = from.next; | |
1322 | __to_next = reinterpret_cast<char*>(to.next); | |
1323 | return res; | |
1324 | } | |
1325 | ||
1326 | codecvt_base::result | |
1327 | __codecvt_utf16_base<char16_t>:: | |
1328 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
1329 | extern_type*& __to_next) const | |
1330 | { | |
1331 | __to_next = __to; | |
1332 | return noconv; | |
1333 | } | |
1334 | ||
1335 | codecvt_base::result | |
1336 | __codecvt_utf16_base<char16_t>:: | |
1337 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
1338 | const extern_type*& __from_next, | |
1339 | intern_type* __to, intern_type* __to_end, | |
1340 | intern_type*& __to_next) const | |
1341 | { | |
d951e75d | 1342 | range<const char16_t, false> from{ __from, __from_end }; |
28af1fb3 JW |
1343 | range<char16_t> to{ __to, __to_end }; |
1344 | auto res = ucs2_in(from, to, _M_maxcode, _M_mode); | |
1345 | __from_next = reinterpret_cast<const char*>(from.next); | |
1346 | __to_next = to.next; | |
d951e75d JW |
1347 | if (res == codecvt_base::ok && __from_next != __from_end) |
1348 | res = codecvt_base::error; | |
28af1fb3 JW |
1349 | return res; |
1350 | } | |
1351 | ||
1352 | int | |
1353 | __codecvt_utf16_base<char16_t>::do_encoding() const throw() | |
516231de | 1354 | { return 0; } // UTF-16 is not a fixed-width encoding |
28af1fb3 JW |
1355 | |
1356 | bool | |
1357 | __codecvt_utf16_base<char16_t>::do_always_noconv() const throw() | |
1358 | { return false; } | |
1359 | ||
1360 | int | |
1361 | __codecvt_utf16_base<char16_t>:: | |
1362 | do_length(state_type&, const extern_type* __from, | |
1363 | const extern_type* __end, size_t __max) const | |
1364 | { | |
d951e75d JW |
1365 | range<const char16_t, false> from{ __from, __end }; |
1366 | const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode); | |
28af1fb3 JW |
1367 | return reinterpret_cast<const char*>(next) - __from; |
1368 | } | |
1369 | ||
1370 | int | |
1371 | __codecvt_utf16_base<char16_t>::do_max_length() const throw() | |
516231de JW |
1372 | { |
1373 | // A single UCS-2 character requires one UTF-16 code unit (so two chars). | |
1374 | // (UCS-2 cannot represent characters that use multiple UTF-16 code units). | |
1375 | int max = 2; | |
1376 | if (_M_mode & consume_header) | |
1377 | max += sizeof(utf16_bom); | |
1378 | return max; | |
1379 | } | |
28af1fb3 JW |
1380 | |
1381 | // Define members of codecvt_utf16<char32_t> base class implementation. | |
1382 | // Converts from UTF-16 to UTF-32 (aka UCS-4). | |
1383 | ||
1384 | __codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { } | |
1385 | ||
1386 | codecvt_base::result | |
1387 | __codecvt_utf16_base<char32_t>:: | |
1388 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
1389 | const intern_type*& __from_next, | |
1390 | extern_type* __to, extern_type* __to_end, | |
1391 | extern_type*& __to_next) const | |
1392 | { | |
1393 | range<const char32_t> from{ __from, __from_end }; | |
d951e75d | 1394 | range<char16_t, false> to{ __to, __to_end }; |
28af1fb3 JW |
1395 | auto res = ucs4_out(from, to, _M_maxcode, _M_mode); |
1396 | __from_next = from.next; | |
1397 | __to_next = reinterpret_cast<char*>(to.next); | |
1398 | return res; | |
1399 | } | |
1400 | ||
1401 | codecvt_base::result | |
1402 | __codecvt_utf16_base<char32_t>:: | |
1403 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
1404 | extern_type*& __to_next) const | |
1405 | { | |
1406 | __to_next = __to; | |
1407 | return noconv; | |
1408 | } | |
1409 | ||
1410 | codecvt_base::result | |
1411 | __codecvt_utf16_base<char32_t>:: | |
1412 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
1413 | const extern_type*& __from_next, | |
1414 | intern_type* __to, intern_type* __to_end, | |
1415 | intern_type*& __to_next) const | |
1416 | { | |
d951e75d | 1417 | range<const char16_t, false> from{ __from, __from_end }; |
28af1fb3 JW |
1418 | range<char32_t> to{ __to, __to_end }; |
1419 | auto res = ucs4_in(from, to, _M_maxcode, _M_mode); | |
1420 | __from_next = reinterpret_cast<const char*>(from.next); | |
1421 | __to_next = to.next; | |
d951e75d JW |
1422 | if (res == codecvt_base::ok && __from_next != __from_end) |
1423 | res = codecvt_base::error; | |
28af1fb3 JW |
1424 | return res; |
1425 | } | |
1426 | ||
1427 | int | |
1428 | __codecvt_utf16_base<char32_t>::do_encoding() const throw() | |
516231de | 1429 | { return 0; } // UTF-16 is not a fixed-width encoding |
28af1fb3 JW |
1430 | |
1431 | bool | |
1432 | __codecvt_utf16_base<char32_t>::do_always_noconv() const throw() | |
1433 | { return false; } | |
1434 | ||
1435 | int | |
1436 | __codecvt_utf16_base<char32_t>:: | |
1437 | do_length(state_type&, const extern_type* __from, | |
1438 | const extern_type* __end, size_t __max) const | |
1439 | { | |
d951e75d JW |
1440 | range<const char16_t, false> from{ __from, __end }; |
1441 | const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode); | |
28af1fb3 JW |
1442 | return reinterpret_cast<const char*>(next) - __from; |
1443 | } | |
1444 | ||
1445 | int | |
1446 | __codecvt_utf16_base<char32_t>::do_max_length() const throw() | |
516231de JW |
1447 | { |
1448 | // A single UCS-4 character requires one or two UTF-16 code units | |
1449 | // (so up to four chars). | |
1450 | int max = 4; | |
1451 | if (_M_mode & consume_header) | |
1452 | max += sizeof(utf16_bom); | |
1453 | return max; | |
1454 | } | |
28af1fb3 JW |
1455 | |
1456 | #ifdef _GLIBCXX_USE_WCHAR_T | |
1457 | // Define members of codecvt_utf16<wchar_t> base class implementation. | |
1458 | // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t). | |
1459 | ||
1460 | __codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { } | |
1461 | ||
1462 | codecvt_base::result | |
1463 | __codecvt_utf16_base<wchar_t>:: | |
1464 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
1465 | const intern_type*& __from_next, | |
1466 | extern_type* __to, extern_type* __to_end, | |
1467 | extern_type*& __to_next) const | |
1468 | { | |
d951e75d | 1469 | range<char16_t, false> to{ __to, __to_end }; |
28af1fb3 JW |
1470 | #if __SIZEOF_WCHAR_T__ == 2 |
1471 | range<const char16_t> from{ | |
1472 | reinterpret_cast<const char16_t*>(__from), | |
d951e75d | 1473 | reinterpret_cast<const char16_t*>(__from_end), |
28af1fb3 JW |
1474 | }; |
1475 | auto res = ucs2_out(from, to, _M_maxcode, _M_mode); | |
1476 | #elif __SIZEOF_WCHAR_T__ == 4 | |
1477 | range<const char32_t> from{ | |
1478 | reinterpret_cast<const char32_t*>(__from), | |
d951e75d | 1479 | reinterpret_cast<const char32_t*>(__from_end), |
28af1fb3 JW |
1480 | }; |
1481 | auto res = ucs4_out(from, to, _M_maxcode, _M_mode); | |
1482 | #else | |
1483 | return codecvt_base::error; | |
1484 | #endif | |
1485 | __from_next = reinterpret_cast<const wchar_t*>(from.next); | |
a4c687d6 | 1486 | __to_next = reinterpret_cast<char*>(to.next); |
28af1fb3 JW |
1487 | return res; |
1488 | } | |
1489 | ||
1490 | codecvt_base::result | |
1491 | __codecvt_utf16_base<wchar_t>:: | |
1492 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
1493 | extern_type*& __to_next) const | |
1494 | { | |
1495 | __to_next = __to; | |
1496 | return noconv; | |
1497 | } | |
1498 | ||
1499 | codecvt_base::result | |
1500 | __codecvt_utf16_base<wchar_t>:: | |
1501 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
1502 | const extern_type*& __from_next, | |
1503 | intern_type* __to, intern_type* __to_end, | |
1504 | intern_type*& __to_next) const | |
1505 | { | |
d951e75d | 1506 | range<const char16_t, false> from{ __from, __from_end }; |
28af1fb3 JW |
1507 | #if __SIZEOF_WCHAR_T__ == 2 |
1508 | range<char16_t> to{ | |
1509 | reinterpret_cast<char16_t*>(__to), | |
d951e75d | 1510 | reinterpret_cast<char16_t*>(__to_end), |
28af1fb3 JW |
1511 | }; |
1512 | auto res = ucs2_in(from, to, _M_maxcode, _M_mode); | |
1513 | #elif __SIZEOF_WCHAR_T__ == 4 | |
1514 | range<char32_t> to{ | |
1515 | reinterpret_cast<char32_t*>(__to), | |
d951e75d | 1516 | reinterpret_cast<char32_t*>(__to_end), |
28af1fb3 JW |
1517 | }; |
1518 | auto res = ucs4_in(from, to, _M_maxcode, _M_mode); | |
1519 | #else | |
1520 | return codecvt_base::error; | |
1521 | #endif | |
a4c687d6 | 1522 | __from_next = reinterpret_cast<const char*>(from.next); |
28af1fb3 | 1523 | __to_next = reinterpret_cast<wchar_t*>(to.next); |
d951e75d JW |
1524 | if (res == codecvt_base::ok && __from_next != __from_end) |
1525 | res = codecvt_base::error; | |
28af1fb3 JW |
1526 | return res; |
1527 | } | |
1528 | ||
1529 | int | |
1530 | __codecvt_utf16_base<wchar_t>::do_encoding() const throw() | |
516231de | 1531 | { return 0; } // UTF-16 is not a fixed-width encoding |
28af1fb3 JW |
1532 | |
1533 | bool | |
1534 | __codecvt_utf16_base<wchar_t>::do_always_noconv() const throw() | |
1535 | { return false; } | |
1536 | ||
1537 | int | |
1538 | __codecvt_utf16_base<wchar_t>:: | |
1539 | do_length(state_type&, const extern_type* __from, | |
1540 | const extern_type* __end, size_t __max) const | |
1541 | { | |
d951e75d | 1542 | range<const char16_t, false> from{ __from, __end }; |
28af1fb3 | 1543 | #if __SIZEOF_WCHAR_T__ == 2 |
d951e75d | 1544 | const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode); |
28af1fb3 | 1545 | #elif __SIZEOF_WCHAR_T__ == 4 |
d951e75d | 1546 | const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode); |
28af1fb3 JW |
1547 | #endif |
1548 | return reinterpret_cast<const char*>(next) - __from; | |
1549 | } | |
1550 | ||
1551 | int | |
1552 | __codecvt_utf16_base<wchar_t>::do_max_length() const throw() | |
516231de JW |
1553 | { |
1554 | #if __SIZEOF_WCHAR_T__ == 2 | |
1555 | int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length() | |
1556 | #else | |
1557 | int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length() | |
1558 | #endif | |
1559 | if (_M_mode & consume_header) | |
1560 | max += sizeof(utf16_bom); | |
1561 | return max; | |
1562 | } | |
28af1fb3 JW |
1563 | #endif |
1564 | ||
1565 | // Define members of codecvt_utf8_utf16<char16_t> base class implementation. | |
1566 | // Converts from UTF-8 to UTF-16. | |
1567 | ||
1568 | __codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { } | |
1569 | ||
1570 | codecvt_base::result | |
1571 | __codecvt_utf8_utf16_base<char16_t>:: | |
1572 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
1573 | const intern_type*& __from_next, | |
1574 | extern_type* __to, extern_type* __to_end, | |
1575 | extern_type*& __to_next) const | |
1576 | { | |
1577 | range<const char16_t> from{ __from, __from_end }; | |
1578 | range<char> to{ __to, __to_end }; | |
1579 | auto res = utf16_out(from, to, _M_maxcode, _M_mode); | |
1580 | __from_next = from.next; | |
1581 | __to_next = to.next; | |
1582 | return res; | |
1583 | } | |
1584 | ||
1585 | codecvt_base::result | |
1586 | __codecvt_utf8_utf16_base<char16_t>:: | |
1587 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
1588 | extern_type*& __to_next) const | |
1589 | { | |
1590 | __to_next = __to; | |
1591 | return noconv; | |
1592 | } | |
1593 | ||
1594 | codecvt_base::result | |
1595 | __codecvt_utf8_utf16_base<char16_t>:: | |
1596 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
1597 | const extern_type*& __from_next, | |
1598 | intern_type* __to, intern_type* __to_end, | |
1599 | intern_type*& __to_next) const | |
1600 | { | |
1601 | range<const char> from{ __from, __from_end }; | |
1602 | range<char16_t> to{ __to, __to_end }; | |
a1e1ec76 | 1603 | codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); |
795038b7 JW |
1604 | #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ |
1605 | mode = codecvt_mode(mode | little_endian); | |
1606 | #endif | |
1607 | auto res = utf16_in(from, to, _M_maxcode, mode); | |
28af1fb3 JW |
1608 | __from_next = from.next; |
1609 | __to_next = to.next; | |
1610 | return res; | |
1611 | } | |
1612 | ||
1613 | int | |
1614 | __codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw() | |
516231de | 1615 | { return 0; } // UTF-8 is not a fixed-width encoding |
28af1fb3 JW |
1616 | |
1617 | bool | |
1618 | __codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw() | |
1619 | { return false; } | |
1620 | ||
1621 | int | |
1622 | __codecvt_utf8_utf16_base<char16_t>:: | |
1623 | do_length(state_type&, const extern_type* __from, | |
1624 | const extern_type* __end, size_t __max) const | |
1625 | { | |
1626 | __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode); | |
1627 | return __end - __from; | |
1628 | } | |
1629 | ||
1630 | int | |
1631 | __codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw() | |
1632 | { | |
516231de JW |
1633 | // A single character can be 1 or 2 UTF-16 code units, |
1634 | // requiring up to 4 UTF-8 code units. | |
1635 | int max = 4; | |
1636 | if (_M_mode & consume_header) | |
1637 | max += sizeof(utf8_bom); | |
1638 | return max; | |
28af1fb3 JW |
1639 | } |
1640 | ||
1641 | // Define members of codecvt_utf8_utf16<char32_t> base class implementation. | |
1642 | // Converts from UTF-8 to UTF-16. | |
1643 | ||
1644 | __codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { } | |
1645 | ||
1646 | codecvt_base::result | |
1647 | __codecvt_utf8_utf16_base<char32_t>:: | |
1648 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
1649 | const intern_type*& __from_next, | |
1650 | extern_type* __to, extern_type* __to_end, | |
1651 | extern_type*& __to_next) const | |
1652 | { | |
1653 | range<const char32_t> from{ __from, __from_end }; | |
1654 | range<char> to{ __to, __to_end }; | |
1655 | auto res = utf16_out(from, to, _M_maxcode, _M_mode); | |
1656 | __from_next = from.next; | |
1657 | __to_next = to.next; | |
1658 | return res; | |
1659 | } | |
1660 | ||
1661 | codecvt_base::result | |
1662 | __codecvt_utf8_utf16_base<char32_t>:: | |
1663 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
1664 | extern_type*& __to_next) const | |
1665 | { | |
1666 | __to_next = __to; | |
1667 | return noconv; | |
1668 | } | |
1669 | ||
1670 | codecvt_base::result | |
1671 | __codecvt_utf8_utf16_base<char32_t>:: | |
1672 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
1673 | const extern_type*& __from_next, | |
1674 | intern_type* __to, intern_type* __to_end, | |
1675 | intern_type*& __to_next) const | |
1676 | { | |
1677 | range<const char> from{ __from, __from_end }; | |
1678 | range<char32_t> to{ __to, __to_end }; | |
02e12bda JW |
1679 | codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); |
1680 | #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ | |
1681 | mode = codecvt_mode(mode | little_endian); | |
1682 | #endif | |
1683 | auto res = utf16_in(from, to, _M_maxcode, mode); | |
28af1fb3 JW |
1684 | __from_next = from.next; |
1685 | __to_next = to.next; | |
1686 | return res; | |
1687 | } | |
1688 | ||
1689 | int | |
1690 | __codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw() | |
516231de | 1691 | { return 0; } // UTF-8 is not a fixed-width encoding |
28af1fb3 JW |
1692 | |
1693 | bool | |
1694 | __codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw() | |
1695 | { return false; } | |
1696 | ||
1697 | int | |
1698 | __codecvt_utf8_utf16_base<char32_t>:: | |
1699 | do_length(state_type&, const extern_type* __from, | |
1700 | const extern_type* __end, size_t __max) const | |
1701 | { | |
1702 | __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode); | |
1703 | return __end - __from; | |
1704 | } | |
1705 | ||
1706 | int | |
1707 | __codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw() | |
1708 | { | |
516231de JW |
1709 | // A single character can be 1 or 2 UTF-16 code units, |
1710 | // requiring up to 4 UTF-8 code units. | |
1711 | int max = 4; | |
1712 | if (_M_mode & consume_header) | |
1713 | max += sizeof(utf8_bom); | |
1714 | return max; | |
28af1fb3 JW |
1715 | } |
1716 | ||
1717 | #ifdef _GLIBCXX_USE_WCHAR_T | |
1718 | // Define members of codecvt_utf8_utf16<wchar_t> base class implementation. | |
1719 | // Converts from UTF-8 to UTF-16. | |
1720 | ||
1721 | __codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { } | |
1722 | ||
1723 | codecvt_base::result | |
1724 | __codecvt_utf8_utf16_base<wchar_t>:: | |
1725 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
1726 | const intern_type*& __from_next, | |
1727 | extern_type* __to, extern_type* __to_end, | |
1728 | extern_type*& __to_next) const | |
1729 | { | |
1730 | range<const wchar_t> from{ __from, __from_end }; | |
1731 | range<char> to{ __to, __to_end }; | |
1732 | auto res = utf16_out(from, to, _M_maxcode, _M_mode); | |
1733 | __from_next = from.next; | |
1734 | __to_next = to.next; | |
1735 | return res; | |
1736 | } | |
1737 | ||
1738 | codecvt_base::result | |
1739 | __codecvt_utf8_utf16_base<wchar_t>:: | |
1740 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
1741 | extern_type*& __to_next) const | |
1742 | { | |
1743 | __to_next = __to; | |
1744 | return noconv; | |
1745 | } | |
1746 | ||
1747 | codecvt_base::result | |
1748 | __codecvt_utf8_utf16_base<wchar_t>:: | |
1749 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
1750 | const extern_type*& __from_next, | |
1751 | intern_type* __to, intern_type* __to_end, | |
1752 | intern_type*& __to_next) const | |
1753 | { | |
1754 | range<const char> from{ __from, __from_end }; | |
1755 | range<wchar_t> to{ __to, __to_end }; | |
02e12bda JW |
1756 | codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); |
1757 | #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ | |
1758 | mode = codecvt_mode(mode | little_endian); | |
1759 | #endif | |
1760 | auto res = utf16_in(from, to, _M_maxcode, mode); | |
28af1fb3 JW |
1761 | __from_next = from.next; |
1762 | __to_next = to.next; | |
1763 | return res; | |
1764 | } | |
1765 | ||
1766 | int | |
1767 | __codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw() | |
516231de | 1768 | { return 0; } // UTF-8 is not a fixed-width encoding |
28af1fb3 JW |
1769 | |
1770 | bool | |
1771 | __codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw() | |
1772 | { return false; } | |
1773 | ||
1774 | int | |
1775 | __codecvt_utf8_utf16_base<wchar_t>:: | |
1776 | do_length(state_type&, const extern_type* __from, | |
1777 | const extern_type* __end, size_t __max) const | |
1778 | { | |
1779 | __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode); | |
1780 | return __end - __from; | |
1781 | } | |
1782 | ||
1783 | int | |
1784 | __codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw() | |
1785 | { | |
516231de JW |
1786 | // A single character can be 1 or 2 UTF-16 code units, |
1787 | // requiring up to 4 UTF-8 code units. | |
1788 | int max = 4; | |
1789 | if (_M_mode & consume_header) | |
1790 | max += sizeof(utf8_bom); | |
1791 | return max; | |
28af1fb3 JW |
1792 | } |
1793 | #endif | |
1794 | ||
bb93f35d JW |
1795 | inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>; |
1796 | inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>; | |
71a16cd8 JW |
1797 | template class codecvt_byname<char16_t, char, mbstate_t>; |
1798 | template class codecvt_byname<char32_t, char, mbstate_t>; | |
bb93f35d | 1799 | |
c124af93 TH |
1800 | #if defined(_GLIBCXX_USE_CHAR8_T) |
1801 | inline template class __codecvt_abstract_base<char16_t, char8_t, mbstate_t>; | |
1802 | inline template class __codecvt_abstract_base<char32_t, char8_t, mbstate_t>; | |
1803 | template class codecvt_byname<char16_t, char8_t, mbstate_t>; | |
1804 | template class codecvt_byname<char32_t, char8_t, mbstate_t>; | |
1805 | #endif | |
1806 | ||
bb93f35d JW |
1807 | _GLIBCXX_END_NAMESPACE_VERSION |
1808 | } |