]>
Commit | Line | Data |
---|---|---|
bb93f35d JW |
1 | // Locale support (codecvt) -*- C++ -*- |
2 | ||
a5544970 | 3 | // Copyright (C) 2015-2019 Free Software Foundation, Inc. |
bb93f35d JW |
4 | // |
5 | // This file is part of the GNU ISO C++ Library. This library is free | |
6 | // software; you can redistribute it and/or modify it under the | |
7 | // terms of the GNU General Public License as published by the | |
8 | // Free Software Foundation; either version 3, or (at your option) | |
9 | // any later version. | |
10 | ||
11 | // This library is distributed in the hope that it will be useful, | |
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | // GNU General Public License for more details. | |
15 | ||
16 | // Under Section 7 of GPL version 3, you are granted additional | |
17 | // permissions described in the GCC Runtime Library Exception, version | |
18 | // 3.1, as published by the Free Software Foundation. | |
19 | ||
20 | // You should have received a copy of the GNU General Public License and | |
21 | // a copy of the GCC Runtime Library Exception along with this program; | |
22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
23 | // <http://www.gnu.org/licenses/>. | |
24 | ||
28af1fb3 | 25 | #include <codecvt> |
bb93f35d | 26 | #include <cstring> // std::memcpy, std::memcmp |
bcd682e1 | 27 | #include <bits/stl_algobase.h> // std::min |
bb93f35d | 28 | |
bb93f35d JW |
29 | namespace std _GLIBCXX_VISIBILITY(default) |
30 | { | |
31 | _GLIBCXX_BEGIN_NAMESPACE_VERSION | |
32 | ||
bcd682e1 JW |
33 | // The standard doesn't define these operators, which is annoying. |
34 | static underlying_type<codecvt_mode>::type | |
35 | to_integer(codecvt_mode m) | |
e363c939 | 36 | { return static_cast<underlying_type<codecvt_mode>::type>(m); } |
bcd682e1 JW |
37 | |
38 | static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n) | |
39 | { return m = codecvt_mode(to_integer(m) & to_integer(n)); } | |
40 | ||
41 | static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n) | |
42 | { return m = codecvt_mode(to_integer(m) | to_integer(n)); } | |
43 | ||
44 | static codecvt_mode operator~(codecvt_mode m) | |
45 | { return codecvt_mode(~to_integer(m)); } | |
46 | ||
bb93f35d JW |
47 | namespace |
48 | { | |
49 | // Largest code point that fits in a single UTF-16 code unit. | |
50 | const char32_t max_single_utf16_unit = 0xFFFF; | |
b6584a72 | 51 | |
bb93f35d JW |
52 | const char32_t max_code_point = 0x10FFFF; |
53 | ||
b6584a72 JW |
54 | // The functions below rely on maxcode < incomplete_mb_character |
55 | // (which is enforced by the codecvt_utf* classes on construction). | |
56 | const char32_t incomplete_mb_character = char32_t(-2); | |
57 | const char32_t invalid_mb_sequence = char32_t(-1); | |
58 | ||
d951e75d JW |
59 | // Utility type for reading and writing code units of type Elem from |
60 | // a range defined by a pair of pointers. | |
61 | template<typename Elem, bool Aligned = true> | |
bb93f35d JW |
62 | struct range |
63 | { | |
64 | Elem* next; | |
65 | Elem* end; | |
66 | ||
d951e75d JW |
67 | // Write a code unit. |
68 | range& operator=(Elem e) | |
69 | { | |
70 | *next++ = e; | |
71 | return *this; | |
72 | } | |
73 | ||
74 | // Read the next code unit. | |
bb93f35d JW |
75 | Elem operator*() const { return *next; } |
76 | ||
d951e75d JW |
77 | // Read the Nth code unit. |
78 | Elem operator[](size_t n) const { return next[n]; } | |
79 | ||
80 | // Move to the next code unit. | |
81 | range& operator++() | |
82 | { | |
83 | ++next; | |
84 | return *this; | |
85 | } | |
86 | ||
87 | // Move to the Nth code unit. | |
88 | range& operator+=(size_t n) | |
89 | { | |
90 | next += n; | |
91 | return *this; | |
92 | } | |
bb93f35d | 93 | |
d951e75d | 94 | // The number of code units remaining. |
bb93f35d | 95 | size_t size() const { return end - next; } |
d951e75d JW |
96 | |
97 | // The number of bytes remaining. | |
98 | size_t nbytes() const { return (const char*)end - (const char*)next; } | |
99 | }; | |
100 | ||
101 | // This specialization is used when accessing char16_t values through | |
102 | // pointers to char, which might not be correctly aligned for char16_t. | |
103 | template<typename Elem> | |
104 | struct range<Elem, false> | |
105 | { | |
106 | using value_type = typename remove_const<Elem>::type; | |
107 | ||
108 | using char_pointer = typename | |
109 | conditional<is_const<Elem>::value, const char*, char*>::type; | |
110 | ||
111 | char_pointer next; | |
112 | char_pointer end; | |
113 | ||
114 | // Write a code unit. | |
115 | range& operator=(Elem e) | |
116 | { | |
117 | memcpy(next, &e, sizeof(Elem)); | |
118 | ++*this; | |
119 | return *this; | |
120 | } | |
121 | ||
122 | // Read the next code unit. | |
123 | Elem operator*() const | |
124 | { | |
125 | value_type e; | |
126 | memcpy(&e, next, sizeof(Elem)); | |
127 | return e; | |
128 | } | |
129 | ||
130 | // Read the Nth code unit. | |
131 | Elem operator[](size_t n) const | |
132 | { | |
133 | value_type e; | |
134 | memcpy(&e, next + n * sizeof(Elem), sizeof(Elem)); | |
135 | return e; | |
136 | } | |
137 | ||
138 | // Move to the next code unit. | |
139 | range& operator++() | |
140 | { | |
141 | next += sizeof(Elem); | |
142 | return *this; | |
143 | } | |
144 | ||
145 | // Move to the Nth code unit. | |
146 | range& operator+=(size_t n) | |
147 | { | |
148 | next += n * sizeof(Elem); | |
149 | return *this; | |
150 | } | |
151 | ||
152 | // The number of code units remaining. | |
153 | size_t size() const { return nbytes() / sizeof(Elem); } | |
154 | ||
155 | // The number of bytes remaining. | |
156 | size_t nbytes() const { return end - next; } | |
bb93f35d JW |
157 | }; |
158 | ||
28af1fb3 JW |
159 | // Multibyte sequences can have "header" consisting of Byte Order Mark |
160 | const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF }; | |
516231de JW |
161 | const unsigned char utf16_bom[2] = { 0xFE, 0xFF }; |
162 | const unsigned char utf16le_bom[2] = { 0xFF, 0xFE }; | |
28af1fb3 | 163 | |
d951e75d JW |
164 | // Write a BOM (space permitting). |
165 | template<typename C, bool A, size_t N> | |
166 | bool | |
167 | write_bom(range<C, A>& to, const unsigned char (&bom)[N]) | |
28af1fb3 | 168 | { |
d951e75d JW |
169 | static_assert( (N / sizeof(C)) != 0, "" ); |
170 | static_assert( (N % sizeof(C)) == 0, "" ); | |
171 | ||
172 | if (to.nbytes() < N) | |
28af1fb3 JW |
173 | return false; |
174 | memcpy(to.next, bom, N); | |
d951e75d | 175 | to += (N / sizeof(C)); |
28af1fb3 JW |
176 | return true; |
177 | } | |
178 | ||
d951e75d JW |
179 | // Try to read a BOM. |
180 | template<typename C, bool A, size_t N> | |
181 | bool | |
182 | read_bom(range<C, A>& from, const unsigned char (&bom)[N]) | |
183 | { | |
184 | static_assert( (N / sizeof(C)) != 0, "" ); | |
185 | static_assert( (N % sizeof(C)) == 0, "" ); | |
186 | ||
187 | if (from.nbytes() >= N && !memcmp(from.next, bom, N)) | |
188 | { | |
189 | from += (N / sizeof(C)); | |
190 | return true; | |
191 | } | |
192 | return false; | |
193 | } | |
194 | ||
28af1fb3 JW |
195 | // If generate_header is set in mode write out UTF-8 BOM. |
196 | bool | |
197 | write_utf8_bom(range<char>& to, codecvt_mode mode) | |
198 | { | |
199 | if (mode & generate_header) | |
200 | return write_bom(to, utf8_bom); | |
201 | return true; | |
202 | } | |
203 | ||
204 | // If generate_header is set in mode write out the UTF-16 BOM indicated | |
205 | // by whether little_endian is set in mode. | |
d951e75d | 206 | template<bool Aligned> |
28af1fb3 | 207 | bool |
d951e75d | 208 | write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode) |
28af1fb3 JW |
209 | { |
210 | if (mode & generate_header) | |
211 | { | |
d951e75d JW |
212 | if (mode & little_endian) |
213 | return write_bom(to, utf16le_bom); | |
214 | else | |
215 | return write_bom(to, utf16_bom); | |
28af1fb3 JW |
216 | } |
217 | return true; | |
218 | } | |
219 | ||
28af1fb3 JW |
220 | // If consume_header is set in mode update from.next to after any BOM. |
221 | void | |
222 | read_utf8_bom(range<const char>& from, codecvt_mode mode) | |
223 | { | |
224 | if (mode & consume_header) | |
225 | read_bom(from, utf8_bom); | |
226 | } | |
227 | ||
bcd682e1 JW |
228 | // If consume_header is not set in mode, no effects. |
229 | // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then: | |
230 | // - if the UTF-16BE BOM was found unset little_endian in mode, or | |
231 | // - if the UTF-16LE BOM was found set little_endian in mode. | |
d951e75d | 232 | template<bool Aligned> |
bcd682e1 | 233 | void |
d951e75d | 234 | read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode) |
28af1fb3 | 235 | { |
d951e75d | 236 | if (mode & consume_header) |
28af1fb3 | 237 | { |
d951e75d JW |
238 | if (read_bom(from, utf16_bom)) |
239 | mode &= ~little_endian; | |
240 | else if (read_bom(from, utf16le_bom)) | |
241 | mode |= little_endian; | |
28af1fb3 | 242 | } |
28af1fb3 JW |
243 | } |
244 | ||
245 | // Read a codepoint from a UTF-8 multibyte sequence. | |
246 | // Updates from.next if the codepoint is not greater than maxcode. | |
b6584a72 | 247 | // Returns invalid_mb_sequence, incomplete_mb_character or the code point. |
bb93f35d JW |
248 | char32_t |
249 | read_utf8_code_point(range<const char>& from, unsigned long maxcode) | |
250 | { | |
b6584a72 | 251 | const size_t avail = from.size(); |
bb93f35d | 252 | if (avail == 0) |
b6584a72 | 253 | return incomplete_mb_character; |
d951e75d | 254 | unsigned char c1 = from[0]; |
bb93f35d JW |
255 | // https://en.wikipedia.org/wiki/UTF-8#Sample_code |
256 | if (c1 < 0x80) | |
257 | { | |
d951e75d | 258 | ++from; |
bb93f35d JW |
259 | return c1; |
260 | } | |
261 | else if (c1 < 0xC2) // continuation or overlong 2-byte sequence | |
b6584a72 | 262 | return invalid_mb_sequence; |
bb93f35d JW |
263 | else if (c1 < 0xE0) // 2-byte sequence |
264 | { | |
265 | if (avail < 2) | |
b6584a72 | 266 | return incomplete_mb_character; |
d951e75d | 267 | unsigned char c2 = from[1]; |
bb93f35d | 268 | if ((c2 & 0xC0) != 0x80) |
b6584a72 | 269 | return invalid_mb_sequence; |
bb93f35d | 270 | char32_t c = (c1 << 6) + c2 - 0x3080; |
28af1fb3 | 271 | if (c <= maxcode) |
d951e75d | 272 | from += 2; |
bb93f35d JW |
273 | return c; |
274 | } | |
275 | else if (c1 < 0xF0) // 3-byte sequence | |
276 | { | |
277 | if (avail < 3) | |
b6584a72 | 278 | return incomplete_mb_character; |
d951e75d | 279 | unsigned char c2 = from[1]; |
bb93f35d | 280 | if ((c2 & 0xC0) != 0x80) |
b6584a72 | 281 | return invalid_mb_sequence; |
bb93f35d | 282 | if (c1 == 0xE0 && c2 < 0xA0) // overlong |
b6584a72 | 283 | return invalid_mb_sequence; |
d951e75d | 284 | unsigned char c3 = from[2]; |
bb93f35d | 285 | if ((c3 & 0xC0) != 0x80) |
b6584a72 | 286 | return invalid_mb_sequence; |
bb93f35d | 287 | char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080; |
28af1fb3 | 288 | if (c <= maxcode) |
d951e75d | 289 | from += 3; |
bb93f35d JW |
290 | return c; |
291 | } | |
292 | else if (c1 < 0xF5) // 4-byte sequence | |
293 | { | |
294 | if (avail < 4) | |
b6584a72 | 295 | return incomplete_mb_character; |
d951e75d | 296 | unsigned char c2 = from[1]; |
bb93f35d | 297 | if ((c2 & 0xC0) != 0x80) |
b6584a72 | 298 | return invalid_mb_sequence; |
bb93f35d | 299 | if (c1 == 0xF0 && c2 < 0x90) // overlong |
b6584a72 | 300 | return invalid_mb_sequence; |
bb93f35d | 301 | if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF |
b6584a72 | 302 | return invalid_mb_sequence; |
d951e75d | 303 | unsigned char c3 = from[2]; |
bb93f35d | 304 | if ((c3 & 0xC0) != 0x80) |
b6584a72 | 305 | return invalid_mb_sequence; |
d951e75d | 306 | unsigned char c4 = from[3]; |
bb93f35d | 307 | if ((c4 & 0xC0) != 0x80) |
b6584a72 | 308 | return invalid_mb_sequence; |
bb93f35d | 309 | char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080; |
28af1fb3 | 310 | if (c <= maxcode) |
d951e75d | 311 | from += 4; |
bb93f35d JW |
312 | return c; |
313 | } | |
314 | else // > U+10FFFF | |
b6584a72 | 315 | return invalid_mb_sequence; |
bb93f35d JW |
316 | } |
317 | ||
318 | bool | |
319 | write_utf8_code_point(range<char>& to, char32_t code_point) | |
320 | { | |
321 | if (code_point < 0x80) | |
322 | { | |
323 | if (to.size() < 1) | |
324 | return false; | |
d951e75d | 325 | to = code_point; |
bb93f35d JW |
326 | } |
327 | else if (code_point <= 0x7FF) | |
328 | { | |
329 | if (to.size() < 2) | |
330 | return false; | |
d951e75d JW |
331 | to = (code_point >> 6) + 0xC0; |
332 | to = (code_point & 0x3F) + 0x80; | |
bb93f35d JW |
333 | } |
334 | else if (code_point <= 0xFFFF) | |
335 | { | |
336 | if (to.size() < 3) | |
337 | return false; | |
d951e75d JW |
338 | to = (code_point >> 12) + 0xE0; |
339 | to = ((code_point >> 6) & 0x3F) + 0x80; | |
340 | to = (code_point & 0x3F) + 0x80; | |
bb93f35d JW |
341 | } |
342 | else if (code_point <= 0x10FFFF) | |
343 | { | |
344 | if (to.size() < 4) | |
345 | return false; | |
d951e75d JW |
346 | to = (code_point >> 18) + 0xF0; |
347 | to = ((code_point >> 12) & 0x3F) + 0x80; | |
348 | to = ((code_point >> 6) & 0x3F) + 0x80; | |
349 | to = (code_point & 0x3F) + 0x80; | |
bb93f35d JW |
350 | } |
351 | else | |
352 | return false; | |
353 | return true; | |
354 | } | |
355 | ||
28af1fb3 JW |
356 | inline char16_t |
357 | adjust_byte_order(char16_t c, codecvt_mode mode) | |
358 | { | |
359 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |
360 | return (mode & little_endian) ? __builtin_bswap16(c) : c; | |
361 | #else | |
362 | return (mode & little_endian) ? c : __builtin_bswap16(c); | |
363 | #endif | |
364 | } | |
365 | ||
b6584a72 JW |
366 | // Return true if c is a high-surrogate (aka leading) code point. |
367 | inline bool | |
368 | is_high_surrogate(char32_t c) | |
369 | { | |
370 | return c >= 0xD800 && c <= 0xDBFF; | |
371 | } | |
372 | ||
373 | // Return true if c is a low-surrogate (aka trailing) code point. | |
374 | inline bool | |
375 | is_low_surrogate(char32_t c) | |
376 | { | |
377 | return c >= 0xDC00 && c <= 0xDFFF; | |
378 | } | |
379 | ||
380 | inline char32_t | |
381 | surrogate_pair_to_code_point(char32_t high, char32_t low) | |
382 | { | |
383 | return (high << 10) + low - 0x35FDC00; | |
384 | } | |
385 | ||
28af1fb3 JW |
386 | // Read a codepoint from a UTF-16 multibyte sequence. |
387 | // The sequence's endianness is indicated by (mode & little_endian). | |
388 | // Updates from.next if the codepoint is not greater than maxcode. | |
b6584a72 | 389 | // Returns invalid_mb_sequence, incomplete_mb_character or the code point. |
d951e75d JW |
390 | template<bool Aligned> |
391 | char32_t | |
392 | read_utf16_code_point(range<const char16_t, Aligned>& from, | |
393 | unsigned long maxcode, codecvt_mode mode) | |
394 | { | |
395 | const size_t avail = from.size(); | |
396 | if (avail == 0) | |
397 | return incomplete_mb_character; | |
398 | int inc = 1; | |
399 | char32_t c = adjust_byte_order(from[0], mode); | |
400 | if (is_high_surrogate(c)) | |
401 | { | |
402 | if (avail < 2) | |
403 | return incomplete_mb_character; | |
404 | const char16_t c2 = adjust_byte_order(from[1], mode); | |
405 | if (is_low_surrogate(c2)) | |
406 | { | |
407 | c = surrogate_pair_to_code_point(c, c2); | |
408 | inc = 2; | |
409 | } | |
410 | else | |
411 | return invalid_mb_sequence; | |
412 | } | |
413 | else if (is_low_surrogate(c)) | |
414 | return invalid_mb_sequence; | |
415 | if (c <= maxcode) | |
416 | from += inc; | |
417 | return c; | |
418 | } | |
28af1fb3 | 419 | |
d951e75d | 420 | template<typename C, bool A> |
bb93f35d | 421 | bool |
d951e75d | 422 | write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode) |
bb93f35d | 423 | { |
28af1fb3 JW |
424 | static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit"); |
425 | ||
02e12bda | 426 | if (codepoint <= max_single_utf16_unit) |
bb93f35d JW |
427 | { |
428 | if (to.size() > 0) | |
429 | { | |
d951e75d | 430 | to = adjust_byte_order(codepoint, mode); |
bb93f35d JW |
431 | return true; |
432 | } | |
433 | } | |
434 | else if (to.size() > 1) | |
435 | { | |
436 | // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4 | |
437 | const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10); | |
bb93f35d JW |
438 | char16_t lead = LEAD_OFFSET + (codepoint >> 10); |
439 | char16_t trail = 0xDC00 + (codepoint & 0x3FF); | |
d951e75d JW |
440 | to = adjust_byte_order(lead, mode); |
441 | to = adjust_byte_order(trail, mode); | |
bb93f35d JW |
442 | return true; |
443 | } | |
444 | return false; | |
445 | } | |
446 | ||
447 | // utf8 -> ucs4 | |
448 | codecvt_base::result | |
449 | ucs4_in(range<const char>& from, range<char32_t>& to, | |
28af1fb3 | 450 | unsigned long maxcode = max_code_point, codecvt_mode mode = {}) |
bb93f35d | 451 | { |
28af1fb3 | 452 | read_utf8_bom(from, mode); |
bb93f35d JW |
453 | while (from.size() && to.size()) |
454 | { | |
455 | const char32_t codepoint = read_utf8_code_point(from, maxcode); | |
b6584a72 JW |
456 | if (codepoint == incomplete_mb_character) |
457 | return codecvt_base::partial; | |
28af1fb3 | 458 | if (codepoint > maxcode) |
bb93f35d | 459 | return codecvt_base::error; |
d951e75d | 460 | to = codepoint; |
bb93f35d JW |
461 | } |
462 | return from.size() ? codecvt_base::partial : codecvt_base::ok; | |
463 | } | |
464 | ||
465 | // ucs4 -> utf8 | |
466 | codecvt_base::result | |
467 | ucs4_out(range<const char32_t>& from, range<char>& to, | |
28af1fb3 | 468 | unsigned long maxcode = max_code_point, codecvt_mode mode = {}) |
bb93f35d | 469 | { |
28af1fb3 JW |
470 | if (!write_utf8_bom(to, mode)) |
471 | return codecvt_base::partial; | |
bb93f35d JW |
472 | while (from.size()) |
473 | { | |
d951e75d | 474 | const char32_t c = from[0]; |
bb93f35d JW |
475 | if (c > maxcode) |
476 | return codecvt_base::error; | |
477 | if (!write_utf8_code_point(to, c)) | |
478 | return codecvt_base::partial; | |
d951e75d | 479 | ++from; |
bb93f35d JW |
480 | } |
481 | return codecvt_base::ok; | |
482 | } | |
483 | ||
28af1fb3 JW |
484 | // utf16 -> ucs4 |
485 | codecvt_base::result | |
d951e75d | 486 | ucs4_in(range<const char16_t, false>& from, range<char32_t>& to, |
28af1fb3 JW |
487 | unsigned long maxcode = max_code_point, codecvt_mode mode = {}) |
488 | { | |
bcd682e1 | 489 | read_utf16_bom(from, mode); |
28af1fb3 JW |
490 | while (from.size() && to.size()) |
491 | { | |
492 | const char32_t codepoint = read_utf16_code_point(from, maxcode, mode); | |
b6584a72 JW |
493 | if (codepoint == incomplete_mb_character) |
494 | return codecvt_base::partial; | |
28af1fb3 JW |
495 | if (codepoint > maxcode) |
496 | return codecvt_base::error; | |
d951e75d | 497 | to = codepoint; |
28af1fb3 JW |
498 | } |
499 | return from.size() ? codecvt_base::partial : codecvt_base::ok; | |
500 | } | |
501 | ||
502 | // ucs4 -> utf16 | |
503 | codecvt_base::result | |
d951e75d | 504 | ucs4_out(range<const char32_t>& from, range<char16_t, false>& to, |
28af1fb3 JW |
505 | unsigned long maxcode = max_code_point, codecvt_mode mode = {}) |
506 | { | |
507 | if (!write_utf16_bom(to, mode)) | |
508 | return codecvt_base::partial; | |
509 | while (from.size()) | |
510 | { | |
d951e75d | 511 | const char32_t c = from[0]; |
28af1fb3 JW |
512 | if (c > maxcode) |
513 | return codecvt_base::error; | |
514 | if (!write_utf16_code_point(to, c, mode)) | |
515 | return codecvt_base::partial; | |
d951e75d | 516 | ++from; |
28af1fb3 JW |
517 | } |
518 | return codecvt_base::ok; | |
519 | } | |
520 | ||
bcd682e1 JW |
521 | // Flag indicating whether to process UTF-16 or UCS2 |
522 | enum class surrogates { allowed, disallowed }; | |
523 | ||
524 | // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed) | |
28af1fb3 | 525 | template<typename C> |
bb93f35d | 526 | codecvt_base::result |
28af1fb3 | 527 | utf16_in(range<const char>& from, range<C>& to, |
bcd682e1 JW |
528 | unsigned long maxcode = max_code_point, codecvt_mode mode = {}, |
529 | surrogates s = surrogates::allowed) | |
bb93f35d | 530 | { |
28af1fb3 | 531 | read_utf8_bom(from, mode); |
bb93f35d JW |
532 | while (from.size() && to.size()) |
533 | { | |
d951e75d | 534 | auto orig = from; |
bb93f35d | 535 | const char32_t codepoint = read_utf8_code_point(from, maxcode); |
b6584a72 | 536 | if (codepoint == incomplete_mb_character) |
bcd682e1 JW |
537 | { |
538 | if (s == surrogates::allowed) | |
539 | return codecvt_base::partial; | |
540 | else | |
541 | return codecvt_base::error; // No surrogates in UCS2 | |
542 | } | |
28af1fb3 | 543 | if (codepoint > maxcode) |
bb93f35d | 544 | return codecvt_base::error; |
7f971f18 | 545 | if (!write_utf16_code_point(to, codepoint, mode)) |
bb93f35d | 546 | { |
d951e75d | 547 | from = orig; // rewind to previous position |
bb93f35d JW |
548 | return codecvt_base::partial; |
549 | } | |
550 | } | |
551 | return codecvt_base::ok; | |
552 | } | |
553 | ||
bcd682e1 | 554 | // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed) |
28af1fb3 | 555 | template<typename C> |
bb93f35d | 556 | codecvt_base::result |
28af1fb3 | 557 | utf16_out(range<const C>& from, range<char>& to, |
bcd682e1 JW |
558 | unsigned long maxcode = max_code_point, codecvt_mode mode = {}, |
559 | surrogates s = surrogates::allowed) | |
bb93f35d | 560 | { |
28af1fb3 JW |
561 | if (!write_utf8_bom(to, mode)) |
562 | return codecvt_base::partial; | |
bb93f35d JW |
563 | while (from.size()) |
564 | { | |
d951e75d | 565 | char32_t c = from[0]; |
bb93f35d | 566 | int inc = 1; |
b6584a72 | 567 | if (is_high_surrogate(c)) |
bb93f35d | 568 | { |
bcd682e1 JW |
569 | if (s == surrogates::disallowed) |
570 | return codecvt_base::error; // No surrogates in UCS-2 | |
571 | ||
bb93f35d JW |
572 | if (from.size() < 2) |
573 | return codecvt_base::ok; // stop converting at this point | |
574 | ||
d951e75d | 575 | const char32_t c2 = from[1]; |
b6584a72 | 576 | if (is_low_surrogate(c2)) |
bb93f35d | 577 | { |
b6584a72 | 578 | c = surrogate_pair_to_code_point(c, c2); |
bb93f35d | 579 | inc = 2; |
bb93f35d JW |
580 | } |
581 | else | |
582 | return codecvt_base::error; | |
583 | } | |
b6584a72 JW |
584 | else if (is_low_surrogate(c)) |
585 | return codecvt_base::error; | |
bb93f35d JW |
586 | if (c > maxcode) |
587 | return codecvt_base::error; | |
588 | if (!write_utf8_code_point(to, c)) | |
589 | return codecvt_base::partial; | |
d951e75d | 590 | from += inc; |
bb93f35d JW |
591 | } |
592 | return codecvt_base::ok; | |
593 | } | |
594 | ||
595 | // return pos such that [begin,pos) is valid UTF-16 string no longer than max | |
28af1fb3 JW |
596 | const char* |
597 | utf16_span(const char* begin, const char* end, size_t max, | |
598 | char32_t maxcode = max_code_point, codecvt_mode mode = {}) | |
bb93f35d JW |
599 | { |
600 | range<const char> from{ begin, end }; | |
28af1fb3 | 601 | read_utf8_bom(from, mode); |
bb93f35d JW |
602 | size_t count = 0; |
603 | while (count+1 < max) | |
604 | { | |
605 | char32_t c = read_utf8_code_point(from, maxcode); | |
b6584a72 JW |
606 | if (c > maxcode) |
607 | return from.next; | |
bb93f35d JW |
608 | else if (c > max_single_utf16_unit) |
609 | ++count; | |
610 | ++count; | |
611 | } | |
612 | if (count+1 == max) // take one more character if it fits in a single unit | |
bcd682e1 | 613 | read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode)); |
28af1fb3 | 614 | return from.next; |
bb93f35d JW |
615 | } |
616 | ||
28af1fb3 JW |
617 | // utf8 -> ucs2 |
618 | codecvt_base::result | |
619 | ucs2_in(range<const char>& from, range<char16_t>& to, | |
620 | char32_t maxcode = max_code_point, codecvt_mode mode = {}) | |
bb93f35d | 621 | { |
bcd682e1 JW |
622 | // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: |
623 | maxcode = std::min(max_single_utf16_unit, maxcode); | |
624 | return utf16_in(from, to, maxcode, mode, surrogates::disallowed); | |
28af1fb3 JW |
625 | } |
626 | ||
627 | // ucs2 -> utf8 | |
628 | codecvt_base::result | |
629 | ucs2_out(range<const char16_t>& from, range<char>& to, | |
630 | char32_t maxcode = max_code_point, codecvt_mode mode = {}) | |
631 | { | |
bcd682e1 JW |
632 | // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: |
633 | maxcode = std::min(max_single_utf16_unit, maxcode); | |
634 | return utf16_out(from, to, maxcode, mode, surrogates::disallowed); | |
28af1fb3 JW |
635 | } |
636 | ||
637 | // ucs2 -> utf16 | |
638 | codecvt_base::result | |
d951e75d | 639 | ucs2_out(range<const char16_t>& from, range<char16_t, false>& to, |
28af1fb3 JW |
640 | char32_t maxcode = max_code_point, codecvt_mode mode = {}) |
641 | { | |
642 | if (!write_utf16_bom(to, mode)) | |
643 | return codecvt_base::partial; | |
644 | while (from.size() && to.size()) | |
bb93f35d | 645 | { |
d951e75d | 646 | char16_t c = from[0]; |
b6584a72 | 647 | if (is_high_surrogate(c)) |
28af1fb3 JW |
648 | return codecvt_base::error; |
649 | if (c > maxcode) | |
650 | return codecvt_base::error; | |
d951e75d JW |
651 | to = adjust_byte_order(c, mode); |
652 | ++from; | |
28af1fb3 JW |
653 | } |
654 | return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial; | |
655 | } | |
656 | ||
657 | // utf16 -> ucs2 | |
658 | codecvt_base::result | |
d951e75d | 659 | ucs2_in(range<const char16_t, false>& from, range<char16_t>& to, |
28af1fb3 JW |
660 | char32_t maxcode = max_code_point, codecvt_mode mode = {}) |
661 | { | |
bcd682e1 JW |
662 | read_utf16_bom(from, mode); |
663 | // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: | |
664 | maxcode = std::min(max_single_utf16_unit, maxcode); | |
28af1fb3 JW |
665 | while (from.size() && to.size()) |
666 | { | |
667 | const char32_t c = read_utf16_code_point(from, maxcode, mode); | |
b6584a72 | 668 | if (c == incomplete_mb_character) |
bcd682e1 | 669 | return codecvt_base::error; // UCS-2 only supports single units. |
b6584a72 | 670 | if (c > maxcode) |
28af1fb3 | 671 | return codecvt_base::error; |
d951e75d | 672 | to = c; |
bb93f35d | 673 | } |
28af1fb3 JW |
674 | return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial; |
675 | } | |
676 | ||
677 | const char16_t* | |
d951e75d | 678 | ucs2_span(range<const char16_t, false>& from, size_t max, |
28af1fb3 JW |
679 | char32_t maxcode, codecvt_mode mode) |
680 | { | |
bcd682e1 JW |
681 | read_utf16_bom(from, mode); |
682 | // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: | |
683 | maxcode = std::min(max_single_utf16_unit, maxcode); | |
28af1fb3 JW |
684 | char32_t c = 0; |
685 | while (max-- && c <= maxcode) | |
686 | c = read_utf16_code_point(from, maxcode, mode); | |
d951e75d | 687 | return reinterpret_cast<const char16_t*>(from.next); |
28af1fb3 JW |
688 | } |
689 | ||
690 | const char* | |
691 | ucs2_span(const char* begin, const char* end, size_t max, | |
692 | char32_t maxcode, codecvt_mode mode) | |
693 | { | |
694 | range<const char> from{ begin, end }; | |
695 | read_utf8_bom(from, mode); | |
bcd682e1 JW |
696 | // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: |
697 | maxcode = std::min(max_single_utf16_unit, maxcode); | |
28af1fb3 JW |
698 | char32_t c = 0; |
699 | while (max-- && c <= maxcode) | |
700 | c = read_utf8_code_point(from, maxcode); | |
701 | return from.next; | |
702 | } | |
703 | ||
704 | // return pos such that [begin,pos) is valid UCS-4 string no longer than max | |
705 | const char* | |
706 | ucs4_span(const char* begin, const char* end, size_t max, | |
707 | char32_t maxcode = max_code_point, codecvt_mode mode = {}) | |
708 | { | |
709 | range<const char> from{ begin, end }; | |
710 | read_utf8_bom(from, mode); | |
711 | char32_t c = 0; | |
712 | while (max-- && c <= maxcode) | |
713 | c = read_utf8_code_point(from, maxcode); | |
714 | return from.next; | |
715 | } | |
716 | ||
717 | // return pos such that [begin,pos) is valid UCS-4 string no longer than max | |
718 | const char16_t* | |
d951e75d | 719 | ucs4_span(range<const char16_t, false>& from, size_t max, |
28af1fb3 JW |
720 | char32_t maxcode = max_code_point, codecvt_mode mode = {}) |
721 | { | |
bcd682e1 | 722 | read_utf16_bom(from, mode); |
28af1fb3 JW |
723 | char32_t c = 0; |
724 | while (max-- && c <= maxcode) | |
725 | c = read_utf16_code_point(from, maxcode, mode); | |
d951e75d | 726 | return reinterpret_cast<const char16_t*>(from.next); |
bb93f35d JW |
727 | } |
728 | } | |
729 | ||
730 | // Define members of codecvt<char16_t, char, mbstate_t> specialization. | |
731 | // Converts from UTF-8 to UTF-16. | |
732 | ||
733 | locale::id codecvt<char16_t, char, mbstate_t>::id; | |
734 | ||
735 | codecvt<char16_t, char, mbstate_t>::~codecvt() { } | |
736 | ||
737 | codecvt_base::result | |
738 | codecvt<char16_t, char, mbstate_t>:: | |
739 | do_out(state_type&, | |
740 | const intern_type* __from, | |
741 | const intern_type* __from_end, const intern_type*& __from_next, | |
742 | extern_type* __to, extern_type* __to_end, | |
743 | extern_type*& __to_next) const | |
744 | { | |
745 | range<const char16_t> from{ __from, __from_end }; | |
746 | range<char> to{ __to, __to_end }; | |
747 | auto res = utf16_out(from, to); | |
748 | __from_next = from.next; | |
749 | __to_next = to.next; | |
750 | return res; | |
751 | } | |
752 | ||
753 | codecvt_base::result | |
754 | codecvt<char16_t, char, mbstate_t>:: | |
755 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
756 | extern_type*& __to_next) const | |
757 | { | |
758 | __to_next = __to; | |
759 | return noconv; // we don't use mbstate_t for the unicode facets | |
760 | } | |
761 | ||
762 | codecvt_base::result | |
763 | codecvt<char16_t, char, mbstate_t>:: | |
764 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
765 | const extern_type*& __from_next, | |
766 | intern_type* __to, intern_type* __to_end, | |
767 | intern_type*& __to_next) const | |
768 | { | |
769 | range<const char> from{ __from, __from_end }; | |
770 | range<char16_t> to{ __to, __to_end }; | |
7f971f18 JW |
771 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
772 | codecvt_mode mode = {}; | |
773 | #else | |
774 | codecvt_mode mode = little_endian; | |
775 | #endif | |
776 | auto res = utf16_in(from, to, max_code_point, mode); | |
bb93f35d JW |
777 | __from_next = from.next; |
778 | __to_next = to.next; | |
779 | return res; | |
780 | } | |
781 | ||
782 | int | |
783 | codecvt<char16_t, char, mbstate_t>::do_encoding() const throw() | |
516231de | 784 | { return 0; } // UTF-8 is not a fixed-width encoding |
bb93f35d JW |
785 | |
786 | bool | |
787 | codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw() | |
788 | { return false; } | |
789 | ||
790 | int | |
791 | codecvt<char16_t, char, mbstate_t>:: | |
792 | do_length(state_type&, const extern_type* __from, | |
793 | const extern_type* __end, size_t __max) const | |
794 | { | |
28af1fb3 JW |
795 | __end = utf16_span(__from, __end, __max); |
796 | return __end - __from; | |
bb93f35d JW |
797 | } |
798 | ||
799 | int | |
800 | codecvt<char16_t, char, mbstate_t>::do_max_length() const throw() | |
801 | { | |
516231de JW |
802 | // A single character (one or two UTF-16 code units) requires |
803 | // up to four UTF-8 code units. | |
804 | return 4; | |
bb93f35d JW |
805 | } |
806 | ||
807 | // Define members of codecvt<char32_t, char, mbstate_t> specialization. | |
808 | // Converts from UTF-8 to UTF-32 (aka UCS-4). | |
809 | ||
810 | locale::id codecvt<char32_t, char, mbstate_t>::id; | |
811 | ||
812 | codecvt<char32_t, char, mbstate_t>::~codecvt() { } | |
813 | ||
814 | codecvt_base::result | |
815 | codecvt<char32_t, char, mbstate_t>:: | |
816 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
817 | const intern_type*& __from_next, | |
818 | extern_type* __to, extern_type* __to_end, | |
819 | extern_type*& __to_next) const | |
820 | { | |
821 | range<const char32_t> from{ __from, __from_end }; | |
822 | range<char> to{ __to, __to_end }; | |
823 | auto res = ucs4_out(from, to); | |
824 | __from_next = from.next; | |
825 | __to_next = to.next; | |
826 | return res; | |
827 | } | |
828 | ||
829 | codecvt_base::result | |
830 | codecvt<char32_t, char, mbstate_t>:: | |
831 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
832 | extern_type*& __to_next) const | |
833 | { | |
834 | __to_next = __to; | |
835 | return noconv; | |
836 | } | |
837 | ||
838 | codecvt_base::result | |
839 | codecvt<char32_t, char, mbstate_t>:: | |
840 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
841 | const extern_type*& __from_next, | |
842 | intern_type* __to, intern_type* __to_end, | |
843 | intern_type*& __to_next) const | |
844 | { | |
845 | range<const char> from{ __from, __from_end }; | |
846 | range<char32_t> to{ __to, __to_end }; | |
847 | auto res = ucs4_in(from, to); | |
848 | __from_next = from.next; | |
849 | __to_next = to.next; | |
850 | return res; | |
851 | } | |
852 | ||
853 | int | |
854 | codecvt<char32_t, char, mbstate_t>::do_encoding() const throw() | |
516231de | 855 | { return 0; } // UTF-8 is not a fixed-width encoding |
bb93f35d JW |
856 | |
857 | bool | |
858 | codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw() | |
859 | { return false; } | |
860 | ||
861 | int | |
862 | codecvt<char32_t, char, mbstate_t>:: | |
863 | do_length(state_type&, const extern_type* __from, | |
864 | const extern_type* __end, size_t __max) const | |
865 | { | |
28af1fb3 JW |
866 | __end = ucs4_span(__from, __end, __max); |
867 | return __end - __from; | |
bb93f35d JW |
868 | } |
869 | ||
870 | int | |
871 | codecvt<char32_t, char, mbstate_t>::do_max_length() const throw() | |
516231de JW |
872 | { |
873 | // A single character (one UTF-32 code unit) requires | |
874 | // up to 4 UTF-8 code units. | |
875 | return 4; | |
876 | } | |
bb93f35d | 877 | |
28af1fb3 JW |
878 | // Define members of codecvt_utf8<char16_t> base class implementation. |
879 | // Converts from UTF-8 to UCS-2. | |
880 | ||
881 | __codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { } | |
882 | ||
883 | codecvt_base::result | |
884 | __codecvt_utf8_base<char16_t>:: | |
885 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
886 | const intern_type*& __from_next, | |
887 | extern_type* __to, extern_type* __to_end, | |
888 | extern_type*& __to_next) const | |
889 | { | |
890 | range<const char16_t> from{ __from, __from_end }; | |
891 | range<char> to{ __to, __to_end }; | |
892 | auto res = ucs2_out(from, to, _M_maxcode, _M_mode); | |
893 | __from_next = from.next; | |
894 | __to_next = to.next; | |
895 | return res; | |
896 | } | |
897 | ||
898 | codecvt_base::result | |
899 | __codecvt_utf8_base<char16_t>:: | |
900 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
901 | extern_type*& __to_next) const | |
902 | { | |
903 | __to_next = __to; | |
904 | return noconv; | |
905 | } | |
906 | ||
907 | codecvt_base::result | |
908 | __codecvt_utf8_base<char16_t>:: | |
909 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
910 | const extern_type*& __from_next, | |
911 | intern_type* __to, intern_type* __to_end, | |
912 | intern_type*& __to_next) const | |
913 | { | |
914 | range<const char> from{ __from, __from_end }; | |
915 | range<char16_t> to{ __to, __to_end }; | |
a1e1ec76 | 916 | codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); |
29ca91f7 JW |
917 | #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ |
918 | mode = codecvt_mode(mode | little_endian); | |
919 | #endif | |
920 | auto res = ucs2_in(from, to, _M_maxcode, mode); | |
28af1fb3 JW |
921 | __from_next = from.next; |
922 | __to_next = to.next; | |
923 | return res; | |
924 | } | |
925 | ||
926 | int | |
927 | __codecvt_utf8_base<char16_t>::do_encoding() const throw() | |
516231de | 928 | { return 0; } // UTF-8 is not a fixed-width encoding |
28af1fb3 JW |
929 | |
930 | bool | |
931 | __codecvt_utf8_base<char16_t>::do_always_noconv() const throw() | |
932 | { return false; } | |
933 | ||
934 | int | |
935 | __codecvt_utf8_base<char16_t>:: | |
936 | do_length(state_type&, const extern_type* __from, | |
937 | const extern_type* __end, size_t __max) const | |
938 | { | |
939 | __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode); | |
940 | return __end - __from; | |
941 | } | |
942 | ||
943 | int | |
944 | __codecvt_utf8_base<char16_t>::do_max_length() const throw() | |
516231de JW |
945 | { |
946 | // A single UCS-2 character requires up to three UTF-8 code units. | |
947 | // (UCS-2 cannot represent characters that use four UTF-8 code units). | |
948 | int max = 3; | |
949 | if (_M_mode & consume_header) | |
950 | max += sizeof(utf8_bom); | |
951 | return max; | |
952 | } | |
28af1fb3 JW |
953 | |
954 | // Define members of codecvt_utf8<char32_t> base class implementation. | |
955 | // Converts from UTF-8 to UTF-32 (aka UCS-4). | |
956 | ||
957 | __codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { } | |
958 | ||
959 | codecvt_base::result | |
960 | __codecvt_utf8_base<char32_t>:: | |
961 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
962 | const intern_type*& __from_next, | |
963 | extern_type* __to, extern_type* __to_end, | |
964 | extern_type*& __to_next) const | |
965 | { | |
966 | range<const char32_t> from{ __from, __from_end }; | |
967 | range<char> to{ __to, __to_end }; | |
968 | auto res = ucs4_out(from, to, _M_maxcode, _M_mode); | |
969 | __from_next = from.next; | |
970 | __to_next = to.next; | |
971 | return res; | |
972 | } | |
973 | ||
974 | codecvt_base::result | |
975 | __codecvt_utf8_base<char32_t>:: | |
976 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
977 | extern_type*& __to_next) const | |
978 | { | |
979 | __to_next = __to; | |
980 | return noconv; | |
981 | } | |
982 | ||
983 | codecvt_base::result | |
984 | __codecvt_utf8_base<char32_t>:: | |
985 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
986 | const extern_type*& __from_next, | |
987 | intern_type* __to, intern_type* __to_end, | |
988 | intern_type*& __to_next) const | |
989 | { | |
990 | range<const char> from{ __from, __from_end }; | |
991 | range<char32_t> to{ __to, __to_end }; | |
992 | auto res = ucs4_in(from, to, _M_maxcode, _M_mode); | |
993 | __from_next = from.next; | |
994 | __to_next = to.next; | |
995 | return res; | |
996 | } | |
997 | ||
998 | int | |
999 | __codecvt_utf8_base<char32_t>::do_encoding() const throw() | |
516231de | 1000 | { return 0; } // UTF-8 is not a fixed-width encoding |
28af1fb3 JW |
1001 | |
1002 | bool | |
1003 | __codecvt_utf8_base<char32_t>::do_always_noconv() const throw() | |
1004 | { return false; } | |
1005 | ||
1006 | int | |
1007 | __codecvt_utf8_base<char32_t>:: | |
1008 | do_length(state_type&, const extern_type* __from, | |
1009 | const extern_type* __end, size_t __max) const | |
1010 | { | |
1011 | __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode); | |
1012 | return __end - __from; | |
1013 | } | |
1014 | ||
1015 | int | |
1016 | __codecvt_utf8_base<char32_t>::do_max_length() const throw() | |
516231de JW |
1017 | { |
1018 | // A single UCS-4 character requires up to four UTF-8 code units. | |
1019 | int max = 4; | |
1020 | if (_M_mode & consume_header) | |
1021 | max += sizeof(utf8_bom); | |
1022 | return max; | |
1023 | } | |
28af1fb3 JW |
1024 | |
1025 | #ifdef _GLIBCXX_USE_WCHAR_T | |
d951e75d JW |
1026 | |
1027 | #if __SIZEOF_WCHAR_T__ == 2 | |
1028 | static_assert(sizeof(wchar_t) == sizeof(char16_t), ""); | |
1029 | #elif __SIZEOF_WCHAR_T__ == 4 | |
1030 | static_assert(sizeof(wchar_t) == sizeof(char32_t), ""); | |
1031 | #endif | |
1032 | ||
28af1fb3 JW |
1033 | // Define members of codecvt_utf8<wchar_t> base class implementation. |
1034 | // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t). | |
1035 | ||
1036 | __codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { } | |
1037 | ||
1038 | codecvt_base::result | |
1039 | __codecvt_utf8_base<wchar_t>:: | |
1040 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
1041 | const intern_type*& __from_next, | |
1042 | extern_type* __to, extern_type* __to_end, | |
1043 | extern_type*& __to_next) const | |
1044 | { | |
1045 | range<char> to{ __to, __to_end }; | |
1046 | #if __SIZEOF_WCHAR_T__ == 2 | |
1047 | range<const char16_t> from{ | |
1048 | reinterpret_cast<const char16_t*>(__from), | |
1049 | reinterpret_cast<const char16_t*>(__from_end) | |
1050 | }; | |
1051 | auto res = ucs2_out(from, to, _M_maxcode, _M_mode); | |
1052 | #elif __SIZEOF_WCHAR_T__ == 4 | |
1053 | range<const char32_t> from{ | |
1054 | reinterpret_cast<const char32_t*>(__from), | |
1055 | reinterpret_cast<const char32_t*>(__from_end) | |
1056 | }; | |
1057 | auto res = ucs4_out(from, to, _M_maxcode, _M_mode); | |
1058 | #else | |
1059 | return codecvt_base::error; | |
1060 | #endif | |
1061 | __from_next = reinterpret_cast<const wchar_t*>(from.next); | |
1062 | __to_next = to.next; | |
1063 | return res; | |
1064 | } | |
1065 | ||
1066 | codecvt_base::result | |
1067 | __codecvt_utf8_base<wchar_t>:: | |
1068 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
1069 | extern_type*& __to_next) const | |
1070 | { | |
1071 | __to_next = __to; | |
1072 | return noconv; | |
1073 | } | |
1074 | ||
1075 | codecvt_base::result | |
1076 | __codecvt_utf8_base<wchar_t>:: | |
1077 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
1078 | const extern_type*& __from_next, | |
1079 | intern_type* __to, intern_type* __to_end, | |
1080 | intern_type*& __to_next) const | |
1081 | { | |
1082 | range<const char> from{ __from, __from_end }; | |
1083 | #if __SIZEOF_WCHAR_T__ == 2 | |
1084 | range<char16_t> to{ | |
1085 | reinterpret_cast<char16_t*>(__to), | |
1086 | reinterpret_cast<char16_t*>(__to_end) | |
1087 | }; | |
a76400f4 JW |
1088 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
1089 | codecvt_mode mode = {}; | |
1090 | #else | |
1091 | codecvt_mode mode = little_endian; | |
1092 | #endif | |
1093 | auto res = ucs2_in(from, to, _M_maxcode, mode); | |
28af1fb3 JW |
1094 | #elif __SIZEOF_WCHAR_T__ == 4 |
1095 | range<char32_t> to{ | |
1096 | reinterpret_cast<char32_t*>(__to), | |
1097 | reinterpret_cast<char32_t*>(__to_end) | |
1098 | }; | |
1099 | auto res = ucs4_in(from, to, _M_maxcode, _M_mode); | |
1100 | #else | |
1101 | return codecvt_base::error; | |
1102 | #endif | |
1103 | __from_next = from.next; | |
1104 | __to_next = reinterpret_cast<wchar_t*>(to.next); | |
1105 | return res; | |
1106 | } | |
1107 | ||
1108 | int | |
1109 | __codecvt_utf8_base<wchar_t>::do_encoding() const throw() | |
516231de | 1110 | { return 0; } // UTF-8 is not a fixed-width encoding |
28af1fb3 JW |
1111 | |
1112 | bool | |
1113 | __codecvt_utf8_base<wchar_t>::do_always_noconv() const throw() | |
1114 | { return false; } | |
1115 | ||
1116 | int | |
1117 | __codecvt_utf8_base<wchar_t>:: | |
1118 | do_length(state_type&, const extern_type* __from, | |
1119 | const extern_type* __end, size_t __max) const | |
1120 | { | |
1121 | #if __SIZEOF_WCHAR_T__ == 2 | |
1122 | __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode); | |
1123 | #elif __SIZEOF_WCHAR_T__ == 4 | |
1124 | __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode); | |
1125 | #else | |
1126 | __end = __from; | |
1127 | #endif | |
1128 | return __end - __from; | |
1129 | } | |
1130 | ||
1131 | int | |
1132 | __codecvt_utf8_base<wchar_t>::do_max_length() const throw() | |
516231de JW |
1133 | { |
1134 | #if __SIZEOF_WCHAR_T__ == 2 | |
1135 | int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length() | |
1136 | #else | |
1137 | int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length() | |
1138 | #endif | |
1139 | if (_M_mode & consume_header) | |
1140 | max += sizeof(utf8_bom); | |
1141 | return max; | |
1142 | } | |
28af1fb3 JW |
1143 | #endif |
1144 | ||
1145 | // Define members of codecvt_utf16<char16_t> base class implementation. | |
1146 | // Converts from UTF-16 to UCS-2. | |
1147 | ||
1148 | __codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { } | |
1149 | ||
1150 | codecvt_base::result | |
1151 | __codecvt_utf16_base<char16_t>:: | |
1152 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
1153 | const intern_type*& __from_next, | |
1154 | extern_type* __to, extern_type* __to_end, | |
1155 | extern_type*& __to_next) const | |
1156 | { | |
1157 | range<const char16_t> from{ __from, __from_end }; | |
d951e75d | 1158 | range<char16_t, false> to{ __to, __to_end }; |
28af1fb3 JW |
1159 | auto res = ucs2_out(from, to, _M_maxcode, _M_mode); |
1160 | __from_next = from.next; | |
1161 | __to_next = reinterpret_cast<char*>(to.next); | |
1162 | return res; | |
1163 | } | |
1164 | ||
1165 | codecvt_base::result | |
1166 | __codecvt_utf16_base<char16_t>:: | |
1167 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
1168 | extern_type*& __to_next) const | |
1169 | { | |
1170 | __to_next = __to; | |
1171 | return noconv; | |
1172 | } | |
1173 | ||
1174 | codecvt_base::result | |
1175 | __codecvt_utf16_base<char16_t>:: | |
1176 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
1177 | const extern_type*& __from_next, | |
1178 | intern_type* __to, intern_type* __to_end, | |
1179 | intern_type*& __to_next) const | |
1180 | { | |
d951e75d | 1181 | range<const char16_t, false> from{ __from, __from_end }; |
28af1fb3 JW |
1182 | range<char16_t> to{ __to, __to_end }; |
1183 | auto res = ucs2_in(from, to, _M_maxcode, _M_mode); | |
1184 | __from_next = reinterpret_cast<const char*>(from.next); | |
1185 | __to_next = to.next; | |
d951e75d JW |
1186 | if (res == codecvt_base::ok && __from_next != __from_end) |
1187 | res = codecvt_base::error; | |
28af1fb3 JW |
1188 | return res; |
1189 | } | |
1190 | ||
1191 | int | |
1192 | __codecvt_utf16_base<char16_t>::do_encoding() const throw() | |
516231de | 1193 | { return 0; } // UTF-16 is not a fixed-width encoding |
28af1fb3 JW |
1194 | |
1195 | bool | |
1196 | __codecvt_utf16_base<char16_t>::do_always_noconv() const throw() | |
1197 | { return false; } | |
1198 | ||
1199 | int | |
1200 | __codecvt_utf16_base<char16_t>:: | |
1201 | do_length(state_type&, const extern_type* __from, | |
1202 | const extern_type* __end, size_t __max) const | |
1203 | { | |
d951e75d JW |
1204 | range<const char16_t, false> from{ __from, __end }; |
1205 | const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode); | |
28af1fb3 JW |
1206 | return reinterpret_cast<const char*>(next) - __from; |
1207 | } | |
1208 | ||
1209 | int | |
1210 | __codecvt_utf16_base<char16_t>::do_max_length() const throw() | |
516231de JW |
1211 | { |
1212 | // A single UCS-2 character requires one UTF-16 code unit (so two chars). | |
1213 | // (UCS-2 cannot represent characters that use multiple UTF-16 code units). | |
1214 | int max = 2; | |
1215 | if (_M_mode & consume_header) | |
1216 | max += sizeof(utf16_bom); | |
1217 | return max; | |
1218 | } | |
28af1fb3 JW |
1219 | |
1220 | // Define members of codecvt_utf16<char32_t> base class implementation. | |
1221 | // Converts from UTF-16 to UTF-32 (aka UCS-4). | |
1222 | ||
1223 | __codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { } | |
1224 | ||
1225 | codecvt_base::result | |
1226 | __codecvt_utf16_base<char32_t>:: | |
1227 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
1228 | const intern_type*& __from_next, | |
1229 | extern_type* __to, extern_type* __to_end, | |
1230 | extern_type*& __to_next) const | |
1231 | { | |
1232 | range<const char32_t> from{ __from, __from_end }; | |
d951e75d | 1233 | range<char16_t, false> to{ __to, __to_end }; |
28af1fb3 JW |
1234 | auto res = ucs4_out(from, to, _M_maxcode, _M_mode); |
1235 | __from_next = from.next; | |
1236 | __to_next = reinterpret_cast<char*>(to.next); | |
1237 | return res; | |
1238 | } | |
1239 | ||
1240 | codecvt_base::result | |
1241 | __codecvt_utf16_base<char32_t>:: | |
1242 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
1243 | extern_type*& __to_next) const | |
1244 | { | |
1245 | __to_next = __to; | |
1246 | return noconv; | |
1247 | } | |
1248 | ||
1249 | codecvt_base::result | |
1250 | __codecvt_utf16_base<char32_t>:: | |
1251 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
1252 | const extern_type*& __from_next, | |
1253 | intern_type* __to, intern_type* __to_end, | |
1254 | intern_type*& __to_next) const | |
1255 | { | |
d951e75d | 1256 | range<const char16_t, false> from{ __from, __from_end }; |
28af1fb3 JW |
1257 | range<char32_t> to{ __to, __to_end }; |
1258 | auto res = ucs4_in(from, to, _M_maxcode, _M_mode); | |
1259 | __from_next = reinterpret_cast<const char*>(from.next); | |
1260 | __to_next = to.next; | |
d951e75d JW |
1261 | if (res == codecvt_base::ok && __from_next != __from_end) |
1262 | res = codecvt_base::error; | |
28af1fb3 JW |
1263 | return res; |
1264 | } | |
1265 | ||
1266 | int | |
1267 | __codecvt_utf16_base<char32_t>::do_encoding() const throw() | |
516231de | 1268 | { return 0; } // UTF-16 is not a fixed-width encoding |
28af1fb3 JW |
1269 | |
1270 | bool | |
1271 | __codecvt_utf16_base<char32_t>::do_always_noconv() const throw() | |
1272 | { return false; } | |
1273 | ||
1274 | int | |
1275 | __codecvt_utf16_base<char32_t>:: | |
1276 | do_length(state_type&, const extern_type* __from, | |
1277 | const extern_type* __end, size_t __max) const | |
1278 | { | |
d951e75d JW |
1279 | range<const char16_t, false> from{ __from, __end }; |
1280 | const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode); | |
28af1fb3 JW |
1281 | return reinterpret_cast<const char*>(next) - __from; |
1282 | } | |
1283 | ||
1284 | int | |
1285 | __codecvt_utf16_base<char32_t>::do_max_length() const throw() | |
516231de JW |
1286 | { |
1287 | // A single UCS-4 character requires one or two UTF-16 code units | |
1288 | // (so up to four chars). | |
1289 | int max = 4; | |
1290 | if (_M_mode & consume_header) | |
1291 | max += sizeof(utf16_bom); | |
1292 | return max; | |
1293 | } | |
28af1fb3 JW |
1294 | |
1295 | #ifdef _GLIBCXX_USE_WCHAR_T | |
1296 | // Define members of codecvt_utf16<wchar_t> base class implementation. | |
1297 | // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t). | |
1298 | ||
1299 | __codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { } | |
1300 | ||
1301 | codecvt_base::result | |
1302 | __codecvt_utf16_base<wchar_t>:: | |
1303 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
1304 | const intern_type*& __from_next, | |
1305 | extern_type* __to, extern_type* __to_end, | |
1306 | extern_type*& __to_next) const | |
1307 | { | |
d951e75d | 1308 | range<char16_t, false> to{ __to, __to_end }; |
28af1fb3 JW |
1309 | #if __SIZEOF_WCHAR_T__ == 2 |
1310 | range<const char16_t> from{ | |
1311 | reinterpret_cast<const char16_t*>(__from), | |
d951e75d | 1312 | reinterpret_cast<const char16_t*>(__from_end), |
28af1fb3 JW |
1313 | }; |
1314 | auto res = ucs2_out(from, to, _M_maxcode, _M_mode); | |
1315 | #elif __SIZEOF_WCHAR_T__ == 4 | |
1316 | range<const char32_t> from{ | |
1317 | reinterpret_cast<const char32_t*>(__from), | |
d951e75d | 1318 | reinterpret_cast<const char32_t*>(__from_end), |
28af1fb3 JW |
1319 | }; |
1320 | auto res = ucs4_out(from, to, _M_maxcode, _M_mode); | |
1321 | #else | |
1322 | return codecvt_base::error; | |
1323 | #endif | |
1324 | __from_next = reinterpret_cast<const wchar_t*>(from.next); | |
a4c687d6 | 1325 | __to_next = reinterpret_cast<char*>(to.next); |
28af1fb3 JW |
1326 | return res; |
1327 | } | |
1328 | ||
1329 | codecvt_base::result | |
1330 | __codecvt_utf16_base<wchar_t>:: | |
1331 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
1332 | extern_type*& __to_next) const | |
1333 | { | |
1334 | __to_next = __to; | |
1335 | return noconv; | |
1336 | } | |
1337 | ||
1338 | codecvt_base::result | |
1339 | __codecvt_utf16_base<wchar_t>:: | |
1340 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
1341 | const extern_type*& __from_next, | |
1342 | intern_type* __to, intern_type* __to_end, | |
1343 | intern_type*& __to_next) const | |
1344 | { | |
d951e75d | 1345 | range<const char16_t, false> from{ __from, __from_end }; |
28af1fb3 JW |
1346 | #if __SIZEOF_WCHAR_T__ == 2 |
1347 | range<char16_t> to{ | |
1348 | reinterpret_cast<char16_t*>(__to), | |
d951e75d | 1349 | reinterpret_cast<char16_t*>(__to_end), |
28af1fb3 JW |
1350 | }; |
1351 | auto res = ucs2_in(from, to, _M_maxcode, _M_mode); | |
1352 | #elif __SIZEOF_WCHAR_T__ == 4 | |
1353 | range<char32_t> to{ | |
1354 | reinterpret_cast<char32_t*>(__to), | |
d951e75d | 1355 | reinterpret_cast<char32_t*>(__to_end), |
28af1fb3 JW |
1356 | }; |
1357 | auto res = ucs4_in(from, to, _M_maxcode, _M_mode); | |
1358 | #else | |
1359 | return codecvt_base::error; | |
1360 | #endif | |
a4c687d6 | 1361 | __from_next = reinterpret_cast<const char*>(from.next); |
28af1fb3 | 1362 | __to_next = reinterpret_cast<wchar_t*>(to.next); |
d951e75d JW |
1363 | if (res == codecvt_base::ok && __from_next != __from_end) |
1364 | res = codecvt_base::error; | |
28af1fb3 JW |
1365 | return res; |
1366 | } | |
1367 | ||
1368 | int | |
1369 | __codecvt_utf16_base<wchar_t>::do_encoding() const throw() | |
516231de | 1370 | { return 0; } // UTF-16 is not a fixed-width encoding |
28af1fb3 JW |
1371 | |
1372 | bool | |
1373 | __codecvt_utf16_base<wchar_t>::do_always_noconv() const throw() | |
1374 | { return false; } | |
1375 | ||
1376 | int | |
1377 | __codecvt_utf16_base<wchar_t>:: | |
1378 | do_length(state_type&, const extern_type* __from, | |
1379 | const extern_type* __end, size_t __max) const | |
1380 | { | |
d951e75d | 1381 | range<const char16_t, false> from{ __from, __end }; |
28af1fb3 | 1382 | #if __SIZEOF_WCHAR_T__ == 2 |
d951e75d | 1383 | const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode); |
28af1fb3 | 1384 | #elif __SIZEOF_WCHAR_T__ == 4 |
d951e75d | 1385 | const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode); |
28af1fb3 JW |
1386 | #endif |
1387 | return reinterpret_cast<const char*>(next) - __from; | |
1388 | } | |
1389 | ||
1390 | int | |
1391 | __codecvt_utf16_base<wchar_t>::do_max_length() const throw() | |
516231de JW |
1392 | { |
1393 | #if __SIZEOF_WCHAR_T__ == 2 | |
1394 | int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length() | |
1395 | #else | |
1396 | int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length() | |
1397 | #endif | |
1398 | if (_M_mode & consume_header) | |
1399 | max += sizeof(utf16_bom); | |
1400 | return max; | |
1401 | } | |
28af1fb3 JW |
1402 | #endif |
1403 | ||
1404 | // Define members of codecvt_utf8_utf16<char16_t> base class implementation. | |
1405 | // Converts from UTF-8 to UTF-16. | |
1406 | ||
1407 | __codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { } | |
1408 | ||
1409 | codecvt_base::result | |
1410 | __codecvt_utf8_utf16_base<char16_t>:: | |
1411 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
1412 | const intern_type*& __from_next, | |
1413 | extern_type* __to, extern_type* __to_end, | |
1414 | extern_type*& __to_next) const | |
1415 | { | |
1416 | range<const char16_t> from{ __from, __from_end }; | |
1417 | range<char> to{ __to, __to_end }; | |
1418 | auto res = utf16_out(from, to, _M_maxcode, _M_mode); | |
1419 | __from_next = from.next; | |
1420 | __to_next = to.next; | |
1421 | return res; | |
1422 | } | |
1423 | ||
1424 | codecvt_base::result | |
1425 | __codecvt_utf8_utf16_base<char16_t>:: | |
1426 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
1427 | extern_type*& __to_next) const | |
1428 | { | |
1429 | __to_next = __to; | |
1430 | return noconv; | |
1431 | } | |
1432 | ||
1433 | codecvt_base::result | |
1434 | __codecvt_utf8_utf16_base<char16_t>:: | |
1435 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
1436 | const extern_type*& __from_next, | |
1437 | intern_type* __to, intern_type* __to_end, | |
1438 | intern_type*& __to_next) const | |
1439 | { | |
1440 | range<const char> from{ __from, __from_end }; | |
1441 | range<char16_t> to{ __to, __to_end }; | |
a1e1ec76 | 1442 | codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); |
795038b7 JW |
1443 | #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ |
1444 | mode = codecvt_mode(mode | little_endian); | |
1445 | #endif | |
1446 | auto res = utf16_in(from, to, _M_maxcode, mode); | |
28af1fb3 JW |
1447 | __from_next = from.next; |
1448 | __to_next = to.next; | |
1449 | return res; | |
1450 | } | |
1451 | ||
1452 | int | |
1453 | __codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw() | |
516231de | 1454 | { return 0; } // UTF-8 is not a fixed-width encoding |
28af1fb3 JW |
1455 | |
1456 | bool | |
1457 | __codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw() | |
1458 | { return false; } | |
1459 | ||
1460 | int | |
1461 | __codecvt_utf8_utf16_base<char16_t>:: | |
1462 | do_length(state_type&, const extern_type* __from, | |
1463 | const extern_type* __end, size_t __max) const | |
1464 | { | |
1465 | __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode); | |
1466 | return __end - __from; | |
1467 | } | |
1468 | ||
1469 | int | |
1470 | __codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw() | |
1471 | { | |
516231de JW |
1472 | // A single character can be 1 or 2 UTF-16 code units, |
1473 | // requiring up to 4 UTF-8 code units. | |
1474 | int max = 4; | |
1475 | if (_M_mode & consume_header) | |
1476 | max += sizeof(utf8_bom); | |
1477 | return max; | |
28af1fb3 JW |
1478 | } |
1479 | ||
1480 | // Define members of codecvt_utf8_utf16<char32_t> base class implementation. | |
1481 | // Converts from UTF-8 to UTF-16. | |
1482 | ||
1483 | __codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { } | |
1484 | ||
1485 | codecvt_base::result | |
1486 | __codecvt_utf8_utf16_base<char32_t>:: | |
1487 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
1488 | const intern_type*& __from_next, | |
1489 | extern_type* __to, extern_type* __to_end, | |
1490 | extern_type*& __to_next) const | |
1491 | { | |
1492 | range<const char32_t> from{ __from, __from_end }; | |
1493 | range<char> to{ __to, __to_end }; | |
1494 | auto res = utf16_out(from, to, _M_maxcode, _M_mode); | |
1495 | __from_next = from.next; | |
1496 | __to_next = to.next; | |
1497 | return res; | |
1498 | } | |
1499 | ||
1500 | codecvt_base::result | |
1501 | __codecvt_utf8_utf16_base<char32_t>:: | |
1502 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
1503 | extern_type*& __to_next) const | |
1504 | { | |
1505 | __to_next = __to; | |
1506 | return noconv; | |
1507 | } | |
1508 | ||
1509 | codecvt_base::result | |
1510 | __codecvt_utf8_utf16_base<char32_t>:: | |
1511 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
1512 | const extern_type*& __from_next, | |
1513 | intern_type* __to, intern_type* __to_end, | |
1514 | intern_type*& __to_next) const | |
1515 | { | |
1516 | range<const char> from{ __from, __from_end }; | |
1517 | range<char32_t> to{ __to, __to_end }; | |
02e12bda JW |
1518 | codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); |
1519 | #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ | |
1520 | mode = codecvt_mode(mode | little_endian); | |
1521 | #endif | |
1522 | auto res = utf16_in(from, to, _M_maxcode, mode); | |
28af1fb3 JW |
1523 | __from_next = from.next; |
1524 | __to_next = to.next; | |
1525 | return res; | |
1526 | } | |
1527 | ||
1528 | int | |
1529 | __codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw() | |
516231de | 1530 | { return 0; } // UTF-8 is not a fixed-width encoding |
28af1fb3 JW |
1531 | |
1532 | bool | |
1533 | __codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw() | |
1534 | { return false; } | |
1535 | ||
1536 | int | |
1537 | __codecvt_utf8_utf16_base<char32_t>:: | |
1538 | do_length(state_type&, const extern_type* __from, | |
1539 | const extern_type* __end, size_t __max) const | |
1540 | { | |
1541 | __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode); | |
1542 | return __end - __from; | |
1543 | } | |
1544 | ||
1545 | int | |
1546 | __codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw() | |
1547 | { | |
516231de JW |
1548 | // A single character can be 1 or 2 UTF-16 code units, |
1549 | // requiring up to 4 UTF-8 code units. | |
1550 | int max = 4; | |
1551 | if (_M_mode & consume_header) | |
1552 | max += sizeof(utf8_bom); | |
1553 | return max; | |
28af1fb3 JW |
1554 | } |
1555 | ||
1556 | #ifdef _GLIBCXX_USE_WCHAR_T | |
1557 | // Define members of codecvt_utf8_utf16<wchar_t> base class implementation. | |
1558 | // Converts from UTF-8 to UTF-16. | |
1559 | ||
1560 | __codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { } | |
1561 | ||
1562 | codecvt_base::result | |
1563 | __codecvt_utf8_utf16_base<wchar_t>:: | |
1564 | do_out(state_type&, const intern_type* __from, const intern_type* __from_end, | |
1565 | const intern_type*& __from_next, | |
1566 | extern_type* __to, extern_type* __to_end, | |
1567 | extern_type*& __to_next) const | |
1568 | { | |
1569 | range<const wchar_t> from{ __from, __from_end }; | |
1570 | range<char> to{ __to, __to_end }; | |
1571 | auto res = utf16_out(from, to, _M_maxcode, _M_mode); | |
1572 | __from_next = from.next; | |
1573 | __to_next = to.next; | |
1574 | return res; | |
1575 | } | |
1576 | ||
1577 | codecvt_base::result | |
1578 | __codecvt_utf8_utf16_base<wchar_t>:: | |
1579 | do_unshift(state_type&, extern_type* __to, extern_type*, | |
1580 | extern_type*& __to_next) const | |
1581 | { | |
1582 | __to_next = __to; | |
1583 | return noconv; | |
1584 | } | |
1585 | ||
1586 | codecvt_base::result | |
1587 | __codecvt_utf8_utf16_base<wchar_t>:: | |
1588 | do_in(state_type&, const extern_type* __from, const extern_type* __from_end, | |
1589 | const extern_type*& __from_next, | |
1590 | intern_type* __to, intern_type* __to_end, | |
1591 | intern_type*& __to_next) const | |
1592 | { | |
1593 | range<const char> from{ __from, __from_end }; | |
1594 | range<wchar_t> to{ __to, __to_end }; | |
02e12bda JW |
1595 | codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); |
1596 | #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ | |
1597 | mode = codecvt_mode(mode | little_endian); | |
1598 | #endif | |
1599 | auto res = utf16_in(from, to, _M_maxcode, mode); | |
28af1fb3 JW |
1600 | __from_next = from.next; |
1601 | __to_next = to.next; | |
1602 | return res; | |
1603 | } | |
1604 | ||
1605 | int | |
1606 | __codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw() | |
516231de | 1607 | { return 0; } // UTF-8 is not a fixed-width encoding |
28af1fb3 JW |
1608 | |
1609 | bool | |
1610 | __codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw() | |
1611 | { return false; } | |
1612 | ||
1613 | int | |
1614 | __codecvt_utf8_utf16_base<wchar_t>:: | |
1615 | do_length(state_type&, const extern_type* __from, | |
1616 | const extern_type* __end, size_t __max) const | |
1617 | { | |
1618 | __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode); | |
1619 | return __end - __from; | |
1620 | } | |
1621 | ||
1622 | int | |
1623 | __codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw() | |
1624 | { | |
516231de JW |
1625 | // A single character can be 1 or 2 UTF-16 code units, |
1626 | // requiring up to 4 UTF-8 code units. | |
1627 | int max = 4; | |
1628 | if (_M_mode & consume_header) | |
1629 | max += sizeof(utf8_bom); | |
1630 | return max; | |
28af1fb3 JW |
1631 | } |
1632 | #endif | |
1633 | ||
bb93f35d JW |
1634 | inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>; |
1635 | inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>; | |
71a16cd8 JW |
1636 | template class codecvt_byname<char16_t, char, mbstate_t>; |
1637 | template class codecvt_byname<char32_t, char, mbstate_t>; | |
bb93f35d JW |
1638 | |
1639 | _GLIBCXX_END_NAMESPACE_VERSION | |
1640 | } |