From 9c24e93e4350da302ffa8c0ea899f67a3485ee30 Mon Sep 17 00:00:00 2001 From: Jonathan Wakely Date: Fri, 17 Mar 2017 19:28:42 +0000 Subject: [PATCH] Backport fixes from trunk Fix alignment bugs in std::codecvt_utf16 * src/c++11/codecvt.cc (range): Add non-type template parameter and define oerloaded operators for reading and writing code units. (range): Define partial specialization for accessing wide characters in potentially unaligned byte ranges. (ucs2_span(const char16_t*, const char16_t*, ...)) (ucs4_span(const char16_t*, const char16_t*, ...)): Change parameters to range in order to avoid unaligned reads. (__codecvt_utf16_base::do_out) (__codecvt_utf16_base::do_out) (__codecvt_utf16_base::do_out): Use range specialization for unaligned data to avoid unaligned writes. (__codecvt_utf16_base::do_in) (__codecvt_utf16_base::do_in) (__codecvt_utf16_base::do_in): Likewise for writes. Return error if there are unprocessable trailing bytes. (__codecvt_utf16_base::do_length) (__codecvt_utf16_base::do_length) (__codecvt_utf16_base::do_length): Pass arguments of type range to span functions. * testsuite/22_locale/codecvt/codecvt_utf16/misaligned.cc: New test. PR libstdc++/79980 fix target type of cast PR libstdc++/79980 * src/c++11/codecvt.cc (to_integer(codecvt_mode)): Fix target type. PR libstdc++/80041 fix codecvt_utf16 to use UTF-16 not UTF-8 PR libstdc++/80041 * src/c++11/codecvt.cc (__codecvt_utf16_base::do_out) (__codecvt_utf16_base::do_in): Convert char arguments to char16_t to work with UTF-16 instead of UTF-8. * testsuite/22_locale/codecvt/codecvt_utf16/80041.cc: New test. Fix encoding() and max_length() values for codecvt facets * src/c++11/codecvt.cc (codecvt) (codecvt, __codecvt_utf8_base) (__codecvt_utf8_base, __codecvt_utf8_base) (__codecvt_utf16_base, __codecvt_utf16_base) (__codecvt_utf16_base, __codecvt_utf8_utf16_base) (__codecvt_utf8_utf16_base) (__codecvt_utf8_utf16_base): Fix do_encoding() and do_max_length() return values. * testsuite/22_locale/codecvt/codecvt_utf16/members.cc: New test. * testsuite/22_locale/codecvt/codecvt_utf8/members.cc: New test. * testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc: New test. PR libstdc++/79980 fix BOM detection, maxcode checks, UCS2 handling PR libstdc++/79980 * include/bits/locale_conv.h (__do_str_codecvt): Set __count on error path. * src/c++11/codecvt.cc (operator&=, operator|=, operator~): Overloads for manipulating codecvt_mode values. (read_utf16_bom): Compare input to BOM constants instead of integral constants that depend on endianness. Take mode parameter by reference and adjust it, to distinguish between no BOM present and UTF-16BE BOM present. (ucs4_in, ucs2_span, ucs4_span): Adjust calls to read_utf16_bom. (surrogates): New enumeration type. (utf16_in, utf16_out): Add surrogates parameter to choose between UTF-16 and UCS2 behaviour. (utf16_span, ucs2_span): Use std::min not std::max. (ucs2_out): Use std::min not std::max. Disallow surrogate pairs. (ucs2_in): Likewise. Adjust calls to read_utf16_bom. * testsuite/22_locale/codecvt/codecvt_utf16/79980.cc: New test. * testsuite/22_locale/codecvt/codecvt_utf8/79980.cc: New test. PR libstdc++/79511 fix endianness of UTF-16 data PR libstdc++/79511 * src/c++11/codecvt.cc (write_utf16_code_point): Don't write 0xffff as a surrogate pair. (__codecvt_utf8_utf16_base::do_in): Use native endianness for internal representation. (__codecvt_utf8_utf16_base::do_in): Likewise. * testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc: New test. From-SVN: r246247 --- libstdc++-v3/ChangeLog | 77 +++ libstdc++-v3/include/bits/locale_conv.h | 5 +- libstdc++-v3/src/c++11/codecvt.cc | 606 ++++++++++++------ .../testsuite/22_locale/codecvt/char16_t.cc | 2 +- .../22_locale/codecvt/codecvt_utf16/79980.cc | 142 ++++ .../22_locale/codecvt/codecvt_utf16/80041.cc | 87 +++ .../codecvt/codecvt_utf16/members.cc | 81 +++ .../codecvt/codecvt_utf16/misaligned.cc | 289 +++++++++ .../22_locale/codecvt/codecvt_utf8/79980.cc | 94 +++ .../22_locale/codecvt/codecvt_utf8/members.cc | 81 +++ .../codecvt/codecvt_utf8_utf16/79511.cc | 60 ++ .../codecvt/codecvt_utf8_utf16/members.cc | 76 +++ 12 files changed, 1387 insertions(+), 213 deletions(-) create mode 100644 libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc create mode 100644 libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/80041.cc create mode 100644 libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc create mode 100644 libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/misaligned.cc create mode 100644 libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc create mode 100644 libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc create mode 100644 libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc create mode 100644 libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index 01446915800a..537100e59f97 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,3 +1,80 @@ +2017-03-17 Jonathan Wakely + + Backport from mainline + 2017-03-17 Jonathan Wakely + + * src/c++11/codecvt.cc (range): Add non-type template parameter and + define oerloaded operators for reading and writing code units. + (range): Define partial specialization for accessing + wide characters in potentially unaligned byte ranges. + (ucs2_span(const char16_t*, const char16_t*, ...)) + (ucs4_span(const char16_t*, const char16_t*, ...)): Change parameters + to range in order to avoid unaligned reads. + (__codecvt_utf16_base::do_out) + (__codecvt_utf16_base::do_out) + (__codecvt_utf16_base::do_out): Use range specialization for + unaligned data to avoid unaligned writes. + (__codecvt_utf16_base::do_in) + (__codecvt_utf16_base::do_in) + (__codecvt_utf16_base::do_in): Likewise for writes. Return + error if there are unprocessable trailing bytes. + (__codecvt_utf16_base::do_length) + (__codecvt_utf16_base::do_length) + (__codecvt_utf16_base::do_length): Pass arguments of type + range to span functions. + * testsuite/22_locale/codecvt/codecvt_utf16/misaligned.cc: New test. + + Backport from mainline + 2017-03-16 Jonathan Wakely + + PR libstdc++/79980 + * src/c++11/codecvt.cc (to_integer(codecvt_mode)): Fix target type. + + PR libstdc++/80041 + * src/c++11/codecvt.cc (__codecvt_utf16_base::do_out) + (__codecvt_utf16_base::do_in): Convert char arguments to + char16_t to work with UTF-16 instead of UTF-8. + * testsuite/22_locale/codecvt/codecvt_utf16/80041.cc: New test. + + * src/c++11/codecvt.cc (codecvt) + (codecvt, __codecvt_utf8_base) + (__codecvt_utf8_base, __codecvt_utf8_base) + (__codecvt_utf16_base, __codecvt_utf16_base) + (__codecvt_utf16_base, __codecvt_utf8_utf16_base) + (__codecvt_utf8_utf16_base) + (__codecvt_utf8_utf16_base): Fix do_encoding() and + do_max_length() return values. + * testsuite/22_locale/codecvt/codecvt_utf16/members.cc: New test. + * testsuite/22_locale/codecvt/codecvt_utf8/members.cc: New test. + * testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc: New test. + + PR libstdc++/79980 + * include/bits/locale_conv.h (__do_str_codecvt): Set __count on + error path. + * src/c++11/codecvt.cc (operator&=, operator|=, operator~): Overloads + for manipulating codecvt_mode values. + (read_utf16_bom): Compare input to BOM constants instead of integral + constants that depend on endianness. Take mode parameter by + reference and adjust it, to distinguish between no BOM present and + UTF-16BE BOM present. + (ucs4_in, ucs2_span, ucs4_span): Adjust calls to read_utf16_bom. + (surrogates): New enumeration type. + (utf16_in, utf16_out): Add surrogates parameter to choose between + UTF-16 and UCS2 behaviour. + (utf16_span, ucs2_span): Use std::min not std::max. + (ucs2_out): Use std::min not std::max. Disallow surrogate pairs. + (ucs2_in): Likewise. Adjust calls to read_utf16_bom. + * testsuite/22_locale/codecvt/codecvt_utf16/79980.cc: New test. + * testsuite/22_locale/codecvt/codecvt_utf8/79980.cc: New test. + + PR libstdc++/79511 + * src/c++11/codecvt.cc (write_utf16_code_point): Don't write 0xffff + as a surrogate pair. + (__codecvt_utf8_utf16_base::do_in): Use native endianness + for internal representation. + (__codecvt_utf8_utf16_base::do_in): Likewise. + * testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc: New test. + 2017-03-14 Jonathan Wakely * testsuite/17_intro/names.cc: Undefine macros that clash with diff --git a/libstdc++-v3/include/bits/locale_conv.h b/libstdc++-v3/include/bits/locale_conv.h index 16b48758e772..2b3f2dcbf417 100644 --- a/libstdc++-v3/include/bits/locale_conv.h +++ b/libstdc++-v3/include/bits/locale_conv.h @@ -81,7 +81,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION && (__outstr.size() - __outchars) < __maxlen); if (__result == codecvt_base::error) - return false; + { + __count = __next - __first; + return false; + } if (__result == codecvt_base::noconv) { diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc index 8cd8eb46a96c..118733973411 100644 --- a/libstdc++-v3/src/c++11/codecvt.cc +++ b/libstdc++-v3/src/c++11/codecvt.cc @@ -1,6 +1,6 @@ // Locale support (codecvt) -*- C++ -*- -// Copyright (C) 2015 Free Software Foundation, Inc. +// Copyright (C) 2015-2017 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the @@ -24,13 +24,27 @@ #include #include // std::memcpy, std::memcmp -#include // std::max +#include // std::min #ifdef _GLIBCXX_USE_C99_STDINT_TR1 namespace std _GLIBCXX_VISIBILITY(default) { _GLIBCXX_BEGIN_NAMESPACE_VERSION + // The standard doesn't define these operators, which is annoying. + static underlying_type::type + to_integer(codecvt_mode m) + { return static_cast::type>(m); } + + static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n) + { return m = codecvt_mode(to_integer(m) & to_integer(n)); } + + static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n) + { return m = codecvt_mode(to_integer(m) | to_integer(n)); } + + static codecvt_mode operator~(codecvt_mode m) + { return codecvt_mode(~to_integer(m)); } + namespace { // Largest code point that fits in a single UTF-16 code unit. @@ -43,35 +57,142 @@ namespace const char32_t incomplete_mb_character = char32_t(-2); const char32_t invalid_mb_sequence = char32_t(-1); - template + // Utility type for reading and writing code units of type Elem from + // a range defined by a pair of pointers. + template struct range { Elem* next; Elem* end; + // Write a code unit. + range& operator=(Elem e) + { + *next++ = e; + return *this; + } + + // Read the next code unit. Elem operator*() const { return *next; } - range& operator++() { ++next; return *this; } + // Read the Nth code unit. + Elem operator[](size_t n) const { return next[n]; } + // Move to the next code unit. + range& operator++() + { + ++next; + return *this; + } + + // Move to the Nth code unit. + range& operator+=(size_t n) + { + next += n; + return *this; + } + + // The number of code units remaining. size_t size() const { return end - next; } + + // The number of bytes remaining. + size_t nbytes() const { return (const char*)end - (const char*)next; } + }; + + // This specialization is used when accessing char16_t values through + // pointers to char, which might not be correctly aligned for char16_t. + template + struct range + { + using value_type = typename remove_const::type; + + using char_pointer = typename + conditional::value, const char*, char*>::type; + + char_pointer next; + char_pointer end; + + // Write a code unit. + range& operator=(Elem e) + { + memcpy(next, &e, sizeof(Elem)); + ++*this; + return *this; + } + + // Read the next code unit. + Elem operator*() const + { + value_type e; + memcpy(&e, next, sizeof(Elem)); + return e; + } + + // Read the Nth code unit. + Elem operator[](size_t n) const + { + value_type e; + memcpy(&e, next + n * sizeof(Elem), sizeof(Elem)); + return e; + } + + // Move to the next code unit. + range& operator++() + { + next += sizeof(Elem); + return *this; + } + + // Move to the Nth code unit. + range& operator+=(size_t n) + { + next += n * sizeof(Elem); + return *this; + } + + // The number of code units remaining. + size_t size() const { return nbytes() / sizeof(Elem); } + + // The number of bytes remaining. + size_t nbytes() const { return end - next; } }; // Multibyte sequences can have "header" consisting of Byte Order Mark const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF }; - const unsigned char utf16_bom[4] = { 0xFE, 0xFF }; - const unsigned char utf16le_bom[4] = { 0xFF, 0xFE }; + const unsigned char utf16_bom[2] = { 0xFE, 0xFF }; + const unsigned char utf16le_bom[2] = { 0xFF, 0xFE }; - template - inline bool - write_bom(range& to, const unsigned char (&bom)[N]) + // Write a BOM (space permitting). + template + bool + write_bom(range& to, const unsigned char (&bom)[N]) { - if (to.size() < N) + static_assert( (N / sizeof(C)) != 0, "" ); + static_assert( (N % sizeof(C)) == 0, "" ); + + if (to.nbytes() < N) return false; memcpy(to.next, bom, N); - to.next += N; + to += (N / sizeof(C)); return true; } + // Try to read a BOM. + template + bool + read_bom(range& from, const unsigned char (&bom)[N]) + { + static_assert( (N / sizeof(C)) != 0, "" ); + static_assert( (N % sizeof(C)) == 0, "" ); + + if (from.nbytes() >= N && !memcmp(from.next, bom, N)) + { + from += (N / sizeof(C)); + return true; + } + return false; + } + // If generate_header is set in mode write out UTF-8 BOM. bool write_utf8_bom(range& to, codecvt_mode mode) @@ -83,32 +204,20 @@ namespace // If generate_header is set in mode write out the UTF-16 BOM indicated // by whether little_endian is set in mode. + template bool - write_utf16_bom(range& to, codecvt_mode mode) + write_utf16_bom(range& to, codecvt_mode mode) { if (mode & generate_header) { - if (!to.size()) - return false; - auto* bom = (mode & little_endian) ? utf16le_bom : utf16_bom; - std::memcpy(to.next, bom, 2); - ++to.next; + if (mode & little_endian) + return write_bom(to, utf16le_bom); + else + return write_bom(to, utf16_bom); } return true; } - template - inline bool - read_bom(range& from, const unsigned char (&bom)[N]) - { - if (from.size() >= N && !memcmp(from.next, bom, N)) - { - from.next += N; - return true; - } - return false; - } - // If consume_header is set in mode update from.next to after any BOM. void read_utf8_bom(range& from, codecvt_mode mode) @@ -117,22 +226,21 @@ namespace read_bom(from, utf8_bom); } - // If consume_header is set in mode update from.next to after any BOM. - // Return little_endian iff the UTF-16LE BOM was present. - codecvt_mode - read_utf16_bom(range& from, codecvt_mode mode) + // If consume_header is not set in mode, no effects. + // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then: + // - if the UTF-16BE BOM was found unset little_endian in mode, or + // - if the UTF-16LE BOM was found set little_endian in mode. + template + void + read_utf16_bom(range& from, codecvt_mode& mode) { - if (mode & consume_header && from.size()) + if (mode & consume_header) { - if (*from.next == 0xFEFF) - ++from.next; - else if (*from.next == 0xFFFE) - { - ++from.next; - return little_endian; - } + if (read_bom(from, utf16_bom)) + mode &= ~little_endian; + else if (read_bom(from, utf16le_bom)) + mode |= little_endian; } - return {}; } // Read a codepoint from a UTF-8 multibyte sequence. @@ -144,11 +252,11 @@ namespace const size_t avail = from.size(); if (avail == 0) return incomplete_mb_character; - unsigned char c1 = from.next[0]; + unsigned char c1 = from[0]; // https://en.wikipedia.org/wiki/UTF-8#Sample_code if (c1 < 0x80) { - ++from.next; + ++from; return c1; } else if (c1 < 0xC2) // continuation or overlong 2-byte sequence @@ -157,51 +265,51 @@ namespace { if (avail < 2) return incomplete_mb_character; - unsigned char c2 = from.next[1]; + unsigned char c2 = from[1]; if ((c2 & 0xC0) != 0x80) return invalid_mb_sequence; char32_t c = (c1 << 6) + c2 - 0x3080; if (c <= maxcode) - from.next += 2; + from += 2; return c; } else if (c1 < 0xF0) // 3-byte sequence { if (avail < 3) return incomplete_mb_character; - unsigned char c2 = from.next[1]; + unsigned char c2 = from[1]; if ((c2 & 0xC0) != 0x80) return invalid_mb_sequence; if (c1 == 0xE0 && c2 < 0xA0) // overlong return invalid_mb_sequence; - unsigned char c3 = from.next[2]; + unsigned char c3 = from[2]; if ((c3 & 0xC0) != 0x80) return invalid_mb_sequence; char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080; if (c <= maxcode) - from.next += 3; + from += 3; return c; } else if (c1 < 0xF5) // 4-byte sequence { if (avail < 4) return incomplete_mb_character; - unsigned char c2 = from.next[1]; + unsigned char c2 = from[1]; if ((c2 & 0xC0) != 0x80) return invalid_mb_sequence; if (c1 == 0xF0 && c2 < 0x90) // overlong return invalid_mb_sequence; if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF return invalid_mb_sequence; - unsigned char c3 = from.next[2]; + unsigned char c3 = from[2]; if ((c3 & 0xC0) != 0x80) return invalid_mb_sequence; - unsigned char c4 = from.next[3]; + unsigned char c4 = from[3]; if ((c4 & 0xC0) != 0x80) return invalid_mb_sequence; char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080; if (c <= maxcode) - from.next += 4; + from += 4; return c; } else // > U+10FFFF @@ -215,31 +323,31 @@ namespace { if (to.size() < 1) return false; - *to.next++ = code_point; + to = code_point; } else if (code_point <= 0x7FF) { if (to.size() < 2) return false; - *to.next++ = (code_point >> 6) + 0xC0; - *to.next++ = (code_point & 0x3F) + 0x80; + to = (code_point >> 6) + 0xC0; + to = (code_point & 0x3F) + 0x80; } else if (code_point <= 0xFFFF) { if (to.size() < 3) return false; - *to.next++ = (code_point >> 12) + 0xE0; - *to.next++ = ((code_point >> 6) & 0x3F) + 0x80; - *to.next++ = (code_point & 0x3F) + 0x80; + to = (code_point >> 12) + 0xE0; + to = ((code_point >> 6) & 0x3F) + 0x80; + to = (code_point & 0x3F) + 0x80; } else if (code_point <= 0x10FFFF) { if (to.size() < 4) return false; - *to.next++ = (code_point >> 18) + 0xF0; - *to.next++ = ((code_point >> 12) & 0x3F) + 0x80; - *to.next++ = ((code_point >> 6) & 0x3F) + 0x80; - *to.next++ = (code_point & 0x3F) + 0x80; + to = (code_point >> 18) + 0xF0; + to = ((code_point >> 12) & 0x3F) + 0x80; + to = ((code_point >> 6) & 0x3F) + 0x80; + to = (code_point & 0x3F) + 0x80; } else return false; @@ -280,47 +388,47 @@ namespace // The sequence's endianness is indicated by (mode & little_endian). // Updates from.next if the codepoint is not greater than maxcode. // Returns invalid_mb_sequence, incomplete_mb_character or the code point. - char32_t - read_utf16_code_point(range& from, unsigned long maxcode, - codecvt_mode mode) - { - const size_t avail = from.size(); - if (avail == 0) - return incomplete_mb_character; - int inc = 1; - char32_t c = adjust_byte_order(from.next[0], mode); - if (is_high_surrogate(c)) - { - if (avail < 2) - return incomplete_mb_character; - const char16_t c2 = adjust_byte_order(from.next[1], mode); - if (is_low_surrogate(c2)) - { - c = surrogate_pair_to_code_point(c, c2); - inc = 2; - } - else - return invalid_mb_sequence; - } - else if (is_low_surrogate(c)) - return invalid_mb_sequence; - if (c <= maxcode) - from.next += inc; - return c; - } + template + char32_t + read_utf16_code_point(range& from, + unsigned long maxcode, codecvt_mode mode) + { + const size_t avail = from.size(); + if (avail == 0) + return incomplete_mb_character; + int inc = 1; + char32_t c = adjust_byte_order(from[0], mode); + if (is_high_surrogate(c)) + { + if (avail < 2) + return incomplete_mb_character; + const char16_t c2 = adjust_byte_order(from[1], mode); + if (is_low_surrogate(c2)) + { + c = surrogate_pair_to_code_point(c, c2); + inc = 2; + } + else + return invalid_mb_sequence; + } + else if (is_low_surrogate(c)) + return invalid_mb_sequence; + if (c <= maxcode) + from += inc; + return c; + } - template + template bool - write_utf16_code_point(range& to, char32_t codepoint, codecvt_mode mode) + write_utf16_code_point(range& to, char32_t codepoint, codecvt_mode mode) { static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit"); - if (codepoint < max_single_utf16_unit) + if (codepoint <= max_single_utf16_unit) { if (to.size() > 0) { - *to.next = adjust_byte_order(codepoint, mode); - ++to.next; + to = adjust_byte_order(codepoint, mode); return true; } } @@ -330,9 +438,8 @@ namespace const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10); char16_t lead = LEAD_OFFSET + (codepoint >> 10); char16_t trail = 0xDC00 + (codepoint & 0x3FF); - to.next[0] = adjust_byte_order(lead, mode); - to.next[1] = adjust_byte_order(trail, mode); - to.next += 2; + to = adjust_byte_order(lead, mode); + to = adjust_byte_order(trail, mode); return true; } return false; @@ -351,7 +458,7 @@ namespace return codecvt_base::partial; if (codepoint > maxcode) return codecvt_base::error; - *to.next++ = codepoint; + to = codepoint; } return from.size() ? codecvt_base::partial : codecvt_base::ok; } @@ -365,23 +472,22 @@ namespace return codecvt_base::partial; while (from.size()) { - const char32_t c = from.next[0]; + const char32_t c = from[0]; if (c > maxcode) return codecvt_base::error; if (!write_utf8_code_point(to, c)) return codecvt_base::partial; - ++from.next; + ++from; } return codecvt_base::ok; } // utf16 -> ucs4 codecvt_base::result - ucs4_in(range& from, range& to, + ucs4_in(range& from, range& to, unsigned long maxcode = max_code_point, codecvt_mode mode = {}) { - if (read_utf16_bom(from, mode) == little_endian) - mode = codecvt_mode(mode & little_endian); + read_utf16_bom(from, mode); while (from.size() && to.size()) { const char32_t codepoint = read_utf16_code_point(from, maxcode, mode); @@ -389,72 +495,85 @@ namespace return codecvt_base::partial; if (codepoint > maxcode) return codecvt_base::error; - *to.next++ = codepoint; + to = codepoint; } return from.size() ? codecvt_base::partial : codecvt_base::ok; } // ucs4 -> utf16 codecvt_base::result - ucs4_out(range& from, range& to, + ucs4_out(range& from, range& to, unsigned long maxcode = max_code_point, codecvt_mode mode = {}) { if (!write_utf16_bom(to, mode)) return codecvt_base::partial; while (from.size()) { - const char32_t c = from.next[0]; + const char32_t c = from[0]; if (c > maxcode) return codecvt_base::error; if (!write_utf16_code_point(to, c, mode)) return codecvt_base::partial; - ++from.next; + ++from; } return codecvt_base::ok; } - // utf8 -> utf16 + // Flag indicating whether to process UTF-16 or UCS2 + enum class surrogates { allowed, disallowed }; + + // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed) template codecvt_base::result utf16_in(range& from, range& to, - unsigned long maxcode = max_code_point, codecvt_mode mode = {}) + unsigned long maxcode = max_code_point, codecvt_mode mode = {}, + surrogates s = surrogates::allowed) { read_utf8_bom(from, mode); while (from.size() && to.size()) { - const char* const first = from.next; + auto orig = from; const char32_t codepoint = read_utf8_code_point(from, maxcode); if (codepoint == incomplete_mb_character) - return codecvt_base::partial; + { + if (s == surrogates::allowed) + return codecvt_base::partial; + else + return codecvt_base::error; // No surrogates in UCS2 + } if (codepoint > maxcode) return codecvt_base::error; if (!write_utf16_code_point(to, codepoint, mode)) { - from.next = first; + from = orig; // rewind to previous position return codecvt_base::partial; } } return codecvt_base::ok; } - // utf16 -> utf8 + // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed) template codecvt_base::result utf16_out(range& from, range& to, - unsigned long maxcode = max_code_point, codecvt_mode mode = {}) + unsigned long maxcode = max_code_point, codecvt_mode mode = {}, + surrogates s = surrogates::allowed) { if (!write_utf8_bom(to, mode)) return codecvt_base::partial; while (from.size()) { - char32_t c = from.next[0]; + char32_t c = from[0]; int inc = 1; if (is_high_surrogate(c)) { + if (s == surrogates::disallowed) + return codecvt_base::error; // No surrogates in UCS-2 + if (from.size() < 2) return codecvt_base::ok; // stop converting at this point - const char32_t c2 = from.next[1]; + const char32_t c2 = from[1]; if (is_low_surrogate(c2)) { c = surrogate_pair_to_code_point(c, c2); @@ -469,7 +588,7 @@ namespace return codecvt_base::error; if (!write_utf8_code_point(to, c)) return codecvt_base::partial; - from.next += inc; + from += inc; } return codecvt_base::ok; } @@ -492,7 +611,7 @@ namespace ++count; } if (count+1 == max) // take one more character if it fits in a single unit - read_utf8_code_point(from, std::max(max_single_utf16_unit, maxcode)); + read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode)); return from.next; } @@ -501,7 +620,9 @@ namespace ucs2_in(range& from, range& to, char32_t maxcode = max_code_point, codecvt_mode mode = {}) { - return utf16_in(from, to, std::max(max_single_utf16_unit, maxcode), mode); + // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: + maxcode = std::min(max_single_utf16_unit, maxcode); + return utf16_in(from, to, maxcode, mode, surrogates::disallowed); } // ucs2 -> utf8 @@ -509,61 +630,62 @@ namespace ucs2_out(range& from, range& to, char32_t maxcode = max_code_point, codecvt_mode mode = {}) { - return utf16_out(from, to, std::max(max_single_utf16_unit, maxcode), mode); + // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: + maxcode = std::min(max_single_utf16_unit, maxcode); + return utf16_out(from, to, maxcode, mode, surrogates::disallowed); } // ucs2 -> utf16 codecvt_base::result - ucs2_out(range& from, range& to, + ucs2_out(range& from, range& to, char32_t maxcode = max_code_point, codecvt_mode mode = {}) { if (!write_utf16_bom(to, mode)) return codecvt_base::partial; while (from.size() && to.size()) { - char16_t c = from.next[0]; + char16_t c = from[0]; if (is_high_surrogate(c)) return codecvt_base::error; if (c > maxcode) return codecvt_base::error; - *to.next++ = adjust_byte_order(c, mode); - ++from.next; + to = adjust_byte_order(c, mode); + ++from; } return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial; } // utf16 -> ucs2 codecvt_base::result - ucs2_in(range& from, range& to, + ucs2_in(range& from, range& to, char32_t maxcode = max_code_point, codecvt_mode mode = {}) { - if (read_utf16_bom(from, mode) == little_endian) - mode = codecvt_mode(mode & little_endian); - maxcode = std::max(max_single_utf16_unit, maxcode); + read_utf16_bom(from, mode); + // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: + maxcode = std::min(max_single_utf16_unit, maxcode); while (from.size() && to.size()) { const char32_t c = read_utf16_code_point(from, maxcode, mode); if (c == incomplete_mb_character) - return codecvt_base::partial; + return codecvt_base::error; // UCS-2 only supports single units. if (c > maxcode) return codecvt_base::error; - *to.next++ = c; + to = c; } return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial; } const char16_t* - ucs2_span(const char16_t* begin, const char16_t* end, size_t max, + ucs2_span(range& from, size_t max, char32_t maxcode, codecvt_mode mode) { - range from{ begin, end }; - if (read_utf16_bom(from, mode) == little_endian) - mode = codecvt_mode(mode & little_endian); - maxcode = std::max(max_single_utf16_unit, maxcode); + read_utf16_bom(from, mode); + // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: + maxcode = std::min(max_single_utf16_unit, maxcode); char32_t c = 0; while (max-- && c <= maxcode) c = read_utf16_code_point(from, maxcode, mode); - return from.next; + return reinterpret_cast(from.next); } const char* @@ -572,7 +694,8 @@ namespace { range from{ begin, end }; read_utf8_bom(from, mode); - maxcode = std::max(max_single_utf16_unit, maxcode); + // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: + maxcode = std::min(max_single_utf16_unit, maxcode); char32_t c = 0; while (max-- && c <= maxcode) c = read_utf8_code_point(from, maxcode); @@ -594,16 +717,14 @@ namespace // return pos such that [begin,pos) is valid UCS-4 string no longer than max const char16_t* - ucs4_span(const char16_t* begin, const char16_t* end, size_t max, + ucs4_span(range& from, size_t max, char32_t maxcode = max_code_point, codecvt_mode mode = {}) { - range from{ begin, end }; - if (read_utf16_bom(from, mode) == little_endian) - mode = codecvt_mode(mode & little_endian); + read_utf16_bom(from, mode); char32_t c = 0; while (max-- && c <= maxcode) c = read_utf16_code_point(from, maxcode, mode); - return from.next; + return reinterpret_cast(from.next); } } @@ -661,7 +782,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int codecvt::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-8 is not a fixed-width encoding bool codecvt::do_always_noconv() const throw() @@ -679,9 +800,9 @@ do_length(state_type&, const extern_type* __from, int codecvt::do_max_length() const throw() { - // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit, - // whereas 4 byte sequences require two 16-bit code units. - return 3; + // A single character (one or two UTF-16 code units) requires + // up to four UTF-8 code units. + return 4; } // Define members of codecvt specialization. @@ -732,7 +853,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int codecvt::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-8 is not a fixed-width encoding bool codecvt::do_always_noconv() const throw() @@ -749,7 +870,11 @@ do_length(state_type&, const extern_type* __from, int codecvt::do_max_length() const throw() -{ return 4; } +{ + // A single character (one UTF-32 code unit) requires + // up to 4 UTF-8 code units. + return 4; +} // Define members of codecvt_utf8 base class implementation. // Converts from UTF-8 to UCS-2. @@ -801,7 +926,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int __codecvt_utf8_base::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-8 is not a fixed-width encoding bool __codecvt_utf8_base::do_always_noconv() const throw() @@ -818,7 +943,14 @@ do_length(state_type&, const extern_type* __from, int __codecvt_utf8_base::do_max_length() const throw() -{ return 3; } +{ + // A single UCS-2 character requires up to three UTF-8 code units. + // (UCS-2 cannot represent characters that use four UTF-8 code units). + int max = 3; + if (_M_mode & consume_header) + max += sizeof(utf8_bom); + return max; +} // Define members of codecvt_utf8 base class implementation. // Converts from UTF-8 to UTF-32 (aka UCS-4). @@ -866,7 +998,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int __codecvt_utf8_base::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-8 is not a fixed-width encoding bool __codecvt_utf8_base::do_always_noconv() const throw() @@ -883,9 +1015,22 @@ do_length(state_type&, const extern_type* __from, int __codecvt_utf8_base::do_max_length() const throw() -{ return 4; } +{ + // A single UCS-4 character requires up to four UTF-8 code units. + int max = 4; + if (_M_mode & consume_header) + max += sizeof(utf8_bom); + return max; +} #ifdef _GLIBCXX_USE_WCHAR_T + +#if __SIZEOF_WCHAR_T__ == 2 +static_assert(sizeof(wchar_t) == sizeof(char16_t), ""); +#elif __SIZEOF_WCHAR_T__ == 4 +static_assert(sizeof(wchar_t) == sizeof(char32_t), ""); +#endif + // Define members of codecvt_utf8 base class implementation. // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t). @@ -958,7 +1103,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int __codecvt_utf8_base::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-8 is not a fixed-width encoding bool __codecvt_utf8_base::do_always_noconv() const throw() @@ -981,7 +1126,16 @@ do_length(state_type&, const extern_type* __from, int __codecvt_utf8_base::do_max_length() const throw() -{ return 4; } +{ +#if __SIZEOF_WCHAR_T__ == 2 + int max = 3; // See __codecvt_utf8_base::do_max_length() +#else + int max = 4; // See __codecvt_utf8_base::do_max_length() +#endif + if (_M_mode & consume_header) + max += sizeof(utf8_bom); + return max; +} #endif // Define members of codecvt_utf16 base class implementation. @@ -997,10 +1151,7 @@ do_out(state_type&, const intern_type* __from, const intern_type* __from_end, extern_type*& __to_next) const { range from{ __from, __from_end }; - range to{ - reinterpret_cast(__to), - reinterpret_cast(__to_end) - }; + range to{ __to, __to_end }; auto res = ucs2_out(from, to, _M_maxcode, _M_mode); __from_next = from.next; __to_next = reinterpret_cast(to.next); @@ -1023,20 +1174,19 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, intern_type* __to, intern_type* __to_end, intern_type*& __to_next) const { - range from{ - reinterpret_cast(__from), - reinterpret_cast(__from_end) - }; + range from{ __from, __from_end }; range to{ __to, __to_end }; auto res = ucs2_in(from, to, _M_maxcode, _M_mode); __from_next = reinterpret_cast(from.next); __to_next = to.next; + if (res == codecvt_base::ok && __from_next != __from_end) + res = codecvt_base::error; return res; } int __codecvt_utf16_base::do_encoding() const throw() -{ return 1; } +{ return 0; } // UTF-16 is not a fixed-width encoding bool __codecvt_utf16_base::do_always_noconv() const throw() @@ -1047,15 +1197,21 @@ __codecvt_utf16_base:: do_length(state_type&, const extern_type* __from, const extern_type* __end, size_t __max) const { - auto next = reinterpret_cast(__from); - next = ucs2_span(next, reinterpret_cast(__end), __max, - _M_maxcode, _M_mode); + range from{ __from, __end }; + const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode); return reinterpret_cast(next) - __from; } int __codecvt_utf16_base::do_max_length() const throw() -{ return 3; } +{ + // A single UCS-2 character requires one UTF-16 code unit (so two chars). + // (UCS-2 cannot represent characters that use multiple UTF-16 code units). + int max = 2; + if (_M_mode & consume_header) + max += sizeof(utf16_bom); + return max; +} // Define members of codecvt_utf16 base class implementation. // Converts from UTF-16 to UTF-32 (aka UCS-4). @@ -1070,10 +1226,7 @@ do_out(state_type&, const intern_type* __from, const intern_type* __from_end, extern_type*& __to_next) const { range from{ __from, __from_end }; - range to{ - reinterpret_cast(__to), - reinterpret_cast(__to_end) - }; + range to{ __to, __to_end }; auto res = ucs4_out(from, to, _M_maxcode, _M_mode); __from_next = from.next; __to_next = reinterpret_cast(to.next); @@ -1096,20 +1249,19 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, intern_type* __to, intern_type* __to_end, intern_type*& __to_next) const { - range from{ - reinterpret_cast(__from), - reinterpret_cast(__from_end) - }; + range from{ __from, __from_end }; range to{ __to, __to_end }; auto res = ucs4_in(from, to, _M_maxcode, _M_mode); __from_next = reinterpret_cast(from.next); __to_next = to.next; + if (res == codecvt_base::ok && __from_next != __from_end) + res = codecvt_base::error; return res; } int __codecvt_utf16_base::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-16 is not a fixed-width encoding bool __codecvt_utf16_base::do_always_noconv() const throw() @@ -1120,15 +1272,21 @@ __codecvt_utf16_base:: do_length(state_type&, const extern_type* __from, const extern_type* __end, size_t __max) const { - auto next = reinterpret_cast(__from); - next = ucs4_span(next, reinterpret_cast(__end), __max, - _M_maxcode, _M_mode); + range from{ __from, __end }; + const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode); return reinterpret_cast(next) - __from; } int __codecvt_utf16_base::do_max_length() const throw() -{ return 4; } +{ + // A single UCS-4 character requires one or two UTF-16 code units + // (so up to four chars). + int max = 4; + if (_M_mode & consume_header) + max += sizeof(utf16_bom); + return max; +} #ifdef _GLIBCXX_USE_WCHAR_T // Define members of codecvt_utf16 base class implementation. @@ -1143,24 +1301,24 @@ do_out(state_type&, const intern_type* __from, const intern_type* __from_end, extern_type* __to, extern_type* __to_end, extern_type*& __to_next) const { - range to{ __to, __to_end }; + range to{ __to, __to_end }; #if __SIZEOF_WCHAR_T__ == 2 range from{ reinterpret_cast(__from), - reinterpret_cast(__from_end) + reinterpret_cast(__from_end), }; auto res = ucs2_out(from, to, _M_maxcode, _M_mode); #elif __SIZEOF_WCHAR_T__ == 4 range from{ reinterpret_cast(__from), - reinterpret_cast(__from_end) + reinterpret_cast(__from_end), }; auto res = ucs4_out(from, to, _M_maxcode, _M_mode); #else return codecvt_base::error; #endif __from_next = reinterpret_cast(from.next); - __to_next = to.next; + __to_next = reinterpret_cast(to.next); return res; } @@ -1180,30 +1338,32 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, intern_type* __to, intern_type* __to_end, intern_type*& __to_next) const { - range from{ __from, __from_end }; + range from{ __from, __from_end }; #if __SIZEOF_WCHAR_T__ == 2 range to{ reinterpret_cast(__to), - reinterpret_cast(__to_end) + reinterpret_cast(__to_end), }; auto res = ucs2_in(from, to, _M_maxcode, _M_mode); #elif __SIZEOF_WCHAR_T__ == 4 range to{ reinterpret_cast(__to), - reinterpret_cast(__to_end) + reinterpret_cast(__to_end), }; auto res = ucs4_in(from, to, _M_maxcode, _M_mode); #else return codecvt_base::error; #endif - __from_next = from.next; + __from_next = reinterpret_cast(from.next); __to_next = reinterpret_cast(to.next); + if (res == codecvt_base::ok && __from_next != __from_end) + res = codecvt_base::error; return res; } int __codecvt_utf16_base::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-16 is not a fixed-width encoding bool __codecvt_utf16_base::do_always_noconv() const throw() @@ -1214,20 +1374,27 @@ __codecvt_utf16_base:: do_length(state_type&, const extern_type* __from, const extern_type* __end, size_t __max) const { - auto next = reinterpret_cast(__from); + range from{ __from, __end }; #if __SIZEOF_WCHAR_T__ == 2 - next = ucs2_span(next, reinterpret_cast(__end), __max, - _M_maxcode, _M_mode); + const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode); #elif __SIZEOF_WCHAR_T__ == 4 - next = ucs4_span(next, reinterpret_cast(__end), __max, - _M_maxcode, _M_mode); + const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode); #endif return reinterpret_cast(next) - __from; } int __codecvt_utf16_base::do_max_length() const throw() -{ return 4; } +{ +#if __SIZEOF_WCHAR_T__ == 2 + int max = 2; // See __codecvt_utf16_base::do_max_length() +#else + int max = 4; // See __codecvt_utf16_base::do_max_length() +#endif + if (_M_mode & consume_header) + max += sizeof(utf16_bom); + return max; +} #endif // Define members of codecvt_utf8_utf16 base class implementation. @@ -1280,7 +1447,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int __codecvt_utf8_utf16_base::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-8 is not a fixed-width encoding bool __codecvt_utf8_utf16_base::do_always_noconv() const throw() @@ -1298,9 +1465,12 @@ do_length(state_type&, const extern_type* __from, int __codecvt_utf8_utf16_base::do_max_length() const throw() { - // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit, - // whereas 4 byte sequences require two 16-bit code units. - return 3; + // A single character can be 1 or 2 UTF-16 code units, + // requiring up to 4 UTF-8 code units. + int max = 4; + if (_M_mode & consume_header) + max += sizeof(utf8_bom); + return max; } // Define members of codecvt_utf8_utf16 base class implementation. @@ -1341,7 +1511,11 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, { range from{ __from, __from_end }; range to{ __to, __to_end }; - auto res = utf16_in(from, to, _M_maxcode, _M_mode); + codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); +#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ + mode = codecvt_mode(mode | little_endian); +#endif + auto res = utf16_in(from, to, _M_maxcode, mode); __from_next = from.next; __to_next = to.next; return res; @@ -1349,7 +1523,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int __codecvt_utf8_utf16_base::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-8 is not a fixed-width encoding bool __codecvt_utf8_utf16_base::do_always_noconv() const throw() @@ -1367,9 +1541,12 @@ do_length(state_type&, const extern_type* __from, int __codecvt_utf8_utf16_base::do_max_length() const throw() { - // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit, - // whereas 4 byte sequences require two 16-bit code units. - return 3; + // A single character can be 1 or 2 UTF-16 code units, + // requiring up to 4 UTF-8 code units. + int max = 4; + if (_M_mode & consume_header) + max += sizeof(utf8_bom); + return max; } #ifdef _GLIBCXX_USE_WCHAR_T @@ -1411,7 +1588,11 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, { range from{ __from, __from_end }; range to{ __to, __to_end }; - auto res = utf16_in(from, to, _M_maxcode, _M_mode); + codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); +#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ + mode = codecvt_mode(mode | little_endian); +#endif + auto res = utf16_in(from, to, _M_maxcode, mode); __from_next = from.next; __to_next = to.next; return res; @@ -1419,7 +1600,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int __codecvt_utf8_utf16_base::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-8 is not a fixed-width encoding bool __codecvt_utf8_utf16_base::do_always_noconv() const throw() @@ -1437,9 +1618,12 @@ do_length(state_type&, const extern_type* __from, int __codecvt_utf8_utf16_base::do_max_length() const throw() { - // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit, - // whereas 4 byte sequences require two 16-bit code units. - return 3; + // A single character can be 1 or 2 UTF-16 code units, + // requiring up to 4 UTF-8 code units. + int max = 4; + if (_M_mode & consume_header) + max += sizeof(utf8_bom); + return max; } #endif diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc index a21a8385629f..a66a35a5964b 100644 --- a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc +++ b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc @@ -34,7 +34,7 @@ test01() const codecvt_c16* const cvt = &use_facet(loc_c); VERIFY(!cvt->always_noconv()); - VERIFY(cvt->max_length() == 3); + VERIFY(cvt->max_length() == 4); VERIFY(cvt->encoding() == 0); const char u8dat[] = u8"H\U000000E4ll\U000000F6 \U0001F63F \U000056FD " diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc new file mode 100644 index 000000000000..d8b9729ed5b2 --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc @@ -0,0 +1,142 @@ +// Copyright (C) 2017 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// { dg-do run { target c++11 } } + +#include +#include +#include + +// PR libstdc++/79980 + +constexpr std::codecvt_mode mode(std::codecvt_mode m) +{ return static_cast(m | std::consume_header); } + +template + using Conv + = std::wstring_convert, WCh>; + +void +test01() +{ + const char src[] = "\xFE\xFF\xAB\xCD"; + Conv conv; + auto dst = conv.from_bytes(src, src+4); + VERIFY( dst[0] == 0xabcd ); +} + +void +test02() +{ + const char src[] = "\xFF\xFE\xAB\xCD"; + Conv conv; + auto dst = conv.from_bytes(src, src+4); + VERIFY( dst[0] == 0xcdab ); +} + +void +test03() +{ + const char src[] = "\xFE\xFF\xAB\xCD"; + Conv conv; + auto dst = conv.from_bytes(src, src+4); + VERIFY( dst[0] == 0xabcd ); +} + +void +test04() +{ + const char src[] = "\xFF\xFE\xAB\xCD"; + Conv conv; + auto dst = conv.from_bytes(src, src+4); + VERIFY( dst[0] == 0xcdab ); +} + +void +test05() +{ + const char src[] = "\0\x61\xAB\xCD"; // character greater than 0x00FF + Conv conv("to_bytes failed", u"from_bytes failed"); + std::u16string result = conv.from_bytes(src, src+4); + VERIFY( result == u"from_bytes failed" ); + VERIFY( conv.converted() == 2 ); +} + +void +test06() +{ + const char src[] = "\0\x61\xAB\xCD"; + Conv conv("to_bytes failed", u"from_bytes failed"); + std::u16string result = conv.from_bytes(src, src+3); // incomplete character + VERIFY( result == u"from_bytes failed" ); + VERIFY( conv.converted() == 2 ); +} + +void +test07() +{ + Conv conv("to_bytes failed", u"from_bytes failed"); + // ucs2 to utf-16 conversion should fail on invalid ucs2 input: + std::u16string utf16 = u"1234\U00001111\U0001ffff"; + auto out = conv.to_bytes(utf16); + VERIFY( out == "to_bytes failed" ); + VERIFY( conv.converted() == 5 ); + + // And should also fail on incomplete surrogate pair (not return partial): + out = conv.to_bytes(utf16.substr(0, utf16.size()-1)); + VERIFY( out == "to_bytes failed" ); + VERIFY( conv.converted() == 5 ); +} + +void +test08() +{ + // Read/write UTF-16 code units from data not correctly aligned for char16_t + Conv conv; + const char src[] = "-\xFE\xFF\0\x61\xAB\xCD"; + auto out = conv.from_bytes(src + 1, src + 7); + VERIFY( out[0] == 0x0061 ); + VERIFY( out[1] == 0xabcd ); + auto bytes = conv.to_bytes(out); + VERIFY( bytes == std::string(src + 1, 6) ); +} + +void +test09() +{ + // Read/write UTF-16 code units from data not correctly aligned for char16_t + Conv conv; + const char src[] = "-\xFE\xFF\xD8\x08\xDF\x45"; + auto out = conv.from_bytes(src + 1, src + 7); + VERIFY( out == U"\U00012345" ); + auto bytes = conv.to_bytes(out); + VERIFY( bytes == std::string(src + 1, 6) ); +} + +int main() +{ + test01(); + test02(); + test03(); + test04(); + test05(); + test06(); + test07(); + test08(); + test09(); +} diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/80041.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/80041.cc new file mode 100644 index 000000000000..a78b194212dc --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/80041.cc @@ -0,0 +1,87 @@ +// Copyright (C) 2017 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// { dg-do run { target c++11 } } + +#include +#include + +void +test01() +{ +#ifdef _GLIBCXX_USE_WCHAR_T + std::codecvt_utf16 conv; + const wchar_t wc = 0x6557; + char bytes[2] = {0}; + const wchar_t* wcnext; + std::mbstate_t st{}; + char* next = nullptr; + auto res = conv.out(st, &wc, &wc+ 1, wcnext, bytes, std::end(bytes), next); + VERIFY( res == std::codecvt_base::ok ); + VERIFY( wcnext == &wc + 1 ); + VERIFY( next == std::end(bytes) ); + VERIFY( bytes[0] == 0x65 ); + VERIFY( bytes[1] == 0x57 ); + VERIFY( conv.length(st, bytes, next, 1) == (next - bytes) ); + + wchar_t w; + wchar_t* wnext; + const char* cnext; + st = {}; + res = conv.in(st, bytes, next, cnext, &w, &w + 1, wnext); + VERIFY( res == std::codecvt_base::ok ); + VERIFY( wnext == &w + 1 ); + VERIFY( cnext == next ); + VERIFY( w == wc ); +#endif +} + +void +test02() +{ +#ifdef _GLIBCXX_USE_WCHAR_T + std::codecvt_utf16 conv; + wchar_t wc = 0x6557; + char bytes[2] = {0}; + const wchar_t* wcnext; + std::mbstate_t st{}; + char* next = nullptr; + auto res = conv.out(st, &wc, &wc+ 1, wcnext, bytes, std::end(bytes), next); + VERIFY( res == std::codecvt_base::ok ); + VERIFY( wcnext == &wc + 1 ); + VERIFY( next == std::end(bytes) ); + VERIFY( bytes[0] == 0x57 ); + VERIFY( bytes[1] == 0x65 ); + VERIFY( conv.length(st, bytes, next, 1) == (next - bytes) ); + + wchar_t w; + wchar_t* wnext; + const char* cnext; + st = {}; + res = conv.in(st, bytes, next, cnext, &w, &w + 1, wnext); + VERIFY( res == std::codecvt_base::ok ); + VERIFY( wnext == &w + 1 ); + VERIFY( cnext == next ); + VERIFY( w == wc ); +#endif +} + +int main() +{ + test01(); + test02(); +} diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc new file mode 100644 index 000000000000..993c86082c17 --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc @@ -0,0 +1,81 @@ +// Copyright (C) 2017 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// { dg-do run { target c++11 } } + +#include +#include + +const int bomlen = 2; // UTF-16 BOM is 16 bits + +void +test01() +{ + const int maxlen = 2; + + std::codecvt_utf16 c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf16 c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +} + +void +test02() +{ + const int maxlen = 4; + + std::codecvt_utf16 c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf16 c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +} + +void +test03() +{ +#ifdef _GLIBCXX_USE_WCHAR_T + const int maxlen = sizeof(wchar_t) == 4 ? 4 : 2; + + std::codecvt_utf16 c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf16 c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +#endif +} + +int +main() +{ + test01(); + test02(); + test03(); +} diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/misaligned.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/misaligned.cc new file mode 100644 index 000000000000..0179c184c202 --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/misaligned.cc @@ -0,0 +1,289 @@ +// Copyright (C) 2017 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// { dg-do run { target c++11 } } + +#include +#include +#include + +using std::codecvt_base; +using std::codecvt_mode; +using std::codecvt_utf16; +using std::wstring_convert; +using std::mbstate_t; + +constexpr codecvt_mode +operator|(codecvt_mode m1, codecvt_mode m2) +{ + using underlying = std::underlying_type::type; + return static_cast(static_cast(m1) | m2); +} + +// Read/write UTF-16 code units from data not correctly aligned for char16_t + +void +test01() +{ + mbstate_t st; + constexpr codecvt_mode m = std::consume_header|std::generate_header; + codecvt_utf16 conv; + const char src[] = "-\xFE\xFF\0\x61\xAB\xCD"; + const char* const src_end = src + 7; + + int len = conv.length(st, src + 1, src_end, 1); + VERIFY( len == 4 ); + len = conv.length(st, src + 1, src_end, 2); + VERIFY( len == 6 ); + + char16_t dst[2]; + char16_t* const dst_end = dst + 2; + char16_t* dst_next; + const char* src_cnext; + auto res = conv.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next); + VERIFY( res == codecvt_base::ok ); + VERIFY( dst[0] == 0x0061 ); + VERIFY( dst[1] == 0xabcd ); + VERIFY( src_cnext == src_end ); + VERIFY( dst_next == dst_end ); + + char out[sizeof(src)] = { src[0] }; + char* const out_end = out + 7; + char* out_next; + const char16_t* dst_cnext; + res = conv.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next); + VERIFY( res == codecvt_base::ok ); + VERIFY( out_next == out_end ); + VERIFY( dst_cnext == dst_end ); + VERIFY( out[1] == src[1] ); + VERIFY( out[2] == src[2] ); + VERIFY( out[3] == src[3] ); + VERIFY( out[4] == src[4] ); + VERIFY( out[5] == src[5] ); + VERIFY( out[6] == src[6] ); + + codecvt_utf16 conv_le; + + len = conv_le.length(st, src + 1, src_end, 1); + VERIFY( len == 4 ); + len = conv_le.length(st, src + 1, src_end, 2); + VERIFY( len == 6 ); + + res = conv_le.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next); + VERIFY( res == codecvt_base::ok ); + VERIFY( dst[0] == 0x0061 ); + VERIFY( dst[1] == 0xabcd ); + VERIFY( src_cnext == src_end ); + VERIFY( dst_next == dst_end ); + + res = conv_le.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next); + VERIFY( res == codecvt_base::ok ); + VERIFY( out_next == out_end ); + VERIFY( dst_cnext == dst_end ); + VERIFY( out[1] == src[2] ); + VERIFY( out[2] == src[1] ); + VERIFY( out[3] == src[4] ); + VERIFY( out[4] == src[3] ); + VERIFY( out[5] == src[6] ); + VERIFY( out[6] == src[5] ); +} + +void +test02() +{ + mbstate_t st; + constexpr codecvt_mode m = std::consume_header|std::generate_header; + codecvt_utf16 conv; + const char src[] = "-\xFE\xFF\0\x61\xAB\xCD\xD8\x08\xDF\x45"; + const char* const src_end = src + 11; + + int len = conv.length(st, src + 1, src_end, 1); + VERIFY( len == 4 ); + len = conv.length(st, src + 1, src_end, 2); + VERIFY( len == 6 ); + len = conv.length(st, src + 1, src_end, -1ul); + VERIFY( len == 10 ); + + char32_t dst[3]; + char32_t* const dst_end = dst + 3; + char32_t* dst_next; + const char* src_cnext; + auto res = conv.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next); + VERIFY( res == codecvt_base::ok ); + VERIFY( dst[0] == 0x0061 ); + VERIFY( dst[1] == 0xabcd ); + VERIFY( dst[2] == 0x012345 ); + VERIFY( src_cnext == src_end ); + VERIFY( dst_next == dst_end ); + + char out[sizeof(src)] = { src[0] }; + char* const out_end = out + 11; + char* out_next; + const char32_t* dst_cnext; + res = conv.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next); + VERIFY( res == codecvt_base::ok ); + VERIFY( out_next == out_end ); + VERIFY( dst_cnext == dst_end ); + VERIFY( out[1] == src[1] ); + VERIFY( out[2] == src[2] ); + VERIFY( out[3] == src[3] ); + VERIFY( out[4] == src[4] ); + VERIFY( out[5] == src[5] ); + VERIFY( out[6] == src[6] ); + VERIFY( out[7] == src[7] ); + VERIFY( out[8] == src[8] ); + VERIFY( out[9] == src[9] ); + VERIFY( out[10] == src[10] ); + + codecvt_utf16 conv_le; + + len = conv_le.length(st, src + 1, src_end, 1); + VERIFY( len == 4 ); + len = conv_le.length(st, src + 1, src_end, 2); + VERIFY( len == 6 ); + len = conv.length(st, src + 1, src_end, -1ul); + VERIFY( len == 10 ); + + res = conv_le.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next); + VERIFY( res == codecvt_base::ok ); + VERIFY( dst[0] == 0x0061 ); + VERIFY( dst[1] == 0xabcd ); + VERIFY( dst[2] == 0x012345 ); + VERIFY( src_cnext == src_end ); + VERIFY( dst_next == dst_end ); + + res = conv_le.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next); + VERIFY( res == codecvt_base::ok ); + VERIFY( out_next == out_end ); + VERIFY( dst_cnext == dst_end ); + VERIFY( out[1] == src[2] ); + VERIFY( out[2] == src[1] ); + VERIFY( out[3] == src[4] ); + VERIFY( out[4] == src[3] ); + VERIFY( out[5] == src[6] ); + VERIFY( out[6] == src[5] ); + VERIFY( out[7] == src[8] ); + VERIFY( out[8] == src[7] ); + VERIFY( out[9] == src[10] ); + VERIFY( out[10] == src[9] ); +} + +void +test03() +{ +#ifdef _GLIBCXX_USE_WCHAR_T + mbstate_t st; + constexpr codecvt_mode m = std::consume_header|std::generate_header; + codecvt_utf16 conv; + const char src[] = "-\xFE\xFF\0\x61\xAB\xCD\xD8\x08\xDF\x45"; + const size_t in_len = sizeof(wchar_t) == 4 ? 11 : 7; + const size_t out_len = sizeof(wchar_t) == 4 ? 3 : 2; + const char* const src_end = src + in_len; + + int len = conv.length(st, src + 1, src_end, 1); + VERIFY( len == 4 ); + len = conv.length(st, src + 1, src_end, 2); + VERIFY( len == 6 ); + if (sizeof(wchar_t) == 4) + { + len = conv.length(st, src + 1, src_end, -1ul); + VERIFY( len == 10 ); + } + + wchar_t dst[out_len]; + wchar_t* const dst_end = dst + out_len; + wchar_t* dst_next; + const char* src_cnext; + auto res = conv.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next); + VERIFY( res == codecvt_base::ok ); + VERIFY( dst[0] == 0x0061 ); + VERIFY( dst[1] == 0xabcd ); + if (sizeof(wchar_t) == 4) + VERIFY( dst[2] == 0x012345 ); + VERIFY( src_cnext == src_end ); + VERIFY( dst_next == dst_end ); + + char out[sizeof(src)] = { src[0] }; + char* const out_end = out + in_len; + char* out_next; + const wchar_t* dst_cnext; + res = conv.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next); + VERIFY( res == codecvt_base::ok ); + VERIFY( out_next == out_end ); + VERIFY( dst_cnext == dst_end ); + VERIFY( out[1] == src[1] ); + VERIFY( out[2] == src[2] ); + VERIFY( out[3] == src[3] ); + VERIFY( out[4] == src[4] ); + VERIFY( out[5] == src[5] ); + VERIFY( out[6] == src[6] ); + if (sizeof(wchar_t) == 4) + { + VERIFY( out[7] == src[7] ); + VERIFY( out[8] == src[8] ); + VERIFY( out[9] == src[9] ); + VERIFY( out[10] == src[10] ); + } + + codecvt_utf16 conv_le; + + len = conv_le.length(st, src + 1, src_end, 1); + VERIFY( len == 4 ); + len = conv_le.length(st, src + 1, src_end, 2); + VERIFY( len == 6 ); + if (sizeof(wchar_t) == 4) + { + len = conv.length(st, src + 1, src_end, -1ul); + VERIFY( len == 10 ); + } + + res = conv_le.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next); + VERIFY( res == codecvt_base::ok ); + VERIFY( dst[0] == 0x0061 ); + VERIFY( dst[1] == 0xabcd ); + if (sizeof(wchar_t) == 4) + VERIFY( dst[2] == 0x012345 ); + VERIFY( src_cnext == src_end ); + VERIFY( dst_next == dst_end ); + + res = conv_le.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next); + VERIFY( res == codecvt_base::ok ); + VERIFY( out_next == out_end ); + VERIFY( dst_cnext == dst_end ); + VERIFY( out[1] == src[2] ); + VERIFY( out[2] == src[1] ); + VERIFY( out[3] == src[4] ); + VERIFY( out[4] == src[3] ); + VERIFY( out[5] == src[6] ); + VERIFY( out[6] == src[5] ); + if (sizeof(wchar_t) == 4) + { + VERIFY( out[7] == src[8] ); + VERIFY( out[8] == src[7] ); + VERIFY( out[9] == src[10] ); + VERIFY( out[10] == src[9] ); + } +#endif +} + +int +main() +{ + test01(); + test02(); + test03(); +} diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc new file mode 100644 index 000000000000..1251acb85bee --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc @@ -0,0 +1,94 @@ +// Copyright (C) 2017 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// { dg-do run { target c++11 } } + +#include +#include +#include +#include + +using std::wstring_convert; +using std::codecvt_utf8; + +void +test01() +{ + std::string src = u8"1234\U00001111\U0001ffff"; + wstring_convert, char16_t> c("bad", u"BAD"); + + // utf-8 to ucs2 conversion should fail on character outside BMP + auto ucs2 = c.from_bytes(src); + VERIFY( ucs2 == u"BAD" ); + VERIFY( c.converted() == 7 ); + + // ucs2 to utf-8 conversion should fail on invalid ucs2 input: + std::u16string utf16 = u"1234\U00001111\U0001ffff"; + auto out = c.to_bytes(utf16); + VERIFY( out == "bad" ); + VERIFY( c.converted() == 5 ); + + // And should also fail on incomplete surrogate pair (not return partial): + out = c.to_bytes(utf16.substr(0, utf16.size()-1)); + VERIFY( out == "bad" ); + VERIFY( c.converted() == 5 ); +} + +void +test02() +{ + std::string src = u8"1234\U00001111\U0001ffff"; + wstring_convert, char16_t> c("bad", u"BAD"); + + // utf-8 to ucs2 conversion should fail on character above Maxcode=0x1000 + auto ucs2 = c.from_bytes(src); + VERIFY( ucs2 == u"BAD" ); + VERIFY( c.converted() == 4 ); +} + +void +test03() +{ + std::string src = u8"1234\U00001111\U0001ffff"; + wstring_convert, char32_t> c("bad", U"BAD"); + + // utf-8 to ucs4 conversion should fail on character above Maxcode=0x10000 + auto ucs4 = c.from_bytes(src); + VERIFY( ucs4 == U"BAD" ); + VERIFY( c.converted() == 7 ); +} + +void +test04() +{ + std::string src = u8"1234\U00001111\U0001ffff"; + wstring_convert, char32_t> c("bad", U"BAD"); + + // utf-8 to ucs4 conversion should fail on character above Maxcode=0x1000 + auto ucs4 = c.from_bytes(src); + VERIFY( ucs4 == U"BAD" ); + VERIFY( c.converted() == 4 ); +} + +int +main() +{ + test01(); + test02(); + test03(); + test04(); +} diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc new file mode 100644 index 000000000000..baeb049861a1 --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc @@ -0,0 +1,81 @@ +// Copyright (C) 2017 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// { dg-do run { target c++11 } } + +#include +#include + +const int bomlen = 3; // UTF-8 BOM is 24 bits + +void +test01() +{ + const int maxlen = 3; + + std::codecvt_utf8 c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf8 c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +} + +void +test02() +{ + const int maxlen = 4; + + std::codecvt_utf8 c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf8 c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +} + +void +test03() +{ +#ifdef _GLIBCXX_USE_WCHAR_T + const int maxlen = sizeof(wchar_t) == 4 ? 4 : 3; + + std::codecvt_utf8 c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf8 c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +#endif +} + +int +main() +{ + test01(); + test02(); + test03(); +} diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc new file mode 100644 index 000000000000..5555bcba6fe5 --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc @@ -0,0 +1,60 @@ +// Copyright (C) 2017 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// { dg-do run { target c++11 } } + +#include +#include +#include + +// PR libstdc++/79511 + +template + std::basic_string conv(const char* src) + { + std::wstring_convert, ElemT> conv; + return conv.from_bytes(src); + } + +void +test01() +{ + static char const src[] = "\xEF\xBF\xBF"; + VERIFY( conv(src) == u"\xffff" ); + VERIFY( conv(src) == U"\xffff" ); +#ifdef _GLIBCXX_USE_WCHAR_T + VERIFY( conv(src) == L"\xffff" ); +#endif +} + +void +test02() +{ + static char const src[] = "\xE2\x82\xAC"; + VERIFY( conv(src) == u"\x20ac" ); + VERIFY( conv(src) == U"\x20ac" ); +#ifdef _GLIBCXX_USE_WCHAR_T + VERIFY( conv(src) == L"\x20ac" ); +#endif +} + +int +main() +{ + test01(); + test02(); +} diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc new file mode 100644 index 000000000000..8fcdfff2cc1a --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc @@ -0,0 +1,76 @@ +// Copyright (C) 2017 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// { dg-do run { target c++11 } } + +#include +#include + +const int bomlen = 3; // UTF-8 BOM is 24 bits +const int maxlen = 4; + +void +test01() +{ + std::codecvt_utf8_utf16 c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf8_utf16 c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +} + +void +test02() +{ + std::codecvt_utf8_utf16 c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf8_utf16 c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +} + +void +test03() +{ +#ifdef _GLIBCXX_USE_WCHAR_T + std::codecvt_utf8_utf16 c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf8_utf16 c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +#endif +} + +int +main() +{ + test01(); + test02(); + test03(); +} -- 2.47.2