Backport <codecvt> fixes from trunk

author Jonathan Wakely <jwakely@redhat.com>

Fri, 17 Mar 2017 19:28:42 +0000 (19:28 +0000)

committer Jonathan Wakely <redi@gcc.gnu.org>

Fri, 17 Mar 2017 19:28:42 +0000 (19:28 +0000)
author Jonathan Wakely <jwakely@redhat.com>
Fri, 17 Mar 2017 19:28:42 +0000 (19:28 +0000)
committer Jonathan Wakely <redi@gcc.gnu.org>
Fri, 17 Mar 2017 19:28:42 +0000 (19:28 +0000)
diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog

index 01446915800a7ec6dd142fa765c84696c4d74c61..537100e59f973641d2b52adb9d2be711f56fc0f4 100644 (file)
--- a/libstdc++-v3/ChangeLog
+++ b/libstdc++-v3/ChangeLog
@@ -1,3 +1,80 @@
+2017-03-17  Jonathan Wakely  <jwakely@redhat.com>
+
+       Backport from mainline
+       2017-03-17  Jonathan Wakely  <jwakely@redhat.com>
+
+       * src/c++11/codecvt.cc (range): Add non-type template parameter and
+       define oerloaded operators for reading and writing code units.
+       (range<Elem, false>): Define partial specialization for accessing
+       wide characters in potentially unaligned byte ranges.
+       (ucs2_span(const char16_t*, const char16_t*, ...))
+       (ucs4_span(const char16_t*, const char16_t*, ...)): Change parameters
+       to range<const char16_t, false> in order to avoid unaligned reads.
+       (__codecvt_utf16_base<char16_t>::do_out)
+       (__codecvt_utf16_base<char32_t>::do_out)
+       (__codecvt_utf16_base<wchar_t>::do_out): Use range specialization for
+       unaligned data to avoid unaligned writes.
+       (__codecvt_utf16_base<char16_t>::do_in)
+       (__codecvt_utf16_base<char32_t>::do_in)
+       (__codecvt_utf16_base<wchar_t>::do_in): Likewise for writes. Return
+       error if there are unprocessable trailing bytes.
+       (__codecvt_utf16_base<char16_t>::do_length)
+       (__codecvt_utf16_base<char32_t>::do_length)
+       (__codecvt_utf16_base<wchar_t>::do_length): Pass arguments of type
+       range<const char16_t, false> to span functions.
+       * testsuite/22_locale/codecvt/codecvt_utf16/misaligned.cc: New test.
+
+       Backport from mainline
+       2017-03-16  Jonathan Wakely  <jwakely@redhat.com>
+
+       PR libstdc++/79980
+       * src/c++11/codecvt.cc (to_integer(codecvt_mode)): Fix target type.
+
+       PR libstdc++/80041
+       * src/c++11/codecvt.cc (__codecvt_utf16_base<wchar_t>::do_out)
+       (__codecvt_utf16_base<wchar_t>::do_in): Convert char arguments to
+       char16_t to work with UTF-16 instead of UTF-8.
+       * testsuite/22_locale/codecvt/codecvt_utf16/80041.cc: New test.
+
+       * src/c++11/codecvt.cc (codecvt<char16_t, char, mbstate_t>)
+       (codecvt<char32_t, char, mbstate_t>, __codecvt_utf8_base<char16_t>)
+       (__codecvt_utf8_base<char32_t>, __codecvt_utf8_base<wchar_t>)
+       (__codecvt_utf16_base<char16_t>, __codecvt_utf16_base<char32_t>)
+       (__codecvt_utf16_base<wchar_t>, __codecvt_utf8_utf16_base<char16_t>)
+       (__codecvt_utf8_utf16_base<char32_t>)
+       (__codecvt_utf8_utf16_base<wchar_t>): Fix do_encoding() and
+       do_max_length() return values.
+       * testsuite/22_locale/codecvt/codecvt_utf16/members.cc: New test.
+       * testsuite/22_locale/codecvt/codecvt_utf8/members.cc: New test.
+       * testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc: New test.
+
+       PR libstdc++/79980
+       * include/bits/locale_conv.h (__do_str_codecvt): Set __count on
+       error path.
+       * src/c++11/codecvt.cc (operator&=, operator|=, operator~): Overloads
+       for manipulating codecvt_mode values.
+       (read_utf16_bom): Compare input to BOM constants instead of integral
+       constants that depend on endianness.  Take mode parameter by
+       reference and adjust it, to distinguish between no BOM present and
+       UTF-16BE BOM present.
+       (ucs4_in, ucs2_span, ucs4_span): Adjust calls to read_utf16_bom.
+       (surrogates): New enumeration type.
+       (utf16_in, utf16_out): Add surrogates parameter to choose between
+       UTF-16 and UCS2 behaviour.
+       (utf16_span, ucs2_span): Use std::min not std::max.
+       (ucs2_out): Use std::min not std::max.  Disallow surrogate pairs.
+       (ucs2_in): Likewise. Adjust calls to read_utf16_bom.
+       * testsuite/22_locale/codecvt/codecvt_utf16/79980.cc: New test.
+       * testsuite/22_locale/codecvt/codecvt_utf8/79980.cc: New test.
+
+       PR libstdc++/79511
+       * src/c++11/codecvt.cc (write_utf16_code_point): Don't write 0xffff
+       as a surrogate pair.
+       (__codecvt_utf8_utf16_base<char32_t>::do_in): Use native endianness
+       for internal representation.
+       (__codecvt_utf8_utf16_base<wchar_t>::do_in): Likewise.
+       * testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc: New test.
+
  2017-03-14  Jonathan Wakely  <jwakely@redhat.com>
  
         * testsuite/17_intro/names.cc: Undefine macros that clash with
diff --git a/libstdc++-v3/include/bits/locale_conv.h b/libstdc++-v3/include/bits/locale_conv.h

index 16b48758e772af88332d603795dfda52bbcfe746..2b3f2dcbf417239b15cf1684ce69c0568a847679 100644 (file)
--- a/libstdc++-v3/include/bits/locale_conv.h
+++ b/libstdc++-v3/include/bits/locale_conv.h
@@ -81,7 +81,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
              && (__outstr.size() - __outchars) < __maxlen);
  
        if (__result == codecvt_base::error)
-       return false;
+       {
+         __count = __next - __first;
+         return false;
+       }
  
        if (__result == codecvt_base::noconv)
         {
diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc

index 8cd8eb46a96c423ea40b33fd89c4d5dca996bfe4..11873397341114f0693946ed46578e5e6f720f08 100644 (file)
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -1,6 +1,6 @@
  // Locale support (codecvt) -*- C++ -*-
  
-// Copyright (C) 2015 Free Software Foundation, Inc.
+// Copyright (C) 2015-2017 Free Software Foundation, Inc.
  //
  // This file is part of the GNU ISO C++ Library.  This library is free
  // software; you can redistribute it and/or modify it under the
@@ -24,13 +24,27 @@
  
  #include <codecvt>
  #include <cstring>             // std::memcpy, std::memcmp
-#include <bits/stl_algobase.h> // std::max
+#include <bits/stl_algobase.h> // std::min
  
  #ifdef _GLIBCXX_USE_C99_STDINT_TR1
  namespace std _GLIBCXX_VISIBILITY(default)
  {
  _GLIBCXX_BEGIN_NAMESPACE_VERSION
  
+  // The standard doesn't define these operators, which is annoying.
+  static underlying_type<codecvt_mode>::type
+  to_integer(codecvt_mode m)
+  { return static_cast<underlying_type<codecvt_mode>::type>(m); }
+
+  static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
+  { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
+
+  static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
+  { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
+
+  static codecvt_mode operator~(codecvt_mode m)
+  { return codecvt_mode(~to_integer(m)); }
+
  namespace
  {
    // Largest code point that fits in a single UTF-16 code unit.
@@ -43,35 +57,142 @@ namespace
    const char32_t incomplete_mb_character = char32_t(-2);
    const char32_t invalid_mb_sequence = char32_t(-1);
  
-  template<typename Elem>
+  // Utility type for reading and writing code units of type Elem from
+  // a range defined by a pair of pointers.
+  template<typename Elem, bool Aligned = true>
      struct range
      {
        Elem* next;
        Elem* end;
  
+      // Write a code unit.
+      range& operator=(Elem e)
+      {
+       *next++ = e;
+       return *this;
+      }
+
+      // Read the next code unit.
        Elem operator*() const { return *next; }
  
-      range& operator++() { ++next; return *this; }
+      // Read the Nth code unit.
+      Elem operator[](size_t n) const { return next[n]; }
  
+      // Move to the next code unit.
+      range& operator++()
+      {
+       ++next;
+       return *this;
+      }
+
+      // Move to the Nth code unit.
+      range& operator+=(size_t n)
+      {
+       next += n;
+       return *this;
+      }
+
+      // The number of code units remaining.
        size_t size() const { return end - next; }
+
+      // The number of bytes remaining.
+      size_t nbytes() const { return (const char*)end - (const char*)next; }
+    };
+
+  // This specialization is used when accessing char16_t values through
+  // pointers to char, which might not be correctly aligned for char16_t.
+  template<typename Elem>
+    struct range<Elem, false>
+    {
+      using value_type = typename remove_const<Elem>::type;
+
+      using char_pointer = typename
+       conditional<is_const<Elem>::value, const char*, char*>::type;
+
+      char_pointer next;
+      char_pointer end;
+
+      // Write a code unit.
+      range& operator=(Elem e)
+      {
+       memcpy(next, &e, sizeof(Elem));
+       ++*this;
+       return *this;
+      }
+
+      // Read the next code unit.
+      Elem operator*() const
+      {
+       value_type e;
+       memcpy(&e, next, sizeof(Elem));
+       return e;
+      }
+
+      // Read the Nth code unit.
+      Elem operator[](size_t n) const
+      {
+       value_type e;
+       memcpy(&e, next + n * sizeof(Elem), sizeof(Elem));
+       return e;
+      }
+
+      // Move to the next code unit.
+      range& operator++()
+      {
+       next += sizeof(Elem);
+       return *this;
+      }
+
+      // Move to the Nth code unit.
+      range& operator+=(size_t n)
+      {
+       next += n * sizeof(Elem);
+       return *this;
+      }
+
+      // The number of code units remaining.
+      size_t size() const { return nbytes() / sizeof(Elem); }
+
+      // The number of bytes remaining.
+      size_t nbytes() const { return end - next; }
      };
  
    // Multibyte sequences can have "header" consisting of Byte Order Mark
    const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
-  const unsigned char utf16_bom[4] = { 0xFE, 0xFF };
-  const unsigned char utf16le_bom[4] = { 0xFF, 0xFE };
+  const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
+  const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
  
-  template<size_t N>
-    inline bool
-    write_bom(range<char>& to, const unsigned char (&bom)[N])
+  // Write a BOM (space permitting).
+  template<typename C, bool A, size_t N>
+    bool
+    write_bom(range<C, A>& to, const unsigned char (&bom)[N])
      {
-      if (to.size() < N)
+      static_assert( (N / sizeof(C)) != 0, "" );
+      static_assert( (N % sizeof(C)) == 0, "" );
+
+      if (to.nbytes() < N)
         return false;
        memcpy(to.next, bom, N);
-      to.next += N;
+      to += (N / sizeof(C));
        return true;
      }
  
+  // Try to read a BOM.
+  template<typename C, bool A, size_t N>
+    bool
+    read_bom(range<C, A>& from, const unsigned char (&bom)[N])
+    {
+      static_assert( (N / sizeof(C)) != 0, "" );
+      static_assert( (N % sizeof(C)) == 0, "" );
+
+      if (from.nbytes() >= N && !memcmp(from.next, bom, N))
+       {
+         from += (N / sizeof(C));
+         return true;
+       }
+      return false;
+    }
+
    // If generate_header is set in mode write out UTF-8 BOM.
    bool
    write_utf8_bom(range<char>& to, codecvt_mode mode)
@@ -83,32 +204,20 @@ namespace
  
    // If generate_header is set in mode write out the UTF-16 BOM indicated
    // by whether little_endian is set in mode.
+  template<bool Aligned>
    bool
-  write_utf16_bom(range<char16_t>& to, codecvt_mode mode)
+  write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode)
    {
      if (mode & generate_header)
      {
-      if (!to.size())
-       return false;
-      auto* bom = (mode & little_endian) ? utf16le_bom : utf16_bom;
-      std::memcpy(to.next, bom, 2);
-      ++to.next;
+      if (mode & little_endian)
+       return write_bom(to, utf16le_bom);
+      else
+       return write_bom(to, utf16_bom);
      }
      return true;
    }
  
-  template<size_t N>
-    inline bool
-    read_bom(range<const char>& from, const unsigned char (&bom)[N])
-    {
-      if (from.size() >= N && !memcmp(from.next, bom, N))
-       {
-         from.next += N;
-         return true;
-       }
-      return false;
-    }
-
    // If consume_header is set in mode update from.next to after any BOM.
    void
    read_utf8_bom(range<const char>& from, codecvt_mode mode)
@@ -117,22 +226,21 @@ namespace
        read_bom(from, utf8_bom);
    }
  
-  // If consume_header is set in mode update from.next to after any BOM.
-  // Return little_endian iff the UTF-16LE BOM was present.
-  codecvt_mode
-  read_utf16_bom(range<const char16_t>& from, codecvt_mode mode)
+  // If consume_header is not set in mode, no effects.
+  // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
+  // - if the UTF-16BE BOM was found unset little_endian in mode, or
+  // - if the UTF-16LE BOM was found set little_endian in mode.
+  template<bool Aligned>
+  void
+  read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode)
    {
-    if (mode & consume_header && from.size())
+    if (mode & consume_header)
        {
-       if (*from.next == 0xFEFF)
-         ++from.next;
-       else if (*from.next == 0xFFFE)
-         {
-           ++from.next;
-           return little_endian;
-         }
+       if (read_bom(from, utf16_bom))
+         mode &= ~little_endian;
+       else if (read_bom(from, utf16le_bom))
+         mode |= little_endian;
        }
-    return {};
    }
  
    // Read a codepoint from a UTF-8 multibyte sequence.
@@ -144,11 +252,11 @@ namespace
      const size_t avail = from.size();
      if (avail == 0)
        return incomplete_mb_character;
-    unsigned char c1 = from.next[0];
+    unsigned char c1 = from[0];
      // https://en.wikipedia.org/wiki/UTF-8#Sample_code
      if (c1 < 0x80)
      {
-      ++from.next;
+      ++from;
        return c1;
      }
      else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
@@ -157,51 +265,51 @@ namespace
      {
        if (avail < 2)
         return incomplete_mb_character;
-      unsigned char c2 = from.next[1];
+      unsigned char c2 = from[1];
        if ((c2 & 0xC0) != 0x80)
         return invalid_mb_sequence;
        char32_t c = (c1 << 6) + c2 - 0x3080;
        if (c <= maxcode)
-       from.next += 2;
+       from += 2;
        return c;
      }
      else if (c1 < 0xF0) // 3-byte sequence
      {
        if (avail < 3)
         return incomplete_mb_character;
-      unsigned char c2 = from.next[1];
+      unsigned char c2 = from[1];
        if ((c2 & 0xC0) != 0x80)
         return invalid_mb_sequence;
        if (c1 == 0xE0 && c2 < 0xA0) // overlong
         return invalid_mb_sequence;
-      unsigned char c3 = from.next[2];
+      unsigned char c3 = from[2];
        if ((c3 & 0xC0) != 0x80)
         return invalid_mb_sequence;
        char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
        if (c <= maxcode)
-       from.next += 3;
+       from += 3;
        return c;
      }
      else if (c1 < 0xF5) // 4-byte sequence
      {
        if (avail < 4)
         return incomplete_mb_character;
-      unsigned char c2 = from.next[1];
+      unsigned char c2 = from[1];
        if ((c2 & 0xC0) != 0x80)
         return invalid_mb_sequence;
        if (c1 == 0xF0 && c2 < 0x90) // overlong
         return invalid_mb_sequence;
        if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
        return invalid_mb_sequence;
-      unsigned char c3 = from.next[2];
+      unsigned char c3 = from[2];
        if ((c3 & 0xC0) != 0x80)
         return invalid_mb_sequence;
-      unsigned char c4 = from.next[3];
+      unsigned char c4 = from[3];
        if ((c4 & 0xC0) != 0x80)
         return invalid_mb_sequence;
        char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
        if (c <= maxcode)
-       from.next += 4;
+       from += 4;
        return c;
      }
      else // > U+10FFFF
@@ -215,31 +323,31 @@ namespace
        {
         if (to.size() < 1)
           return false;
-       *to.next++ = code_point;
+       to = code_point;
        }
      else if (code_point <= 0x7FF)
        {
         if (to.size() < 2)
           return false;
-       *to.next++ = (code_point >> 6) + 0xC0;
-       *to.next++ = (code_point & 0x3F) + 0x80;
+       to = (code_point >> 6) + 0xC0;
+       to = (code_point & 0x3F) + 0x80;
        }
      else if (code_point <= 0xFFFF)
        {
         if (to.size() < 3)
           return false;
-       *to.next++ = (code_point >> 12) + 0xE0;
-       *to.next++ = ((code_point >> 6) & 0x3F) + 0x80;
-       *to.next++ = (code_point & 0x3F) + 0x80;
+       to = (code_point >> 12) + 0xE0;
+       to = ((code_point >> 6) & 0x3F) + 0x80;
+       to = (code_point & 0x3F) + 0x80;
        }
      else if (code_point <= 0x10FFFF)
        {
         if (to.size() < 4)
           return false;
-       *to.next++ = (code_point >> 18) + 0xF0;
-       *to.next++ = ((code_point >> 12) & 0x3F) + 0x80;
-       *to.next++ = ((code_point >> 6) & 0x3F) + 0x80;
-       *to.next++ = (code_point & 0x3F) + 0x80;
+       to = (code_point >> 18) + 0xF0;
+       to = ((code_point >> 12) & 0x3F) + 0x80;
+       to = ((code_point >> 6) & 0x3F) + 0x80;
+       to = (code_point & 0x3F) + 0x80;
        }
      else
        return false;
@@ -280,47 +388,47 @@ namespace
    // The sequence's endianness is indicated by (mode & little_endian).
    // Updates from.next if the codepoint is not greater than maxcode.
    // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
-  char32_t
-  read_utf16_code_point(range<const char16_t>& from, unsigned long maxcode,
-                       codecvt_mode mode)
-  {
-    const size_t avail = from.size();
-    if (avail == 0)
-      return incomplete_mb_character;
-    int inc = 1;
-    char32_t c = adjust_byte_order(from.next[0], mode);
-    if (is_high_surrogate(c))
-      {
-       if (avail < 2)
-         return incomplete_mb_character;
-       const char16_t c2 = adjust_byte_order(from.next[1], mode);
-       if (is_low_surrogate(c2))
-         {
-           c = surrogate_pair_to_code_point(c, c2);
-           inc = 2;
-         }
-       else
-         return invalid_mb_sequence;
-      }
-    else if (is_low_surrogate(c))
-      return invalid_mb_sequence;
-    if (c <= maxcode)
-      from.next += inc;
-    return c;
-  }
+  template<bool Aligned>
+    char32_t
+    read_utf16_code_point(range<const char16_t, Aligned>& from,
+                         unsigned long maxcode, codecvt_mode mode)
+    {
+      const size_t avail = from.size();
+      if (avail == 0)
+       return incomplete_mb_character;
+      int inc = 1;
+      char32_t c = adjust_byte_order(from[0], mode);
+      if (is_high_surrogate(c))
+       {
+         if (avail < 2)
+           return incomplete_mb_character;
+         const char16_t c2 = adjust_byte_order(from[1], mode);
+         if (is_low_surrogate(c2))
+           {
+             c = surrogate_pair_to_code_point(c, c2);
+             inc = 2;
+           }
+         else
+           return invalid_mb_sequence;
+       }
+      else if (is_low_surrogate(c))
+       return invalid_mb_sequence;
+      if (c <= maxcode)
+       from += inc;
+      return c;
+    }
  
-  template<typename C>
+  template<typename C, bool A>
    bool
-  write_utf16_code_point(range<C>& to, char32_t codepoint, codecvt_mode mode)
+  write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode)
    {
      static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
  
-    if (codepoint < max_single_utf16_unit)
+    if (codepoint <= max_single_utf16_unit)
        {
         if (to.size() > 0)
           {
-           *to.next = adjust_byte_order(codepoint, mode);
-           ++to.next;
+           to = adjust_byte_order(codepoint, mode);
             return true;
           }
        }
@@ -330,9 +438,8 @@ namespace
         const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
         char16_t lead = LEAD_OFFSET + (codepoint >> 10);
         char16_t trail = 0xDC00 + (codepoint & 0x3FF);
-       to.next[0] = adjust_byte_order(lead, mode);
-       to.next[1] = adjust_byte_order(trail, mode);
-       to.next += 2;
+       to = adjust_byte_order(lead, mode);
+       to = adjust_byte_order(trail, mode);
         return true;
        }
      return false;
@@ -351,7 +458,7 @@ namespace
           return codecvt_base::partial;
         if (codepoint > maxcode)
           return codecvt_base::error;
-       *to.next++ = codepoint;
+       to = codepoint;
        }
      return from.size() ? codecvt_base::partial : codecvt_base::ok;
    }
@@ -365,23 +472,22 @@ namespace
        return codecvt_base::partial;
      while (from.size())
        {
-       const char32_t c = from.next[0];
+       const char32_t c = from[0];
         if (c > maxcode)
           return codecvt_base::error;
         if (!write_utf8_code_point(to, c))
           return codecvt_base::partial;
-       ++from.next;
+       ++from;
        }
      return codecvt_base::ok;
    }
  
    // utf16 -> ucs4
    codecvt_base::result
-  ucs4_in(range<const char16_t>& from, range<char32_t>& to,
+  ucs4_in(range<const char16_t, false>& from, range<char32_t>& to,
            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
    {
-    if (read_utf16_bom(from, mode) == little_endian)
-      mode = codecvt_mode(mode & little_endian);
+    read_utf16_bom(from, mode);
      while (from.size() && to.size())
        {
         const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
@@ -389,72 +495,85 @@ namespace
           return codecvt_base::partial;
         if (codepoint > maxcode)
           return codecvt_base::error;
-       *to.next++ = codepoint;
+       to = codepoint;
        }
      return from.size() ? codecvt_base::partial : codecvt_base::ok;
    }
  
    // ucs4 -> utf16
    codecvt_base::result
-  ucs4_out(range<const char32_t>& from, range<char16_t>& to,
+  ucs4_out(range<const char32_t>& from, range<char16_t, false>& to,
             unsigned long maxcode = max_code_point, codecvt_mode mode = {})
    {
      if (!write_utf16_bom(to, mode))
        return codecvt_base::partial;
      while (from.size())
        {
-       const char32_t c = from.next[0];
+       const char32_t c = from[0];
         if (c > maxcode)
           return codecvt_base::error;
         if (!write_utf16_code_point(to, c, mode))
           return codecvt_base::partial;
-       ++from.next;
+       ++from;
        }
      return codecvt_base::ok;
    }
  
-  // utf8 -> utf16
+  // Flag indicating whether to process UTF-16 or UCS2
+  enum class surrogates { allowed, disallowed };
+
+  // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
    template<typename C>
    codecvt_base::result
    utf16_in(range<const char>& from, range<C>& to,
-           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
+          unsigned long maxcode = max_code_point, codecvt_mode mode = {},
+          surrogates s = surrogates::allowed)
    {
      read_utf8_bom(from, mode);
      while (from.size() && to.size())
        {
-       const char* const first = from.next;
+       auto orig = from;
         const char32_t codepoint = read_utf8_code_point(from, maxcode);
         if (codepoint == incomplete_mb_character)
-         return codecvt_base::partial;
+         {
+           if (s == surrogates::allowed)
+             return codecvt_base::partial;
+           else
+             return codecvt_base::error; // No surrogates in UCS2
+         }
         if (codepoint > maxcode)
           return codecvt_base::error;
         if (!write_utf16_code_point(to, codepoint, mode))
           {
-           from.next = first;
+           from = orig; // rewind to previous position
             return codecvt_base::partial;
           }
        }
      return codecvt_base::ok;
    }
  
-  // utf16 -> utf8
+  // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
    template<typename C>
    codecvt_base::result
    utf16_out(range<const C>& from, range<char>& to,
-            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
+           unsigned long maxcode = max_code_point, codecvt_mode mode = {},
+           surrogates s = surrogates::allowed)
    {
      if (!write_utf8_bom(to, mode))
        return codecvt_base::partial;
      while (from.size())
        {
-       char32_t c = from.next[0];
+       char32_t c = from[0];
         int inc = 1;
         if (is_high_surrogate(c))
           {
+           if (s == surrogates::disallowed)
+             return codecvt_base::error; // No surrogates in UCS-2
+
             if (from.size() < 2)
               return codecvt_base::ok; // stop converting at this point
  
-           const char32_t c2 = from.next[1];
+           const char32_t c2 = from[1];
             if (is_low_surrogate(c2))
               {
                 c = surrogate_pair_to_code_point(c, c2);
@@ -469,7 +588,7 @@ namespace
           return codecvt_base::error;
         if (!write_utf8_code_point(to, c))
           return codecvt_base::partial;
-       from.next += inc;
+       from += inc;
        }
      return codecvt_base::ok;
    }
@@ -492,7 +611,7 @@ namespace
         ++count;
        }
      if (count+1 == max) // take one more character if it fits in a single unit
-      read_utf8_code_point(from, std::max(max_single_utf16_unit, maxcode));
+      read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
      return from.next;
    }
  
@@ -501,7 +620,9 @@ namespace
    ucs2_in(range<const char>& from, range<char16_t>& to,
           char32_t maxcode = max_code_point, codecvt_mode mode = {})
    {
-    return utf16_in(from, to, std::max(max_single_utf16_unit, maxcode), mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
+    return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
    }
  
    // ucs2 -> utf8
@@ -509,61 +630,62 @@ namespace
    ucs2_out(range<const char16_t>& from, range<char>& to,
            char32_t maxcode = max_code_point, codecvt_mode mode = {})
    {
-    return utf16_out(from, to, std::max(max_single_utf16_unit, maxcode), mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
+    return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
    }
  
    // ucs2 -> utf16
    codecvt_base::result
-  ucs2_out(range<const char16_t>& from, range<char16_t>& to,
+  ucs2_out(range<const char16_t>& from, range<char16_t, false>& to,
            char32_t maxcode = max_code_point, codecvt_mode mode = {})
    {
      if (!write_utf16_bom(to, mode))
        return codecvt_base::partial;
      while (from.size() && to.size())
        {
-       char16_t c = from.next[0];
+       char16_t c = from[0];
         if (is_high_surrogate(c))
           return codecvt_base::error;
         if (c > maxcode)
           return codecvt_base::error;
-       *to.next++ = adjust_byte_order(c, mode);
-       ++from.next;
+       to = adjust_byte_order(c, mode);
+       ++from;
        }
      return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
    }
  
    // utf16 -> ucs2
    codecvt_base::result
-  ucs2_in(range<const char16_t>& from, range<char16_t>& to,
+  ucs2_in(range<const char16_t, false>& from, range<char16_t>& to,
           char32_t maxcode = max_code_point, codecvt_mode mode = {})
    {
-    if (read_utf16_bom(from, mode) == little_endian)
-      mode = codecvt_mode(mode & little_endian);
-    maxcode = std::max(max_single_utf16_unit, maxcode);
+    read_utf16_bom(from, mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
      while (from.size() && to.size())
        {
         const char32_t c = read_utf16_code_point(from, maxcode, mode);
         if (c == incomplete_mb_character)
-         return codecvt_base::partial;
+         return codecvt_base::error; // UCS-2 only supports single units.
         if (c > maxcode)
           return codecvt_base::error;
-       *to.next++ = c;
+       to = c;
        }
      return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
    }
  
    const char16_t*
-  ucs2_span(const char16_t* begin, const char16_t* end, size_t max,
+  ucs2_span(range<const char16_t, false>& from, size_t max,
              char32_t maxcode, codecvt_mode mode)
    {
-    range<const char16_t> from{ begin, end };
-    if (read_utf16_bom(from, mode) == little_endian)
-      mode = codecvt_mode(mode & little_endian);
-    maxcode = std::max(max_single_utf16_unit, maxcode);
+    read_utf16_bom(from, mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
      char32_t c = 0;
      while (max-- && c <= maxcode)
        c = read_utf16_code_point(from, maxcode, mode);
-    return from.next;
+    return reinterpret_cast<const char16_t*>(from.next);
    }
  
    const char*
@@ -572,7 +694,8 @@ namespace
    {
      range<const char> from{ begin, end };
      read_utf8_bom(from, mode);
-    maxcode = std::max(max_single_utf16_unit, maxcode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
      char32_t c = 0;
      while (max-- && c <= maxcode)
        c = read_utf8_code_point(from, maxcode);
@@ -594,16 +717,14 @@ namespace
  
    // return pos such that [begin,pos) is valid UCS-4 string no longer than max
    const char16_t*
-  ucs4_span(const char16_t* begin, const char16_t* end, size_t max,
+  ucs4_span(range<const char16_t, false>& from, size_t max,
              char32_t maxcode = max_code_point, codecvt_mode mode = {})
    {
-    range<const char16_t> from{ begin, end };
-    if (read_utf16_bom(from, mode) == little_endian)
-      mode = codecvt_mode(mode & little_endian);
+    read_utf16_bom(from, mode);
      char32_t c = 0;
      while (max-- && c <= maxcode)
        c = read_utf16_code_point(from, maxcode, mode);
-    return from.next;
+    return reinterpret_cast<const char16_t*>(from.next);
    }
  }
  
@@ -661,7 +782,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
  
  int
  codecvt<char16_t, char, mbstate_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
  
  bool
  codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw()
@@ -679,9 +800,9 @@ do_length(state_type&, const extern_type* __from,
  int
  codecvt<char16_t, char, mbstate_t>::do_max_length() const throw()
  {
-  // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
-  // whereas 4 byte sequences require two 16-bit code units.
-  return 3;
+  // A single character (one or two UTF-16 code units) requires
+  // up to four UTF-8 code units.
+  return 4;
  }
  
  // Define members of codecvt<char32_t, char, mbstate_t> specialization.
@@ -732,7 +853,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
  
  int
  codecvt<char32_t, char, mbstate_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
  
  bool
  codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw()
@@ -749,7 +870,11 @@ do_length(state_type&, const extern_type* __from,
  
  int
  codecvt<char32_t, char, mbstate_t>::do_max_length() const throw()
-{ return 4; }
+{
+  // A single character (one UTF-32 code unit) requires
+  // up to 4 UTF-8 code units.
+  return 4;
+}
  
  // Define members of codecvt_utf8<char16_t> base class implementation.
  // Converts from UTF-8 to UCS-2.
@@ -801,7 +926,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
  
  int
  __codecvt_utf8_base<char16_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
  
  bool
  __codecvt_utf8_base<char16_t>::do_always_noconv() const throw()
@@ -818,7 +943,14 @@ do_length(state_type&, const extern_type* __from,
  
  int
  __codecvt_utf8_base<char16_t>::do_max_length() const throw()
-{ return 3; }
+{
+  // A single UCS-2 character requires up to three UTF-8 code units.
+  // (UCS-2 cannot represent characters that use four UTF-8 code units).
+  int max = 3;
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
+}
  
  // Define members of codecvt_utf8<char32_t> base class implementation.
  // Converts from UTF-8 to UTF-32 (aka UCS-4).
@@ -866,7 +998,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
  
  int
  __codecvt_utf8_base<char32_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
  
  bool
  __codecvt_utf8_base<char32_t>::do_always_noconv() const throw()
@@ -883,9 +1015,22 @@ do_length(state_type&, const extern_type* __from,
  
  int
  __codecvt_utf8_base<char32_t>::do_max_length() const throw()
-{ return 4; }
+{
+  // A single UCS-4 character requires up to four UTF-8 code units.
+  int max = 4;
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
+}
  
  #ifdef _GLIBCXX_USE_WCHAR_T
+
+#if __SIZEOF_WCHAR_T__ == 2
+static_assert(sizeof(wchar_t) == sizeof(char16_t), "");
+#elif __SIZEOF_WCHAR_T__ == 4
+static_assert(sizeof(wchar_t) == sizeof(char32_t), "");
+#endif
+
  // Define members of codecvt_utf8<wchar_t> base class implementation.
  // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
  
@@ -958,7 +1103,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
  
  int
  __codecvt_utf8_base<wchar_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
  
  bool
  __codecvt_utf8_base<wchar_t>::do_always_noconv() const throw()
@@ -981,7 +1126,16 @@ do_length(state_type&, const extern_type* __from,
  
  int
  __codecvt_utf8_base<wchar_t>::do_max_length() const throw()
-{ return 4; }
+{
+#if __SIZEOF_WCHAR_T__ == 2
+  int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
+#else
+  int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
+#endif
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
+}
  #endif
  
  // Define members of codecvt_utf16<char16_t> base class implementation.
@@ -997,10 +1151,7 @@ do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
         extern_type*& __to_next) const
  {
    range<const char16_t> from{ __from, __from_end };
-  range<char16_t> to{
-    reinterpret_cast<char16_t*>(__to),
-    reinterpret_cast<char16_t*>(__to_end)
-  };
+  range<char16_t, false> to{ __to, __to_end };
    auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
    __from_next = from.next;
    __to_next = reinterpret_cast<char*>(to.next);
@@ -1023,20 +1174,19 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
        intern_type* __to, intern_type* __to_end,
        intern_type*& __to_next) const
  {
-  range<const char16_t> from{
-    reinterpret_cast<const char16_t*>(__from),
-    reinterpret_cast<const char16_t*>(__from_end)
-  };
+  range<const char16_t, false> from{ __from, __from_end };
    range<char16_t> to{ __to, __to_end };
    auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
    __from_next = reinterpret_cast<const char*>(from.next);
    __to_next = to.next;
+  if (res == codecvt_base::ok && __from_next != __from_end)
+    res = codecvt_base::error;
    return res;
  }
  
  int
  __codecvt_utf16_base<char16_t>::do_encoding() const throw()
-{ return 1; }
+{ return 0; } // UTF-16 is not a fixed-width encoding
  
  bool
  __codecvt_utf16_base<char16_t>::do_always_noconv() const throw()
@@ -1047,15 +1197,21 @@ __codecvt_utf16_base<char16_t>::
  do_length(state_type&, const extern_type* __from,
           const extern_type* __end, size_t __max) const
  {
-  auto next = reinterpret_cast<const char16_t*>(__from);
-  next = ucs2_span(next, reinterpret_cast<const char16_t*>(__end), __max,
-                  _M_maxcode, _M_mode);
+  range<const char16_t, false> from{ __from, __end };
+  const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
    return reinterpret_cast<const char*>(next) - __from;
  }
  
  int
  __codecvt_utf16_base<char16_t>::do_max_length() const throw()
-{ return 3; }
+{
+  // A single UCS-2 character requires one UTF-16 code unit (so two chars).
+  // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
+  int max = 2;
+  if (_M_mode & consume_header)
+    max += sizeof(utf16_bom);
+  return max;
+}
  
  // Define members of codecvt_utf16<char32_t> base class implementation.
  // Converts from UTF-16 to UTF-32 (aka UCS-4).
@@ -1070,10 +1226,7 @@ do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
         extern_type*& __to_next) const
  {
    range<const char32_t> from{ __from, __from_end };
-  range<char16_t> to{
-    reinterpret_cast<char16_t*>(__to),
-    reinterpret_cast<char16_t*>(__to_end)
-  };
+  range<char16_t, false> to{ __to, __to_end };
    auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
    __from_next = from.next;
    __to_next = reinterpret_cast<char*>(to.next);
@@ -1096,20 +1249,19 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
        intern_type* __to, intern_type* __to_end,
        intern_type*& __to_next) const
  {
-  range<const char16_t> from{
-    reinterpret_cast<const char16_t*>(__from),
-    reinterpret_cast<const char16_t*>(__from_end)
-  };
+  range<const char16_t, false> from{ __from, __from_end };
    range<char32_t> to{ __to, __to_end };
    auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
    __from_next = reinterpret_cast<const char*>(from.next);
    __to_next = to.next;
+  if (res == codecvt_base::ok && __from_next != __from_end)
+    res = codecvt_base::error;
    return res;
  }
  
  int
  __codecvt_utf16_base<char32_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-16 is not a fixed-width encoding
  
  bool
  __codecvt_utf16_base<char32_t>::do_always_noconv() const throw()
@@ -1120,15 +1272,21 @@ __codecvt_utf16_base<char32_t>::
  do_length(state_type&, const extern_type* __from,
           const extern_type* __end, size_t __max) const
  {
-  auto next = reinterpret_cast<const char16_t*>(__from);
-  next = ucs4_span(next, reinterpret_cast<const char16_t*>(__end), __max,
-                  _M_maxcode, _M_mode);
+  range<const char16_t, false> from{ __from, __end };
+  const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
    return reinterpret_cast<const char*>(next) - __from;
  }
  
  int
  __codecvt_utf16_base<char32_t>::do_max_length() const throw()
-{ return 4; }
+{
+  // A single UCS-4 character requires one or two UTF-16 code units
+  // (so up to four chars).
+  int max = 4;
+  if (_M_mode & consume_header)
+    max += sizeof(utf16_bom);
+  return max;
+}
  
  #ifdef _GLIBCXX_USE_WCHAR_T
  // Define members of codecvt_utf16<wchar_t> base class implementation.
@@ -1143,24 +1301,24 @@ do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
         extern_type* __to, extern_type* __to_end,
         extern_type*& __to_next) const
  {
-  range<char> to{ __to, __to_end };
+  range<char16_t, false> to{ __to, __to_end };
  #if __SIZEOF_WCHAR_T__ == 2
    range<const char16_t> from{
      reinterpret_cast<const char16_t*>(__from),
-    reinterpret_cast<const char16_t*>(__from_end)
+    reinterpret_cast<const char16_t*>(__from_end),
    };
    auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
  #elif __SIZEOF_WCHAR_T__ == 4
    range<const char32_t> from{
      reinterpret_cast<const char32_t*>(__from),
-    reinterpret_cast<const char32_t*>(__from_end)
+    reinterpret_cast<const char32_t*>(__from_end),
    };
    auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
  #else
    return codecvt_base::error;
  #endif
    __from_next = reinterpret_cast<const wchar_t*>(from.next);
-  __to_next = to.next;
+  __to_next = reinterpret_cast<char*>(to.next);
    return res;
  }
  
@@ -1180,30 +1338,32 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
        intern_type* __to, intern_type* __to_end,
        intern_type*& __to_next) const
  {
-  range<const char> from{ __from, __from_end };
+  range<const char16_t, false> from{ __from, __from_end };
  #if __SIZEOF_WCHAR_T__ == 2
    range<char16_t> to{
      reinterpret_cast<char16_t*>(__to),
-    reinterpret_cast<char16_t*>(__to_end)
+    reinterpret_cast<char16_t*>(__to_end),
    };
    auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
  #elif __SIZEOF_WCHAR_T__ == 4
    range<char32_t> to{
      reinterpret_cast<char32_t*>(__to),
-    reinterpret_cast<char32_t*>(__to_end)
+    reinterpret_cast<char32_t*>(__to_end),
    };
    auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
  #else
    return codecvt_base::error;
  #endif
-  __from_next = from.next;
+  __from_next = reinterpret_cast<const char*>(from.next);
    __to_next = reinterpret_cast<wchar_t*>(to.next);
+  if (res == codecvt_base::ok && __from_next != __from_end)
+    res = codecvt_base::error;
    return res;
  }
  
  int
  __codecvt_utf16_base<wchar_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-16 is not a fixed-width encoding
  
  bool
  __codecvt_utf16_base<wchar_t>::do_always_noconv() const throw()
@@ -1214,20 +1374,27 @@ __codecvt_utf16_base<wchar_t>::
  do_length(state_type&, const extern_type* __from,
           const extern_type* __end, size_t __max) const
  {
-  auto next = reinterpret_cast<const char16_t*>(__from);
+  range<const char16_t, false> from{ __from, __end };
  #if __SIZEOF_WCHAR_T__ == 2
-  next = ucs2_span(next, reinterpret_cast<const char16_t*>(__end), __max,
-                  _M_maxcode, _M_mode);
+  const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
  #elif __SIZEOF_WCHAR_T__ == 4
-  next = ucs4_span(next, reinterpret_cast<const char16_t*>(__end), __max,
-                  _M_maxcode, _M_mode);
+  const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
  #endif
    return reinterpret_cast<const char*>(next) - __from;
  }
  
  int
  __codecvt_utf16_base<wchar_t>::do_max_length() const throw()
-{ return 4; }
+{
+#if __SIZEOF_WCHAR_T__ == 2
+  int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
+#else
+  int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
+#endif
+  if (_M_mode & consume_header)
+    max += sizeof(utf16_bom);
+  return max;
+}
  #endif
  
  // Define members of codecvt_utf8_utf16<char16_t> base class implementation.
@@ -1280,7 +1447,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
  
  int
  __codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
  
  bool
  __codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw()
@@ -1298,9 +1465,12 @@ do_length(state_type&, const extern_type* __from,
  int
  __codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw()
  {
-  // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
-  // whereas 4 byte sequences require two 16-bit code units.
-  return 3;
+  // A single character can be 1 or 2 UTF-16 code units,
+  // requiring up to 4 UTF-8 code units.
+  int max = 4;
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
  }
  
  // Define members of codecvt_utf8_utf16<char32_t> base class implementation.
@@ -1341,7 +1511,11 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
  {
    range<const char> from{ __from, __from_end };
    range<char32_t> to{ __to, __to_end };
-  auto res = utf16_in(from, to, _M_maxcode, _M_mode);
+  codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
+#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
+  mode = codecvt_mode(mode | little_endian);
+#endif
+  auto res = utf16_in(from, to, _M_maxcode, mode);
    __from_next = from.next;
    __to_next = to.next;
    return res;
@@ -1349,7 +1523,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
  
  int
  __codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
  
  bool
  __codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw()
@@ -1367,9 +1541,12 @@ do_length(state_type&, const extern_type* __from,
  int
  __codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw()
  {
-  // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
-  // whereas 4 byte sequences require two 16-bit code units.
-  return 3;
+  // A single character can be 1 or 2 UTF-16 code units,
+  // requiring up to 4 UTF-8 code units.
+  int max = 4;
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
  }
  
  #ifdef _GLIBCXX_USE_WCHAR_T
@@ -1411,7 +1588,11 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
  {
    range<const char> from{ __from, __from_end };
    range<wchar_t> to{ __to, __to_end };
-  auto res = utf16_in(from, to, _M_maxcode, _M_mode);
+  codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
+#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
+  mode = codecvt_mode(mode | little_endian);
+#endif
+  auto res = utf16_in(from, to, _M_maxcode, mode);
    __from_next = from.next;
    __to_next = to.next;
    return res;
@@ -1419,7 +1600,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
  
  int
  __codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
  
  bool
  __codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw()
@@ -1437,9 +1618,12 @@ do_length(state_type&, const extern_type* __from,
  int
  __codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw()
  {
-  // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
-  // whereas 4 byte sequences require two 16-bit code units.
-  return 3;
+  // A single character can be 1 or 2 UTF-16 code units,
+  // requiring up to 4 UTF-8 code units.
+  int max = 4;
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
  }
  #endif
  
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc

index a21a8385629fec110b06eb6d181d56ed82a32066..a66a35a5964b8a62aab010cd62691b14b6c900e1 100644 (file)
--- a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
@@ -34,7 +34,7 @@ test01()
    const codecvt_c16* const cvt = &use_facet<codecvt_c16>(loc_c);
  
    VERIFY(!cvt->always_noconv());
-  VERIFY(cvt->max_length() == 3);
+  VERIFY(cvt->max_length() == 4);
    VERIFY(cvt->encoding() == 0);
  
    const char u8dat[] = u8"H\U000000E4ll\U000000F6 \U0001F63F \U000056FD "
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc

new file mode 100644 (file)

index 0000000..d8b9729
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc
@@ -0,0 +1,142 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <locale>
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+// PR libstdc++/79980
+
+constexpr std::codecvt_mode mode(std::codecvt_mode m)
+{ return static_cast<std::codecvt_mode>(m | std::consume_header); }
+
+template<typename WCh, unsigned long Max = 0x10FFFF,
+        std::codecvt_mode Mode = std::consume_header>
+  using Conv
+    = std::wstring_convert<std::codecvt_utf16<WCh, Max, mode(Mode)>, WCh>;
+
+void
+test01()
+{
+  const char src[] = "\xFE\xFF\xAB\xCD";
+  Conv<char16_t> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xabcd );
+}
+
+void
+test02()
+{
+  const char src[] = "\xFF\xFE\xAB\xCD";
+  Conv<char16_t> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xcdab );
+}
+
+void
+test03()
+{
+  const char src[] = "\xFE\xFF\xAB\xCD";
+  Conv<char16_t, 0x10FFFF, std::little_endian> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xabcd );
+}
+
+void
+test04()
+{
+  const char src[] = "\xFF\xFE\xAB\xCD";
+  Conv<char16_t, 0x10FFFF, std::little_endian> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xcdab );
+}
+
+void
+test05()
+{
+  const char src[] = "\0\x61\xAB\xCD"; // character greater than 0x00FF
+  Conv<char16_t, 0xFF> conv("to_bytes failed", u"from_bytes failed");
+  std::u16string result = conv.from_bytes(src, src+4);
+  VERIFY( result == u"from_bytes failed" );
+  VERIFY( conv.converted() == 2 );
+}
+
+void
+test06()
+{
+  const char src[] = "\0\x61\xAB\xCD";
+  Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
+  std::u16string result = conv.from_bytes(src, src+3); // incomplete character
+  VERIFY( result == u"from_bytes failed" );
+  VERIFY( conv.converted() == 2 );
+}
+
+void
+test07()
+{
+  Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
+  // ucs2 to utf-16 conversion should fail on invalid ucs2 input:
+  std::u16string utf16 = u"1234\U00001111\U0001ffff";
+  auto out = conv.to_bytes(utf16);
+  VERIFY( out == "to_bytes failed" );
+  VERIFY( conv.converted() == 5 );
+
+  // And should also fail on incomplete surrogate pair (not return partial):
+  out = conv.to_bytes(utf16.substr(0, utf16.size()-1));
+  VERIFY( out == "to_bytes failed" );
+  VERIFY( conv.converted() == 5 );
+}
+
+void
+test08()
+{
+  // Read/write UTF-16 code units from data not correctly aligned for char16_t
+  Conv<char16_t, 0x10FFFF, std::generate_header> conv;
+  const char src[] = "-\xFE\xFF\0\x61\xAB\xCD";
+  auto out = conv.from_bytes(src + 1, src + 7);
+  VERIFY( out[0] == 0x0061 );
+  VERIFY( out[1] == 0xabcd );
+  auto bytes = conv.to_bytes(out);
+  VERIFY( bytes == std::string(src + 1, 6) );
+}
+
+void
+test09()
+{
+  // Read/write UTF-16 code units from data not correctly aligned for char16_t
+  Conv<char32_t, 0x10FFFF, std::generate_header> conv;
+  const char src[] = "-\xFE\xFF\xD8\x08\xDF\x45";
+  auto out = conv.from_bytes(src + 1, src + 7);
+  VERIFY( out == U"\U00012345" );
+  auto bytes = conv.to_bytes(out);
+  VERIFY( bytes == std::string(src + 1, 6) );
+}
+
+int main()
+{
+  test01();
+  test02();
+  test03();
+  test04();
+  test05();
+  test06();
+  test07();
+  test08();
+  test09();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/80041.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/80041.cc

new file mode 100644 (file)

index 0000000..a78b194
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/80041.cc
@@ -0,0 +1,87 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+void
+test01()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+  std::codecvt_utf16<wchar_t> conv;
+  const wchar_t wc = 0x6557;
+  char bytes[2] = {0};
+  const wchar_t* wcnext;
+  std::mbstate_t st{};
+  char* next = nullptr;
+  auto res = conv.out(st, &wc, &wc+ 1, wcnext, bytes, std::end(bytes), next);
+  VERIFY( res == std::codecvt_base::ok );
+  VERIFY( wcnext == &wc + 1 );
+  VERIFY( next == std::end(bytes) );
+  VERIFY( bytes[0] == 0x65 );
+  VERIFY( bytes[1] == 0x57 );
+  VERIFY( conv.length(st, bytes, next, 1) == (next - bytes) );
+
+  wchar_t w;
+  wchar_t* wnext;
+  const char* cnext;
+  st = {};
+  res = conv.in(st, bytes, next, cnext, &w, &w + 1, wnext);
+  VERIFY( res == std::codecvt_base::ok );
+  VERIFY( wnext == &w + 1 );
+  VERIFY( cnext == next );
+  VERIFY( w == wc );
+#endif
+}
+
+void
+test02()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+  std::codecvt_utf16<wchar_t, 0x10FFFF, std::little_endian> conv;
+  wchar_t wc = 0x6557;
+  char bytes[2] = {0};
+  const wchar_t* wcnext;
+  std::mbstate_t st{};
+  char* next = nullptr;
+  auto res = conv.out(st, &wc, &wc+ 1, wcnext, bytes, std::end(bytes), next);
+  VERIFY( res == std::codecvt_base::ok );
+  VERIFY( wcnext == &wc + 1 );
+  VERIFY( next == std::end(bytes) );
+  VERIFY( bytes[0] == 0x57 );
+  VERIFY( bytes[1] == 0x65 );
+  VERIFY( conv.length(st, bytes, next, 1) == (next - bytes) );
+
+  wchar_t w;
+  wchar_t* wnext;
+  const char* cnext;
+  st = {};
+  res = conv.in(st, bytes, next, cnext, &w, &w + 1, wnext);
+  VERIFY( res == std::codecvt_base::ok );
+  VERIFY( wnext == &w + 1 );
+  VERIFY( cnext == next );
+  VERIFY( w == wc );
+#endif
+}
+
+int main()
+{
+  test01();
+  test02();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc

new file mode 100644 (file)

index 0000000..993c860
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc
@@ -0,0 +1,81 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+const int bomlen = 2; // UTF-16 BOM is 16 bits
+
+void
+test01()
+{
+  const int maxlen = 2;
+
+  std::codecvt_utf16<char16_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf16<char16_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test02()
+{
+  const int maxlen = 4;
+
+  std::codecvt_utf16<char32_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf16<char32_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test03()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+  const int maxlen = sizeof(wchar_t) == 4 ? 4 : 2;
+
+  std::codecvt_utf16<wchar_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf16<wchar_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+#endif
+}
+
+int
+main()
+{
+  test01();
+  test02();
+  test03();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/misaligned.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/misaligned.cc

new file mode 100644 (file)

index 0000000..0179c18
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/misaligned.cc
@@ -0,0 +1,289 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <locale>
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+using std::codecvt_base;
+using std::codecvt_mode;
+using std::codecvt_utf16;
+using std::wstring_convert;
+using std::mbstate_t;
+
+constexpr codecvt_mode
+operator|(codecvt_mode m1, codecvt_mode m2)
+{
+  using underlying = std::underlying_type<codecvt_mode>::type;
+  return static_cast<codecvt_mode>(static_cast<underlying>(m1) | m2);
+}
+
+// Read/write UTF-16 code units from data not correctly aligned for char16_t
+
+void
+test01()
+{
+  mbstate_t st;
+  constexpr codecvt_mode m = std::consume_header|std::generate_header;
+  codecvt_utf16<char16_t, 0x10FFFF, m> conv;
+  const char src[] = "-\xFE\xFF\0\x61\xAB\xCD";
+  const char* const src_end = src + 7;
+
+  int len = conv.length(st, src + 1, src_end, 1);
+  VERIFY( len == 4 );
+  len = conv.length(st, src + 1, src_end, 2);
+  VERIFY( len == 6 );
+
+  char16_t dst[2];
+  char16_t* const dst_end = dst + 2;
+  char16_t* dst_next;
+  const char* src_cnext;
+  auto res = conv.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( dst[0] == 0x0061 );
+  VERIFY( dst[1] == 0xabcd );
+  VERIFY( src_cnext == src_end );
+  VERIFY( dst_next == dst_end );
+
+  char out[sizeof(src)] = { src[0] };
+  char* const out_end = out + 7;
+  char* out_next;
+  const char16_t* dst_cnext;
+  res = conv.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( out_next == out_end );
+  VERIFY( dst_cnext == dst_end );
+  VERIFY( out[1] == src[1] );
+  VERIFY( out[2] == src[2] );
+  VERIFY( out[3] == src[3] );
+  VERIFY( out[4] == src[4] );
+  VERIFY( out[5] == src[5] );
+  VERIFY( out[6] == src[6] );
+
+  codecvt_utf16<char16_t, 0x10FFFF, m|std::little_endian> conv_le;
+
+  len = conv_le.length(st, src + 1, src_end, 1);
+  VERIFY( len == 4 );
+  len = conv_le.length(st, src + 1, src_end, 2);
+  VERIFY( len == 6 );
+
+  res = conv_le.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( dst[0] == 0x0061 );
+  VERIFY( dst[1] == 0xabcd );
+  VERIFY( src_cnext == src_end );
+  VERIFY( dst_next == dst_end );
+
+  res = conv_le.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( out_next == out_end );
+  VERIFY( dst_cnext == dst_end );
+  VERIFY( out[1] == src[2] );
+  VERIFY( out[2] == src[1] );
+  VERIFY( out[3] == src[4] );
+  VERIFY( out[4] == src[3] );
+  VERIFY( out[5] == src[6] );
+  VERIFY( out[6] == src[5] );
+}
+
+void
+test02()
+{
+  mbstate_t st;
+  constexpr codecvt_mode m = std::consume_header|std::generate_header;
+  codecvt_utf16<char32_t, 0x10FFFF, m> conv;
+  const char src[] = "-\xFE\xFF\0\x61\xAB\xCD\xD8\x08\xDF\x45";
+  const char* const src_end = src + 11;
+
+  int len = conv.length(st, src + 1, src_end, 1);
+  VERIFY( len == 4 );
+  len = conv.length(st, src + 1, src_end, 2);
+  VERIFY( len == 6 );
+  len = conv.length(st, src + 1, src_end, -1ul);
+  VERIFY( len == 10 );
+
+  char32_t dst[3];
+  char32_t* const dst_end = dst + 3;
+  char32_t* dst_next;
+  const char* src_cnext;
+  auto res = conv.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( dst[0] == 0x0061 );
+  VERIFY( dst[1] == 0xabcd );
+  VERIFY( dst[2] == 0x012345 );
+  VERIFY( src_cnext == src_end );
+  VERIFY( dst_next == dst_end );
+
+  char out[sizeof(src)] = { src[0] };
+  char* const out_end = out + 11;
+  char* out_next;
+  const char32_t* dst_cnext;
+  res = conv.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( out_next == out_end );
+  VERIFY( dst_cnext == dst_end );
+  VERIFY( out[1] == src[1] );
+  VERIFY( out[2] == src[2] );
+  VERIFY( out[3] == src[3] );
+  VERIFY( out[4] == src[4] );
+  VERIFY( out[5] == src[5] );
+  VERIFY( out[6] == src[6] );
+  VERIFY( out[7] == src[7] );
+  VERIFY( out[8] == src[8] );
+  VERIFY( out[9] == src[9] );
+  VERIFY( out[10] == src[10] );
+
+  codecvt_utf16<char32_t, 0x10FFFF, m|std::little_endian> conv_le;
+
+  len = conv_le.length(st, src + 1, src_end, 1);
+  VERIFY( len == 4 );
+  len = conv_le.length(st, src + 1, src_end, 2);
+  VERIFY( len == 6 );
+  len = conv.length(st, src + 1, src_end, -1ul);
+  VERIFY( len == 10 );
+
+  res = conv_le.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( dst[0] == 0x0061 );
+  VERIFY( dst[1] == 0xabcd );
+  VERIFY( dst[2] == 0x012345 );
+  VERIFY( src_cnext == src_end );
+  VERIFY( dst_next == dst_end );
+
+  res = conv_le.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( out_next == out_end );
+  VERIFY( dst_cnext == dst_end );
+  VERIFY( out[1] == src[2] );
+  VERIFY( out[2] == src[1] );
+  VERIFY( out[3] == src[4] );
+  VERIFY( out[4] == src[3] );
+  VERIFY( out[5] == src[6] );
+  VERIFY( out[6] == src[5] );
+  VERIFY( out[7] == src[8] );
+  VERIFY( out[8] == src[7] );
+  VERIFY( out[9] == src[10] );
+  VERIFY( out[10] == src[9] );
+}
+
+void
+test03()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+  mbstate_t st;
+  constexpr codecvt_mode m = std::consume_header|std::generate_header;
+  codecvt_utf16<wchar_t, 0x10FFFF, m> conv;
+  const char src[] = "-\xFE\xFF\0\x61\xAB\xCD\xD8\x08\xDF\x45";
+  const size_t in_len = sizeof(wchar_t) == 4 ? 11 : 7;
+  const size_t out_len = sizeof(wchar_t) == 4 ? 3 : 2;
+  const char* const src_end = src + in_len;
+
+  int len = conv.length(st, src + 1, src_end, 1);
+  VERIFY( len == 4 );
+  len = conv.length(st, src + 1, src_end, 2);
+  VERIFY( len == 6 );
+  if (sizeof(wchar_t) == 4)
+  {
+    len = conv.length(st, src + 1, src_end, -1ul);
+    VERIFY( len == 10 );
+  }
+
+  wchar_t dst[out_len];
+  wchar_t* const dst_end = dst + out_len;
+  wchar_t* dst_next;
+  const char* src_cnext;
+  auto res = conv.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( dst[0] == 0x0061 );
+  VERIFY( dst[1] == 0xabcd );
+  if (sizeof(wchar_t) == 4)
+    VERIFY( dst[2] == 0x012345 );
+  VERIFY( src_cnext == src_end );
+  VERIFY( dst_next == dst_end );
+
+  char out[sizeof(src)] = { src[0] };
+  char* const out_end = out + in_len;
+  char* out_next;
+  const wchar_t* dst_cnext;
+  res = conv.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( out_next == out_end );
+  VERIFY( dst_cnext == dst_end );
+  VERIFY( out[1] == src[1] );
+  VERIFY( out[2] == src[2] );
+  VERIFY( out[3] == src[3] );
+  VERIFY( out[4] == src[4] );
+  VERIFY( out[5] == src[5] );
+  VERIFY( out[6] == src[6] );
+  if (sizeof(wchar_t) == 4)
+  {
+    VERIFY( out[7] == src[7] );
+    VERIFY( out[8] == src[8] );
+    VERIFY( out[9] == src[9] );
+    VERIFY( out[10] == src[10] );
+  }
+
+  codecvt_utf16<wchar_t, 0x10FFFF, m|std::little_endian> conv_le;
+
+  len = conv_le.length(st, src + 1, src_end, 1);
+  VERIFY( len == 4 );
+  len = conv_le.length(st, src + 1, src_end, 2);
+  VERIFY( len == 6 );
+  if (sizeof(wchar_t) == 4)
+  {
+    len = conv.length(st, src + 1, src_end, -1ul);
+    VERIFY( len == 10 );
+  }
+
+  res = conv_le.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( dst[0] == 0x0061 );
+  VERIFY( dst[1] == 0xabcd );
+  if (sizeof(wchar_t) == 4)
+    VERIFY( dst[2] == 0x012345 );
+  VERIFY( src_cnext == src_end );
+  VERIFY( dst_next == dst_end );
+
+  res = conv_le.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( out_next == out_end );
+  VERIFY( dst_cnext == dst_end );
+  VERIFY( out[1] == src[2] );
+  VERIFY( out[2] == src[1] );
+  VERIFY( out[3] == src[4] );
+  VERIFY( out[4] == src[3] );
+  VERIFY( out[5] == src[6] );
+  VERIFY( out[6] == src[5] );
+  if (sizeof(wchar_t) == 4)
+  {
+    VERIFY( out[7] == src[8] );
+    VERIFY( out[8] == src[7] );
+    VERIFY( out[9] == src[10] );
+    VERIFY( out[10] == src[9] );
+  }
+#endif
+}
+
+int
+main()
+{
+  test01();
+  test02();
+  test03();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc

new file mode 100644 (file)

index 0000000..1251acb
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc
@@ -0,0 +1,94 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <locale>
+#include <string>
+#include <testsuite_hooks.h>
+
+using std::wstring_convert;
+using std::codecvt_utf8;
+
+void
+test01()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char16_t>, char16_t> c("bad", u"BAD");
+
+  // utf-8 to ucs2 conversion should fail on character outside BMP
+  auto ucs2 = c.from_bytes(src);
+  VERIFY( ucs2 == u"BAD" );
+  VERIFY( c.converted() == 7 );
+
+  // ucs2 to utf-8 conversion should fail on invalid ucs2 input:
+  std::u16string utf16 = u"1234\U00001111\U0001ffff";
+  auto out = c.to_bytes(utf16);
+  VERIFY( out == "bad" );
+  VERIFY( c.converted() == 5 );
+
+  // And should also fail on incomplete surrogate pair (not return partial):
+  out = c.to_bytes(utf16.substr(0, utf16.size()-1));
+  VERIFY( out == "bad" );
+  VERIFY( c.converted() == 5 );
+}
+
+void
+test02()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char16_t, 0x1000>, char16_t> c("bad", u"BAD");
+
+  // utf-8 to ucs2 conversion should fail on character above Maxcode=0x1000
+  auto ucs2 = c.from_bytes(src);
+  VERIFY( ucs2 == u"BAD" );
+  VERIFY( c.converted() == 4 );
+}
+
+void
+test03()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char32_t, 0x10000>, char32_t> c("bad", U"BAD");
+
+  // utf-8 to ucs4 conversion should fail on character above Maxcode=0x10000
+  auto ucs4 = c.from_bytes(src);
+  VERIFY( ucs4 == U"BAD" );
+  VERIFY( c.converted() == 7 );
+}
+
+void
+test04()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char32_t, 0x1000>, char32_t> c("bad", U"BAD");
+
+  // utf-8 to ucs4 conversion should fail on character above Maxcode=0x1000
+  auto ucs4 = c.from_bytes(src);
+  VERIFY( ucs4 == U"BAD" );
+  VERIFY( c.converted() == 4 );
+}
+
+int
+main()
+{
+  test01();
+  test02();
+  test03();
+  test04();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc

new file mode 100644 (file)

index 0000000..baeb049
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc
@@ -0,0 +1,81 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+const int bomlen = 3; // UTF-8 BOM is 24 bits
+
+void
+test01()
+{
+  const int maxlen = 3;
+
+  std::codecvt_utf8<char16_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8<char16_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test02()
+{
+  const int maxlen = 4;
+
+  std::codecvt_utf8<char32_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8<char32_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test03()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+  const int maxlen = sizeof(wchar_t) == 4 ? 4 : 3;
+
+  std::codecvt_utf8<wchar_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8<wchar_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+#endif
+}
+
+int
+main()
+{
+  test01();
+  test02();
+  test03();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc

new file mode 100644 (file)

index 0000000..5555bcb
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc
@@ -0,0 +1,60 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <locale>
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+// PR libstdc++/79511
+
+template<typename ElemT>
+  std::basic_string<ElemT> conv(const char* src)
+  {
+    std::wstring_convert<std::codecvt_utf8_utf16<ElemT>, ElemT> conv;
+    return conv.from_bytes(src);
+  }
+
+void
+test01()
+{
+  static char const src[] = "\xEF\xBF\xBF";
+  VERIFY( conv<char16_t>(src) == u"\xffff" );
+  VERIFY( conv<char32_t>(src) == U"\xffff" );
+#ifdef _GLIBCXX_USE_WCHAR_T
+  VERIFY( conv<wchar_t>(src) == L"\xffff" );
+#endif
+}
+
+void
+test02()
+{
+  static char const src[] = "\xE2\x82\xAC";
+  VERIFY( conv<char16_t>(src) == u"\x20ac" );
+  VERIFY( conv<char32_t>(src) == U"\x20ac" );
+#ifdef _GLIBCXX_USE_WCHAR_T
+  VERIFY( conv<wchar_t>(src) == L"\x20ac" );
+#endif
+}
+
+int
+main()
+{
+  test01();
+  test02();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc

new file mode 100644 (file)

index 0000000..8fcdfff
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc
@@ -0,0 +1,76 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+const int bomlen = 3; // UTF-8 BOM is 24 bits
+const int maxlen = 4;
+
+void
+test01()
+{
+  std::codecvt_utf8_utf16<char16_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8_utf16<char16_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test02()
+{
+  std::codecvt_utf8_utf16<char32_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8_utf16<char32_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test03()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+  std::codecvt_utf8_utf16<wchar_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8_utf16<wchar_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+#endif
+}
+
+int
+main()
+{
+  test01();
+  test02();
+  test03();
+}
author	Jonathan Wakely <jwakely@redhat.com>
	Fri, 17 Mar 2017 19:28:42 +0000 (19:28 +0000)
committer	Jonathan Wakely <redi@gcc.gnu.org>
	Fri, 17 Mar 2017 19:28:42 +0000 (19:28 +0000)
libstdc++-v3/ChangeLog		patch \| blob \| blame \| history
libstdc++-v3/include/bits/locale_conv.h		patch \| blob \| blame \| history
libstdc++-v3/src/c++11/codecvt.cc		patch \| blob \| blame \| history
libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc		patch \| blob \| blame \| history
libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc	[new file with mode: 0644]	patch \| blob
libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/80041.cc	[new file with mode: 0644]	patch \| blob
libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc	[new file with mode: 0644]	patch \| blob
libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/misaligned.cc	[new file with mode: 0644]	patch \| blob
libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc	[new file with mode: 0644]	patch \| blob
libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc	[new file with mode: 0644]	patch \| blob
libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc	[new file with mode: 0644]	patch \| blob
libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc	[new file with mode: 0644]	patch \| blob