constexpr _Utf_iterator&
operator--() requires bidirectional_iterator<_Iter>
{
- if (!_M_buf_index && _M_curr() != _M_first())
- _M_read_reverse();
- else if (_M_buf_index)
+ if (_M_buf_index > 0)
--_M_buf_index;
+ else if (_M_curr() != _M_first())
+ {
+ _M_read_reverse();
+ _M_buf_index = _M_buf_last - 1;
+ ranges::advance(_M_curr(), -_M_to_increment);
+ }
+ // else erroneous, but ignored for now.
return *this;
}
}
constexpr void
- _M_read_reverse(); // TODO
+ _M_read_reverse() requires bidirectional_iterator<_Iter>
+ {
+ if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
+ _M_read_reverse_utf8();
+ else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
+ _M_read_reverse_utf16();
+ else
+ {
+ static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
+ _M_read_reverse_utf32();
+ }
+ }
template<typename>
struct _Guard
_It _M_orig;
};
- constexpr void
+ constexpr char32_t
_M_read_utf8()
{
_Guard<_Iter> __g{this, _M_curr()};
__c = _S_error();
_M_update(__c, __to_incr);
+
+ return __c;
}
constexpr void
_M_update(__c, 1);
}
+ constexpr void
+ _M_read_reverse_utf8() requires bidirectional_iterator<_Iter>
+ {
+ const auto __first = _M_first();
+ auto __curr = _M_curr();
+ // The code point we decode:
+ char32_t __c{};
+ // The last code unit read:
+ uint8_t __u = *--__curr;
+ // Count of bytes read:
+ uint8_t __to_incr = 1;
+
+ if (__u <= 0x7F) [[likely]]
+ {
+ _M_update(__u, 1);
+ return;
+ }
+
+ // Continuation bytes match 10xxxxxx
+ auto __is_continuation = [](uint8_t __b) {
+ return (__b & 0xC0) == 0x80;
+ };
+ // 0xC0 and 0xC1 would produce overlong encodings of ASCII characters.
+ // 0xF5-0xFF would produce code points above U+10FFFF
+ auto __is_invalid = [](uint8_t __b) {
+ return (__b & 0xFE) == 0xC0 || __b >= 0xF5;
+ };
+
+ // No valid or invalid multibyte sequence is longer than 4 bytes,
+ // so skip back over at most four bytes.
+ while (__is_continuation(__u) && __to_incr < 4 && __curr != __first)
+ {
+ ++__to_incr;
+ __u = *--__curr;
+ }
+
+ // If the last byte read was a continuation byte then either we read
+ // four continuation bytes, or stopped at the start of the sequence.
+ // Either way, the maximal subparts are the individual continuation
+ // bytes so each one should be replaced with U+FFFD.
+ if (__is_continuation(__u) || __is_invalid(__u)) [[unlikely]]
+ {
+ // Either found four continuation bytes (maximum allowed is three)
+ // or first non-continuation byte is an invalid UTF-8 code unit.
+ _M_update(_S_error(), 1);
+ return;
+ }
+ // __u is a valid start byte so use countl_one to get the expected
+ // length of the multibyte sequence that starts with this byte.
+ int __seq_length = std::countl_one((unsigned char)__u);
+ if (__seq_length < __to_incr) [[unlikely]]
+ {
+ // If the expected number of continuation bytes is less than
+ // the number we found, then the last continuation byte is a
+ // maximal subpart and the decremented iterator points to it.
+ _M_update(_S_error(), 1);
+ return;
+ }
+
+ auto __orig = std::__exchange(_M_curr(), std::move(__curr));
+ if (_M_read_utf8() == _S_error()) [[unlikely]]
+ {
+ if (_M_to_increment < __to_incr) // Read truncated sequence, set
+ _M_to_increment = 1; // curr to last continuation byte.
+ }
+
+ _M_curr() = std::move(__orig);
+ // operator--() will move back by _M_to_increment
+ }
+
+ constexpr void
+ _M_read_reverse_utf16() requires bidirectional_iterator<_Iter>
+ {
+ _Guard<_Iter> __g{this, _M_curr()};
+ char32_t __c{};
+ uint16_t __u = *--_M_curr();
+ uint8_t __to_incr = 1;
+
+ if (__u < 0xD800 || __u > 0xDFFF) [[likely]]
+ __c = __u;
+ else if (__u >= 0xDC00 && _M_curr() != _M_first()) [[likely]]
+ {
+ // read a low surrogate, expect a high surrogate before it.
+ uint16_t __u2 = *--_M_curr();
+ if (__u2 < 0xD800 || __u2 > 0xDC00) [[unlikely]]
+ __c = _S_error(); // unpaired low surrogate
+ else
+ {
+ __to_incr = 2;
+ uint32_t __x = (__u2 & 0x3F) << 10 | (__u & 0x3FF);
+ uint32_t __w = (__u2 >> 6) & 0x1F;
+ __c = (__w + 1) << 16 | __x;
+ }
+ }
+ else
+ __c = _S_error(); // unpaired surrogate
+
+ _M_update(__c, __to_incr);
+ }
+
+ constexpr void
+ _M_read_reverse_utf32() requires bidirectional_iterator<_Iter>
+ {
+ _Guard<_Iter> __g{this, _M_curr()};
+ char32_t __c = *--_M_curr();
+ if (!__is_scalar_value(__c)) [[unlikely]]
+ __c = _S_error();
+ _M_update(__c, 1);
+ }
+
// Encode the code point __c as one or more code units in _M_buf.
constexpr void
_M_update(char32_t __c, uint8_t __to_incr)
namespace uc = std::__unicode;
using namespace std::string_view_literals;
+template<std::ranges::range View>
+constexpr void
+compare(View v, std::basic_string_view<std::ranges::range_value_t<View>> s)
+{
+ long size = s.size();
+ VERIFY( std::ranges::distance(v) == size );
+ VERIFY( std::ranges::equal(v, s) );
+ auto rev = std::views::reverse(v);
+ VERIFY( std::ranges::distance(rev) == size );
+ VERIFY( std::ranges::equal(rev, s | std::views::reverse) );
+}
+
constexpr void
test_utf8_to_utf8()
{
const auto s8 = u8"£🇬🇧 €🇪🇺 æбçδé ♠♥♦♣ 🤡"sv;
uc::_Utf8_view v(s8);
- VERIFY( std::ranges::distance(v) == s8.size() );
- VERIFY( std::ranges::equal(v, s8) );
+ compare(v, s8);
}
constexpr void
const auto s8 = u8"£🇬🇧 €🇪🇺 æбçδé ♠♥♦♣ 🤡"sv;
const std::u16string_view s16 = u"£🇬🇧 €🇪🇺 æбçδé ♠♥♦♣ 🤡";
uc::_Utf16_view v(s8);
- VERIFY( std::ranges::distance(v) == s16.size() );
- VERIFY( std::ranges::equal(v, s16) );
+ compare(v, s16);
}
constexpr void
const auto s8 = u8"£🇬🇧 €🇪🇺 æбçδé ♠♥♦♣ 🤡"sv;
const auto s32 = U"£🇬🇧 €🇪🇺 æбçδé ♠♥♦♣ 🤡"sv;
uc::_Utf32_view v(s8);
- VERIFY( std::ranges::distance(v) == s32.size() );
- VERIFY( std::ranges::equal(v, s32) );
+ compare(v, s32);
}
constexpr void
test_illformed_utf8()
{
uc::_Utf32_view v("\xa3 10.99 \xee \xdd"sv);
- VERIFY( std::ranges::equal(v, U"\uFFFD 10.99 \uFFFD \uFFFD"sv) );
+ compare(v, U"\uFFFD 10.99 \uFFFD \uFFFD"sv);
uc::_Utf16_view v2(" \xf8\x80\x80\x80 "sv);
- VERIFY( std::ranges::distance(v2) == 6 );
- VERIFY( std::ranges::equal(v2, U" \uFFFD\uFFFD\uFFFD\uFFFD "sv) );
+ compare(v2, u" \uFFFD\uFFFD\uFFFD\uFFFD "sv);
// Examples of U+FFFD substitution from Unicode standard.
uc::_Utf8_view v3("\xc0\xaf\xe0\x80\xbf\xf0\x81\x82\x41"sv); // Table 3-8
- VERIFY( std::ranges::equal(v3, u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv) );
+ compare(v3, u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv);
uc::_Utf8_view v4("\xed\xa0\x80\xed\xbf\xbf\xed\xaf\x41"sv); // Table 3-9
- VERIFY( std::ranges::equal(v4, u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv) );
+ compare(v4, u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv);
uc::_Utf8_view v5("\xf4\x91\x92\x93\xff\x41\x80\xbf\x42"sv); // Table 3-10
- VERIFY( std::ranges::equal(v5, u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41\uFFFD\uFFFD\x42"sv) );
+ compare(v5, u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41\uFFFD\uFFFD\x42"sv);
uc::_Utf8_view v6("\xe1\x80\xe2\xf0\x91\x92\xf1\xbf\x41"sv); // Table 3-11
- VERIFY( std::ranges::equal(v6, u8"\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv) );
+ compare(v6, u8"\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv);
uc::_Utf32_view v7("\xe1\x80"sv);
- VERIFY( std::ranges::equal(v7, U"\uFFFD"sv) );
+ compare(v7, U"\uFFFD"sv);
uc::_Utf32_view v8("\xf1\x80"sv);
- VERIFY( std::ranges::equal(v8, U"\uFFFD"sv) );
+ compare(v8, U"\uFFFD"sv);
uc::_Utf32_view v9("\xf1\x80\x80"sv);
- VERIFY( std::ranges::equal(v9, U"\uFFFD"sv) );
+ compare(v9, U"\uFFFD"sv);
+
+ uc::_Utf32_view v10("\xcf\x80\x80\x81\x82\x83 \x84\x85\x86\x87\x88 "sv);
+ compare(v10, U"\u03C0\uFFFD\uFFFD\uFFFD\uFFFD \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD "sv);
+ uc::_Utf16_view v11("\xcf\x80\x80\x81\x82\x83 \x84\x85\x86\x87\x88 "sv);
+ compare(v11, u"\u03C0\uFFFD\uFFFD\uFFFD\uFFFD \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD "sv);
+ uc::_Utf8_view v12("\xcf\x80\x80\x81\x82\x83 \x84\x85\x86\x87\x88 "sv);
+ compare(v12, u8"\u03C0\uFFFD\uFFFD\uFFFD\uFFFD \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD "sv);
}
constexpr void
{
std::u16string_view s = u"\N{CLOWN FACE}";
std::u16string_view r = u"\uFFFD";
- VERIFY( std::ranges::equal(uc::_Utf16_view(s.substr(0, 1)), r) );
- VERIFY( std::ranges::equal(uc::_Utf16_view(s.substr(1, 1)), r) );
+ compare(uc::_Utf16_view(s.substr(0, 1)), r);
+ compare(uc::_Utf16_view(s.substr(1, 1)), r);
std::array s2{ s[0], s[0] };
- VERIFY( std::ranges::equal(uc::_Utf16_view(s2), u"\uFFFD\uFFFD"sv) );
+ compare(uc::_Utf16_view(s2), u"\uFFFD\uFFFD"sv);
std::array s3{ s[0], s[0], s[1] };
- VERIFY( std::ranges::equal(uc::_Utf16_view(s3), u"\uFFFD\N{CLOWN FACE}"sv) );
+ compare(uc::_Utf16_view(s3), u"\uFFFD\N{CLOWN FACE}"sv);
std::array s4{ s[1], s[0] };
- VERIFY( std::ranges::equal(uc::_Utf16_view(s4), u"\uFFFD\uFFFD"sv) );
+ compare(uc::_Utf16_view(s4), u"\uFFFD\uFFFD"sv);
std::array s5{ s[1], s[0], s[1] };
- VERIFY( std::ranges::equal(uc::_Utf16_view(s5), u"\uFFFD\N{CLOWN FACE}"sv) );
+ compare(uc::_Utf16_view(s5), u"\uFFFD\N{CLOWN FACE}"sv);
}
constexpr void
test_illformed_utf32()
{
std::u32string_view s = U"\x110000";
- VERIFY( std::ranges::equal(uc::_Utf32_view(s), U"\uFFFD"sv) );
+ compare(uc::_Utf32_view(s), U"\uFFFD"sv);
s = U"\xFFFFFF";
- VERIFY( std::ranges::equal(uc::_Utf32_view(s), U"\uFFFD"sv) );
+ compare(uc::_Utf32_view(s), U"\uFFFD"sv);
s = U"\xFFFFFFF0";
- VERIFY( std::ranges::equal(uc::_Utf32_view(s), U"\uFFFD"sv) );
+ compare(uc::_Utf32_view(s), U"\uFFFD"sv);
}
constexpr void
iter++;
VERIFY( iter == v.end() );
VERIFY( *iter == U'4' );
+ std::ranges::advance(iter, -4);
+ VERIFY( *iter == U'1' );
+ // Incrementing before begin has well-defined behaviour.
+ iter--;
+ VERIFY( *iter == U'1' );
+ iter--;
+ VERIFY( *iter == U'1' );
std::string_view empty;
uc::_Utf32_view v2(empty);
iter++;
VERIFY( iter2 == v2.end() );
VERIFY( *iter2 == U'\0' );
+ iter--;
+ VERIFY( iter2 == v2.end() );
+ VERIFY( *iter2 == U'\0' );
}
int main()