From: Tim Shen Date: Sat, 14 Sep 2013 14:23:44 +0000 (+0000) Subject: regex.h (regex_match<>, [...]): Change regex_executor caller. X-Git-Tag: releases/gcc-4.9.0~4066 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7b86458e38ee3c88ca4e222c85ef6fa883267315;p=thirdparty%2Fgcc.git regex.h (regex_match<>, [...]): Change regex_executor caller. 2013-09-14 Tim Shen * include/bits/regex.h (regex_match<>, regex_search<>): Change regex_executor caller. Now use their return value instead of checking __m[0].matched to find out if it's successful. (regex_search<>): Move the search logic to regex_executor. * include/bits/regex_automaton.h: Add some new _Opcode. Refactor _NFA::_M_insert_*. * include/bits/regex_automaton.tcc: Add DEBUG dump for new _Opcode. Refactor _NFA::_M_insert_*. * include/bits/regex_compiler.h (_Compiler<>::_M_get_nfa): Use make_shared instead of construct by hand. * include/bits/regex_compiler.tcc: Implement _Compiler<>::_M_assertion. * include/bits/regex_constants.h: Fix indentation and line breaking. * include/bits/regex_executor.h: Add _ResultsEntry to support greedy/ungreedy mode. Move regex_search logic here. * include/bits/regex_executor.tcc: Implement assertions and greedy/ungreedy matching. * include/bits/regex_scanner.h: Add a new token _S_token_ungreedy. * include/bits/regex_scanner.tcc: Parse a new token _S_token_ungreedy. * testsuite/28_regex/algorithms/regex_search/ecma/assertion.cc: New. * testsuite/28_regex/algorithms/regex_search/ecma/greedy.cc: New. * testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc: Fix comment. From-SVN: r202591 --- diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index 8e0bfb70f8c9..96891056ac7e 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,3 +1,28 @@ +2013-09-14 Tim Shen + + * include/bits/regex.h (regex_match<>, regex_search<>): + Change regex_executor caller. Now use their return value instead + of checking __m[0].matched to find out if it's successful. + (regex_search<>): Move the search logic to regex_executor. + * include/bits/regex_automaton.h: Add some new _Opcode. Refactor + _NFA::_M_insert_*. + * include/bits/regex_automaton.tcc: Add DEBUG dump for new + _Opcode. Refactor _NFA::_M_insert_*. + * include/bits/regex_compiler.h (_Compiler<>::_M_get_nfa): + Use make_shared instead of construct by hand. + * include/bits/regex_compiler.tcc: Implement _Compiler<>::_M_assertion. + * include/bits/regex_constants.h: Fix indentation and line breaking. + * include/bits/regex_executor.h: Add _ResultsEntry to support + greedy/ungreedy mode. Move regex_search logic here. + * include/bits/regex_executor.tcc: Implement assertions and + greedy/ungreedy matching. + * include/bits/regex_scanner.h: Add a new token _S_token_ungreedy. + * include/bits/regex_scanner.tcc: Parse a new token _S_token_ungreedy. + * testsuite/28_regex/algorithms/regex_search/ecma/assertion.cc: New. + * testsuite/28_regex/algorithms/regex_search/ecma/greedy.cc: New. + * testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc: + Fix comment. + 2013-09-13 Paolo Carlini PR libstdc++/58415 diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h index 412465adfa29..659bee131208 100644 --- a/libstdc++-v3/include/bits/regex.h +++ b/libstdc++-v3/include/bits/regex.h @@ -2106,14 +2106,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION template friend class __detail::_BFSExecutor; - template + template friend bool regex_match(_Bp, _Bp, match_results<_Bp, _Ap>&, const basic_regex<_Ch_type, _Rx_traits>&, regex_constants::match_flag_type); - template + template friend bool regex_search(_Bp, _Bp, match_results<_Bp, _Ap>&, const basic_regex<_Ch_type, @@ -2213,8 +2215,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { if (__re._M_automaton == nullptr) return false; - __detail::__get_executor(__s, __e, __m, __re, __flags)->_M_match(); - if (__m.size() > 0 && __m[0].matched) + if (__detail::__get_executor(__s, __e, __m, __re, __flags)->_M_match()) { for (auto __it : __m) if (!__it.matched) @@ -2373,29 +2374,22 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { if (__re._M_automaton == nullptr) return false; - auto __cur = __first; - // Continue when __cur == __last - do + if (__detail::__get_executor(__first, __last, __m, __re, __flags) + ->_M_search()) { - __detail::__get_executor(__cur, __last, __m, __re, __flags) - ->_M_search_from_first(); - if (__m.size() > 0 && __m[0].matched) - { - for (auto __it : __m) - if (!__it.matched) - __it.first = __it.second = __last; - __m.at(__m.size()).first = __first; - __m.at(__m.size()).second = __m[0].first; - __m.at(__m.size()+1).first = __m[0].second; - __m.at(__m.size()+1).second = __last; - __m.at(__m.size()).matched = - (__m.prefix().first != __m.prefix().second); - __m.at(__m.size()+1).matched = - (__m.suffix().first != __m.suffix().second); - return true; - } + for (auto __it : __m) + if (!__it.matched) + __it.first = __it.second = __last; + __m.at(__m.size()).first = __first; + __m.at(__m.size()).second = __m[0].first; + __m.at(__m.size()+1).first = __m[0].second; + __m.at(__m.size()+1).second = __last; + __m.at(__m.size()).matched = + (__m.prefix().first != __m.prefix().second); + __m.at(__m.size()+1).matched = + (__m.suffix().first != __m.suffix().second); + return true; } - while (__cur++ != __last); return false; } diff --git a/libstdc++-v3/include/bits/regex_automaton.h b/libstdc++-v3/include/bits/regex_automaton.h index 77551756f65a..94a14ce96aa0 100644 --- a/libstdc++-v3/include/bits/regex_automaton.h +++ b/libstdc++-v3/include/bits/regex_automaton.h @@ -51,14 +51,18 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION /// that represents the regular expression. enum _Opcode { - _S_opcode_unknown = 0, - _S_opcode_alternative = 1, - _S_opcode_backref = 2, - _S_opcode_subexpr_begin = 4, - _S_opcode_subexpr_end = 5, - _S_opcode_dummy = 6, - _S_opcode_match = 100, - _S_opcode_accept = 255 + _S_opcode_unknown, + _S_opcode_alternative, + _S_opcode_backref, + _S_opcode_line_begin_assertion, + _S_opcode_line_end_assertion, + _S_opcode_word_boundry, + _S_opcode_subexpr_lookahead, + _S_opcode_subexpr_begin, + _S_opcode_subexpr_end, + _S_opcode_dummy, + _S_opcode_match, + _S_opcode_accept, }; template @@ -72,35 +76,25 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _StateIdT _M_next; // outgoing transition union // Since they are mutually exclusive. { - _StateIdT _M_alt; // for _S_opcode_alternative unsigned int _M_subexpr; // for _S_opcode_subexpr_* unsigned int _M_backref_index; // for _S_opcode_backref + struct + { + // for _S_opcode_alternative. + _StateIdT _M_quant_index; + // for _S_opcode_alternative or _S_opcode_subexpr_lookahead + _StateIdT _M_alt; + // for _S_opcode_word_boundry or _S_opcode_subexpr_lookahead or + // quantifiers(ungreedy if set true) + bool _M_neg; + }; }; - _MatcherT _M_matches; // for _S_opcode_match + _MatcherT _M_matches; // for _S_opcode_match explicit _State(_OpcodeT __opcode) : _M_opcode(__opcode), _M_next(_S_invalid_state_id) { } - _State(const _MatcherT& __m) - : _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id), - _M_matches(__m) - { } - - _State(_OpcodeT __opcode, unsigned __index) - : _M_opcode(__opcode), _M_next(_S_invalid_state_id) - { - if (__opcode == _S_opcode_subexpr_begin - || __opcode == _S_opcode_subexpr_end) - _M_subexpr = __index; - else if (__opcode == _S_opcode_backref) - _M_backref_index = __index; - } - - _State(_StateIdT __next, _StateIdT __alt) - : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt) - { } - #ifdef _GLIBCXX_DEBUG std::ostream& _M_print(std::ostream& ostr) const; @@ -141,7 +135,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _NFA(_FlagT __f) : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0), - _M_has_backref(false) + _M_has_backref(false), _M_quant_count(0) { } _FlagT @@ -163,23 +157,30 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _StateIdT _M_insert_accept() { - this->push_back(_StateT(_S_opcode_accept)); - _M_accepting_states.insert(this->size()-1); - return this->size()-1; + auto __ret = _M_insert_state(_StateT(_S_opcode_accept)); + _M_accepting_states.insert(__ret); + return __ret; } _StateIdT - _M_insert_alt(_StateIdT __next, _StateIdT __alt) + _M_insert_alt(_StateIdT __next, _StateIdT __alt, bool __neg) { - this->push_back(_StateT(__next, __alt)); - return this->size()-1; + _StateT __tmp(_S_opcode_alternative); + // It labels every quantifier to make greedy comparison easier in BFS + // approach. + __tmp._M_quant_index = _M_quant_count++; + __tmp._M_next = __next; + __tmp._M_alt = __alt; + __tmp._M_neg = __neg; + return _M_insert_state(__tmp); } _StateIdT _M_insert_matcher(_MatcherT __m) { - this->push_back(_StateT(__m)); - return this->size()-1; + _StateT __tmp(_S_opcode_match); + __tmp._M_matches = __m; + return _M_insert_state(__tmp); } _StateIdT @@ -187,28 +188,52 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { auto __id = _M_subexpr_count++; _M_paren_stack.push_back(__id); - this->push_back(_StateT(_S_opcode_subexpr_begin, __id)); - return this->size()-1; + _StateT __tmp(_S_opcode_subexpr_begin); + __tmp._M_subexpr = __id; + return _M_insert_state(__tmp); } _StateIdT _M_insert_subexpr_end() { - this->push_back(_StateT(_S_opcode_subexpr_end, _M_paren_stack.back())); + _StateT __tmp(_S_opcode_subexpr_end); + __tmp._M_subexpr = _M_paren_stack.back(); _M_paren_stack.pop_back(); - return this->size()-1; + return _M_insert_state(__tmp); } _StateIdT _M_insert_backref(unsigned int __index); _StateIdT - _M_insert_dummy() + _M_insert_line_begin() + { return _M_insert_state(_StateT(_S_opcode_line_begin_assertion)); } + + _StateIdT + _M_insert_line_end() + { return _M_insert_state(_StateT(_S_opcode_line_end_assertion)); } + + _StateIdT + _M_insert_word_bound(bool __neg) { - this->push_back(_StateT(_S_opcode_dummy)); - return this->size()-1; + _StateT __tmp(_S_opcode_word_boundry); + __tmp._M_neg = __neg; + return _M_insert_state(__tmp); } + _StateIdT + _M_insert_lookahead(_StateIdT __alt, bool __neg) + { + _StateT __tmp(_S_opcode_subexpr_lookahead); + __tmp._M_alt = __alt; + __tmp._M_neg = __neg; + return _M_insert_state(__tmp); + } + + _StateIdT + _M_insert_dummy() + { return _M_insert_state(_StateT(_S_opcode_dummy)); } + _StateIdT _M_insert_state(_StateT __s) { @@ -230,6 +255,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _FlagT _M_flags; _StateIdT _M_start_state; _SizeT _M_subexpr_count; + _SizeT _M_quant_count; bool _M_has_backref; }; diff --git a/libstdc++-v3/include/bits/regex_automaton.tcc b/libstdc++-v3/include/bits/regex_automaton.tcc index 2d34b95cdba6..13af984c273e 100644 --- a/libstdc++-v3/include/bits/regex_automaton.tcc +++ b/libstdc++-v3/include/bits/regex_automaton.tcc @@ -80,6 +80,31 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION << __id << " -> " << _M_alt << " [label=\"epsilon\", tailport=\"n\"];\n"; break; + case _S_opcode_backref: + __ostr << __id << " [label=\"" << __id << "\\nBACKREF " + << _M_subexpr << "\"];\n" + << __id << " -> " << _M_next << " [label=\"\"];\n"; + break; + case _S_opcode_line_begin_assertion: + __ostr << __id << " [label=\"" << __id << "\\nLINE_BEGIN \"];\n" + << __id << " -> " << _M_next << " [label=\"epsilon\"];\n"; + break; + case _S_opcode_line_end_assertion: + __ostr << __id << " [label=\"" << __id << "\\nLINE_END \"];\n" + << __id << " -> " << _M_next << " [label=\"epsilon\"];\n"; + break; + case _S_opcode_word_boundry: + __ostr << __id << " [label=\"" << __id << "\\nWORD_BOUNDRY " + << _M_neg << "\"];\n" + << __id << " -> " << _M_next << " [label=\"epsilon\"];\n"; + break; + case _S_opcode_subexpr_lookahead: + __ostr << __id << " [label=\"" << __id << "\\nLOOK_AHEAD\"];\n" + << __id << " -> " << _M_next + << " [label=\"epsilon\", tailport=\"s\"];\n" + << __id << " -> " << _M_alt + << " [label=\"\", tailport=\"n\"];\n"; + break; case _S_opcode_subexpr_begin: __ostr << __id << " [label=\"" << __id << "\\nSBEGIN " << _M_subexpr << "\"];\n" @@ -90,10 +115,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION << _M_subexpr << "\"];\n" << __id << " -> " << _M_next << " [label=\"epsilon\"];\n"; break; - case _S_opcode_backref: - __ostr << __id << " [label=\"" << __id << "\\nBACKREF " - << _M_subexpr << "\"];\n" - << __id << " -> " << _M_next << " [label=\"\"];\n"; + case _S_opcode_dummy: break; case _S_opcode_match: __ostr << __id << " [label=\"" << __id << "\\nMATCH\"];\n" @@ -102,8 +124,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION case _S_opcode_accept: __ostr << __id << " [label=\"" << __id << "\\nACC\"];\n" ; break; - case _S_opcode_dummy: - break; default: _GLIBCXX_DEBUG_ASSERT(false); break; @@ -141,8 +161,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (__index == __it) __throw_regex_error(regex_constants::error_backref); _M_has_backref = true; - this->push_back(_StateT(_S_opcode_backref, __index)); - return this->size()-1; + _StateT __tmp(_S_opcode_backref); + __tmp._M_backref_index = __index; + return _M_insert_state(__tmp); } template @@ -152,7 +173,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION for (auto& __it : *this) { while (__it._M_next >= 0 && (*this)[__it._M_next]._M_opcode - == _S_opcode_dummy) + == _S_opcode_dummy) __it._M_next = (*this)[__it._M_next]._M_next; if (__it._M_opcode == _S_opcode_alternative) while (__it._M_alt >= 0 && (*this)[__it._M_alt]._M_opcode diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h index 96a0d2941775..3b85d3a46c3b 100644 --- a/libstdc++-v3/include/bits/regex_compiler.h +++ b/libstdc++-v3/include/bits/regex_compiler.h @@ -56,7 +56,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION std::shared_ptr<_RegexT> _M_get_nfa() const - { return std::shared_ptr<_RegexT>(new _RegexT(_M_nfa)); } + { return make_shared<_RegexT>(_M_nfa); } private: typedef _Scanner<_FwdIter> _ScannerT; diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index a574e8e5ddd7..8dc779b68e17 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -96,7 +96,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __alt2._M_append(__end); _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_alt(__alt1._M_start, - __alt2._M_start), + __alt2._M_start, false), __end)); } } @@ -132,25 +132,34 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return false; } - // TODO Implement it. template bool _Compiler<_FwdIter, _CharT, _TraitsT>:: _M_assertion() { - // temporary place holders. if (_M_match_token(_ScannerT::_S_token_line_begin)) - _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy())); + _M_stack.push(_StateSeqT(_M_nfa, _M_nfa. + _M_insert_line_begin())); else if (_M_match_token(_ScannerT::_S_token_line_end)) - _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy())); + _M_stack.push(_StateSeqT(_M_nfa, _M_nfa. + _M_insert_line_end())); else if (_M_match_token(_ScannerT::_S_token_word_bound)) - _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy())); - else if (_M_match_token(_ScannerT::_S_token_neg_word_bound)) - _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy())); + // _M_value[0] == 'n' means it's negtive, say "not word boundary". + _M_stack.push(_StateSeqT(_M_nfa, _M_nfa. + _M_insert_word_bound(_M_value[0] == 'n'))); else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin)) - _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy())); - else if (_M_match_token(_ScannerT::_S_token_subexpr_neg_lookahead_begin)) - _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy())); + { + auto __neg = _M_value[0] == 'n'; + this->_M_disjunction(); + if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) + __throw_regex_error(regex_constants::error_paren); + auto __tmp = _M_pop(); + __tmp._M_append(_M_nfa._M_insert_accept()); + _M_stack.push( + _StateSeqT( + _M_nfa, + _M_nfa._M_insert_lookahead(__tmp._M_start, __neg))); + } else return false; return true; @@ -161,40 +170,44 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _Compiler<_FwdIter, _CharT, _TraitsT>:: _M_quantifier() { - if (_M_match_token(_ScannerT::_S_token_closure0)) + bool __neg = regex_constants::ECMAScript; + auto __init = [this, &__neg]() { if (_M_stack.empty()) __throw_regex_error(regex_constants::error_badrepeat); + __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); + }; + if (_M_match_token(_ScannerT::_S_token_closure0)) + { + __init(); auto __e = _M_pop(); _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id, - __e._M_start)); + __e._M_start, __neg)); __e._M_append(__r); _M_stack.push(__r); } else if (_M_match_token(_ScannerT::_S_token_closure1)) { - if (_M_stack.empty()) - __throw_regex_error(regex_constants::error_badrepeat); + __init(); auto __e = _M_pop(); - __e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start)); + __e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start, + __neg)); _M_stack.push(__e); } else if (_M_match_token(_ScannerT::_S_token_opt)) { - if (_M_stack.empty()) - __throw_regex_error(regex_constants::error_badrepeat); + __init(); auto __e = _M_pop(); auto __end = _M_nfa._M_insert_dummy(); _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id, - __e._M_start)); + __e._M_start, __neg)); __e._M_append(__end); __r._M_append(__end); _M_stack.push(__r); } else if (_M_match_token(_ScannerT::_S_token_interval_begin)) { - if (_M_stack.empty()) - __throw_regex_error(regex_constants::error_badrepeat); + __init(); if (!_M_match_token(_ScannerT::_S_token_dup_count)) __throw_regex_error(regex_constants::error_badbrace); _StateSeqT __r(_M_pop()); @@ -206,23 +219,27 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (_M_match_token(_ScannerT::_S_token_comma)) if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7} { - int __n = _M_cur_int_value(10) - __min_rep; - if (__n < 0) - __throw_regex_error(regex_constants::error_badbrace); - auto __end = _M_nfa._M_insert_dummy(); - for (int __i = 0; __i < __n; ++__i) - { + int __n = _M_cur_int_value(10) - __min_rep; + if (__n < 0) + __throw_regex_error(regex_constants::error_badbrace); + auto __end = _M_nfa._M_insert_dummy(); + for (int __i = 0; __i < __n; ++__i) + { auto __tmp = __r._M_clone(); - __e._M_append(_StateSeqT(_M_nfa, _M_nfa. - _M_insert_alt(__tmp._M_start, __end), __tmp._M_end)); - } + __e._M_append + (_StateSeqT(_M_nfa, + _M_nfa._M_insert_alt(__tmp._M_start, + __end, __neg), + __tmp._M_end)); + } __e._M_append(__end); } else // {3,} { auto __tmp = __r._M_clone(); - _StateSeqT __s(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id, - __tmp._M_start)); + _StateSeqT __s(_M_nfa, + _M_nfa._M_insert_alt(_S_invalid_state_id, + __tmp._M_start, __neg)); __tmp._M_append(__s); __e._M_append(__s); } diff --git a/libstdc++-v3/include/bits/regex_constants.h b/libstdc++-v3/include/bits/regex_constants.h index 23174becdf96..10b962ad21a2 100644 --- a/libstdc++-v3/include/bits/regex_constants.h +++ b/libstdc++-v3/include/bits/regex_constants.h @@ -78,87 +78,87 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION * %set. */ enum syntax_option_type : unsigned int - { - /** - * Specifies that the matching of regular expressions against a character - * sequence shall be performed without regard to case. - */ - icase = 1 << _S_icase, - - /** - * Specifies that when a regular expression is matched against a character - * container sequence, no sub-expression matches are to be stored in the - * supplied match_results structure. - */ - nosubs = 1 << _S_nosubs, - - /** - * Specifies that the regular expression engine should pay more attention to - * the speed with which regular expressions are matched, and less to the - * speed with which regular expression objects are constructed. Otherwise - * it has no detectable effect on the program output. - */ - optimize = 1 << _S_optimize, - - /** - * Specifies that character ranges of the form [a-b] should be locale - * sensitive. - */ - collate = 1 << _S_collate, - - /** - * Specifies that the grammar recognized by the regular expression engine is - * that used by ECMAScript in ECMA-262 [Ecma International, ECMAScript - * Language Specification, Standard Ecma-262, third edition, 1999], as - * modified in section [28.13]. This grammar is similar to that defined - * in the PERL scripting language but extended with elements found in the - * POSIX regular expression grammar. - */ - ECMAScript = 1 << _S_ECMAScript, - - /** - * Specifies that the grammar recognized by the regular expression engine is - * that used by POSIX basic regular expressions in IEEE Std 1003.1-2001, - * Portable Operating System Interface (POSIX), Base Definitions and - * Headers, Section 9, Regular Expressions [IEEE, Information Technology -- - * Portable Operating System Interface (POSIX), IEEE Standard 1003.1-2001]. - */ - basic = 1 << _S_basic, - - /** - * Specifies that the grammar recognized by the regular expression engine is - * that used by POSIX extended regular expressions in IEEE Std 1003.1-2001, - * Portable Operating System Interface (POSIX), Base Definitions and Headers, - * Section 9, Regular Expressions. - */ - extended = 1 << _S_extended, - - /** - * Specifies that the grammar recognized by the regular expression engine is - * that used by POSIX utility awk in IEEE Std 1003.1-2001. This option is - * identical to syntax_option_type extended, except that C-style escape - * sequences are supported. These sequences are: - * \\\\, \\a, \\b, \\f, \\n, \\r, \\t , \\v, \\&apos,, &apos,, - * and \\ddd (where ddd is one, two, or three octal digits). - */ - awk = 1 << _S_awk, - - /** - * Specifies that the grammar recognized by the regular expression engine is - * that used by POSIX utility grep in IEEE Std 1003.1-2001. This option is - * identical to syntax_option_type basic, except that newlines are treated - * as whitespace. - */ - grep = 1 << _S_grep, - - /** - * Specifies that the grammar recognized by the regular expression engine is - * that used by POSIX utility grep when given the -E option in - * IEEE Std 1003.1-2001. This option is identical to syntax_option_type - * extended, except that newlines are treated as whitespace. - */ - egrep = 1 << _S_egrep, - }; + { + /** + * Specifies that the matching of regular expressions against a character + * sequence shall be performed without regard to case. + */ + icase = 1 << _S_icase, + + /** + * Specifies that when a regular expression is matched against a character + * container sequence, no sub-expression matches are to be stored in the + * supplied match_results structure. + */ + nosubs = 1 << _S_nosubs, + + /** + * Specifies that the regular expression engine should pay more attention to + * the speed with which regular expressions are matched, and less to the + * speed with which regular expression objects are constructed. Otherwise + * it has no detectable effect on the program output. + */ + optimize = 1 << _S_optimize, + + /** + * Specifies that character ranges of the form [a-b] should be locale + * sensitive. + */ + collate = 1 << _S_collate, + + /** + * Specifies that the grammar recognized by the regular expression engine is + * that used by ECMAScript in ECMA-262 [Ecma International, ECMAScript + * Language Specification, Standard Ecma-262, third edition, 1999], as + * modified in section [28.13]. This grammar is similar to that defined + * in the PERL scripting language but extended with elements found in the + * POSIX regular expression grammar. + */ + ECMAScript = 1 << _S_ECMAScript, + + /** + * Specifies that the grammar recognized by the regular expression engine is + * that used by POSIX basic regular expressions in IEEE Std 1003.1-2001, + * Portable Operating System Interface (POSIX), Base Definitions and + * Headers, Section 9, Regular Expressions [IEEE, Information Technology -- + * Portable Operating System Interface (POSIX), IEEE Standard 1003.1-2001]. + */ + basic = 1 << _S_basic, + + /** + * Specifies that the grammar recognized by the regular expression engine is + * that used by POSIX extended regular expressions in IEEE Std 1003.1-2001, + * Portable Operating System Interface (POSIX), Base Definitions and + * Headers, Section 9, Regular Expressions. + */ + extended = 1 << _S_extended, + + /** + * Specifies that the grammar recognized by the regular expression engine is + * that used by POSIX utility awk in IEEE Std 1003.1-2001. This option is + * identical to syntax_option_type extended, except that C-style escape + * sequences are supported. These sequences are: + * \\\\, \\a, \\b, \\f, \\n, \\r, \\t , \\v, \\&apos,, &apos,, + * and \\ddd (where ddd is one, two, or three octal digits). + */ + awk = 1 << _S_awk, + + /** + * Specifies that the grammar recognized by the regular expression engine is + * that used by POSIX utility grep in IEEE Std 1003.1-2001. This option is + * identical to syntax_option_type basic, except that newlines are treated + * as whitespace. + */ + grep = 1 << _S_grep, + + /** + * Specifies that the grammar recognized by the regular expression engine is + * that used by POSIX utility grep when given the -E option in + * IEEE Std 1003.1-2001. This option is identical to syntax_option_type + * extended, except that newlines are treated as whitespace. + */ + egrep = 1 << _S_egrep, + }; constexpr inline syntax_option_type operator&(syntax_option_type __a, syntax_option_type __b) diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index 6d66d8815846..3df33e030245 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -66,33 +66,46 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { } // Set matched when string exactly match the pattern. - virtual void + virtual bool _M_match() = 0; // Set matched when some prefix of the string matches the pattern. - virtual void - _M_search_from_first() = 0; + virtual bool + _M_search() = 0; protected: typedef typename _NFA<_CharT, _TraitsT>::_SizeT _SizeT; - _Executor(_BiIter __begin, - _BiIter __end, - _ResultsT& __results, - _FlagT __flags, - _SizeT __size) - : _M_current(__begin), _M_end(__end), _M_results(__results), - _M_flags(__flags) + typedef typename _TraitsT::char_class_type _ClassT; + + _Executor(_BiIter __begin, + _BiIter __end, + _ResultsT& __results, + _FlagT __flags, + _SizeT __size, + const _TraitsT& __traits) + : _M_current(__begin), _M_begin(__begin), _M_end(__end), + _M_results(__results), _M_flags(__flags), _M_traits(__traits) { __size += 2; _M_results.resize(__size); - for (auto __i = 0; __i < __size; __i++) + for (_SizeT __i = 0; __i < __size; ++__i) _M_results[__i].matched = false; } - _BiIter _M_current; - _BiIter _M_end; - _ResultsVec& _M_results; - _FlagT _M_flags; + bool + _M_is_word(_CharT __ch) + { + static const _CharT __s = 'w'; + return _M_traits.isctype(__ch, + _M_traits.lookup_classname(&__s, &__s+1)); + } + + _BiIter _M_current; + const _BiIter _M_begin; + const _BiIter _M_end; + _ResultsVec& _M_results; + const _TraitsT& _M_traits; + _FlagT _M_flags; }; // A _DFSExecutor perform a DFS on given NFA and input string. At the very @@ -126,26 +139,51 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION const _RegexT& __nfa, const _TraitsT& __traits, _FlagT __flags) - : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()), - _M_traits(__traits), _M_nfa(__nfa), _M_results_ret(this->_M_results) + : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count(), + __traits), + _M_traits(__traits), _M_nfa(__nfa), _M_cur_results(this->_M_results), + _M_start_state(__nfa._M_start()) { } - void + bool _M_match() - { _M_dfs(_M_nfa._M_start()); } + { + this->_M_current = this->_M_begin; + return _M_dfs(_M_start_state); + } - void + bool _M_search_from_first() - { _M_dfs(_M_nfa._M_start()); } + { + this->_M_current = this->_M_begin; + return _M_dfs(_M_start_state); + } + + bool + _M_search() + { + auto __cur = this->_M_begin; + do + { + this->_M_current = __cur; + if (_M_dfs(_M_start_state)) + return true; + } + // Continue when __cur == _M_end + while (__cur++ != this->_M_end); + return false; + } private: template bool _M_dfs(_StateIdT __i); - _ResultsVec _M_results_ret; + // To record current solution. + _ResultsVec _M_cur_results; const _TraitsT& _M_traits; const _RegexT& _M_nfa; + _StateIdT _M_start_state; }; // Like the DFS approach, it try every possible state transition; Unlike DFS, @@ -170,35 +208,129 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT; typedef _NFA<_CharT, _TraitsT> _RegexT; typedef typename _BaseT::_ResultsT _ResultsT; - typedef typename _BaseT::_ResultsVec _ResultsVec; - typedef std::unique_ptr<_ResultsVec> _ResultsPtr; + // Here's a solution for greedy/ungreedy mode in BFS approach. We need to + // carefully work out how to compare to conflict matching states. + // + // A matching state is a pair(where, when); `where` is a NFA node; `when` + // is a _BiIter, indicating which char is the next to be mathed one. Two + // matching states conflict means that they have equivalent `where` and + // `when`. + // + // Now since we need to drop one and keep another, because at most one of + // them could be the final optimal solution. This behavior is affected by + // greedy policy. + // + // The definition of `greedy`: + // For the sequence of quantifiers in NFA sorted by there start position, + // now maintain a vector in a matching state, with equal length to + // quantifier seq, recording repeating times of every quantifier. Now to + // compare two matching states, we just lexically compare these two + // vectors. To win the compare(to survive), one matching state needs to + // make its greedy quantifier count larger, and ungreedy quantifiers + // count smaller. + // + // In the implementation, we recorded negtive numbers for greedy + // quantifiers and positive numbers of ungreedy ones. Now a simple + // operator<() for lexicographical_compare will emit the answer. + // + // When two vectors equal, it means the `where`, `when` and quantifier + // counts are identical, it indicates the same answer, so just return + // false. + struct _ResultsEntry + : private _BaseT::_ResultsVec + { + public: + _ResultsEntry(unsigned int __res_sz, unsigned int __sz) + : _BaseT::_ResultsVec(__res_sz), _M_quant_keys(__sz) + { } + + sub_match<_BiIter>& + operator[](unsigned int __idx) + { return this->_BaseT::_ResultsVec::operator[](__idx); } + + bool + operator<(const _ResultsEntry& __rhs) const + { + _GLIBCXX_DEBUG_ASSERT(_M_quant_keys.size() + == __rhs._M_quant_keys.size()); + return lexicographical_compare(_M_quant_keys.begin(), + _M_quant_keys.end(), + __rhs._M_quant_keys.begin(), + __rhs._M_quant_keys.end()); + } + + void + _M_inc(unsigned int __idx, bool __neg) + { _M_quant_keys[__idx] += __neg ? 1 : -1; } + + typename _BaseT::_ResultsVec + _M_get() + { return *this; } + + public: + std::vector _M_quant_keys; + }; + + typedef std::unique_ptr<_ResultsEntry> _ResultsPtr; typedef regex_constants::match_flag_type _FlagT; - _BFSExecutor(_BiIter __begin, - _BiIter __end, - _ResultsT& __results, - const _RegexT& __nfa, - _FlagT __flags) - : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()), - _M_nfa(__nfa) - { - if (_M_nfa._M_start() != _S_invalid_state_id) - _M_covered[_M_nfa._M_start()] = - _ResultsPtr(new _ResultsVec(this->_M_results)); - _M_e_closure(); - } + _BFSExecutor(_BiIter __begin, + _BiIter __end, + _ResultsT& __results, + const _RegexT& __nfa, + const _TraitsT& __traits, + _FlagT __flags) + : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count(), + __traits), + _M_nfa(__nfa), + _M_cur_results(nullptr), + _M_start_state(__nfa._M_start()) + { } - void + bool _M_match() - { _M_main_loop(); } + { + _M_init(this->_M_begin); + return _M_main_loop(); + } - void + bool _M_search_from_first() - { _M_main_loop(); } + { + _M_init(this->_M_begin); + return _M_main_loop(); + } + + bool + _M_search() + { + auto __cur = this->_M_begin; + do + { + _M_init(__cur); + if (_M_main_loop()) + return true; + } + // Continue when __cur == _M_end + while (__cur++ != this->_M_end); + return false; + } private: + void + _M_init(_BiIter __cur) + { + _GLIBCXX_DEBUG_ASSERT(_M_start_state != _S_invalid_state_id); + this->_M_current = __cur; + _M_covered.clear(); + _M_covered[_M_start_state] = + _ResultsPtr(new _ResultsEntry(this->_M_results.size(), + _M_nfa._M_quant_count)); + _M_e_closure(); + } + template - void + bool _M_main_loop(); void @@ -208,13 +340,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_move(); bool - _M_match_less_than(const _ResultsVec& __u, const _ResultsVec& __v) const; - - bool - _M_includes_some() const; + _M_includes_some(); - std::map<_StateIdT, _ResultsPtr> _M_covered; - const _RegexT& _M_nfa; + std::map<_StateIdT, _ResultsPtr> _M_covered; + // To record global optimal solution. + _ResultsPtr _M_cur_results; + const _RegexT& _M_nfa; + _StateIdT _M_start_state; }; //@} regex-detail diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index 788d65e54de2..b110c5dc2f0a 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -44,18 +44,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // This is not that certain. Need deeper investigate. return false; auto& __current = this->_M_current; + auto& __begin = this->_M_begin; auto& __end = this->_M_end; - auto& __results = _M_results_ret; + auto& __results = _M_cur_results; const auto& __state = _M_nfa[__i]; bool __ret = false; switch (__state._M_opcode) { case _S_opcode_alternative: - // Greedy mode by default. For non-greedy mode, - // swap _M_alt and _M_next. - // TODO: Add greedy mode option. - __ret = _M_dfs<__match_mode>(__state._M_alt) - || _M_dfs<__match_mode>(__state._M_next); + // Greedy or not, this is a question ;) + if (!__state._M_neg) + __ret = _M_dfs<__match_mode>(__state._M_alt) + || _M_dfs<__match_mode>(__state._M_next); + else + __ret = _M_dfs<__match_mode>(__state._M_next) + || _M_dfs<__match_mode>(__state._M_alt); break; case _S_opcode_subexpr_begin: // Here's the critical part: if there's nothing changed since last @@ -86,6 +89,52 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION else __ret = _M_dfs<__match_mode>(__state._M_next); break; + case _S_opcode_line_begin_assertion: + if (__current == __begin) + __ret = _M_dfs<__match_mode>(__state._M_next); + break; + case _S_opcode_line_end_assertion: + if (__current == __end) + __ret = _M_dfs<__match_mode>(__state._M_next); + break; + // By definition. + case _S_opcode_word_boundry: + { + bool __ans = false; + if (__current == __begin && this->_M_is_word(*__current)) + __ans = true; + else if (__current == __end && this->_M_is_word(*__current)) + __ans = true; + else + { + auto __pre = __current; + --__pre; + if (this->_M_is_word(*__current) + != this->_M_is_word(*__pre)) + __ans = true; + } + if (__ans == !__state._M_neg) + __ret = _M_dfs<__match_mode>(__state._M_next); + } + break; + // Here __state._M_alt offers a single start node for a sub-NFA. + // We recursivly invoke our algorithm to match the sub-NFA. + case _S_opcode_subexpr_lookahead: + { + _ResultsT __m; + // FIXME Here's not necessarily a DFSExecutor. But we need to + // refactor the whole NFA to a recursive tree structure first. + _DFSExecutor __sub(this->_M_current, + this->_M_end, + __m, + this->_M_nfa, + this->_M_traits, + this->_M_flags); + __sub._M_start_state = __state._M_alt; + if (__sub._M_search_from_first() == !__state._M_neg) + __ret = _M_dfs<__match_mode>(__state._M_next); + } + break; case _S_opcode_match: if (__current != __end && __state._M_matches(*__current)) { @@ -138,19 +187,23 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION template template - void _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>:: + bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>:: _M_main_loop() { + bool __ret = false; while (this->_M_current != this->_M_end) { if (!__match_mode) - if (_M_includes_some()) - return; + // To keep regex_search greedy, no "return true" here. + __ret = _M_includes_some() || __ret; _M_move(); ++this->_M_current; _M_e_closure(); } - _M_includes_some(); + __ret = _M_includes_some() || __ret; + if (__ret) + this->_M_results = _M_cur_results->_M_get(); + return __ret; } template_M_current; std::queue<_StateIdT> __q; std::vector __in_q(_M_nfa.size(), false); + auto& __begin = this->_M_begin; + auto& __end = this->_M_end; + for (auto& __it : _M_covered) { __in_q[__it.first] = true; @@ -173,18 +229,19 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __in_q[__u] = false; const auto& __state = _M_nfa[__u]; - // Can be implemented using method, but there're too much arguments. - // I would use macro function before C++11, but lambda is a better - // choice, since hopefully compiler can inline it. + // Can be implemented using method, but there will be too many + // arguments. I would use macro function before C++11, but lambda is + // a better choice, since hopefully compiler can inline it. auto __add_visited_state = [&](_StateIdT __v) { if (__v == _S_invalid_state_id) return; if (_M_covered.count(__u) != 0 && (_M_covered.count(__v) == 0 - || _M_match_less_than(*_M_covered[__u], *_M_covered[__v]))) + || *_M_covered[__u] < *_M_covered[__v])) { - _M_covered[__v] = _ResultsPtr(new _ResultsVec(*_M_covered[__u])); + _M_covered[__v] = + _ResultsPtr(new _ResultsEntry(*_M_covered[__u])); // if a state is updated, it's outgoing neighbors should be // reconsidered too. Push them to the queue. if (!__in_q[__v]) @@ -195,19 +252,33 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } }; + // Identical to DFS's switch part. switch (__state._M_opcode) { + // Needs to maintain quantifier count vector here. A quantifier + // must be concerned with a alt node. case _S_opcode_alternative: - __add_visited_state(__state._M_next); - __add_visited_state(__state._M_alt); + { + __add_visited_state(__state._M_next); + auto __back = + _M_covered[__u]->_M_quant_keys[__state._M_quant_index]; + _M_covered[__u]->_M_inc(__state._M_quant_index, + __state._M_neg); + __add_visited_state(__state._M_alt); + _M_covered[__u]->_M_quant_keys[__state._M_quant_index] + = __back; + } break; case _S_opcode_subexpr_begin: { - auto& __cu = *_M_covered[__u]; - auto __back = __cu[__state._M_subexpr].first; - __cu[__state._M_subexpr].first = __current; - __add_visited_state(__state._M_next); - __cu[__state._M_subexpr].first = __back; + auto& __sub = (*_M_covered[__u])[__state._M_subexpr]; + if (!__sub.matched || __sub.first != __current) + { + auto __back = __sub.first; + __sub.first = __current; + __add_visited_state(__state._M_next); + __sub.first = __back; + } } break; case _S_opcode_subexpr_end: @@ -220,10 +291,51 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __cu[__state._M_subexpr] = __back; } break; + case _S_opcode_line_begin_assertion: + if (__current == __begin) + __add_visited_state(__state._M_next); + break; + case _S_opcode_line_end_assertion: + if (__current == __end) + __add_visited_state(__state._M_next); + break; + case _S_opcode_word_boundry: + { + bool __ans = false; + if (__current == __begin && this->_M_is_word(*__current)) + __ans = true; + else if (__current == __end && this->_M_is_word(*__current)) + __ans = true; + else + { + auto __pre = __current; + --__pre; + if (this->_M_is_word(*__current) + != this->_M_is_word(*__pre)) + __ans = true; + } + if (__ans == !__state._M_neg) + __add_visited_state(__state._M_next); + } + break; + case _S_opcode_subexpr_lookahead: + { + _ResultsT __m; + // Same comment as in DFS. + _BFSExecutor __sub(this->_M_current, + this->_M_end, + __m, + this->_M_nfa, + this->_M_traits, + this->_M_flags); + __sub._M_start_state = __state._M_alt; + if (__sub._M_search_from_first() == !__state._M_neg) + __add_visited_state(__state._M_next); + } + break; case _S_opcode_match: break; case _S_opcode_accept: - __add_visited_state(__state._M_next); break; default: _GLIBCXX_DEBUG_ASSERT(false); @@ -244,7 +356,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION && __state._M_matches(*this->_M_current)) if (__state._M_next != _S_invalid_state_id) if (__next.count(__state._M_next) == 0 - || _M_match_less_than(*__it.second, *__next[__state._M_next])) + || *__it.second < *__next[__state._M_next]) __next[__state._M_next] = move(__it.second); } _M_covered = move(__next); @@ -253,37 +365,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION template bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>:: - _M_match_less_than(const _ResultsVec& __u, const _ResultsVec& __v) const - { - // TODO: Greedy and Non-greedy support - _GLIBCXX_DEBUG_ASSERT(__u.size() == __v.size()); - auto __size = __u.size(); - for (auto __i = 0; __i < __size; __i++) - { - auto __uit = __u[__i], __vit = __v[__i]; - if (__uit.matched && !__vit.matched) - return true; - if (!__uit.matched && __vit.matched) - return false; - if (__uit.matched && __vit.matched) - { - // GREEDY - if (__uit.first != __vit.first) - return __uit.first < __vit.first; - if (__uit.second != __vit.second) - return __uit.second > __vit.second; - } - } - return false; - } - - template - bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>:: - _M_includes_some() const + _M_includes_some() { auto& __s = _M_nfa._M_final_states(); auto& __t = _M_covered; + bool __succ = false; if (__s.size() > 0 && __t.size() > 0) { auto __first = __s.begin(); @@ -292,16 +378,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { if (*__first < __second->first) ++__first; - else if (__second->first < *__first) + else if (*__first > __second->first) ++__second; else { - this->_M_results = *__second->second; - return true; + if (_M_cur_results == nullptr + || *__second->second < *_M_cur_results) + _M_cur_results = + _ResultsPtr(new _ResultsEntry(*__second->second)); + __succ = true; + ++__first; + ++__second; } } } - return false; + return __succ; } template_M_has_backref) return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, __re._M_traits, __flags)); - return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, *__p, __flags)); + return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, *__p, + __re._M_traits, __flags)); } _GLIBCXX_END_NAMESPACE_VERSION diff --git a/libstdc++-v3/include/bits/regex_scanner.h b/libstdc++-v3/include/bits/regex_scanner.h index 064c1832796f..824d6ce10815 100644 --- a/libstdc++-v3/include/bits/regex_scanner.h +++ b/libstdc++-v3/include/bits/regex_scanner.h @@ -69,7 +69,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _S_token_subexpr_begin, _S_token_subexpr_no_group_begin, _S_token_subexpr_lookahead_begin, - _S_token_subexpr_neg_lookahead_begin, _S_token_subexpr_end, _S_token_bracket_begin, _S_token_bracket_neg_begin, @@ -84,10 +83,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _S_token_or, _S_token_closure0, _S_token_closure1, + _S_token_ungreedy, _S_token_line_begin, _S_token_line_end, _S_token_word_bound, - _S_token_neg_word_bound, _S_token_comma, _S_token_dup_count, _S_token_eof, diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc index 3303aa56a38a..4b66157278b1 100644 --- a/libstdc++-v3/include/bits/regex_scanner.tcc +++ b/libstdc++-v3/include/bits/regex_scanner.tcc @@ -210,11 +210,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { ++_M_current; _M_token = _S_token_subexpr_lookahead_begin; + _M_value.assign(1, 'p'); } else if (*_M_current == '!') { ++_M_current; - _M_token = _S_token_subexpr_neg_lookahead_begin; + _M_token = _S_token_subexpr_lookahead_begin; + _M_value.assign(1, 'n'); } else __throw_regex_error(regex_constants::error_paren); @@ -371,9 +373,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_value.assign(1, _M_escape_map.at(__c)); } else if (__c == 'b') - _M_token = _S_token_word_bound; + { + _M_token = _S_token_word_bound; + _M_value.assign(1, 'p'); + } else if (__c == 'B') - _M_token = _S_token_neg_word_bound; + { + _M_token = _S_token_word_bound; + _M_value.assign(1, 'n'); + } // N3376 28.13 else if (__c == 'd' || __c == 'D' @@ -581,9 +589,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION case _S_token_subexpr_lookahead_begin: ostr << "lookahead subexpr begin\n"; break; - case _S_token_subexpr_neg_lookahead_begin: - ostr << "neg lookahead subexpr begin\n"; - break; case _S_token_subexpr_end: ostr << "subexpr end\n"; break; diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/assertion.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/assertion.cc new file mode 100644 index 000000000000..82e99058743c --- /dev/null +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/assertion.cc @@ -0,0 +1,80 @@ +// { dg-options "-std=gnu++11" } +// { dg-do run { xfail *-*-* } } + +// +// 2013-09-14 Tim Shen +// +// Copyright (C) 2013 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// 28.11.3 regex_search +// Tests ECMAScript assertion. + +#include +#include + +using namespace std; + +void +test01() +{ + bool test __attribute__((unused)) = true; + + VERIFY(!regex_search("2123456", regex("^1234"))); + VERIFY(regex_search("123456", regex("^1234"))); + VERIFY(regex_search("123456", regex("(5|^)1234"))); + VERIFY(regex_search("5123456", regex("(5|^)1234"))); + VERIFY(!regex_search("1234562", regex("3456$"))); + VERIFY(regex_search("123456", regex("3456$"))); + VERIFY(!regex_search("123456", regex("(?=1234)56"))); + VERIFY(regex_search("123456", regex("(?=1234)123456"))); + VERIFY(regex_search("123456", regex("(?!1234)56"))); + VERIFY(!regex_search("123456", regex("(?!1234)123456"))); + + VERIFY(regex_search("a-", regex("a\\b-"))); + VERIFY(!regex_search("ab", regex("a\\bb"))); + VERIFY(!regex_search("a-", regex("a\\B-"))); + VERIFY(regex_search("ab", regex("a\\Bb"))); + + string s("This is a regular expression"); + string sol[] = + { + "This", + "is", + "a", + "regular", + "expression", + }; + + regex re("\\b\\w*\\b"); + int i = 0; + for (auto it = sregex_iterator(s.begin(), s.end(), re); + it != sregex_iterator() && i < 5; + ++it) + { + string s((*it)[0].first, (*it)[0].second); + VERIFY(s == sol[i++]); + } + VERIFY(i == 5); +} + +int +main() +{ + test01(); + return 0; +} diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/greedy.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/greedy.cc new file mode 100644 index 000000000000..ad37ec8649a8 --- /dev/null +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/greedy.cc @@ -0,0 +1,71 @@ +// { dg-options "-std=gnu++11" } + +// +// 2013-09-14 Tim Shen +// +// Copyright (C) 2013 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// 28.11.3 regex_search +// Tests ECMAScript greedy and ungreedy quantifiers. + +#include +#include + +using namespace std; + +void +test01() +{ + bool test __attribute__((unused)) = true; + + cmatch m; +#define TEST(i, s) VERIFY(m[i].matched && string(m[i].first, m[i].second) == s) + VERIFY(regex_search("aaaa", m, regex("a*"))); + TEST(0, "aaaa"); + VERIFY(regex_search("aaaa", m, regex("a*?"))); + TEST(0, ""); + VERIFY(regex_search("aaaa", m, regex("a+"))); + TEST(0, "aaaa"); + VERIFY(regex_search("aaaa", m, regex("a+?"))); + TEST(0, "a"); + VERIFY(regex_search("a", m, regex("a?"))); + TEST(0, "a"); + VERIFY(regex_search("a", m, regex("a??"))); + TEST(0, ""); + VERIFY(regex_search("", m, regex("a??"))); + TEST(0, ""); + VERIFY(regex_search("aaaa", m, regex("(a+)(a+)"))); + TEST(1, "aaa"); + TEST(2, "a"); + VERIFY(regex_search("aaaa", m, regex("(a+?)(a+)"))); + TEST(1, "a"); + TEST(2, "aaa"); + VERIFY(regex_search("aaaa", m, regex("(a+?)(a+)"))); + TEST(1, "a"); + TEST(2, "aaa"); + VERIFY(regex_search("aaaa", m, regex("(a+?)(a+?)"))); + TEST(1, "a"); + TEST(2, "a"); +} + +int +main() +{ + test01(); + return 0; +} diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc index a2d290db2839..ec25875fdee0 100644 --- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc @@ -21,7 +21,7 @@ // . // 28.11.3 regex_search -// Tests BRE against a std::string target. +// Tests ECMAScript against a std::string target. #include #include