// class template regex -*- C++ -*-
-// Copyright (C) 2013 Free Software Foundation, Inc.
+// Copyright (C) 2013-2014 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
* Do not attempt to use it directly. @headername{regex}
*/
+// FIXME make comments doxygen format.
+
+// This compiler refers to "Regular Expression Matching Can Be Simple And Fast"
+// (http://swtch.com/~rsc/regexp/regexp1.html"),
+// but doesn't strictly follow it.
+//
+// When compiling, states are *chained* instead of tree- or graph-constructed.
+// It's more like structured programs: there's if statement and loop statement.
+//
+// For alternative structure(say "a|b"), aka "if statement", two branchs should
+// be constructed. However, these two shall merge to an "end_tag" at the end of
+// this operator:
+//
+// branch1
+// / \
+// => begin_tag end_tag =>
+// \ /
+// branch2
+//
+// This is the difference between this implementation and that in Russ's
+// article.
+//
+// That's why we introduced dummy node here ------ "end_tag" is a dummy node.
+// All dummy node will be eliminated at the end of compiling process.
+
namespace std _GLIBCXX_VISIBILITY(default)
{
namespace __detail
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
- template<typename _BiIter>
- void
- _Scanner<_BiIter>::
- _M_advance()
- {
- if (_M_current == _M_end)
- {
- _M_curToken = _S_token_eof;
- return;
- }
-
- _CharT __c = *_M_current;
- if (_M_state & _S_state_in_bracket)
- {
- _M_scan_in_bracket();
- return;
- }
- if (_M_state & _S_state_in_brace)
- {
- _M_scan_in_brace();
- return;
- }
-#if 0
- // TODO: re-enable line anchors when _M_assertion is implemented.
- // See PR libstdc++/47724
- else if (_M_state & _S_state_at_start && __c == _M_ctype.widen('^'))
- {
- _M_curToken = _S_token_line_begin;
- ++_M_current;
- return;
- }
- else if (__c == _M_ctype.widen('$'))
- {
- _M_curToken = _S_token_line_end;
- ++_M_current;
- return;
- }
-#endif
- else if (__c == _M_ctype.widen('.'))
- {
- _M_curToken = _S_token_anychar;
- ++_M_current;
- return;
- }
- else if (__c == _M_ctype.widen('*'))
- {
- _M_curToken = _S_token_closure0;
- ++_M_current;
- return;
- }
- else if (__c == _M_ctype.widen('+'))
- {
- _M_curToken = _S_token_closure1;
- ++_M_current;
- return;
- }
- else if (__c == _M_ctype.widen('|'))
- {
- _M_curToken = _S_token_or;
- ++_M_current;
- return;
- }
- else if (__c == _M_ctype.widen('['))
- {
- if (*++_M_current == _M_ctype.widen('^'))
- {
- _M_curToken = _S_token_bracket_inverse_begin;
- ++_M_current;
- }
- else
- _M_curToken = _S_token_bracket_begin;
- _M_state |= _S_state_in_bracket;
- return;
- }
- else if (__c == _M_ctype.widen('\\'))
- {
- _M_eat_escape();
- return;
- }
- else if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
- {
- if (__c == _M_ctype.widen('('))
- {
- _M_curToken = _S_token_subexpr_begin;
- ++_M_current;
- return;
- }
- else if (__c == _M_ctype.widen(')'))
- {
- _M_curToken = _S_token_subexpr_end;
- ++_M_current;
- return;
- }
- else if (__c == _M_ctype.widen('{'))
- {
- _M_curToken = _S_token_interval_begin;
- _M_state |= _S_state_in_brace;
- ++_M_current;
- return;
- }
- }
-
- _M_curToken = _S_token_ord_char;
- _M_curValue.assign(1, __c);
- ++_M_current;
- }
-
- template<typename _BiIter>
- void
- _Scanner<_BiIter>::
- _M_scan_in_brace()
- {
- if (_M_ctype.is(_CtypeT::digit, *_M_current))
- {
- _M_curToken = _S_token_dup_count;
- _M_curValue.assign(1, *_M_current);
- ++_M_current;
- while (_M_current != _M_end
- && _M_ctype.is(_CtypeT::digit, *_M_current))
- {
- _M_curValue += *_M_current;
- ++_M_current;
- }
- return;
- }
- else if (*_M_current == _M_ctype.widen(','))
- {
- _M_curToken = _S_token_comma;
- ++_M_current;
- return;
- }
- if (_M_flags & (regex_constants::basic | regex_constants::grep))
- {
- if (*_M_current == _M_ctype.widen('\\'))
- _M_eat_escape();
- }
- else
- {
- if (*_M_current == _M_ctype.widen('}'))
- {
- _M_curToken = _S_token_interval_end;
- _M_state &= ~_S_state_in_brace;
- ++_M_current;
- return;
- }
- }
- }
-
- template<typename _BiIter>
- void
- _Scanner<_BiIter>::
- _M_scan_in_bracket()
- {
- if (*_M_current == _M_ctype.widen('['))
- {
- ++_M_current;
- if (_M_current == _M_end)
- {
- _M_curToken = _S_token_eof;
- return;
- }
-
- if (*_M_current == _M_ctype.widen('.'))
- {
- _M_curToken = _S_token_collsymbol;
- _M_eat_collsymbol();
- return;
- }
- else if (*_M_current == _M_ctype.widen(':'))
- {
- _M_curToken = _S_token_char_class_name;
- _M_eat_charclass();
- return;
- }
- else if (*_M_current == _M_ctype.widen('='))
- {
- _M_curToken = _S_token_equiv_class_name;
- _M_eat_equivclass();
- return;
- }
- }
- else if (*_M_current == _M_ctype.widen('-'))
- {
- _M_curToken = _S_token_dash;
- ++_M_current;
- return;
- }
- else if (*_M_current == _M_ctype.widen(']'))
- {
- _M_curToken = _S_token_bracket_end;
- _M_state &= ~_S_state_in_bracket;
- ++_M_current;
- return;
- }
- else if (*_M_current == _M_ctype.widen('\\'))
- {
- _M_eat_escape();
- return;
- }
- _M_curToken = _S_token_collelem_single;
- _M_curValue.assign(1, *_M_current);
- ++_M_current;
- }
-
- // TODO Complete it.
- template<typename _BiIter>
- void
- _Scanner<_BiIter>::
- _M_eat_escape()
- {
- ++_M_current;
- if (_M_current == _M_end)
- {
- _M_curToken = _S_token_eof;
- return;
- }
- _CharT __c = *_M_current;
- ++_M_current;
-
- if (__c == _M_ctype.widen('('))
- {
- if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
- {
- _M_curToken = _S_token_ord_char;
- _M_curValue.assign(1, __c);
- }
- else
- _M_curToken = _S_token_subexpr_begin;
- }
- else if (__c == _M_ctype.widen(')'))
- {
- if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
- {
- _M_curToken = _S_token_ord_char;
- _M_curValue.assign(1, __c);
- }
- else
- _M_curToken = _S_token_subexpr_end;
- }
- else if (__c == _M_ctype.widen('{'))
- {
- if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
- {
- _M_curToken = _S_token_ord_char;
- _M_curValue.assign(1, __c);
- }
- else
- {
- _M_curToken = _S_token_interval_begin;
- _M_state |= _S_state_in_brace;
- }
- }
- else if (__c == _M_ctype.widen('}'))
- {
- if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
- {
- _M_curToken = _S_token_ord_char;
- _M_curValue.assign(1, __c);
- }
- else
- {
- if (!(_M_state && _S_state_in_brace))
- __throw_regex_error(regex_constants::error_badbrace);
- _M_state &= ~_S_state_in_brace;
- _M_curToken = _S_token_interval_end;
- }
- }
- else if (__c == _M_ctype.widen('x'))
- {
- ++_M_current;
- if (_M_current == _M_end)
- {
- _M_curToken = _S_token_eof;
- return;
- }
- if (_M_ctype.is(_CtypeT::digit, *_M_current))
- {
- _M_curValue.assign(1, *_M_current);
- ++_M_current;
- if (_M_current == _M_end)
- {
- _M_curToken = _S_token_eof;
- return;
- }
- if (_M_ctype.is(_CtypeT::digit, *_M_current))
- {
- _M_curValue += *_M_current;
- ++_M_current;
- return;
- }
- }
- }
- else if (__c == _M_ctype.widen('^')
- || __c == _M_ctype.widen('.')
- || __c == _M_ctype.widen('*')
- || __c == _M_ctype.widen('$')
- || __c == _M_ctype.widen('\\'))
- {
- _M_curToken = _S_token_ord_char;
- _M_curValue.assign(1, __c);
- }
- else if (_M_ctype.is(_CtypeT::digit, __c))
- {
- _M_curToken = _S_token_backref;
- _M_curValue.assign(1, __c);
- }
- else if (_M_state & _S_state_in_bracket)
- {
- if (__c == _M_ctype.widen('-')
- || __c == _M_ctype.widen('[')
- || __c == _M_ctype.widen(']'))
- {
- _M_curToken = _S_token_ord_char;
- _M_curValue.assign(1, __c);
- }
- else if ((_M_flags & regex_constants::ECMAScript)
- && __c == _M_ctype.widen('b'))
- {
- _M_curToken = _S_token_ord_char;
- _M_curValue.assign(1, _M_ctype.widen(' '));
- }
- else
- __throw_regex_error(regex_constants::error_escape);
- }
- else
- __throw_regex_error(regex_constants::error_escape);
- }
-
- // Eats a character class or throwns an exception.
- // current point to ':' delimiter on entry, char after ']' on return
- template<typename _BiIter>
- void
- _Scanner<_BiIter>::
- _M_eat_charclass()
- {
- ++_M_current; // skip ':'
- if (_M_current == _M_end)
- __throw_regex_error(regex_constants::error_ctype);
- for (_M_curValue.clear();
- _M_current != _M_end && *_M_current != _M_ctype.widen(':');
- ++_M_current)
- _M_curValue += *_M_current;
- if (_M_current == _M_end)
- __throw_regex_error(regex_constants::error_ctype);
- ++_M_current; // skip ':'
- if (*_M_current != _M_ctype.widen(']'))
- __throw_regex_error(regex_constants::error_ctype);
- ++_M_current; // skip ']'
- }
-
-
- template<typename _BiIter>
- void
- _Scanner<_BiIter>::
- _M_eat_equivclass()
- {
- ++_M_current; // skip '='
- if (_M_current == _M_end)
- __throw_regex_error(regex_constants::error_collate);
- for (_M_curValue.clear();
- _M_current != _M_end && *_M_current != _M_ctype.widen('=');
- ++_M_current)
- _M_curValue += *_M_current;
- if (_M_current == _M_end)
- __throw_regex_error(regex_constants::error_collate);
- ++_M_current; // skip '='
- if (*_M_current != _M_ctype.widen(']'))
- __throw_regex_error(regex_constants::error_collate);
- ++_M_current; // skip ']'
- }
-
-
- template<typename _BiIter>
- void
- _Scanner<_BiIter>::
- _M_eat_collsymbol()
- {
- ++_M_current; // skip '.'
- if (_M_current == _M_end)
- __throw_regex_error(regex_constants::error_collate);
- for (_M_curValue.clear();
- _M_current != _M_end && *_M_current != _M_ctype.widen('.');
- ++_M_current)
- _M_curValue += *_M_current;
- if (_M_current == _M_end)
- __throw_regex_error(regex_constants::error_collate);
- ++_M_current; // skip '.'
- if (*_M_current != _M_ctype.widen(']'))
- __throw_regex_error(regex_constants::error_collate);
- ++_M_current; // skip ']'
- }
-
-#ifdef _GLIBCXX_DEBUG
- template<typename _BiIter>
- std::ostream&
- _Scanner<_BiIter>::
- _M_print(std::ostream& ostr)
- {
- switch (_M_curToken)
- {
- case _S_token_anychar:
- ostr << "any-character\n";
- break;
- case _S_token_backref:
- ostr << "backref\n";
- break;
- case _S_token_bracket_begin:
- ostr << "bracket-begin\n";
- break;
- case _S_token_bracket_inverse_begin:
- ostr << "bracket-inverse-begin\n";
- break;
- case _S_token_bracket_end:
- ostr << "bracket-end\n";
- break;
- case _S_token_char_class_name:
- ostr << "char-class-name \"" << _M_curValue << "\"\n";
- break;
- case _S_token_closure0:
- ostr << "closure0\n";
- break;
- case _S_token_closure1:
- ostr << "closure1\n";
- break;
- case _S_token_collelem_multi:
- ostr << "coll-elem-multi \"" << _M_curValue << "\"\n";
- break;
- case _S_token_collelem_single:
- ostr << "coll-elem-single \"" << _M_curValue << "\"\n";
- break;
- case _S_token_collsymbol:
- ostr << "collsymbol \"" << _M_curValue << "\"\n";
- break;
- case _S_token_comma:
- ostr << "comma\n";
- break;
- case _S_token_dash:
- ostr << "dash\n";
- break;
- case _S_token_dup_count:
- ostr << "dup count: " << _M_curValue << "\n";
- break;
- case _S_token_eof:
- ostr << "EOF\n";
- break;
- case _S_token_equiv_class_name:
- ostr << "equiv-class-name \"" << _M_curValue << "\"\n";
- break;
- case _S_token_interval_begin:
- ostr << "interval begin\n";
- break;
- case _S_token_interval_end:
- ostr << "interval end\n";
- break;
- case _S_token_line_begin:
- ostr << "line begin\n";
- break;
- case _S_token_line_end:
- ostr << "line end\n";
- break;
- case _S_token_opt:
- ostr << "opt\n";
- break;
- case _S_token_or:
- ostr << "or\n";
- break;
- case _S_token_ord_char:
- ostr << "ordinary character: \"" << _M_value() << "\"\n";
- break;
- case _S_token_subexpr_begin:
- ostr << "subexpr begin\n";
- break;
- case _S_token_subexpr_end:
- ostr << "subexpr end\n";
- break;
- case _S_token_word_begin:
- ostr << "word begin\n";
- break;
- case _S_token_word_end:
- ostr << "word end\n";
- break;
- case _S_token_unknown:
- ostr << "-- unknown token --\n";
- break;
- default:
- _GLIBCXX_DEBUG_ASSERT(false);
- }
- return ostr;
- }
-#endif
-
- template<typename _InputIter, typename _CharT, typename _TraitsT>
- _Compiler<_InputIter, _CharT, _TraitsT>::
- _Compiler(_InputIter __b, _InputIter __e,
+ template<typename _FwdIter, typename _TraitsT>
+ _Compiler<_FwdIter, _TraitsT>::
+ _Compiler(_FwdIter __b, _FwdIter __e,
const _TraitsT& __traits, _FlagT __flags)
- : _M_traits(__traits), _M_scanner(__b, __e, __flags, _M_traits.getloc()),
- _M_state_store(__flags), _M_flags(__flags)
+ : _M_flags((__flags
+ & (regex_constants::ECMAScript
+ | regex_constants::basic
+ | regex_constants::extended
+ | regex_constants::grep
+ | regex_constants::egrep
+ | regex_constants::awk))
+ ? __flags
+ : __flags | regex_constants::ECMAScript),
+ _M_traits(__traits),
+ _M_ctype(std::use_facet<_CtypeT>(_M_traits.getloc())),
+ _M_scanner(__b, __e, _M_flags, _M_traits.getloc()),
+ _M_nfa(_M_flags)
{
- _StateSeqT __r(_M_state_store,
- _M_state_store._M_insert_subexpr_begin());
- _M_disjunction();
- if (!_M_stack.empty())
- {
- __r._M_append(_M_stack.top());
- _M_stack.pop();
- }
- __r._M_append(_M_state_store._M_insert_subexpr_end());
- __r._M_append(_M_state_store._M_insert_accept());
+ _StateSeqT __r(_M_nfa, _M_nfa._M_start());
+ __r._M_append(_M_nfa._M_insert_subexpr_begin());
+ this->_M_disjunction();
+ if (!_M_match_token(_ScannerT::_S_token_eof))
+ __throw_regex_error(regex_constants::error_paren);
+ __r._M_append(_M_pop());
+ _GLIBCXX_DEBUG_ASSERT(_M_stack.empty());
+ __r._M_append(_M_nfa._M_insert_subexpr_end());
+ __r._M_append(_M_nfa._M_insert_accept());
+ _M_nfa._M_eliminate_dummy();
}
- template<typename _InputIter, typename _CharT, typename _TraitsT>
- bool
- _Compiler<_InputIter, _CharT, _TraitsT>::
- _M_match_token(_Compiler<_InputIter, _CharT, _TraitsT>::_TokenT token)
- {
- if (token == _M_scanner._M_token())
- {
- _M_cur_value = _M_scanner._M_value();
- _M_scanner._M_advance();
- return true;
- }
- return false;
- }
-
- template<typename _InputIter, typename _CharT, typename _TraitsT>
+ template<typename _FwdIter, typename _TraitsT>
void
- _Compiler<_InputIter, _CharT, _TraitsT>::
+ _Compiler<_FwdIter, _TraitsT>::
_M_disjunction()
{
this->_M_alternative();
- if (_M_match_token(_ScannerT::_S_token_or))
+ while (_M_match_token(_ScannerT::_S_token_or))
{
- _StateSeqT __alt1 = _M_stack.top(); _M_stack.pop();
- this->_M_disjunction();
- _StateSeqT __alt2 = _M_stack.top(); _M_stack.pop();
- _M_stack.push(_StateSeqT(__alt1, __alt2));
+ _StateSeqT __alt1 = _M_pop();
+ this->_M_alternative();
+ _StateSeqT __alt2 = _M_pop();
+ auto __end = _M_nfa._M_insert_dummy();
+ __alt1._M_append(__end);
+ __alt2._M_append(__end);
+ _M_stack.push(_StateSeqT(_M_nfa,
+ _M_nfa._M_insert_alt(__alt1._M_start,
+ __alt2._M_start, false),
+ __end));
}
}
- template<typename _InputIter, typename _CharT, typename _TraitsT>
+ template<typename _FwdIter, typename _TraitsT>
void
- _Compiler<_InputIter, _CharT, _TraitsT>::
+ _Compiler<_FwdIter, _TraitsT>::
_M_alternative()
{
if (this->_M_term())
{
- _StateSeqT __re = _M_stack.top(); _M_stack.pop();
+ _StateSeqT __re = _M_pop();
this->_M_alternative();
- if (!_M_stack.empty())
- {
- __re._M_append(_M_stack.top());
- _M_stack.pop();
- }
+ __re._M_append(_M_pop());
_M_stack.push(__re);
}
+ else
+ _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
}
- template<typename _InputIter, typename _CharT, typename _TraitsT>
+ template<typename _FwdIter, typename _TraitsT>
bool
- _Compiler<_InputIter, _CharT, _TraitsT>::
+ _Compiler<_FwdIter, _TraitsT>::
_M_term()
{
if (this->_M_assertion())
return false;
}
- template<typename _InputIter, typename _CharT, typename _TraitsT>
+ template<typename _FwdIter, typename _TraitsT>
bool
- _Compiler<_InputIter, _CharT, _TraitsT>::
+ _Compiler<_FwdIter, _TraitsT>::
_M_assertion()
{
if (_M_match_token(_ScannerT::_S_token_line_begin))
- {
- // __m.push(_Matcher::_S_opcode_line_begin);
- return true;
- }
- if (_M_match_token(_ScannerT::_S_token_line_end))
- {
- // __m.push(_Matcher::_S_opcode_line_end);
- return true;
- }
- if (_M_match_token(_ScannerT::_S_token_word_begin))
- {
- // __m.push(_Matcher::_S_opcode_word_begin);
- return true;
- }
- if (_M_match_token(_ScannerT::_S_token_word_end))
- {
- // __m.push(_Matcher::_S_opcode_word_end);
- return true;
+ _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_line_begin()));
+ else if (_M_match_token(_ScannerT::_S_token_line_end))
+ _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_line_end()));
+ else if (_M_match_token(_ScannerT::_S_token_word_bound))
+ // _M_value[0] == 'n' means it's negtive, say "not word boundary".
+ _M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
+ _M_insert_word_bound(_M_value[0] == 'n')));
+ else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin))
+ {
+ auto __neg = _M_value[0] == 'n';
+ this->_M_disjunction();
+ if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
+ __throw_regex_error(regex_constants::error_paren);
+ auto __tmp = _M_pop();
+ __tmp._M_append(_M_nfa._M_insert_accept());
+ _M_stack.push(
+ _StateSeqT(
+ _M_nfa,
+ _M_nfa._M_insert_lookahead(__tmp._M_start, __neg)));
}
- return false;
+ else
+ return false;
+ return true;
}
- template<typename _InputIter, typename _CharT, typename _TraitsT>
+ template<typename _FwdIter, typename _TraitsT>
void
- _Compiler<_InputIter, _CharT, _TraitsT>::
+ _Compiler<_FwdIter, _TraitsT>::
_M_quantifier()
{
- if (_M_match_token(_ScannerT::_S_token_closure0))
+ bool __neg = (_M_flags & regex_constants::ECMAScript);
+ auto __init = [this, &__neg]()
{
if (_M_stack.empty())
__throw_regex_error(regex_constants::error_badrepeat);
- _StateSeqT __r(_M_stack.top(), -1);
- __r._M_append(__r._M_front());
- _M_stack.pop();
+ __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
+ };
+ if (_M_match_token(_ScannerT::_S_token_closure0))
+ {
+ __init();
+ auto __e = _M_pop();
+ _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
+ __e._M_start, __neg));
+ __e._M_append(__r);
_M_stack.push(__r);
- return;
}
- if (_M_match_token(_ScannerT::_S_token_closure1))
+ else if (_M_match_token(_ScannerT::_S_token_closure1))
{
- if (_M_stack.empty())
- __throw_regex_error(regex_constants::error_badrepeat);
- _StateSeqT __r(_M_state_store,
- _M_state_store.
- _M_insert_alt(_S_invalid_state_id,
- _M_stack.top()._M_front()));
- _M_stack.top()._M_append(__r);
- return;
+ __init();
+ auto __e = _M_pop();
+ __e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start,
+ __neg));
+ _M_stack.push(__e);
}
- if (_M_match_token(_ScannerT::_S_token_opt))
+ else if (_M_match_token(_ScannerT::_S_token_opt))
{
- if (_M_stack.empty())
- __throw_regex_error(regex_constants::error_badrepeat);
- _StateSeqT __r(_M_stack.top(), -1);
- _M_stack.pop();
+ __init();
+ auto __e = _M_pop();
+ auto __end = _M_nfa._M_insert_dummy();
+ _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
+ __e._M_start, __neg));
+ __e._M_append(__end);
+ __r._M_append(__end);
_M_stack.push(__r);
- return;
}
- if (_M_match_token(_ScannerT::_S_token_interval_begin))
+ else if (_M_match_token(_ScannerT::_S_token_interval_begin))
{
if (_M_stack.empty())
__throw_regex_error(regex_constants::error_badrepeat);
if (!_M_match_token(_ScannerT::_S_token_dup_count))
__throw_regex_error(regex_constants::error_badbrace);
- _StateSeqT __r(_M_stack.top());
- int __min_rep = _M_cur_int_value(10);
- for (int __i = 1; __i < __min_rep; ++__i)
- _M_stack.top()._M_append(__r._M_clone());
+ _StateSeqT __r(_M_pop());
+ _StateSeqT __e(_M_nfa, _M_nfa._M_insert_dummy());
+ long __min_rep = _M_cur_int_value(10);
+ bool __infi = false;
+ long __n;
+
+ // {3
if (_M_match_token(_ScannerT::_S_token_comma))
- if (_M_match_token(_ScannerT::_S_token_dup_count))
- {
- int __n = _M_cur_int_value(10) - __min_rep;
- if (__n < 0)
- __throw_regex_error(regex_constants::error_badbrace);
- for (int __i = 0; __i < __n; ++__i)
- {
- _StateSeqT __r(_M_state_store,
- _M_state_store.
- _M_insert_alt(_S_invalid_state_id,
- _M_stack.top()._M_front()));
- _M_stack.top()._M_append(__r);
- }
- }
+ if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7}
+ __n = _M_cur_int_value(10) - __min_rep;
else
- {
- _StateSeqT __r(_M_stack.top(), -1);
- __r._M_push_back(__r._M_front());
- _M_stack.pop();
- _M_stack.push(__r);
- }
+ __infi = true;
+ else
+ __n = 0;
if (!_M_match_token(_ScannerT::_S_token_interval_end))
__throw_regex_error(regex_constants::error_brace);
- return;
+
+ __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
+
+ for (long __i = 0; __i < __min_rep; ++__i)
+ __e._M_append(__r._M_clone());
+
+ if (__infi)
+ {
+ auto __tmp = __r._M_clone();
+ _StateSeqT __s(_M_nfa,
+ _M_nfa._M_insert_alt(_S_invalid_state_id,
+ __tmp._M_start, __neg));
+ __tmp._M_append(__s);
+ __e._M_append(__s);
+ }
+ else
+ {
+ if (__n < 0)
+ __throw_regex_error(regex_constants::error_badbrace);
+ auto __end = _M_nfa._M_insert_dummy();
+ // _M_alt is the "match more" branch, and _M_next is the
+ // "match less" one. Switch _M_alt and _M_next of all created
+ // nodes. This is a hacking but IMO works well.
+ std::stack<_StateIdT> __stack;
+ for (long __i = 0; __i < __n; ++__i)
+ {
+ auto __tmp = __r._M_clone();
+ auto __alt = _M_nfa._M_insert_alt(__tmp._M_start,
+ __end, __neg);
+ __stack.push(__alt);
+ __e._M_append(_StateSeqT(_M_nfa, __alt, __tmp._M_end));
+ }
+ __e._M_append(__end);
+ while (!__stack.empty())
+ {
+ auto& __tmp = _M_nfa[__stack.top()];
+ __stack.pop();
+ swap(__tmp._M_next, __tmp._M_alt);
+ }
+ }
+ _M_stack.push(__e);
}
}
- template<typename _InputIter, typename _CharT, typename _TraitsT>
+ template<typename _FwdIter, typename _TraitsT>
bool
- _Compiler<_InputIter, _CharT, _TraitsT>::
+ _Compiler<_FwdIter, _TraitsT>::
_M_atom()
{
if (_M_match_token(_ScannerT::_S_token_anychar))
- {
- const static auto&
- __any_matcher = [](_CharT) -> bool
- { return true; };
-
- _M_stack.push(_StateSeqT(_M_state_store,
- _M_state_store._M_insert_matcher
- (__any_matcher)));
- return true;
- }
- if (_M_match_token(_ScannerT::_S_token_ord_char))
- {
- auto __c = _M_cur_value[0];
- __detail::_Matcher<_CharT> f;
- if (_M_flags & regex_constants::icase)
- {
- auto __traits = this->_M_traits;
- __c = __traits.translate_nocase(__c);
- f = [__traits, __c](_CharT __ch) -> bool
- { return __traits.translate_nocase(__ch) == __c; };
- }
- else
- f = [__c](_CharT __ch) -> bool
- { return __ch == __c; };
-
- _M_stack.push(_StateSeqT(_M_state_store,
- _M_state_store._M_insert_matcher(f)));
- return true;
- }
- if (_M_match_token(_ScannerT::_S_token_backref))
- {
- // __m.push(_Matcher::_S_opcode_ordchar, _M_cur_value);
- _M_stack.push(_StateSeqT(_M_state_store, _M_state_store.
- _M_insert_backref(_M_cur_int_value(10))));
- return true;
+ _M_stack.push(_StateSeqT(_M_nfa,
+ _M_nfa._M_insert_matcher
+ (_AnyMatcher<_TraitsT>(_M_traits))));
+ else if (_M_try_char())
+ _M_stack.push(_StateSeqT(_M_nfa,
+ _M_nfa._M_insert_matcher
+ (_CharMatcher<_TraitsT>(_M_value[0],
+ _M_traits,
+ _M_flags))));
+ else if (_M_match_token(_ScannerT::_S_token_backref))
+ _M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
+ _M_insert_backref(_M_cur_int_value(10))));
+ else if (_M_match_token(_ScannerT::_S_token_quoted_class))
+ {
+ _GLIBCXX_DEBUG_ASSERT(_M_value.size() == 1);
+ _BMatcherT __matcher(_M_ctype.is(_CtypeT::upper, _M_value[0]),
+ _M_traits, _M_flags);
+ __matcher._M_add_character_class(_M_value);
+ _M_stack.push(_StateSeqT(_M_nfa,
+ _M_nfa._M_insert_matcher(std::move(__matcher))));
+ }
+ else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin))
+ {
+ _StateSeqT __r(_M_nfa, _M_nfa._M_insert_dummy());
+ this->_M_disjunction();
+ if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
+ __throw_regex_error(regex_constants::error_paren);
+ __r._M_append(_M_pop());
+ _M_stack.push(__r);
}
- if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
+ else if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
{
- int __mark = _M_state_store._M_sub_count();
- _StateSeqT __r(_M_state_store,
- _M_state_store.
- _M_insert_subexpr_begin());
+ _StateSeqT __r(_M_nfa, _M_nfa._M_insert_subexpr_begin());
this->_M_disjunction();
if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
__throw_regex_error(regex_constants::error_paren);
- if (!_M_stack.empty())
- {
- __r._M_append(_M_stack.top());
- _M_stack.pop();
- }
- __r._M_append(_M_state_store._M_insert_subexpr_end());
+ __r._M_append(_M_pop());
+ __r._M_append(_M_nfa._M_insert_subexpr_end());
_M_stack.push(__r);
- return true;
}
- return _M_bracket_expression();
- }
-
- template<typename _InputIter, typename _CharT, typename _TraitsT>
- bool
- _Compiler<_InputIter, _CharT, _TraitsT>::
- _M_bracket_expression()
- {
- bool __inverse =
- _M_match_token(_ScannerT::_S_token_bracket_inverse_begin);
- if (!(__inverse || _M_match_token(_ScannerT::_S_token_bracket_begin)))
+ else if (!_M_bracket_expression())
return false;
- _BMatcherT __matcher( __inverse, _M_traits, _M_flags);
- // special case: only if _not_ chr first after
- // '[' or '[^' or if ECMAscript
- if (!_M_bracket_list(__matcher) // list is empty
- && !(_M_flags & regex_constants::ECMAScript))
- __throw_regex_error(regex_constants::error_brack);
- _M_stack.push(_StateSeqT(_M_state_store,
- _M_state_store._M_insert_matcher(__matcher)));
return true;
}
- template<typename _InputIter, typename _CharT, typename _TraitsT>
- bool // list is non-empty
- _Compiler<_InputIter, _CharT, _TraitsT>::
- _M_bracket_list(_BMatcherT& __matcher)
+ template<typename _FwdIter, typename _TraitsT>
+ bool
+ _Compiler<_FwdIter, _TraitsT>::
+ _M_bracket_expression()
{
- if (_M_match_token(_ScannerT::_S_token_bracket_end))
+ bool __neg =
+ _M_match_token(_ScannerT::_S_token_bracket_neg_begin);
+ if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin)))
return false;
- _M_expression_term(__matcher);
- _M_bracket_list(__matcher);
+ _BMatcherT __matcher(__neg, _M_traits, _M_flags);
+ while (!_M_match_token(_ScannerT::_S_token_bracket_end))
+ _M_expression_term(__matcher);
+ _M_stack.push(_StateSeqT(_M_nfa,
+ _M_nfa._M_insert_matcher(std::move(__matcher))));
return true;
}
- template<typename _InputIter, typename _CharT, typename _TraitsT>
+ template<typename _FwdIter, typename _TraitsT>
void
- _Compiler<_InputIter, _CharT, _TraitsT>::
+ _Compiler<_FwdIter, _TraitsT>::
_M_expression_term(_BMatcherT& __matcher)
{
if (_M_match_token(_ScannerT::_S_token_collsymbol))
- {
- __matcher._M_add_collating_element(_M_cur_value);
- return;
+ __matcher._M_add_collating_element(_M_value);
+ else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
+ __matcher._M_add_equivalence_class(_M_value);
+ else if (_M_match_token(_ScannerT::_S_token_char_class_name))
+ __matcher._M_add_character_class(_M_value);
+ else if (_M_try_char()) // [a
+ {
+ auto __ch = _M_value[0];
+ if (_M_try_char())
+ {
+ if (_M_value[0] == '-') // [a-
+ {
+ if (_M_try_char()) // [a-z]
+ {
+ __matcher._M_make_range(__ch, _M_value[0]);
+ return;
+ }
+ // If the dash is the last character in the bracket
+ // expression, it is not special.
+ if (_M_scanner._M_get_token()
+ != _ScannerT::_S_token_bracket_end)
+ __throw_regex_error(regex_constants::error_range);
+ }
+ __matcher._M_add_char(_M_value[0]);
+ }
+ __matcher._M_add_char(__ch);
}
- if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
+ else
+ __throw_regex_error(regex_constants::error_brack);
+ }
+
+ template<typename _FwdIter, typename _TraitsT>
+ bool
+ _Compiler<_FwdIter, _TraitsT>::
+ _M_try_char()
+ {
+ bool __is_char = false;
+ if (_M_match_token(_ScannerT::_S_token_oct_num))
{
- __matcher._M_add_equivalence_class(_M_cur_value);
- return;
+ __is_char = true;
+ _M_value.assign(1, _M_cur_int_value(8));
}
- if (_M_match_token(_ScannerT::_S_token_char_class_name))
+ else if (_M_match_token(_ScannerT::_S_token_hex_num))
{
- __matcher._M_add_character_class(_M_cur_value);
- return;
+ __is_char = true;
+ _M_value.assign(1, _M_cur_int_value(16));
}
- if (_M_match_token(_ScannerT::_S_token_collelem_single)) // [a
+ else if (_M_match_token(_ScannerT::_S_token_ord_char))
+ __is_char = true;
+ return __is_char;
+ }
+
+ template<typename _FwdIter, typename _TraitsT>
+ bool
+ _Compiler<_FwdIter, _TraitsT>::
+ _M_match_token(_TokenT token)
+ {
+ if (token == _M_scanner._M_get_token())
{
- auto __ch = _M_cur_value[0];
- if (_M_match_token(_ScannerT::_S_token_dash)) // [a-
- {
- // If the dash is the last character in the bracket expression,
- // it is not special.
- if (_M_scanner._M_token() == _ScannerT::_S_token_bracket_end)
- __matcher._M_add_char(_M_cur_value[0]); // [a-] <=> [a\-]
- else // [a-z]
- {
- if (!_M_match_token(_ScannerT::_S_token_collelem_single))
- __throw_regex_error(regex_constants::error_range);
- __matcher._M_make_range(__ch, _M_cur_value[0]);
- }
- }
- else // [a]
- __matcher._M_add_char(__ch);
- return;
+ _M_value = _M_scanner._M_get_value();
+ _M_scanner._M_advance();
+ return true;
}
- __throw_regex_error(regex_constants::error_brack);
+ return false;
}
- template<typename _InputIter, typename _CharT, typename _TraitsT>
+ template<typename _FwdIter, typename _TraitsT>
int
- _Compiler<_InputIter, _CharT, _TraitsT>::
+ _Compiler<_FwdIter, _TraitsT>::
_M_cur_int_value(int __radix)
{
- int __v = 0;
+ long __v = 0;
for (typename _StringT::size_type __i = 0;
- __i < _M_cur_value.length(); ++__i)
- __v =__v * __radix + _M_traits.value(_M_cur_value[__i], __radix);
+ __i < _M_value.length(); ++__i)
+ __v =__v * __radix + _M_traits.value(_M_value[__i], __radix);
return __v;
}
- template<typename _CharT, typename _TraitsT>
- bool _BracketMatcher<_CharT, _TraitsT>::
- operator()(_CharT __ch) const
+ template<typename _TraitsT>
+ bool
+ _BracketMatcher<_TraitsT>::operator()(_CharT __ch) const
{
- auto __oldch = __ch;
- if (_M_flags & regex_constants::collate)
- if (_M_is_icase())
- __ch = _M_traits.translate_nocase(__ch);
- else
- __ch = _M_traits.translate(__ch);
-
bool __ret = false;
- for (auto __c : _M_char_set)
- if (__c == __ch)
- {
- __ret = true;
- break;
- }
- if (!__ret && _M_traits.isctype(__oldch, _M_class_set))
+ if (_M_traits.isctype(__ch, _M_class_set)
+ || _M_char_set.count(_M_translate(__ch))
+ || _M_equiv_set.count(_M_traits.transform_primary(&__ch, &__ch+1)))
__ret = true;
else
{
- _StringT __s = _M_get_str(__ch);
+ _StringT __s = _M_get_str(_M_flags & regex_constants::collate
+ ? _M_translate(__ch) : __ch);
for (auto& __it : _M_range_set)
if (__it.first <= __s && __s <= __it.second)
{
}
}
if (_M_is_non_matching)
- __ret = !__ret;
- return __ret;
+ return !__ret;
+ else
+ return __ret;
}
_GLIBCXX_END_NAMESPACE_VERSION