From: Patrick Palka Date: Fri, 30 Jan 2026 17:23:40 +0000 (-0500) Subject: libstdc++/regex: Make DFS executor non-recursive [PR86164] X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=158ad5f96954da5fa24d5c2a91ae92417fb62e20;p=thirdparty%2Fgcc.git libstdc++/regex: Make DFS executor non-recursive [PR86164] This patch replaces the recursive implementation of the DFS executor with an iterative one using an explicit heap-based stack. System stack usage of the executor is now constant with respect to input size rather than linear, avoding stack overflow errors when processing long inputs. PR libstdc++/86164 libstdc++-v3/ChangeLog: * include/bits/regex.h (__detail::_Executor): Use inline namespace _V2. * include/bits/regex_executor.h (__detail::_ExecutorFrame): Declare. (__detail::_Executor): Use inline namespace _V2. (__detail::_Executor::_M_node): Declare. (__detail::_Executor::_M_frames): New data member. * include/bits/regex_executor.tcc (__detail::_ExecutorFrameOpcode): New. (__detail::_ExecutorFrameBase): New. (__detail::_ExecutorFrame): New. (__detail::_Executor): Use inline namespace _V2. (__detail::_Executor::_M_rep_once_more): Replace recursive _M_dfs calls with an _S_opcode_next frame push, and any work after such calls with an appropriate frame push. (__detail::_M_handle_repeat): Likewise. (__detail::_M_handle_subexpr_begin): Likewise. (__detail::_M_handle_subexpr_end): Likewise. (__detail::_M_handle_line_begin_assertion): Likewise. (__detail::_M_handle_line_end_assertion): Likewise. (__detail::_M_handle_word_boundary): Likewise. (__detail::_M_handle_subexpr_lookahead): Likewise. (__detail::_M_handle_match): Likewise. (__detail::_M_handle_backref): Likewise. (__detail::_M_handle_accept): Likewise. (__detail::_M_handle_alternative): Likewise. (__detail::_M_node): Factored out from _M_dfs. (__detail::_M_dfs): Push an initial frame to _M_frames that visits the starting node and pass this stack each subroutine. Pop the latest _ExecutorFrame from _M_frames and handle appropriately according to its _ExecutorFrameOpcode. Loop until _M_frames is empty. Reviewed-by: Tomasz KamiƄski Reviewed-by: Jonathan Wakely --- diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h index 68fb6254f36..5bcb9a447ce 100644 --- a/libstdc++-v3/include/bits/regex.h +++ b/libstdc++-v3/include/bits/regex.h @@ -61,8 +61,10 @@ namespace __detail _RegexExecutorPolicy __policy, bool __match_mode); +_GLIBCXX_BEGIN_INLINE_ABI_NAMESPACE(_V2) template class _Executor; +_GLIBCXX_END_INLINE_ABI_NAMESPACE(_V2) template struct __is_contiguous_iter : false_type { }; diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index f9857bfc4c1..94048e1f632 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -36,11 +36,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION namespace __detail { +_GLIBCXX_BEGIN_INLINE_ABI_NAMESPACE(_V2) /** * @addtogroup regex-detail * @{ */ + template::value> + struct _ExecutorFrame; + /** * @brief Takes a regex and an input string and does the matching. * @@ -142,6 +146,9 @@ namespace __detail void _M_handle_alternative(_Match_mode, _StateIdT); + void + _M_node(_Match_mode, _StateIdT); + void _M_dfs(_Match_mode __match_mode, _StateIdT __start); @@ -290,6 +297,7 @@ namespace __detail }; public: + _GLIBCXX_STD_C::vector<_ExecutorFrame<_BiIter>> _M_frames; _ResultsVec _M_cur_results; _BiIter _M_current; _BiIter _M_begin; @@ -305,6 +313,7 @@ namespace __detail }; ///@} regex-detail +_GLIBCXX_END_INLINE_ABI_NAMESPACE(_V2) } // namespace __detail _GLIBCXX_END_NAMESPACE_VERSION } // namespace std diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index 421e585f39d..b76f1dce952 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -36,6 +36,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION #pragma GCC diagnostic ignored "-Wc++17-extensions" // if constexpr namespace __detail { +_GLIBCXX_BEGIN_INLINE_ABI_NAMESPACE(_V2) template bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: @@ -55,6 +56,85 @@ namespace __detail return false; } + enum _ExecutorFrameOpcode : unsigned char + { + _S_fopcode_next, + _S_fopcode_fallback_next, + _S_fopcode_rep_once_more, + _S_fopcode_fallback_rep_once_more, + _S_fopcode_posix_alternative, + _S_fopcode_merge_sol, + _S_fopcode_restore_current, + _S_fopcode_restore_cur_results, + _S_fopcode_restore_rep_count, + _S_fopcode_decrement_rep_count, + }; + + struct _ExecutorFrameBase + { + _ExecutorFrameBase(_ExecutorFrameOpcode __op, _StateIdT __i) + : _M_op(__op), _M_state_id(__i) + { } + + _ExecutorFrameOpcode _M_op; + union { + unsigned char _M_byte0; + struct { // Used by restore_rep_count frame + unsigned char _M_count : 2; + }; + struct { // Used by restore_cur_results frame + unsigned char _M_end : 1; + unsigned char _M_matched : 1; + }; + }; + unsigned char _M_bytes[6]; + _StateIdT _M_state_id; + }; + + template::value */> + struct _ExecutorFrame : _ExecutorFrameBase + { + _ExecutorFrame(_ExecutorFrameOpcode __op, _StateIdT __i) + : _ExecutorFrameBase(__op, __i) + { } + + _ExecutorFrame(_ExecutorFrameOpcode __op, _StateIdT __i, _BiIter __p) + : _ExecutorFrameBase(__op, __i), _M_pos(__p) + { } + + _ExecutorFrame(_ExecutorFrameOpcode __op, _StateIdT __i, long __v) + : _ExecutorFrameBase(__op, __i), _M_val(__v) + { } + + // _M_pos and _M_val are mutually exclusive, which the optimized + // partial specialization below depends on. + _BiIter _M_pos = _BiIter(); + long _M_val = 0; + }; + + // Space-optimized partial specialization for when the input iterator is + // trivially copyable. + template + struct _ExecutorFrame<_BiIter, true> : _ExecutorFrameBase + { + _ExecutorFrame(_ExecutorFrameOpcode __op, _StateIdT __i) + : _ExecutorFrameBase(__op, __i) + { } + + _ExecutorFrame(_ExecutorFrameOpcode __op, _StateIdT __i, _BiIter __p) + : _ExecutorFrameBase(__op, __i), _M_pos(__p) + { } + + _ExecutorFrame(_ExecutorFrameOpcode __op, _StateIdT __i, long __v) + : _ExecutorFrameBase(__op, __i), _M_val(__v) + { } + + union { + _BiIter _M_pos; + long _M_val; + }; + }; + // The _M_main function operates in different modes, DFS mode or BFS mode, // indicated by template parameter __dfs_mode, and dispatches to one of the // _M_main_dispatch overloads. @@ -181,19 +261,20 @@ namespace __detail auto& __rep_count = _M_rep_count[__i]; if (__rep_count.second == 0 || __rep_count.first != _M_current) { - auto __back = __rep_count; + _M_frames.emplace_back(_S_fopcode_restore_rep_count, + __i, __rep_count.first); + _M_frames.back()._M_count = __rep_count.second; __rep_count.first = _M_current; __rep_count.second = 1; - _M_dfs(__match_mode, __state._M_alt); - __rep_count = __back; + _M_frames.emplace_back(_S_fopcode_next, __state._M_alt); } else { if (__rep_count.second < 2) { __rep_count.second++; - _M_dfs(__match_mode, __state._M_alt); - __rep_count.second--; + _M_frames.emplace_back(_S_fopcode_decrement_rep_count, __i); + _M_frames.emplace_back(_S_fopcode_next, __state._M_alt); } } } @@ -208,23 +289,23 @@ namespace __detail _M_handle_repeat(_Match_mode __match_mode, _StateIdT __i) { const auto& __state = _M_nfa[__i]; - // Greedy. if (!__state._M_neg) { - _M_rep_once_more(__match_mode, __i); - // If it's DFS executor and already accepted, we're done. - if (!__dfs_mode || !_M_has_sol) - _M_dfs(__match_mode, __state._M_next); + if constexpr (__dfs_mode) + // If it's DFS executor and already accepted, we're done. + _M_frames.emplace_back(_S_fopcode_fallback_next, __state._M_next); + else + _M_frames.emplace_back(_S_fopcode_next, __state._M_next); + _M_frames.emplace_back(_S_fopcode_rep_once_more, __i); } else // Non-greedy mode { if constexpr (__dfs_mode) { // vice-versa. - _M_dfs(__match_mode, __state._M_next); - if (!_M_has_sol) - _M_rep_once_more(__match_mode, __i); + _M_frames.emplace_back(_S_fopcode_fallback_rep_once_more, __i); + _M_frames.emplace_back(_S_fopcode_next, __state._M_next); } else { @@ -233,12 +314,11 @@ namespace __detail // be better by attempting its next node. if (!_M_has_sol) { - _M_dfs(__match_mode, __state._M_next); // DON'T attempt anything if it's already accepted. An // accepted state *must* be better than a solution that // matches a non-greedy quantifier one more time. - if (!_M_has_sol) - _M_rep_once_more(__match_mode, __i); + _M_frames.emplace_back(_S_fopcode_fallback_rep_once_more, __i); + _M_frames.emplace_back(_S_fopcode_next, __state._M_next); } } } @@ -250,12 +330,13 @@ namespace __detail _M_handle_subexpr_begin(_Match_mode __match_mode, _StateIdT __i) { const auto& __state = _M_nfa[__i]; - auto& __res = _M_cur_results[__state._M_subexpr]; - auto __back = __res.first; + _M_frames.emplace_back(_S_fopcode_restore_cur_results, + static_cast<_StateIdT>(__state._M_subexpr), + __res.first); + _M_frames.back()._M_end = false; __res.first = _M_current; - _M_dfs(__match_mode, __state._M_next); - __res.first = __back; + _M_frames.emplace_back(_S_fopcode_next, __state._M_next); } template(__state._M_subexpr), + __res.second); + _M_frames.back()._M_end = true; + _M_frames.back()._M_matched = __res.matched; __res.second = _M_current; __res.matched = true; - _M_dfs(__match_mode, __state._M_next); - __res = __back; + _M_frames.emplace_back(_S_fopcode_next, __state._M_next); } template void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: - _M_dfs(_Match_mode __match_mode, _StateIdT __i) + _M_node(_Match_mode __match_mode, _StateIdT __i) { if (_M_states._M_visited(__i)) return; @@ -545,6 +621,72 @@ namespace __detail } } + template + void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: + _M_dfs(_Match_mode __match_mode, _StateIdT __start) + { + _M_frames.emplace_back(_S_fopcode_next, __start); + + while (!_M_frames.empty()) + { + _ExecutorFrame<_BiIter> __frame = std::move(_M_frames.back()); + _M_frames.pop_back(); + + switch (__frame._M_op) + { + case _S_fopcode_fallback_next: + if (_M_has_sol) + break; + [[__fallthrough__]]; + case _S_fopcode_next: + _M_node(__match_mode, __frame._M_state_id); + break; + + case _S_fopcode_fallback_rep_once_more: + if (_M_has_sol) + break; + [[__fallthrough__]]; + case _S_fopcode_rep_once_more: + _M_rep_once_more(__match_mode, __frame._M_state_id); + break; + + case _S_fopcode_posix_alternative: + _M_frames.emplace_back(_S_fopcode_merge_sol, 0, _M_has_sol); + _M_frames.emplace_back(_S_fopcode_next, __frame._M_state_id); + _M_has_sol = false; + break; + + case _S_fopcode_merge_sol: + _M_has_sol |= __frame._M_val; + break; + + case _S_fopcode_restore_current: + _M_current = __frame._M_pos; + break; + + case _S_fopcode_restore_cur_results: + if (!__frame._M_end) + _M_cur_results[__frame._M_state_id].first = __frame._M_pos; + else + { + _M_cur_results[__frame._M_state_id].second = __frame._M_pos; + _M_cur_results[__frame._M_state_id].matched = __frame._M_matched; + } + break; + + case _S_fopcode_restore_rep_count: + _M_rep_count[__frame._M_state_id].first = __frame._M_pos; + _M_rep_count[__frame._M_state_id].second = __frame._M_count; + break; + + case _S_fopcode_decrement_rep_count: + _M_rep_count[__frame._M_state_id].second--; + break; + } + } + } + // Return whether now is at some word boundary. template @@ -569,6 +711,7 @@ namespace __detail return __left_is_word != __right_is_word; } +_GLIBCXX_END_INLINE_ABI_NAMESPACE(_V2) } // namespace __detail #pragma GCC diagnostic pop