]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
libstdc++/regex: Make DFS executor non-recursive [PR86164]
authorPatrick Palka <ppalka@redhat.com>
Fri, 30 Jan 2026 17:23:40 +0000 (12:23 -0500)
committerPatrick Palka <ppalka@redhat.com>
Fri, 30 Jan 2026 17:23:40 +0000 (12:23 -0500)
This patch replaces the recursive implementation of the DFS executor
with an iterative one using an explicit heap-based stack.  System
stack usage of the executor is now constant with respect to input size
rather than linear, avoding stack overflow errors when processing
long inputs.

PR libstdc++/86164

libstdc++-v3/ChangeLog:

* include/bits/regex.h (__detail::_Executor): Use inline
namespace _V2.
* include/bits/regex_executor.h (__detail::_ExecutorFrame):
Declare.
(__detail::_Executor): Use inline namespace _V2.
(__detail::_Executor::_M_node): Declare.
(__detail::_Executor::_M_frames): New data member.
* include/bits/regex_executor.tcc (__detail::_ExecutorFrameOpcode):
New.
(__detail::_ExecutorFrameBase): New.
(__detail::_ExecutorFrame): New.
(__detail::_Executor): Use inline namespace _V2.
(__detail::_Executor::_M_rep_once_more): Replace recursive
_M_dfs calls with an _S_opcode_next frame push, and any work
after such calls with an appropriate frame push.
(__detail::_M_handle_repeat): Likewise.
(__detail::_M_handle_subexpr_begin): Likewise.
(__detail::_M_handle_subexpr_end): Likewise.
(__detail::_M_handle_line_begin_assertion): Likewise.
(__detail::_M_handle_line_end_assertion): Likewise.
(__detail::_M_handle_word_boundary): Likewise.
(__detail::_M_handle_subexpr_lookahead): Likewise.
(__detail::_M_handle_match): Likewise.
(__detail::_M_handle_backref): Likewise.
(__detail::_M_handle_accept): Likewise.
(__detail::_M_handle_alternative): Likewise.
(__detail::_M_node): Factored out from _M_dfs.
(__detail::_M_dfs): Push an initial frame to _M_frames that
visits the starting node and pass this stack each subroutine.
Pop the latest _ExecutorFrame from _M_frames and handle
appropriately according to its _ExecutorFrameOpcode.  Loop until
_M_frames is empty.

Reviewed-by: Tomasz KamiƄski <tkaminsk@redhat.com>
Reviewed-by: Jonathan Wakely <jwakely@redhat.com>
libstdc++-v3/include/bits/regex.h
libstdc++-v3/include/bits/regex_executor.h
libstdc++-v3/include/bits/regex_executor.tcc

index 68fb6254f362c992c872457688a89821130de2c8..5bcb9a447ce29b62bc296d3a8fc926ceda9d12ec 100644 (file)
@@ -61,8 +61,10 @@ namespace __detail
                      _RegexExecutorPolicy                 __policy,
                      bool                                 __match_mode);
 
+_GLIBCXX_BEGIN_INLINE_ABI_NAMESPACE(_V2)
   template<typename, typename, typename, bool>
     class _Executor;
+_GLIBCXX_END_INLINE_ABI_NAMESPACE(_V2)
 
   template<typename _Tp>
     struct __is_contiguous_iter : false_type { };
index f9857bfc4c1da3405db49d0df636a5181d3975fc..94048e1f632c75d2750b98e778686f0dc76e6576 100644 (file)
@@ -36,11 +36,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
 namespace __detail
 {
+_GLIBCXX_BEGIN_INLINE_ABI_NAMESPACE(_V2)
   /**
    * @addtogroup regex-detail
    * @{
    */
 
+  template<typename _BiIter, bool _Trivial = is_trivially_copyable<_BiIter>::value>
+    struct _ExecutorFrame;
+
   /**
    * @brief Takes a regex and an input string and does the matching.
    *
@@ -142,6 +146,9 @@ namespace __detail
       void
       _M_handle_alternative(_Match_mode, _StateIdT);
 
+      void
+      _M_node(_Match_mode, _StateIdT);
+
       void
       _M_dfs(_Match_mode __match_mode, _StateIdT __start);
 
@@ -290,6 +297,7 @@ namespace __detail
        };
 
     public:
+      _GLIBCXX_STD_C::vector<_ExecutorFrame<_BiIter>>       _M_frames;
       _ResultsVec                                           _M_cur_results;
       _BiIter                                               _M_current;
       _BiIter                                               _M_begin;
@@ -305,6 +313,7 @@ namespace __detail
     };
 
  ///@} regex-detail
+_GLIBCXX_END_INLINE_ABI_NAMESPACE(_V2)
 } // namespace __detail
 _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace std
index 421e585f39d988d91477bf154add32230accb4a2..b76f1dce952c2afd9cf59cb7ea4819ba81848aa8 100644 (file)
@@ -36,6 +36,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #pragma GCC diagnostic ignored "-Wc++17-extensions" // if constexpr
 namespace __detail
 {
+_GLIBCXX_BEGIN_INLINE_ABI_NAMESPACE(_V2)
   template<typename _BiIter, typename _Alloc, typename _TraitsT,
           bool __dfs_mode>
     bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
@@ -55,6 +56,85 @@ namespace __detail
       return false;
     }
 
+  enum _ExecutorFrameOpcode : unsigned char
+  {
+    _S_fopcode_next,
+    _S_fopcode_fallback_next,
+    _S_fopcode_rep_once_more,
+    _S_fopcode_fallback_rep_once_more,
+    _S_fopcode_posix_alternative,
+    _S_fopcode_merge_sol,
+    _S_fopcode_restore_current,
+    _S_fopcode_restore_cur_results,
+    _S_fopcode_restore_rep_count,
+    _S_fopcode_decrement_rep_count,
+  };
+
+  struct _ExecutorFrameBase
+  {
+    _ExecutorFrameBase(_ExecutorFrameOpcode __op, _StateIdT __i)
+    : _M_op(__op), _M_state_id(__i)
+    { }
+
+    _ExecutorFrameOpcode _M_op;
+    union {
+      unsigned char _M_byte0;
+      struct { // Used by restore_rep_count frame
+       unsigned char _M_count : 2;
+      };
+      struct { // Used by restore_cur_results frame
+       unsigned char _M_end : 1;
+       unsigned char _M_matched : 1;
+      };
+    };
+    unsigned char _M_bytes[6];
+    _StateIdT _M_state_id;
+  };
+
+  template<typename _BiIter, bool _Trivial /* = is_trivially_copyable<_BiIter>::value */>
+    struct _ExecutorFrame : _ExecutorFrameBase
+    {
+      _ExecutorFrame(_ExecutorFrameOpcode __op, _StateIdT __i)
+      : _ExecutorFrameBase(__op, __i)
+      { }
+
+      _ExecutorFrame(_ExecutorFrameOpcode __op, _StateIdT __i, _BiIter __p)
+      : _ExecutorFrameBase(__op, __i), _M_pos(__p)
+      { }
+
+      _ExecutorFrame(_ExecutorFrameOpcode __op, _StateIdT __i, long __v)
+      : _ExecutorFrameBase(__op, __i), _M_val(__v)
+      { }
+
+      // _M_pos and _M_val are mutually exclusive, which the optimized
+      // partial specialization below depends on.
+      _BiIter _M_pos = _BiIter();
+      long _M_val = 0;
+    };
+
+  // Space-optimized partial specialization for when the input iterator is
+  // trivially copyable.
+  template<typename _BiIter>
+    struct _ExecutorFrame<_BiIter, true> : _ExecutorFrameBase
+    {
+      _ExecutorFrame(_ExecutorFrameOpcode __op, _StateIdT __i)
+      : _ExecutorFrameBase(__op, __i)
+      { }
+
+      _ExecutorFrame(_ExecutorFrameOpcode __op, _StateIdT __i, _BiIter __p)
+      : _ExecutorFrameBase(__op, __i), _M_pos(__p)
+      { }
+
+      _ExecutorFrame(_ExecutorFrameOpcode __op, _StateIdT __i, long __v)
+      : _ExecutorFrameBase(__op, __i), _M_val(__v)
+      { }
+
+      union {
+       _BiIter _M_pos;
+       long _M_val;
+      };
+    };
+
   // The _M_main function operates in different modes, DFS mode or BFS mode,
   // indicated by template parameter __dfs_mode, and dispatches to one of the
   // _M_main_dispatch overloads.
@@ -181,19 +261,20 @@ namespace __detail
       auto& __rep_count = _M_rep_count[__i];
       if (__rep_count.second == 0 || __rep_count.first != _M_current)
        {
-         auto __back = __rep_count;
+         _M_frames.emplace_back(_S_fopcode_restore_rep_count,
+                                __i, __rep_count.first);
+         _M_frames.back()._M_count = __rep_count.second;
          __rep_count.first = _M_current;
          __rep_count.second = 1;
-         _M_dfs(__match_mode, __state._M_alt);
-         __rep_count = __back;
+         _M_frames.emplace_back(_S_fopcode_next, __state._M_alt);
        }
       else
        {
          if (__rep_count.second < 2)
            {
              __rep_count.second++;
-             _M_dfs(__match_mode, __state._M_alt);
-             __rep_count.second--;
+             _M_frames.emplace_back(_S_fopcode_decrement_rep_count, __i);
+             _M_frames.emplace_back(_S_fopcode_next, __state._M_alt);
            }
        }
     }
@@ -208,23 +289,23 @@ namespace __detail
     _M_handle_repeat(_Match_mode __match_mode, _StateIdT __i)
     {
       const auto& __state = _M_nfa[__i];
-
       // Greedy.
       if (!__state._M_neg)
        {
-         _M_rep_once_more(__match_mode, __i);
-         // If it's DFS executor and already accepted, we're done.
-         if (!__dfs_mode || !_M_has_sol)
-           _M_dfs(__match_mode, __state._M_next);
+         if constexpr (__dfs_mode)
+           // If it's DFS executor and already accepted, we're done.
+           _M_frames.emplace_back(_S_fopcode_fallback_next, __state._M_next);
+         else
+           _M_frames.emplace_back(_S_fopcode_next, __state._M_next);
+         _M_frames.emplace_back(_S_fopcode_rep_once_more, __i);
        }
       else // Non-greedy mode
        {
          if constexpr (__dfs_mode)
            {
              // vice-versa.
-             _M_dfs(__match_mode, __state._M_next);
-             if (!_M_has_sol)
-               _M_rep_once_more(__match_mode, __i);
+             _M_frames.emplace_back(_S_fopcode_fallback_rep_once_more, __i);
+             _M_frames.emplace_back(_S_fopcode_next, __state._M_next);
            }
          else
            {
@@ -233,12 +314,11 @@ namespace __detail
              // be better by attempting its next node.
              if (!_M_has_sol)
                {
-                 _M_dfs(__match_mode, __state._M_next);
                  // DON'T attempt anything if it's already accepted. An
                  // accepted state *must* be better than a solution that
                  // matches a non-greedy quantifier one more time.
-                 if (!_M_has_sol)
-                   _M_rep_once_more(__match_mode, __i);
+                 _M_frames.emplace_back(_S_fopcode_fallback_rep_once_more, __i);
+                 _M_frames.emplace_back(_S_fopcode_next, __state._M_next);
                }
            }
        }
@@ -250,12 +330,13 @@ namespace __detail
     _M_handle_subexpr_begin(_Match_mode __match_mode, _StateIdT __i)
     {
       const auto& __state = _M_nfa[__i];
-
       auto& __res = _M_cur_results[__state._M_subexpr];
-      auto __back = __res.first;
+      _M_frames.emplace_back(_S_fopcode_restore_cur_results,
+                            static_cast<_StateIdT>(__state._M_subexpr),
+                            __res.first);
+      _M_frames.back()._M_end = false;
       __res.first = _M_current;
-      _M_dfs(__match_mode, __state._M_next);
-      __res.first = __back;
+      _M_frames.emplace_back(_S_fopcode_next, __state._M_next);
     }
 
   template<typename _BiIter, typename _Alloc, typename _TraitsT,
@@ -264,13 +345,15 @@ namespace __detail
     _M_handle_subexpr_end(_Match_mode __match_mode, _StateIdT __i)
     {
       const auto& __state = _M_nfa[__i];
-
       auto& __res = _M_cur_results[__state._M_subexpr];
-      auto __back = __res;
+      _M_frames.emplace_back(_S_fopcode_restore_cur_results,
+                            static_cast<_StateIdT>(__state._M_subexpr),
+                            __res.second);
+      _M_frames.back()._M_end = true;
+      _M_frames.back()._M_matched = __res.matched;
       __res.second = _M_current;
       __res.matched = true;
-      _M_dfs(__match_mode, __state._M_next);
-      __res = __back;
+      _M_frames.emplace_back(_S_fopcode_next, __state._M_next);
     }
 
   template<typename _BiIter, typename _Alloc, typename _TraitsT,
@@ -280,7 +363,7 @@ namespace __detail
     {
       const auto& __state = _M_nfa[__i];
       if (_M_at_begin())
-       _M_dfs(__match_mode, __state._M_next);
+       _M_frames.emplace_back(_S_fopcode_next, __state._M_next);
     }
 
   template<typename _BiIter, typename _Alloc, typename _TraitsT,
@@ -290,7 +373,7 @@ namespace __detail
     {
       const auto& __state = _M_nfa[__i];
       if (_M_at_end())
-       _M_dfs(__match_mode, __state._M_next);
+       _M_frames.emplace_back(_S_fopcode_next, __state._M_next);
     }
 
   template<typename _BiIter, typename _Alloc, typename _TraitsT,
@@ -300,7 +383,7 @@ namespace __detail
     {
       const auto& __state = _M_nfa[__i];
       if (_M_word_boundary() == !__state._M_neg)
-       _M_dfs(__match_mode, __state._M_next);
+       _M_frames.emplace_back(_S_fopcode_next, __state._M_next);
     }
 
   // Here __state._M_alt offers a single start node for a sub-NFA.
@@ -312,7 +395,7 @@ namespace __detail
     {
       const auto& __state = _M_nfa[__i];
       if (_M_lookahead(__state._M_alt) == !__state._M_neg)
-       _M_dfs(__match_mode, __state._M_next);
+       _M_frames.emplace_back(_S_fopcode_next, __state._M_next);
     }
 
   template<typename _BiIter, typename _Alloc, typename _TraitsT,
@@ -321,16 +404,15 @@ namespace __detail
     _M_handle_match(_Match_mode __match_mode, _StateIdT __i)
     {
       const auto& __state = _M_nfa[__i];
-
       if (_M_current == _M_end)
        return;
       if constexpr (__dfs_mode)
        {
          if (__state._M_matches(*_M_current))
            {
+             _M_frames.emplace_back(_S_fopcode_restore_current, 0, _M_current);
              ++_M_current;
-             _M_dfs(__match_mode, __state._M_next);
-             --_M_current;
+             _M_frames.emplace_back(_S_fopcode_next, __state._M_next);
            }
        }
       else
@@ -413,13 +495,12 @@ namespace __detail
        {
          if (__last != _M_current)
            {
-             auto __backup = _M_current;
+             _M_frames.emplace_back(_S_fopcode_restore_current, 0, _M_current);
              _M_current = __last;
-             _M_dfs(__match_mode, __state._M_next);
-             _M_current = __backup;
+             _M_frames.emplace_back(_S_fopcode_next, __state._M_next);
            }
          else
-           _M_dfs(__match_mode, __state._M_next);
+           _M_frames.emplace_back(_S_fopcode_next, __state._M_next);
        }
     }
 
@@ -483,31 +564,26 @@ namespace __detail
     _M_handle_alternative(_Match_mode __match_mode, _StateIdT __i)
     {
       const auto& __state = _M_nfa[__i];
-
       if (_M_nfa._M_flags & regex_constants::ECMAScript)
        {
          // TODO: Fix BFS support. It is wrong.
-         _M_dfs(__match_mode, __state._M_alt);
          // Pick lhs if it matches. Only try rhs if it doesn't.
-         if (!_M_has_sol)
-           _M_dfs(__match_mode, __state._M_next);
+         _M_frames.emplace_back(_S_fopcode_fallback_next, __state._M_next);
+         _M_frames.emplace_back(_S_fopcode_next, __state._M_alt);
        }
       else
        {
          // Try both and compare the result.
          // See "case _S_opcode_accept:" handling above.
-         _M_dfs(__match_mode, __state._M_alt);
-         auto __has_sol = _M_has_sol;
-         _M_has_sol = false;
-         _M_dfs(__match_mode, __state._M_next);
-         _M_has_sol |= __has_sol;
+         _M_frames.emplace_back(_S_fopcode_posix_alternative, __state._M_next);
+         _M_frames.emplace_back(_S_fopcode_next, __state._M_alt);
        }
     }
 
   template<typename _BiIter, typename _Alloc, typename _TraitsT,
           bool __dfs_mode>
     void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
-    _M_dfs(_Match_mode __match_mode, _StateIdT __i)
+    _M_node(_Match_mode __match_mode, _StateIdT __i)
     {
       if (_M_states._M_visited(__i))
        return;
@@ -545,6 +621,72 @@ namespace __detail
        }
     }
 
+  template<typename _BiIter, typename _Alloc, typename _TraitsT,
+          bool __dfs_mode>
+    void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
+    _M_dfs(_Match_mode __match_mode, _StateIdT __start)
+    {
+      _M_frames.emplace_back(_S_fopcode_next, __start);
+
+      while (!_M_frames.empty())
+       {
+         _ExecutorFrame<_BiIter> __frame = std::move(_M_frames.back());
+         _M_frames.pop_back();
+
+         switch (__frame._M_op)
+           {
+           case _S_fopcode_fallback_next:
+             if (_M_has_sol)
+               break;
+             [[__fallthrough__]];
+           case _S_fopcode_next:
+             _M_node(__match_mode, __frame._M_state_id);
+             break;
+
+           case _S_fopcode_fallback_rep_once_more:
+             if (_M_has_sol)
+               break;
+             [[__fallthrough__]];
+           case _S_fopcode_rep_once_more:
+             _M_rep_once_more(__match_mode, __frame._M_state_id);
+             break;
+
+           case _S_fopcode_posix_alternative:
+             _M_frames.emplace_back(_S_fopcode_merge_sol, 0, _M_has_sol);
+             _M_frames.emplace_back(_S_fopcode_next, __frame._M_state_id);
+             _M_has_sol = false;
+             break;
+
+           case _S_fopcode_merge_sol:
+             _M_has_sol |= __frame._M_val;
+             break;
+
+           case _S_fopcode_restore_current:
+             _M_current = __frame._M_pos;
+             break;
+
+           case _S_fopcode_restore_cur_results:
+             if (!__frame._M_end)
+               _M_cur_results[__frame._M_state_id].first = __frame._M_pos;
+             else
+               {
+                 _M_cur_results[__frame._M_state_id].second = __frame._M_pos;
+                 _M_cur_results[__frame._M_state_id].matched = __frame._M_matched;
+               }
+             break;
+
+           case _S_fopcode_restore_rep_count:
+             _M_rep_count[__frame._M_state_id].first = __frame._M_pos;
+             _M_rep_count[__frame._M_state_id].second = __frame._M_count;
+             break;
+
+           case _S_fopcode_decrement_rep_count:
+             _M_rep_count[__frame._M_state_id].second--;
+             break;
+           }
+       }
+    }
+
   // Return whether now is at some word boundary.
   template<typename _BiIter, typename _Alloc, typename _TraitsT,
           bool __dfs_mode>
@@ -569,6 +711,7 @@ namespace __detail
 
       return __left_is_word != __right_is_word;
     }
+_GLIBCXX_END_INLINE_ABI_NAMESPACE(_V2)
 } // namespace __detail
 #pragma GCC diagnostic pop