]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
re PR libstdc++/61424 (std::regex matches right to left, not leftmost longest)
authorTim Shen <timshen@google.com>
Tue, 1 Jul 2014 02:10:31 +0000 (02:10 +0000)
committerTim Shen <timshen@gcc.gnu.org>
Tue, 1 Jul 2014 02:10:31 +0000 (02:10 +0000)
PR libstdc++/61424
* include/bits/regex.tcc (__regex_algo_impl<>): Use DFS for ECMAScript,
not just regex containing back-references.
* include/bits/regex_compiler.tcc (_Compiler<>::_M_disjunction):
exchange _M_next and _M_alt for alternative operator,
making matching from left to right.
* include/bits/regex_executor.h (_State_info<>::_M_get_sol_pos):
Add position tracking fom DFS.
* include/bits/regex_executor.tcc (_Executor<>::_M_main_dispatch,
_Executor<>::_M_dfs): Likewise.
* include/bits/regex_scanner.h: Remove unused enum entry.
* testsuite/28_regex/algorithms/regex_search/61424.cc: New
testcase from PR.

From-SVN: r212184

libstdc++-v3/ChangeLog
libstdc++-v3/include/bits/regex.tcc
libstdc++-v3/include/bits/regex_compiler.tcc
libstdc++-v3/include/bits/regex_executor.h
libstdc++-v3/include/bits/regex_executor.tcc
libstdc++-v3/include/bits/regex_scanner.h
libstdc++-v3/testsuite/28_regex/algorithms/regex_search/61424.cc [new file with mode: 0644]

index 1d6cf7a5bd8c29ca889f0b97f595e41bf6092b5e..75e4841626cae67eb87395e039fda03622d7030f 100644 (file)
@@ -1,3 +1,19 @@
+2014-07-01  Tim Shen  <timshen@google.com>
+
+       PR libstdc++/61424
+       * include/bits/regex.tcc (__regex_algo_impl<>): Use DFS for ECMAScript,
+       not just regex containing back-references.
+       * include/bits/regex_compiler.tcc (_Compiler<>::_M_disjunction):
+       exchange _M_next and _M_alt for alternative operator,
+       making matching from left to right.
+       * include/bits/regex_executor.h (_State_info<>::_M_get_sol_pos):
+       Add position tracking fom DFS.
+       * include/bits/regex_executor.tcc (_Executor<>::_M_main_dispatch,
+       _Executor<>::_M_dfs): Likewise.
+       * include/bits/regex_scanner.h: Remove unused enum entry.
+       * testsuite/28_regex/algorithms/regex_search/61424.cc: New
+       testcase from PR.
+
 2014-06-30  Jason Merrill  <jason@redhat.com>
 
        * libsupc++/cxxabi.h (class __pbase_type_info): __pointer_catch
index a81f51750dc99fe2745f2e7781d4438b13c02927..3322379a5b81c46d01047c8679bfd833e4480348 100644 (file)
@@ -71,6 +71,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       // _GLIBCXX_REGEX_USE_THOMPSON_NFA if they need to use this approach.
       bool __ret;
       if (!__re._M_automaton->_M_has_backref
+         && !(__re._M_flags & regex_constants::ECMAScript)
 #ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA
          && __policy == _RegexExecutorPolicy::_S_alternate
 #endif
index 0df10cc1a8b6d40a49a920cebe528d104d87a305..f15f7dd0f7b3dd22d5a358a0adc856be7aa79ffd 100644 (file)
@@ -103,9 +103,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
          auto __end = _M_nfa._M_insert_dummy();
          __alt1._M_append(__end);
          __alt2._M_append(__end);
+         // __alt2 is state._M_next, __alt1 is state._M_alt. The executor
+         // executes _M_alt before _M_next, as well as executing left
+         // alternative before right one.
          _M_stack.push(_StateSeqT(_M_nfa,
-                                  _M_nfa._M_insert_alt(__alt1._M_start,
-                                                       __alt2._M_start, false),
+                                  _M_nfa._M_insert_alt(__alt2._M_start,
+                                                       __alt1._M_start, false),
                                   __end));
        }
     }
index 1991c0007a1aee3f1ea6466f18d206734eb2993e..40d3443d9855f139c12b03e70ce264d41d3685b1 100644 (file)
@@ -173,6 +173,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
          void _M_queue(_StateIdT __i, const _ResultsVec& __res)
          { _M_match_queue.emplace_back(__i, __res); }
 
+         // Dummy implementations for BFS mode.
+         _BiIter* _M_get_sol_pos() { return nullptr; }
+
          // Saves states that need to be considered for the next character.
          vector<pair<_StateIdT, _ResultsVec>>  _M_match_queue;
          // Indicates which states are already visited.
@@ -192,11 +195,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
          bool _M_visited(_StateIdT) const { return false; }
          void _M_queue(_StateIdT, const _ResultsVec&) { }
 
+         _BiIter* _M_get_sol_pos() { return &_M_sol_pos; }
+
          // To record current solution.
          _StateIdT _M_start;
+         _BiIter   _M_sol_pos;
        };
 
-
     public:
       _ResultsVec                                           _M_cur_results;
       _BiIter                                               _M_current;
index aefa8f4713776a600691eabecf96b502151d7f9f..38b8ff270130908f3e33d8d55e2169c1d75a2e1c 100644 (file)
@@ -82,6 +82,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
     _M_main_dispatch(_Match_mode __match_mode, __dfs)
     {
       _M_has_sol = false;
+      *_M_states._M_get_sol_pos() = _BiIter();
       _M_cur_results = _M_results;
       _M_dfs(__match_mode, _M_states._M_start);
       return _M_has_sol;
@@ -338,7 +339,29 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
                  && (_M_flags & regex_constants::match_not_null))
                _M_has_sol = false;
              if (_M_has_sol)
-               _M_results = _M_cur_results;
+               {
+                 if (_M_nfa._M_flags & regex_constants::ECMAScript)
+                   _M_results = _M_cur_results;
+                 else // POSIX
+                   {
+                     _GLIBCXX_DEBUG_ASSERT(_M_states._M_get_sol_pos());
+                     // Here's POSIX's logic: match the longest one. However
+                     // we never know which one (lhs or rhs of "|") is longer
+                     // unless we try both of them and compare the results.
+                     // The member variable _M_sol_pos records the end
+                     // position of the last successful match. It's better
+                     // to be larger, because POSIX regex is always greedy.
+                     // TODO: This could be slow.
+                     if (*_M_states._M_get_sol_pos() == _BiIter()
+                         || std::distance(_M_begin,
+                                          *_M_states._M_get_sol_pos())
+                            < std::distance(_M_begin, _M_current))
+                       {
+                         *_M_states._M_get_sol_pos() = _M_current;
+                         _M_results = _M_cur_results;
+                       }
+                   }
+               }
            }
          else
            {
@@ -354,9 +377,25 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
            }
          break;
        case _S_opcode_alternative:
-         _M_dfs(__match_mode, __state._M_alt);
-         if (!__dfs_mode || !_M_has_sol)
-           _M_dfs(__match_mode, __state._M_next);
+         if (_M_nfa._M_flags & regex_constants::ECMAScript)
+           {
+             // TODO: Let DFS support ECMAScript's alternative operation.
+             _GLIBCXX_DEBUG_ASSERT(!__dfs_mode);
+             _M_dfs(__match_mode, __state._M_alt);
+             // Pick lhs if it matches. Only try rhs if it doesn't.
+             if (!_M_has_sol)
+               _M_dfs(__match_mode, __state._M_next);
+           }
+         else
+           {
+             // Try both and compare the result.
+             // See "case _S_opcode_accept:" handling above.
+             _M_dfs(__match_mode, __state._M_alt);
+             auto __has_sol = _M_has_sol;
+             _M_has_sol = false;
+             _M_dfs(__match_mode, __state._M_next);
+             _M_has_sol |= __has_sol;
+           }
          break;
        default:
          _GLIBCXX_DEBUG_ASSERT(false);
index 6627db97ce0576dcf6f73929117210e8a40a2d08..5552226bdfb167da5f523c2ad1d4e415ac5e3041 100644 (file)
@@ -67,7 +67,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       _S_token_or,
       _S_token_closure0,
       _S_token_closure1,
-      _S_token_ungreedy,
       _S_token_line_begin,
       _S_token_line_end,
       _S_token_word_bound, // neg if _M_value[0] == 'n'
diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/61424.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/61424.cc
new file mode 100644 (file)
index 0000000..bdccb4a
--- /dev/null
@@ -0,0 +1,52 @@
+// { dg-options "-std=gnu++11" }
+
+// Copyright (C) 2014 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// PR libstdc++/61424
+
+#include <regex>
+#include <testsuite_hooks.h>
+#include <testsuite_regex.h>
+
+using namespace std;
+using namespace __gnu_test;
+
+int main()
+{
+  regex_constants::syntax_option_type grammar[] = {
+    regex_constants::ECMAScript, regex_constants::extended,
+    regex_constants::awk, regex_constants::egrep
+  };
+
+  string sol[] = {
+      "tour",
+      "tournament",
+      "tournament",
+      "tournament",
+  };
+  int i = 0;
+  for (auto g : grammar)
+  {
+    regex re("tour|tournament|tourn", g);
+    const char str[] = "tournament";
+    cmatch m;
+    VERIFY(regex_search_debug(str, m, re));
+    VERIFY(sol[i] == m[0]);
+    i++;
+  }
+}