]>
Commit | Line | Data |
---|---|---|
6cb784b6 TS |
1 | // class template regex -*- C++ -*- |
2 | ||
aa118a03 | 3 | // Copyright (C) 2013-2014 Free Software Foundation, Inc. |
6cb784b6 TS |
4 | // |
5 | // This file is part of the GNU ISO C++ Library. This library is free | |
6 | // software; you can redistribute it and/or modify it under the | |
7 | // terms of the GNU General Public License as published by the | |
8 | // Free Software Foundation; either version 3, or (at your option) | |
9 | // any later version. | |
10 | ||
11 | // This library is distributed in the hope that it will be useful, | |
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | // GNU General Public License for more details. | |
15 | ||
16 | // Under Section 7 of GPL version 3, you are granted additional | |
17 | // permissions described in the GCC Runtime Library Exception, version | |
18 | // 3.1, as published by the Free Software Foundation. | |
19 | ||
20 | // You should have received a copy of the GNU General Public License and | |
21 | // a copy of the GCC Runtime Library Exception along with this program; | |
22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
23 | // <http://www.gnu.org/licenses/>. | |
24 | ||
25 | /** | |
26 | * @file bits/regex_executor.tcc | |
27 | * This is an internal header file, included by other library headers. | |
28 | * Do not attempt to use it directly. @headername{regex} | |
29 | */ | |
30 | ||
31 | namespace std _GLIBCXX_VISIBILITY(default) | |
32 | { | |
33 | namespace __detail | |
34 | { | |
35 | _GLIBCXX_BEGIN_NAMESPACE_VERSION | |
36 | ||
9f0d9611 TS |
37 | template<typename _BiIter, typename _Alloc, typename _TraitsT, |
38 | bool __dfs_mode> | |
39 | bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: | |
18971f1f TS |
40 | _M_search() |
41 | { | |
42 | if (_M_flags & regex_constants::match_continuous) | |
43 | return _M_search_from_first(); | |
44 | auto __cur = _M_begin; | |
45 | do | |
46 | { | |
9f0d9611 TS |
47 | _M_current = __cur; |
48 | if (_M_main<false>()) | |
18971f1f TS |
49 | return true; |
50 | } | |
51 | // Continue when __cur == _M_end | |
52 | while (__cur++ != _M_end); | |
53 | return false; | |
54 | } | |
55 | ||
caaf33fa TS |
56 | // This function operates in different modes, DFS mode or BFS mode, indicated |
57 | // by template parameter __dfs_mode. See _M_main for details. | |
58 | // | |
59 | // ------------------------------------------------------------ | |
60 | // | |
61 | // DFS mode: | |
62 | // | |
63 | // It applies a Depth-First-Search (aka backtracking) on given NFA and input | |
64 | // string. | |
65 | // At the very beginning the executor stands in the start state, then it tries | |
66 | // every possible state transition in current state recursively. Some state | |
67 | // transitions consume input string, say, a single-char-matcher or a | |
68 | // back-reference matcher; some don't, like assertion or other anchor nodes. | |
69 | // When the input is exhausted and/or the current state is an accepting state, | |
70 | // the whole executor returns true. | |
71 | // | |
72 | // TODO: This approach is exponentially slow for certain input. | |
73 | // Try to compile the NFA to a DFA. | |
74 | // | |
ee54a3b3 | 75 | // Time complexity: \Omega(match_length), O(2^(_M_nfa.size())) |
caaf33fa TS |
76 | // Space complexity: \theta(match_results.size() + match_length) |
77 | // | |
78 | // ------------------------------------------------------------ | |
79 | // | |
80 | // BFS mode: | |
81 | // | |
82 | // Russ Cox's article (http://swtch.com/~rsc/regexp/regexp1.html) | |
83 | // explained this algorithm clearly. | |
84 | // | |
ee54a3b3 TS |
85 | // It first computes epsilon closure (states that can be achieved without |
86 | // consuming characters) for every state that's still matching, | |
87 | // using the same DFS algorithm, but doesn't re-enter states (find a true in | |
caaf33fa TS |
88 | // _M_visited), nor follows _S_opcode_match. |
89 | // | |
90 | // Then apply DFS using every _S_opcode_match (in _M_match_queue) as the start | |
91 | // state. | |
92 | // | |
93 | // It significantly reduces potential duplicate states, so has a better | |
94 | // upper bound; but it requires more overhead. | |
95 | // | |
ee54a3b3 | 96 | // Time complexity: \Omega(match_length * match_results.size()) |
caaf33fa | 97 | // O(match_length * _M_nfa.size() * match_results.size()) |
ee54a3b3 | 98 | // Space complexity: \Omega(_M_nfa.size() + match_results.size()) |
caaf33fa | 99 | // O(_M_nfa.size() * match_results.size()) |
9f0d9611 TS |
100 | template<typename _BiIter, typename _Alloc, typename _TraitsT, |
101 | bool __dfs_mode> | |
102 | template<bool __match_mode> | |
103 | bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: | |
104 | _M_main() | |
105 | { | |
106 | if (__dfs_mode) | |
107 | { | |
108 | _M_has_sol = false; | |
109 | _M_cur_results = _M_results; | |
110 | _M_dfs<__match_mode>(_M_start_state); | |
111 | return _M_has_sol; | |
112 | } | |
113 | else | |
114 | { | |
ddf41e9d | 115 | _M_match_queue->push_back(make_pair(_M_start_state, _M_results)); |
9f0d9611 TS |
116 | bool __ret = false; |
117 | while (1) | |
118 | { | |
119 | _M_has_sol = false; | |
120 | if (_M_match_queue->empty()) | |
121 | break; | |
122 | _M_visited->assign(_M_visited->size(), false); | |
123 | auto _M_old_queue = std::move(*_M_match_queue); | |
ddf41e9d | 124 | for (auto __task : _M_old_queue) |
9f0d9611 | 125 | { |
9f0d9611 TS |
126 | _M_cur_results = __task.second; |
127 | _M_dfs<__match_mode>(__task.first); | |
128 | } | |
129 | if (!__match_mode) | |
130 | __ret |= _M_has_sol; | |
131 | if (_M_current == _M_end) | |
132 | break; | |
133 | ++_M_current; | |
134 | } | |
135 | if (__match_mode) | |
136 | __ret = _M_has_sol; | |
137 | return __ret; | |
138 | } | |
139 | } | |
140 | ||
141 | // Return whether now match the given sub-NFA. | |
142 | template<typename _BiIter, typename _Alloc, typename _TraitsT, | |
143 | bool __dfs_mode> | |
144 | bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: | |
68e69ce2 | 145 | _M_lookahead(_State<_TraitsT> __state) |
9f0d9611 TS |
146 | { |
147 | _ResultsVec __what(_M_cur_results.size()); | |
148 | auto __sub = std::unique_ptr<_Executor>(new _Executor(_M_current, | |
149 | _M_end, | |
150 | __what, | |
151 | _M_re, | |
152 | _M_flags)); | |
153 | __sub->_M_start_state = __state._M_alt; | |
154 | if (__sub->_M_search_from_first()) | |
155 | { | |
156 | for (size_t __i = 0; __i < __what.size(); __i++) | |
157 | if (__what[__i].matched) | |
158 | _M_cur_results[__i] = __what[__i]; | |
159 | return true; | |
160 | } | |
161 | return false; | |
162 | } | |
163 | ||
a670a9bb TS |
164 | // __rep_count records how many times (__rep_count.second) |
165 | // this node is visited under certain input iterator | |
166 | // (__rep_count.first). This prevent the executor from entering | |
167 | // infinite loop by refusing to continue when it's already been | |
168 | // visited more than twice. It's `twice` instead of `once` because | |
169 | // we need to spare one more time for potential group capture. | |
170 | template<typename _BiIter, typename _Alloc, typename _TraitsT, | |
171 | bool __dfs_mode> | |
172 | template<bool __match_mode> | |
173 | void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: | |
174 | _M_rep_once_more(_StateIdT __i) | |
175 | { | |
176 | const auto& __state = _M_nfa[__i]; | |
177 | auto& __rep_count = _M_rep_count[__i]; | |
178 | if (__rep_count.second == 0 || __rep_count.first != _M_current) | |
179 | { | |
180 | auto __back = __rep_count; | |
181 | __rep_count.first = _M_current; | |
182 | __rep_count.second = 1; | |
183 | _M_dfs<__match_mode>(__state._M_alt); | |
184 | __rep_count = __back; | |
185 | } | |
186 | else | |
187 | { | |
188 | if (__rep_count.second < 2) | |
189 | { | |
190 | __rep_count.second++; | |
191 | _M_dfs<__match_mode>(__state._M_alt); | |
192 | __rep_count.second--; | |
193 | } | |
194 | } | |
195 | }; | |
196 | ||
9f0d9611 TS |
197 | template<typename _BiIter, typename _Alloc, typename _TraitsT, |
198 | bool __dfs_mode> | |
199 | template<bool __match_mode> | |
200 | void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: | |
6cb784b6 TS |
201 | _M_dfs(_StateIdT __i) |
202 | { | |
9f0d9611 TS |
203 | if (!__dfs_mode) |
204 | { | |
205 | if ((*_M_visited)[__i]) | |
206 | return; | |
207 | (*_M_visited)[__i] = true; | |
208 | } | |
209 | ||
210 | const auto& __state = _M_nfa[__i]; | |
caaf33fa TS |
211 | // Every change on _M_cur_results and _M_current will be rolled back after |
212 | // finishing the recursion step. | |
6cb784b6 | 213 | switch (__state._M_opcode) |
e280b6ff | 214 | { |
caaf33fa TS |
215 | // _M_alt branch is "match once more", while _M_next is "get me out |
216 | // of this quantifier". Executing _M_next first or _M_alt first don't | |
217 | // mean the same thing, and we need to choose the correct order under | |
218 | // given greedy mode. | |
a670a9bb TS |
219 | case _S_opcode_repeat: |
220 | { | |
221 | // Greedy. | |
222 | if (!__state._M_neg) | |
223 | { | |
224 | _M_rep_once_more<__match_mode>(__i); | |
225 | // If it's DFS executor and already accepted, we're done. | |
226 | if (!__dfs_mode || !_M_has_sol) | |
9f0d9611 | 227 | _M_dfs<__match_mode>(__state._M_next); |
a670a9bb TS |
228 | } |
229 | else // Non-greedy mode | |
230 | { | |
231 | if (__dfs_mode) | |
232 | { | |
233 | // vice-versa. | |
234 | _M_dfs<__match_mode>(__state._M_next); | |
235 | if (!_M_has_sol) | |
236 | _M_rep_once_more<__match_mode>(__i); | |
237 | } | |
238 | else | |
239 | { | |
240 | // DON'T attempt anything, because there's already another | |
241 | // state with higher priority accepted. This state cannot be | |
242 | // better by attempting its next node. | |
243 | if (!_M_has_sol) | |
244 | { | |
245 | _M_dfs<__match_mode>(__state._M_next); | |
246 | // DON'T attempt anything if it's already accepted. An | |
247 | // accepted state *must* be better than a solution that | |
248 | // matches a non-greedy quantifier one more time. | |
249 | if (!_M_has_sol) | |
250 | _M_rep_once_more<__match_mode>(__i); | |
251 | } | |
252 | } | |
253 | } | |
9f0d9611 | 254 | } |
e280b6ff TS |
255 | break; |
256 | case _S_opcode_subexpr_begin: | |
a670a9bb TS |
257 | { |
258 | auto& __res = _M_cur_results[__state._M_subexpr]; | |
259 | auto __back = __res.first; | |
260 | __res.first = _M_current; | |
261 | _M_dfs<__match_mode>(__state._M_next); | |
262 | __res.first = __back; | |
263 | } | |
e280b6ff TS |
264 | break; |
265 | case _S_opcode_subexpr_end: | |
a670a9bb TS |
266 | { |
267 | auto& __res = _M_cur_results[__state._M_subexpr]; | |
268 | auto __back = __res; | |
269 | __res.second = _M_current; | |
270 | __res.matched = true; | |
9f0d9611 | 271 | _M_dfs<__match_mode>(__state._M_next); |
a670a9bb TS |
272 | __res = __back; |
273 | } | |
e280b6ff | 274 | break; |
7b86458e | 275 | case _S_opcode_line_begin_assertion: |
9f0d9611 TS |
276 | if (_M_at_begin()) |
277 | _M_dfs<__match_mode>(__state._M_next); | |
7b86458e TS |
278 | break; |
279 | case _S_opcode_line_end_assertion: | |
9f0d9611 TS |
280 | if (_M_at_end()) |
281 | _M_dfs<__match_mode>(__state._M_next); | |
7b86458e | 282 | break; |
f054ff5b JW |
283 | case _S_opcode_word_boundary: |
284 | if (_M_word_boundary(__state) == !__state._M_neg) | |
9f0d9611 | 285 | _M_dfs<__match_mode>(__state._M_next); |
7b86458e | 286 | break; |
caaf33fa TS |
287 | // Here __state._M_alt offers a single start node for a sub-NFA. |
288 | // We recursively invoke our algorithm to match the sub-NFA. | |
7b86458e | 289 | case _S_opcode_subexpr_lookahead: |
9f0d9611 TS |
290 | if (_M_lookahead(__state) == !__state._M_neg) |
291 | _M_dfs<__match_mode>(__state._M_next); | |
7b86458e | 292 | break; |
e280b6ff | 293 | case _S_opcode_match: |
9f0d9611 | 294 | if (__dfs_mode) |
e280b6ff | 295 | { |
9f0d9611 TS |
296 | if (_M_current != _M_end && __state._M_matches(*_M_current)) |
297 | { | |
298 | ++_M_current; | |
299 | _M_dfs<__match_mode>(__state._M_next); | |
300 | --_M_current; | |
301 | } | |
e280b6ff | 302 | } |
9f0d9611 TS |
303 | else |
304 | if (__state._M_matches(*_M_current)) | |
ddf41e9d TS |
305 | _M_match_queue->push_back(make_pair(__state._M_next, |
306 | _M_cur_results)); | |
e280b6ff | 307 | break; |
b21abcee | 308 | // First fetch the matched result from _M_cur_results as __submatch; |
e280b6ff | 309 | // then compare it with |
caaf33fa TS |
310 | // (_M_current, _M_current + (__submatch.second - __submatch.first)). |
311 | // If matched, keep going; else just return and try another state. | |
e280b6ff TS |
312 | case _S_opcode_backref: |
313 | { | |
9f0d9611 | 314 | _GLIBCXX_DEBUG_ASSERT(__dfs_mode); |
b21abcee | 315 | auto& __submatch = _M_cur_results[__state._M_backref_index]; |
e280b6ff TS |
316 | if (!__submatch.matched) |
317 | break; | |
9f0d9611 | 318 | auto __last = _M_current; |
e280b6ff | 319 | for (auto __tmp = __submatch.first; |
9f0d9611 | 320 | __last != _M_end && __tmp != __submatch.second; |
e280b6ff TS |
321 | ++__tmp) |
322 | ++__last; | |
9f0d9611 | 323 | if (_M_re._M_traits.transform(__submatch.first, |
b21abcee | 324 | __submatch.second) |
9f0d9611 | 325 | == _M_re._M_traits.transform(_M_current, __last)) |
ab1c993b | 326 | { |
9f0d9611 | 327 | if (__last != _M_current) |
ab1c993b | 328 | { |
9f0d9611 TS |
329 | auto __backup = _M_current; |
330 | _M_current = __last; | |
331 | _M_dfs<__match_mode>(__state._M_next); | |
332 | _M_current = __backup; | |
ab1c993b TS |
333 | } |
334 | else | |
9f0d9611 | 335 | _M_dfs<__match_mode>(__state._M_next); |
ab1c993b | 336 | } |
e280b6ff TS |
337 | } |
338 | break; | |
339 | case _S_opcode_accept: | |
9f0d9611 | 340 | if (__dfs_mode) |
e280b6ff | 341 | { |
9f0d9611 TS |
342 | _GLIBCXX_DEBUG_ASSERT(!_M_has_sol); |
343 | if (__match_mode) | |
344 | _M_has_sol = _M_current == _M_end; | |
345 | else | |
346 | _M_has_sol = true; | |
347 | if (_M_current == _M_begin | |
348 | && (_M_flags & regex_constants::match_not_null)) | |
349 | _M_has_sol = false; | |
350 | if (_M_has_sol) | |
351 | _M_results = _M_cur_results; | |
e280b6ff | 352 | } |
9f0d9611 | 353 | else |
18971f1f | 354 | { |
9f0d9611 TS |
355 | if (_M_current == _M_begin |
356 | && (_M_flags & regex_constants::match_not_null)) | |
357 | break; | |
358 | if (!__match_mode || _M_current == _M_end) | |
359 | if (!_M_has_sol) | |
360 | { | |
361 | _M_has_sol = true; | |
362 | _M_results = _M_cur_results; | |
363 | } | |
18971f1f | 364 | } |
9f0d9611 | 365 | break; |
a670a9bb TS |
366 | case _S_opcode_alternative: |
367 | _M_dfs<__match_mode>(__state._M_alt); | |
368 | if (!__dfs_mode || !_M_has_sol) | |
369 | _M_dfs<__match_mode>(__state._M_next); | |
370 | break; | |
9f0d9611 TS |
371 | default: |
372 | _GLIBCXX_DEBUG_ASSERT(false); | |
e280b6ff | 373 | } |
6cb784b6 TS |
374 | } |
375 | ||
f054ff5b | 376 | // Return whether now is at some word boundary. |
9f0d9611 TS |
377 | template<typename _BiIter, typename _Alloc, typename _TraitsT, |
378 | bool __dfs_mode> | |
379 | bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: | |
68e69ce2 | 380 | _M_word_boundary(_State<_TraitsT> __state) const |
b21abcee TS |
381 | { |
382 | // By definition. | |
383 | bool __ans = false; | |
384 | auto __pre = _M_current; | |
385 | --__pre; | |
386 | if (!(_M_at_begin() && _M_at_end())) | |
ab1c993b TS |
387 | { |
388 | if (_M_at_begin()) | |
389 | __ans = _M_is_word(*_M_current) | |
390 | && !(_M_flags & regex_constants::match_not_bow); | |
391 | else if (_M_at_end()) | |
392 | __ans = _M_is_word(*__pre) | |
393 | && !(_M_flags & regex_constants::match_not_eow); | |
394 | else | |
395 | __ans = _M_is_word(*_M_current) | |
396 | != _M_is_word(*__pre); | |
397 | } | |
b21abcee TS |
398 | return __ans; |
399 | } | |
400 | ||
6cb784b6 TS |
401 | _GLIBCXX_END_NAMESPACE_VERSION |
402 | } // namespace __detail | |
403 | } // namespace |