]>
Commit | Line | Data |
---|---|---|
6cb784b6 TS |
1 | // class template regex -*- C++ -*- |
2 | ||
3 | // Copyright (C) 2013 Free Software Foundation, Inc. | |
4 | // | |
5 | // This file is part of the GNU ISO C++ Library. This library is free | |
6 | // software; you can redistribute it and/or modify it under the | |
7 | // terms of the GNU General Public License as published by the | |
8 | // Free Software Foundation; either version 3, or (at your option) | |
9 | // any later version. | |
10 | ||
11 | // This library is distributed in the hope that it will be useful, | |
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | // GNU General Public License for more details. | |
15 | ||
16 | // Under Section 7 of GPL version 3, you are granted additional | |
17 | // permissions described in the GCC Runtime Library Exception, version | |
18 | // 3.1, as published by the Free Software Foundation. | |
19 | ||
20 | // You should have received a copy of the GNU General Public License and | |
21 | // a copy of the GCC Runtime Library Exception along with this program; | |
22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
23 | // <http://www.gnu.org/licenses/>. | |
24 | ||
25 | /** | |
26 | * @file bits/regex_compiler.tcc | |
27 | * This is an internal header file, included by other library headers. | |
28 | * Do not attempt to use it directly. @headername{regex} | |
29 | */ | |
30 | ||
b21abcee | 31 | // FIXME make comments doxygen format. |
7c812a2a TS |
32 | |
33 | // This compiler refers to "Regular Expression Matching Can Be Simple And Fast" | |
34 | // (http://swtch.com/~rsc/regexp/regexp1.html"), | |
35 | // but doesn't strictly follow it. | |
36 | // | |
37 | // When compiling, states are *chained* instead of tree- or graph-constructed. | |
38 | // It's more like structured programs: there's if statement and loop statement. | |
39 | // | |
40 | // For alternative structure(say "a|b"), aka "if statement", two branchs should | |
41 | // be constructed. However, these two shall merge to an "end_tag" at the end of | |
42 | // this operator: | |
43 | // | |
44 | // branch1 | |
45 | // / \ | |
46 | // => begin_tag end_tag => | |
47 | // \ / | |
48 | // branch2 | |
49 | // | |
50 | // This is the difference between this implementation and that in Russ's | |
51 | // article. | |
52 | // | |
53 | // That's why we introduced dummy node here ------ "end_tag" is a dummy node. | |
54 | // All dummy node will be eliminated at the end of compiling process. | |
55 | ||
6cb784b6 TS |
56 | namespace std _GLIBCXX_VISIBILITY(default) |
57 | { | |
58 | namespace __detail | |
59 | { | |
60 | _GLIBCXX_BEGIN_NAMESPACE_VERSION | |
61 | ||
33fbbb76 TS |
62 | template<typename _FwdIter, typename _CharT, typename _TraitsT> |
63 | _Compiler<_FwdIter, _CharT, _TraitsT>:: | |
64 | _Compiler(_FwdIter __b, _FwdIter __e, | |
e280b6ff | 65 | const _TraitsT& __traits, _FlagT __flags) |
6cb784b6 | 66 | : _M_traits(__traits), _M_scanner(__b, __e, __flags, _M_traits.getloc()), |
7c812a2a TS |
67 | _M_ctype(std::use_facet<std::ctype<_CharT>>(_M_traits.getloc())), |
68 | _M_nfa(__flags), _M_flags(__flags) | |
6cb784b6 | 69 | { |
7c812a2a TS |
70 | _StateSeqT __r(_M_nfa, _M_nfa._M_start()); |
71 | __r._M_append(_M_nfa._M_insert_subexpr_begin()); | |
72 | this->_M_disjunction(); | |
73 | if (!_M_match_token(_ScannerT::_S_token_eof)) | |
74 | __throw_regex_error(regex_constants::error_paren); | |
75 | __r._M_append(_M_pop()); | |
76 | _GLIBCXX_DEBUG_ASSERT(_M_stack.empty()); | |
77 | __r._M_append(_M_nfa._M_insert_subexpr_end()); | |
78 | __r._M_append(_M_nfa._M_insert_accept()); | |
79 | _M_nfa._M_eliminate_dummy(); | |
6cb784b6 TS |
80 | } |
81 | ||
33fbbb76 | 82 | template<typename _FwdIter, typename _CharT, typename _TraitsT> |
6cb784b6 | 83 | void |
33fbbb76 | 84 | _Compiler<_FwdIter, _CharT, _TraitsT>:: |
6cb784b6 TS |
85 | _M_disjunction() |
86 | { | |
87 | this->_M_alternative(); | |
7c812a2a TS |
88 | // TODO empty alternative like, um, "(|asdf)" |
89 | while (_M_match_token(_ScannerT::_S_token_or)) | |
6cb784b6 | 90 | { |
7c812a2a TS |
91 | _StateSeqT __alt1 = _M_pop(); |
92 | this->_M_alternative(); | |
93 | _StateSeqT __alt2 = _M_pop(); | |
94 | auto __end = _M_nfa._M_insert_dummy(); | |
95 | __alt1._M_append(__end); | |
96 | __alt2._M_append(__end); | |
97 | _M_stack.push(_StateSeqT(_M_nfa, | |
98 | _M_nfa._M_insert_alt(__alt1._M_start, | |
7b86458e | 99 | __alt2._M_start, false), |
7c812a2a | 100 | __end)); |
6cb784b6 TS |
101 | } |
102 | } | |
103 | ||
33fbbb76 | 104 | template<typename _FwdIter, typename _CharT, typename _TraitsT> |
6cb784b6 | 105 | void |
33fbbb76 | 106 | _Compiler<_FwdIter, _CharT, _TraitsT>:: |
6cb784b6 TS |
107 | _M_alternative() |
108 | { | |
109 | if (this->_M_term()) | |
110 | { | |
7c812a2a | 111 | _StateSeqT __re = _M_pop(); |
6cb784b6 | 112 | this->_M_alternative(); |
7c812a2a | 113 | __re._M_append(_M_pop()); |
6cb784b6 TS |
114 | _M_stack.push(__re); |
115 | } | |
7c812a2a TS |
116 | else |
117 | _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy())); | |
6cb784b6 TS |
118 | } |
119 | ||
33fbbb76 | 120 | template<typename _FwdIter, typename _CharT, typename _TraitsT> |
6cb784b6 | 121 | bool |
33fbbb76 | 122 | _Compiler<_FwdIter, _CharT, _TraitsT>:: |
6cb784b6 TS |
123 | _M_term() |
124 | { | |
125 | if (this->_M_assertion()) | |
126 | return true; | |
127 | if (this->_M_atom()) | |
128 | { | |
129 | this->_M_quantifier(); | |
130 | return true; | |
131 | } | |
132 | return false; | |
133 | } | |
134 | ||
33fbbb76 | 135 | template<typename _FwdIter, typename _CharT, typename _TraitsT> |
6cb784b6 | 136 | bool |
33fbbb76 | 137 | _Compiler<_FwdIter, _CharT, _TraitsT>:: |
6cb784b6 TS |
138 | _M_assertion() |
139 | { | |
7c812a2a | 140 | if (_M_match_token(_ScannerT::_S_token_line_begin)) |
7b86458e TS |
141 | _M_stack.push(_StateSeqT(_M_nfa, _M_nfa. |
142 | _M_insert_line_begin())); | |
7c812a2a | 143 | else if (_M_match_token(_ScannerT::_S_token_line_end)) |
7b86458e TS |
144 | _M_stack.push(_StateSeqT(_M_nfa, _M_nfa. |
145 | _M_insert_line_end())); | |
7c812a2a | 146 | else if (_M_match_token(_ScannerT::_S_token_word_bound)) |
7b86458e TS |
147 | // _M_value[0] == 'n' means it's negtive, say "not word boundary". |
148 | _M_stack.push(_StateSeqT(_M_nfa, _M_nfa. | |
149 | _M_insert_word_bound(_M_value[0] == 'n'))); | |
7c812a2a | 150 | else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin)) |
7b86458e TS |
151 | { |
152 | auto __neg = _M_value[0] == 'n'; | |
153 | this->_M_disjunction(); | |
154 | if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) | |
155 | __throw_regex_error(regex_constants::error_paren); | |
156 | auto __tmp = _M_pop(); | |
157 | __tmp._M_append(_M_nfa._M_insert_accept()); | |
158 | _M_stack.push( | |
159 | _StateSeqT( | |
160 | _M_nfa, | |
161 | _M_nfa._M_insert_lookahead(__tmp._M_start, __neg))); | |
162 | } | |
7c812a2a TS |
163 | else |
164 | return false; | |
165 | return true; | |
6cb784b6 TS |
166 | } |
167 | ||
33fbbb76 | 168 | template<typename _FwdIter, typename _CharT, typename _TraitsT> |
6cb784b6 | 169 | void |
33fbbb76 | 170 | _Compiler<_FwdIter, _CharT, _TraitsT>:: |
6cb784b6 TS |
171 | _M_quantifier() |
172 | { | |
7b86458e TS |
173 | bool __neg = regex_constants::ECMAScript; |
174 | auto __init = [this, &__neg]() | |
6cb784b6 TS |
175 | { |
176 | if (_M_stack.empty()) | |
177 | __throw_regex_error(regex_constants::error_badrepeat); | |
7b86458e TS |
178 | __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); |
179 | }; | |
180 | if (_M_match_token(_ScannerT::_S_token_closure0)) | |
181 | { | |
182 | __init(); | |
7c812a2a TS |
183 | auto __e = _M_pop(); |
184 | _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id, | |
7b86458e | 185 | __e._M_start, __neg)); |
7c812a2a | 186 | __e._M_append(__r); |
6cb784b6 | 187 | _M_stack.push(__r); |
6cb784b6 | 188 | } |
7c812a2a | 189 | else if (_M_match_token(_ScannerT::_S_token_closure1)) |
6cb784b6 | 190 | { |
7b86458e | 191 | __init(); |
7c812a2a | 192 | auto __e = _M_pop(); |
7b86458e TS |
193 | __e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start, |
194 | __neg)); | |
7c812a2a | 195 | _M_stack.push(__e); |
6cb784b6 | 196 | } |
7c812a2a | 197 | else if (_M_match_token(_ScannerT::_S_token_opt)) |
6cb784b6 | 198 | { |
7b86458e | 199 | __init(); |
7c812a2a TS |
200 | auto __e = _M_pop(); |
201 | auto __end = _M_nfa._M_insert_dummy(); | |
202 | _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id, | |
7b86458e | 203 | __e._M_start, __neg)); |
7c812a2a TS |
204 | __e._M_append(__end); |
205 | __r._M_append(__end); | |
6cb784b6 | 206 | _M_stack.push(__r); |
6cb784b6 | 207 | } |
7c812a2a | 208 | else if (_M_match_token(_ScannerT::_S_token_interval_begin)) |
6cb784b6 | 209 | { |
7b86458e | 210 | __init(); |
6cb784b6 TS |
211 | if (!_M_match_token(_ScannerT::_S_token_dup_count)) |
212 | __throw_regex_error(regex_constants::error_badbrace); | |
7c812a2a TS |
213 | _StateSeqT __r(_M_pop()); |
214 | _StateSeqT __e(_M_nfa, _M_nfa._M_insert_dummy()); | |
6cb784b6 | 215 | int __min_rep = _M_cur_int_value(10); |
7c812a2a TS |
216 | // {3 |
217 | for (int __i = 0; __i < __min_rep; ++__i) | |
218 | __e._M_append(__r._M_clone()); | |
6cb784b6 | 219 | if (_M_match_token(_ScannerT::_S_token_comma)) |
7c812a2a | 220 | if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7} |
6cb784b6 | 221 | { |
7b86458e TS |
222 | int __n = _M_cur_int_value(10) - __min_rep; |
223 | if (__n < 0) | |
224 | __throw_regex_error(regex_constants::error_badbrace); | |
225 | auto __end = _M_nfa._M_insert_dummy(); | |
b21abcee TS |
226 | // _M_alt is the "match more" branch, and _M_next is the |
227 | // "match less" one. Switch _M_alt and _M_next of all created | |
228 | // nodes. This is a hacking but IMO works well. | |
229 | std::stack<_StateIdT> __stack; | |
7b86458e TS |
230 | for (int __i = 0; __i < __n; ++__i) |
231 | { | |
7c812a2a | 232 | auto __tmp = __r._M_clone(); |
b21abcee TS |
233 | auto __alt = _M_nfa._M_insert_alt(__tmp._M_start, |
234 | __end, __neg); | |
235 | __stack.push(__alt); | |
236 | __e._M_append(_StateSeqT(_M_nfa, __alt, __tmp._M_end)); | |
7b86458e | 237 | } |
7c812a2a | 238 | __e._M_append(__end); |
b21abcee TS |
239 | while (!__stack.empty()) |
240 | { | |
241 | auto& __tmp = _M_nfa[__stack.top()]; | |
242 | __stack.pop(); | |
243 | swap(__tmp._M_next, __tmp._M_alt); | |
244 | } | |
6cb784b6 | 245 | } |
7c812a2a | 246 | else // {3,} |
6cb784b6 | 247 | { |
7c812a2a | 248 | auto __tmp = __r._M_clone(); |
7b86458e TS |
249 | _StateSeqT __s(_M_nfa, |
250 | _M_nfa._M_insert_alt(_S_invalid_state_id, | |
251 | __tmp._M_start, __neg)); | |
7c812a2a TS |
252 | __tmp._M_append(__s); |
253 | __e._M_append(__s); | |
6cb784b6 TS |
254 | } |
255 | if (!_M_match_token(_ScannerT::_S_token_interval_end)) | |
256 | __throw_regex_error(regex_constants::error_brace); | |
7c812a2a | 257 | _M_stack.push(__e); |
6cb784b6 TS |
258 | } |
259 | } | |
260 | ||
33fbbb76 | 261 | template<typename _FwdIter, typename _CharT, typename _TraitsT> |
6cb784b6 | 262 | bool |
33fbbb76 | 263 | _Compiler<_FwdIter, _CharT, _TraitsT>:: |
6cb784b6 TS |
264 | _M_atom() |
265 | { | |
266 | if (_M_match_token(_ScannerT::_S_token_anychar)) | |
7c812a2a TS |
267 | _M_stack.push(_StateSeqT(_M_nfa, |
268 | _M_nfa._M_insert_matcher | |
269 | (_AnyMatcher<_CharT, _TraitsT>(_M_traits)))); | |
270 | else if (_M_try_char()) | |
271 | _M_stack.push(_StateSeqT(_M_nfa, | |
272 | _M_nfa._M_insert_matcher | |
273 | (_CharMatcher<_CharT, _TraitsT>(_M_value[0], | |
274 | _M_traits, | |
275 | _M_flags)))); | |
276 | else if (_M_match_token(_ScannerT::_S_token_backref)) | |
277 | _M_stack.push(_StateSeqT(_M_nfa, _M_nfa. | |
278 | _M_insert_backref(_M_cur_int_value(10)))); | |
279 | else if (_M_match_token(_ScannerT::_S_token_quoted_class)) | |
6cb784b6 | 280 | { |
7c812a2a TS |
281 | _GLIBCXX_DEBUG_ASSERT(_M_value.size() == 1); |
282 | _BMatcherT __matcher(_M_ctype.is(_CtypeT::upper, _M_value[0]), | |
283 | _M_traits, _M_flags); | |
284 | __matcher._M_add_character_class(_M_value); | |
285 | _M_stack.push(_StateSeqT(_M_nfa, | |
286 | _M_nfa._M_insert_matcher(__matcher))); | |
6cb784b6 | 287 | } |
7c812a2a | 288 | else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin)) |
6cb784b6 | 289 | { |
7c812a2a TS |
290 | _StateSeqT __r(_M_nfa, _M_nfa._M_insert_dummy()); |
291 | this->_M_disjunction(); | |
292 | if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) | |
293 | __throw_regex_error(regex_constants::error_paren); | |
294 | __r._M_append(_M_pop()); | |
295 | _M_stack.push(__r); | |
6cb784b6 | 296 | } |
7c812a2a | 297 | else if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) |
6cb784b6 | 298 | { |
7c812a2a TS |
299 | int __mark = _M_nfa._M_sub_count(); |
300 | _StateSeqT __r(_M_nfa, _M_nfa._M_insert_subexpr_begin()); | |
6cb784b6 TS |
301 | this->_M_disjunction(); |
302 | if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) | |
303 | __throw_regex_error(regex_constants::error_paren); | |
7c812a2a TS |
304 | __r._M_append(_M_pop()); |
305 | __r._M_append(_M_nfa._M_insert_subexpr_end()); | |
6cb784b6 | 306 | _M_stack.push(__r); |
6cb784b6 | 307 | } |
7c812a2a TS |
308 | else if (!_M_bracket_expression()) |
309 | return false; | |
310 | return true; | |
6cb784b6 TS |
311 | } |
312 | ||
33fbbb76 | 313 | template<typename _FwdIter, typename _CharT, typename _TraitsT> |
6cb784b6 | 314 | bool |
33fbbb76 | 315 | _Compiler<_FwdIter, _CharT, _TraitsT>:: |
6cb784b6 TS |
316 | _M_bracket_expression() |
317 | { | |
33fbbb76 TS |
318 | bool __neg = |
319 | _M_match_token(_ScannerT::_S_token_bracket_neg_begin); | |
320 | if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin))) | |
e280b6ff | 321 | return false; |
33fbbb76 | 322 | _BMatcherT __matcher(__neg, _M_traits, _M_flags); |
7c812a2a TS |
323 | while (!_M_match_token(_ScannerT::_S_token_bracket_end)) |
324 | _M_expression_term(__matcher); | |
325 | _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_matcher(__matcher))); | |
6cb784b6 TS |
326 | return true; |
327 | } | |
328 | ||
33fbbb76 | 329 | template<typename _FwdIter, typename _CharT, typename _TraitsT> |
6cb784b6 | 330 | void |
33fbbb76 | 331 | _Compiler<_FwdIter, _CharT, _TraitsT>:: |
6cb784b6 TS |
332 | _M_expression_term(_BMatcherT& __matcher) |
333 | { | |
334 | if (_M_match_token(_ScannerT::_S_token_collsymbol)) | |
7c812a2a TS |
335 | __matcher._M_add_collating_element(_M_value); |
336 | else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) | |
337 | __matcher._M_add_equivalence_class(_M_value); | |
338 | else if (_M_match_token(_ScannerT::_S_token_char_class_name)) | |
339 | __matcher._M_add_character_class(_M_value); | |
340 | else if (_M_try_char()) // [a | |
e280b6ff | 341 | { |
33fbbb76 TS |
342 | auto __ch = _M_value[0]; |
343 | if (_M_try_char()) | |
e280b6ff | 344 | { |
7c812a2a | 345 | if (_M_value[0] == '-') // [a- |
e280b6ff | 346 | { |
33fbbb76 TS |
347 | if (_M_try_char()) // [a-z] |
348 | { | |
349 | __matcher._M_make_range(__ch, _M_value[0]); | |
350 | return; | |
351 | } | |
352 | // If the dash is the last character in the bracket | |
353 | // expression, it is not special. | |
354 | if (_M_scanner._M_get_token() | |
355 | != _ScannerT::_S_token_bracket_end) | |
e280b6ff | 356 | __throw_regex_error(regex_constants::error_range); |
e280b6ff | 357 | } |
33fbbb76 | 358 | __matcher._M_add_char(_M_value[0]); |
e280b6ff | 359 | } |
33fbbb76 | 360 | __matcher._M_add_char(__ch); |
e280b6ff | 361 | } |
7c812a2a TS |
362 | else |
363 | __throw_regex_error(regex_constants::error_brack); | |
6cb784b6 TS |
364 | } |
365 | ||
33fbbb76 TS |
366 | template<typename _FwdIter, typename _CharT, typename _TraitsT> |
367 | bool | |
368 | _Compiler<_FwdIter, _CharT, _TraitsT>:: | |
369 | _M_try_char() | |
370 | { | |
371 | bool __is_char = false; | |
372 | if (_M_match_token(_ScannerT::_S_token_oct_num)) | |
373 | { | |
374 | __is_char = true; | |
375 | _M_value.assign(1, _M_cur_int_value(8)); | |
376 | } | |
377 | else if (_M_match_token(_ScannerT::_S_token_hex_num)) | |
378 | { | |
379 | __is_char = true; | |
380 | _M_value.assign(1, _M_cur_int_value(16)); | |
381 | } | |
382 | else if (_M_match_token(_ScannerT::_S_token_ord_char)) | |
383 | __is_char = true; | |
384 | return __is_char; | |
385 | } | |
386 | ||
387 | template<typename _FwdIter, typename _CharT, typename _TraitsT> | |
7c812a2a TS |
388 | bool |
389 | _Compiler<_FwdIter, _CharT, _TraitsT>:: | |
390 | _M_match_token(_TokenT token) | |
391 | { | |
392 | if (token == _M_scanner._M_get_token()) | |
393 | { | |
394 | _M_value = _M_scanner._M_get_value(); | |
395 | _M_scanner._M_advance(); | |
396 | return true; | |
397 | } | |
398 | return false; | |
399 | } | |
400 | ||
401 | template<typename _FwdIter, typename _CharT, typename _TraitsT> | |
6cb784b6 | 402 | int |
33fbbb76 | 403 | _Compiler<_FwdIter, _CharT, _TraitsT>:: |
6cb784b6 TS |
404 | _M_cur_int_value(int __radix) |
405 | { | |
406 | int __v = 0; | |
407 | for (typename _StringT::size_type __i = 0; | |
33fbbb76 TS |
408 | __i < _M_value.length(); ++__i) |
409 | __v =__v * __radix + _M_traits.value(_M_value[__i], __radix); | |
6cb784b6 TS |
410 | return __v; |
411 | } | |
412 | ||
413 | template<typename _CharT, typename _TraitsT> | |
414 | bool _BracketMatcher<_CharT, _TraitsT>:: | |
415 | operator()(_CharT __ch) const | |
416 | { | |
6cb784b6 | 417 | bool __ret = false; |
33fbbb76 | 418 | if (_M_traits.isctype(__ch, _M_class_set)) |
e280b6ff | 419 | __ret = true; |
e3509691 TS |
420 | else if (_M_char_set.count(_M_translate(__ch))) |
421 | __ret = true; | |
6cb784b6 | 422 | else |
e280b6ff | 423 | { |
e3509691 TS |
424 | _StringT __s = _M_get_str(_M_flags & regex_constants::collate |
425 | ? _M_translate(__ch) : __ch); | |
426 | for (auto& __it : _M_range_set) | |
427 | if (__it.first <= __s && __s <= __it.second) | |
e280b6ff TS |
428 | { |
429 | __ret = true; | |
430 | break; | |
431 | } | |
432 | } | |
6cb784b6 | 433 | if (_M_is_non_matching) |
33fbbb76 TS |
434 | return !__ret; |
435 | else | |
436 | return __ret; | |
6cb784b6 TS |
437 | } |
438 | ||
439 | _GLIBCXX_END_NAMESPACE_VERSION | |
440 | } // namespace __detail | |
441 | } // namespace |