]>
Commit | Line | Data |
---|---|---|
6adffd3f | 1 | // class template regex -*- C++ -*- |
2 | ||
3 | // Copyright (C) 2010 Free Software Foundation, Inc. | |
4 | // | |
5 | // This file is part of the GNU ISO C++ Library. This library is free | |
6 | // software; you can redistribute it and/or modify it under the | |
7 | // terms of the GNU General Public License as published by the | |
8 | // Free Software Foundation; either version 3, or (at your option) | |
9 | // any later version. | |
10 | ||
11 | // This library is distributed in the hope that it will be useful, | |
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | // GNU General Public License for more details. | |
15 | ||
16 | // Under Section 7 of GPL version 3, you are granted additional | |
17 | // permissions described in the GCC Runtime Library Exception, version | |
18 | // 3.1, as published by the Free Software Foundation. | |
19 | ||
20 | // You should have received a copy of the GNU General Public License and | |
21 | // a copy of the GCC Runtime Library Exception along with this program; | |
22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
23 | // <http://www.gnu.org/licenses/>. | |
24 | ||
25 | /** | |
26 | * @file bits/regex_parser.h | |
27 | * This is an internal header file, included by other library headers. | |
28 | * You should not attempt to use it directly. | |
29 | */ | |
30 | ||
31 | namespace std | |
32 | { | |
33 | namespace __regex | |
34 | { | |
35 | struct _Scanner_base | |
36 | { | |
37 | typedef unsigned int _StateT; // FIXME: replace these constanst with constexpr | |
38 | ||
39 | static const _StateT _S_state_at_start = 1 << 0; | |
40 | static const _StateT _S_state_in_brace = 1 << 2; | |
41 | static const _StateT _S_state_in_bracket = 1 << 3; | |
42 | }; | |
43 | ||
44 | // | |
45 | // @brief Scans an input range for regex tokens. | |
46 | // | |
47 | // The %_Scanner class interprets the regular expression pattern in the input | |
48 | // range passed to its constructor as a sequence of parse tokens passed to | |
49 | // the regular expression compiler. The sequence of tokens provided depends | |
50 | // on the flag settings passed to the constructor: different regular | |
51 | // expression gramars will interpret the same input pattern in syntactically | |
52 | // different ways. | |
53 | // | |
54 | template<typename _InputIterator> | |
55 | class _Scanner: public _Scanner_base | |
56 | { | |
57 | public: | |
58 | typedef _InputIterator _IteratorT; | |
59 | typedef typename std::iterator_traits<_IteratorT>::value_type _CharT; | |
60 | typedef std::basic_string<_CharT> _StringT; | |
61 | typedef regex_constants::syntax_option_type _FlagT; | |
62 | typedef const std::ctype<_CharT> _CtypeT; | |
63 | ||
64 | // Token types returned from the scanner. | |
65 | enum _TokenT | |
66 | { | |
67 | _S_token_anychar, | |
68 | _S_token_backref, | |
69 | _S_token_bracket_begin, | |
70 | _S_token_bracket_end, | |
71 | _S_token_inverse_class, | |
72 | _S_token_char_class_name, | |
73 | _S_token_closure0, | |
74 | _S_token_closure1, | |
75 | _S_token_collelem_multi, | |
76 | _S_token_collelem_single, | |
77 | _S_token_collsymbol, | |
78 | _S_token_comma, | |
79 | _S_token_dash, | |
80 | _S_token_dup_count, | |
81 | _S_token_eof, | |
82 | _S_token_equiv_class_name, | |
83 | _S_token_interval_begin, | |
84 | _S_token_interval_end, | |
85 | _S_token_line_begin, | |
86 | _S_token_line_end, | |
87 | _S_token_opt, | |
88 | _S_token_or, | |
89 | _S_token_ord_char, | |
90 | _S_token_quoted_char, | |
91 | _S_token_subexpr_begin, | |
92 | _S_token_subexpr_end, | |
93 | _S_token_word_begin, | |
94 | _S_token_word_end, | |
95 | _S_token_unknown | |
96 | }; | |
97 | ||
98 | public: | |
99 | _Scanner(_IteratorT __begin, _IteratorT __end, _FlagT __flags, | |
100 | std::locale __loc) | |
101 | : _M_current(__begin) , _M_end(__end) , _M_flags(__flags), | |
102 | _M_ctype(std::use_facet<_CtypeT>(__loc)), _M_state(_S_state_at_start) | |
103 | { _M_advance(); } | |
104 | ||
105 | void | |
106 | _M_advance(); | |
107 | ||
108 | _TokenT | |
109 | _M_token() const | |
110 | { return _M_curToken; } | |
111 | ||
112 | const _StringT& | |
113 | _M_value() const | |
114 | { return _M_curValue; } | |
115 | ||
116 | #ifdef _GLIBCXX_DEBUG | |
117 | std::ostream& | |
118 | _M_print(std::ostream&); | |
119 | #endif | |
120 | ||
121 | private: | |
122 | void | |
123 | _M_eat_escape(); | |
124 | ||
125 | void | |
126 | _M_scan_in_brace(); | |
127 | ||
128 | void | |
129 | _M_scan_in_bracket(); | |
130 | ||
131 | void | |
132 | _M_eat_charclass(); | |
133 | ||
134 | void | |
135 | _M_eat_equivclass(); | |
136 | ||
137 | void | |
138 | _M_eat_collsymbol(); | |
139 | ||
140 | private: | |
141 | _IteratorT _M_current; | |
142 | _IteratorT _M_end; | |
143 | _FlagT _M_flags; | |
144 | _CtypeT& _M_ctype; | |
145 | _TokenT _M_curToken; | |
146 | _StringT _M_curValue; | |
147 | _StateT _M_state; | |
148 | }; | |
149 | ||
150 | template<typename _InputIterator> | |
151 | void | |
152 | _Scanner<_InputIterator>:: | |
153 | _M_advance() | |
154 | { | |
155 | if (_M_current == _M_end) | |
156 | { | |
157 | _M_curToken = _S_token_eof; | |
158 | return; | |
159 | } | |
160 | ||
161 | _CharT __c = *_M_current; | |
162 | if (_M_state & _S_state_in_bracket) | |
163 | { | |
164 | _M_scan_in_bracket(); | |
165 | return; | |
166 | } | |
167 | if (_M_state & _S_state_in_brace) | |
168 | { | |
169 | _M_scan_in_brace(); | |
170 | return; | |
171 | } | |
172 | else if (_M_state & _S_state_at_start && __c == _M_ctype.widen('^')) | |
173 | { | |
174 | _M_curToken = _S_token_line_begin; | |
175 | ++_M_current; | |
176 | return; | |
177 | } | |
178 | else if (__c == _M_ctype.widen('$')) | |
179 | { | |
180 | _M_curToken = _S_token_line_end; | |
181 | ++_M_current; | |
182 | return; | |
183 | } | |
184 | else if (__c == _M_ctype.widen('.')) | |
185 | { | |
186 | _M_curToken = _S_token_anychar; | |
187 | ++_M_current; | |
188 | return; | |
189 | } | |
190 | else if (__c == _M_ctype.widen('*')) | |
191 | { | |
192 | _M_curToken = _S_token_closure0; | |
193 | ++_M_current; | |
194 | return; | |
195 | } | |
196 | else if (__c == _M_ctype.widen('+')) | |
197 | { | |
198 | _M_curToken = _S_token_closure1; | |
199 | ++_M_current; | |
200 | return; | |
201 | } | |
202 | else if (__c == _M_ctype.widen('|')) | |
203 | { | |
204 | _M_curToken = _S_token_or; | |
205 | ++_M_current; | |
206 | return; | |
207 | } | |
208 | else if (__c == _M_ctype.widen('[')) | |
209 | { | |
210 | _M_curToken = _S_token_bracket_begin; | |
211 | _M_state |= (_S_state_in_bracket | _S_state_at_start); | |
212 | ++_M_current; | |
213 | return; | |
214 | } | |
215 | else if (__c == _M_ctype.widen('\\')) | |
216 | { | |
217 | _M_eat_escape(); | |
218 | return; | |
219 | } | |
220 | else if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) | |
221 | { | |
222 | if (__c == _M_ctype.widen('(')) | |
223 | { | |
224 | _M_curToken = _S_token_subexpr_begin; | |
225 | ++_M_current; | |
226 | return; | |
227 | } | |
228 | else if (__c == _M_ctype.widen(')')) | |
229 | { | |
230 | _M_curToken = _S_token_subexpr_end; | |
231 | ++_M_current; | |
232 | return; | |
233 | } | |
234 | else if (__c == _M_ctype.widen('{')) | |
235 | { | |
236 | _M_curToken = _S_token_interval_begin; | |
237 | _M_state |= _S_state_in_brace; | |
238 | ++_M_current; | |
239 | return; | |
240 | } | |
241 | } | |
242 | ||
243 | _M_curToken = _S_token_ord_char; | |
244 | _M_curValue.assign(1, __c); | |
245 | ++_M_current; | |
246 | } | |
247 | ||
248 | ||
249 | template<typename _InputIterator> | |
250 | void | |
251 | _Scanner<_InputIterator>:: | |
252 | _M_scan_in_brace() | |
253 | { | |
254 | if (_M_ctype.is(_CtypeT::digit, *_M_current)) | |
255 | { | |
256 | _M_curToken = _S_token_dup_count; | |
257 | _M_curValue.assign(1, *_M_current); | |
258 | ++_M_current; | |
259 | while (_M_current != _M_end && _M_ctype.is(_CtypeT::digit, *_M_current)) | |
260 | { | |
261 | _M_curValue += *_M_current; | |
262 | ++_M_current; | |
263 | } | |
264 | return; | |
265 | } | |
266 | else if (*_M_current == _M_ctype.widen(',')) | |
267 | { | |
268 | _M_curToken = _S_token_comma; | |
269 | ++_M_current; | |
270 | return; | |
271 | } | |
272 | if (_M_flags & (regex_constants::basic | regex_constants::grep)) | |
273 | { | |
274 | if (*_M_current == _M_ctype.widen('\\')) | |
275 | { _M_eat_escape(); } | |
276 | } | |
277 | else | |
278 | { | |
279 | if (*_M_current == _M_ctype.widen('}')) | |
280 | { | |
281 | _M_curToken = _S_token_interval_end; | |
282 | _M_state &= ~_S_state_in_brace; | |
283 | ++_M_current; | |
284 | return; | |
285 | } | |
286 | } | |
287 | } | |
288 | ||
289 | template<typename _InputIterator> | |
290 | void | |
291 | _Scanner<_InputIterator>:: | |
292 | _M_scan_in_bracket() | |
293 | { | |
294 | if (_M_state & _S_state_at_start && *_M_current == _M_ctype.widen('^')) | |
295 | { | |
296 | _M_curToken = _S_token_inverse_class; | |
297 | _M_state &= ~_S_state_at_start; | |
298 | ++_M_current; | |
299 | return; | |
300 | } | |
301 | else if (*_M_current == _M_ctype.widen('[')) | |
302 | { | |
303 | ++_M_current; | |
304 | if (_M_current == _M_end) | |
305 | { | |
306 | _M_curToken = _S_token_eof; | |
307 | return; | |
308 | } | |
309 | ||
310 | if (*_M_current == _M_ctype.widen('.')) | |
311 | { | |
312 | _M_curToken = _S_token_collsymbol; | |
313 | _M_eat_collsymbol(); | |
314 | return; | |
315 | } | |
316 | else if (*_M_current == _M_ctype.widen(':')) | |
317 | { | |
318 | _M_curToken = _S_token_char_class_name; | |
319 | _M_eat_charclass(); | |
320 | return; | |
321 | } | |
322 | else if (*_M_current == _M_ctype.widen('=')) | |
323 | { | |
324 | _M_curToken = _S_token_equiv_class_name; | |
325 | _M_eat_equivclass(); | |
326 | return; | |
327 | } | |
328 | } | |
329 | else if (*_M_current == _M_ctype.widen('-')) | |
330 | { | |
331 | _M_curToken = _S_token_dash; | |
332 | ++_M_current; | |
333 | return; | |
334 | } | |
335 | else if (*_M_current == _M_ctype.widen(']')) | |
336 | { | |
337 | if (!(_M_flags & regex_constants::ECMAScript) | |
338 | || !(_M_state & _S_state_at_start)) | |
339 | { | |
340 | // special case: only if _not_ chr first after '[' or '[^' and if not | |
341 | // ECMAscript | |
342 | _M_curToken = _S_token_bracket_end; | |
343 | ++_M_current; | |
344 | return; | |
345 | } | |
346 | } | |
347 | _M_curToken = _S_token_collelem_single; | |
348 | _M_curValue.assign(1, *_M_current); | |
349 | ++_M_current; | |
350 | } | |
351 | ||
352 | template<typename _InputIterator> | |
353 | void | |
354 | _Scanner<_InputIterator>:: | |
355 | _M_eat_escape() | |
356 | { | |
357 | ++_M_current; | |
358 | if (_M_current == _M_end) | |
359 | { | |
360 | _M_curToken = _S_token_eof; | |
361 | return; | |
362 | } | |
363 | _CharT __c = *_M_current; | |
364 | ++_M_current; | |
365 | ||
366 | if (__c == _M_ctype.widen('(')) | |
367 | { | |
368 | if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) | |
369 | { | |
370 | _M_curToken = _S_token_ord_char; | |
371 | _M_curValue.assign(1, __c); | |
372 | } | |
373 | else | |
374 | { | |
375 | _M_curToken = _S_token_subexpr_begin; | |
376 | } | |
377 | } | |
378 | else if (__c == _M_ctype.widen(')')) | |
379 | { | |
380 | if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) | |
381 | { | |
382 | _M_curToken = _S_token_ord_char; | |
383 | _M_curValue.assign(1, __c); | |
384 | } | |
385 | else | |
386 | { | |
387 | _M_curToken = _S_token_subexpr_end; | |
388 | } | |
389 | } | |
390 | else if (__c == _M_ctype.widen('{')) | |
391 | { | |
392 | if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) | |
393 | { | |
394 | _M_curToken = _S_token_ord_char; | |
395 | _M_curValue.assign(1, __c); | |
396 | } | |
397 | else | |
398 | { | |
399 | _M_curToken = _S_token_interval_begin; | |
400 | _M_state |= _S_state_in_brace; | |
401 | } | |
402 | } | |
403 | else if (__c == _M_ctype.widen('}')) | |
404 | { | |
405 | if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) | |
406 | { | |
407 | _M_curToken = _S_token_ord_char; | |
408 | _M_curValue.assign(1, __c); | |
409 | } | |
410 | else | |
411 | { | |
412 | if (!(_M_state && _S_state_in_brace)) | |
413 | __throw_regex_error(regex_constants::error_badbrace); | |
414 | _M_state &= ~_S_state_in_brace; | |
415 | _M_curToken = _S_token_interval_end; | |
416 | } | |
417 | } | |
418 | else if (__c == _M_ctype.widen('x')) | |
419 | { | |
420 | ++_M_current; | |
421 | if (_M_current == _M_end) | |
422 | { | |
423 | _M_curToken = _S_token_eof; | |
424 | return; | |
425 | } | |
426 | if (_M_ctype.is(_CtypeT::digit, *_M_current)) | |
427 | { | |
428 | _M_curValue.assign(1, *_M_current); | |
429 | ++_M_current; | |
430 | if (_M_current == _M_end) | |
431 | { | |
432 | _M_curToken = _S_token_eof; | |
433 | return; | |
434 | } | |
435 | if (_M_ctype.is(_CtypeT::digit, *_M_current)) | |
436 | { | |
437 | _M_curValue += *_M_current; | |
438 | ++_M_current; | |
439 | return; | |
440 | } | |
441 | } | |
442 | } | |
443 | else if (__c == _M_ctype.widen('^') | |
444 | || __c == _M_ctype.widen('.') | |
445 | || __c == _M_ctype.widen('*') | |
446 | || __c == _M_ctype.widen('$') | |
447 | || __c == _M_ctype.widen('\\')) | |
448 | { | |
449 | _M_curToken = _S_token_ord_char; | |
450 | _M_curValue.assign(1, __c); | |
451 | } | |
452 | else if (_M_ctype.is(_CtypeT::digit, __c)) | |
453 | { | |
454 | _M_curToken = _S_token_backref; | |
455 | _M_curValue.assign(1, __c); | |
456 | } | |
457 | else | |
458 | { | |
459 | __throw_regex_error(regex_constants::error_escape); | |
460 | } | |
461 | } | |
462 | ||
463 | ||
464 | // Eats a character class or throwns an exception. | |
465 | // current point to ':' delimiter on entry, char after ']' on return | |
466 | template<typename _InputIterator> | |
467 | void | |
468 | _Scanner<_InputIterator>:: | |
469 | _M_eat_charclass() | |
470 | { | |
471 | ++_M_current; // skip ':' | |
472 | if (_M_current == _M_end) | |
473 | __throw_regex_error(regex_constants::error_ctype); | |
474 | for (_M_curValue.clear(); | |
475 | _M_current != _M_end && *_M_current != _M_ctype.widen(':'); | |
476 | ++_M_current) | |
477 | _M_curValue += *_M_current; | |
478 | if (_M_current == _M_end) | |
479 | __throw_regex_error(regex_constants::error_ctype); | |
480 | ++_M_current; // skip ':' | |
481 | if (*_M_current != _M_ctype.widen(']')) | |
482 | __throw_regex_error(regex_constants::error_ctype); | |
483 | ++_M_current; // skip ']' | |
484 | } | |
485 | ||
486 | ||
487 | template<typename _InputIterator> | |
488 | void | |
489 | _Scanner<_InputIterator>:: | |
490 | _M_eat_equivclass() | |
491 | { | |
492 | ++_M_current; // skip '=' | |
493 | if (_M_current == _M_end) | |
494 | __throw_regex_error(regex_constants::error_collate); | |
495 | for (_M_curValue.clear(); | |
496 | _M_current != _M_end && *_M_current != _M_ctype.widen('='); | |
497 | ++_M_current) | |
498 | _M_curValue += *_M_current; | |
499 | if (_M_current == _M_end) | |
500 | __throw_regex_error(regex_constants::error_collate); | |
501 | ++_M_current; // skip '=' | |
502 | if (*_M_current != _M_ctype.widen(']')) | |
503 | __throw_regex_error(regex_constants::error_collate); | |
504 | ++_M_current; // skip ']' | |
505 | } | |
506 | ||
507 | ||
508 | template<typename _InputIterator> | |
509 | void | |
510 | _Scanner<_InputIterator>:: | |
511 | _M_eat_collsymbol() | |
512 | { | |
513 | ++_M_current; // skip '.' | |
514 | if (_M_current == _M_end) | |
515 | __throw_regex_error(regex_constants::error_collate); | |
516 | for (_M_curValue.clear(); | |
517 | _M_current != _M_end && *_M_current != _M_ctype.widen('.'); | |
518 | ++_M_current) | |
519 | _M_curValue += *_M_current; | |
520 | if (_M_current == _M_end) | |
521 | __throw_regex_error(regex_constants::error_collate); | |
522 | ++_M_current; // skip '.' | |
523 | if (*_M_current != _M_ctype.widen(']')) | |
524 | __throw_regex_error(regex_constants::error_collate); | |
525 | ++_M_current; // skip ']' | |
526 | } | |
527 | ||
528 | #ifdef _GLIBCXX_DEBUG | |
529 | template<typename _InputIterator> | |
530 | std::ostream& | |
531 | _Scanner<_InputIterator>:: | |
532 | _M_print(std::ostream& ostr) | |
533 | { | |
534 | switch (_M_curToken) | |
535 | { | |
536 | case _S_token_anychar: | |
537 | ostr << "any-character\n"; | |
538 | break; | |
539 | case _S_token_backref: | |
540 | ostr << "backref\n"; | |
541 | break; | |
542 | case _S_token_bracket_begin: | |
543 | ostr << "bracket-begin\n"; | |
544 | break; | |
545 | case _S_token_bracket_end: | |
546 | ostr << "bracket-end\n"; | |
547 | break; | |
548 | case _S_token_char_class_name: | |
549 | ostr << "char-class-name \"" << _M_curValue << "\"\n"; | |
550 | break; | |
551 | case _S_token_closure0: | |
552 | ostr << "closure0\n"; | |
553 | break; | |
554 | case _S_token_closure1: | |
555 | ostr << "closure1\n"; | |
556 | break; | |
557 | case _S_token_collelem_multi: | |
558 | ostr << "coll-elem-multi \"" << _M_curValue << "\"\n"; | |
559 | break; | |
560 | case _S_token_collelem_single: | |
561 | ostr << "coll-elem-single \"" << _M_curValue << "\"\n"; | |
562 | break; | |
563 | case _S_token_collsymbol: | |
564 | ostr << "collsymbol \"" << _M_curValue << "\"\n"; | |
565 | break; | |
566 | case _S_token_comma: | |
567 | ostr << "comma\n"; | |
568 | break; | |
569 | case _S_token_dash: | |
570 | ostr << "dash\n"; | |
571 | break; | |
572 | case _S_token_dup_count: | |
573 | ostr << "dup count: " << _M_curValue << "\n"; | |
574 | break; | |
575 | case _S_token_eof: | |
576 | ostr << "EOF\n"; | |
577 | break; | |
578 | case _S_token_equiv_class_name: | |
579 | ostr << "equiv-class-name \"" << _M_curValue << "\"\n"; | |
580 | break; | |
581 | case _S_token_interval_begin: | |
582 | ostr << "interval begin\n"; | |
583 | break; | |
584 | case _S_token_interval_end: | |
585 | ostr << "interval end\n"; | |
586 | break; | |
587 | case _S_token_line_begin: | |
588 | ostr << "line begin\n"; | |
589 | break; | |
590 | case _S_token_line_end: | |
591 | ostr << "line end\n"; | |
592 | break; | |
593 | case _S_token_opt: | |
594 | ostr << "opt\n"; | |
595 | break; | |
596 | case _S_token_or: | |
597 | ostr << "or\n"; | |
598 | break; | |
599 | case _S_token_ord_char: | |
600 | ostr << "ordinary character: \"" << _M_value() << "\"\n"; | |
601 | break; | |
602 | case _S_token_quoted_char: | |
603 | ostr << "quoted char\n"; | |
604 | break; | |
605 | case _S_token_subexpr_begin: | |
606 | ostr << "subexpr begin\n"; | |
607 | break; | |
608 | case _S_token_subexpr_end: | |
609 | ostr << "subexpr end\n"; | |
610 | break; | |
611 | case _S_token_word_begin: | |
612 | ostr << "word begin\n"; | |
613 | break; | |
614 | case _S_token_word_end: | |
615 | ostr << "word end\n"; | |
616 | break; | |
617 | case _S_token_unknown: | |
618 | ostr << "-- unknown token --\n"; | |
619 | break; | |
620 | } | |
621 | return ostr; | |
622 | } | |
623 | #endif | |
624 | ||
625 | // Builds an NFA from an input iterator interval. | |
626 | template<typename _InIter, typename _TraitsT> | |
627 | class _Compiler | |
628 | { | |
629 | public: | |
630 | typedef _InIter _IterT; | |
631 | typedef typename std::iterator_traits<_InIter>::value_type _CharT; | |
632 | typedef std::basic_string<_CharT> _StringT; | |
633 | typedef regex_constants::syntax_option_type _FlagT; | |
634 | ||
635 | public: | |
636 | _Compiler(const _InIter& __b, const _InIter& __e, | |
637 | _TraitsT& __traits, _FlagT __flags); | |
638 | ||
639 | const _Nfa& | |
640 | _M_nfa() const | |
641 | { return _M_state_store; } | |
642 | ||
643 | private: | |
644 | typedef _Scanner<_InIter> _ScannerT; | |
645 | typedef typename _ScannerT::_TokenT _TokenT; | |
646 | typedef std::stack<_StateSeq, std::vector<_StateSeq> > _StackT; | |
647 | typedef _RangeMatcher<_InIter, _TraitsT> _RMatcherT; | |
648 | ||
649 | // accepts a specific token or returns false. | |
650 | bool | |
651 | _M_match_token(_TokenT token); | |
652 | ||
653 | void | |
654 | _M_disjunction(); | |
655 | ||
656 | bool | |
657 | _M_alternative(); | |
658 | ||
659 | bool | |
660 | _M_term(); | |
661 | ||
662 | bool | |
663 | _M_assertion(); | |
664 | ||
665 | bool | |
666 | _M_quantifier(); | |
667 | ||
668 | bool | |
669 | _M_atom(); | |
670 | ||
671 | bool | |
672 | _M_bracket_expression(); | |
673 | ||
674 | bool | |
675 | _M_bracket_list(_RMatcherT& __matcher); | |
676 | ||
677 | bool | |
678 | _M_follow_list(_RMatcherT& __matcher); | |
679 | ||
680 | bool | |
681 | _M_follow_list2(_RMatcherT& __matcher); | |
682 | ||
683 | bool | |
684 | _M_expression_term(_RMatcherT& __matcher); | |
685 | ||
686 | bool | |
687 | _M_range_expression(_RMatcherT& __matcher); | |
688 | ||
689 | bool | |
690 | _M_start_range(_RMatcherT& __matcher); | |
691 | ||
692 | bool | |
693 | _M_collating_symbol(_RMatcherT& __matcher); | |
694 | ||
695 | bool | |
696 | _M_equivalence_class(_RMatcherT& __matcher); | |
697 | ||
698 | bool | |
699 | _M_character_class(_RMatcherT& __matcher); | |
700 | ||
701 | int | |
702 | _M_cur_int_value(int __radix); | |
703 | ||
704 | private: | |
705 | _TraitsT& _M_traits; | |
706 | _ScannerT _M_scanner; | |
707 | _StringT _M_cur_value; | |
708 | _Nfa _M_state_store; | |
709 | _StackT _M_stack; | |
710 | }; | |
711 | ||
712 | template<typename _InIter, typename _TraitsT> | |
713 | _Compiler<_InIter, _TraitsT>:: | |
714 | _Compiler(const _InIter& __b, const _InIter& __e, _TraitsT& __traits, | |
715 | _Compiler<_InIter, _TraitsT>::_FlagT __flags) | |
716 | : _M_traits(__traits), _M_scanner(__b, __e, __flags, _M_traits.getloc()), | |
717 | _M_state_store(__flags) | |
718 | { | |
719 | using std::bind; | |
720 | using std::placeholders::_1; | |
721 | using std::placeholders::_2; | |
722 | typedef _StartTagger<_InIter, _TraitsT> _Start; | |
723 | typedef _EndTagger<_InIter, _TraitsT> _End; | |
724 | ||
725 | _StateSeq __r(_M_state_store, | |
726 | _M_state_store._M_insert_subexpr_begin( | |
727 | bind(_Start(0), _1, _2))); | |
728 | _M_disjunction(); | |
729 | if (!_M_stack.empty()) | |
730 | { | |
731 | __r._M_append(_M_stack.top()); | |
732 | _M_stack.pop(); | |
733 | } | |
734 | __r._M_append(_M_state_store._M_insert_subexpr_end(0, | |
735 | bind(_End(0), _1, _2))); | |
736 | __r._M_append(_M_state_store._M_insert_accept()); | |
737 | } | |
738 | ||
739 | template<typename _InIter, typename _TraitsT> | |
740 | bool | |
741 | _Compiler<_InIter, _TraitsT>:: | |
742 | _M_match_token(_Compiler<_InIter, _TraitsT>::_TokenT token) | |
743 | { | |
744 | if (token == _M_scanner._M_token()) | |
745 | { | |
746 | _M_cur_value = _M_scanner._M_value(); | |
747 | _M_scanner._M_advance(); | |
748 | return true; | |
749 | } | |
750 | return false; | |
751 | } | |
752 | ||
753 | template<typename _InIter, typename _TraitsT> | |
754 | void | |
755 | _Compiler<_InIter, _TraitsT>:: | |
756 | _M_disjunction() | |
757 | { | |
758 | this->_M_alternative(); | |
759 | if (_M_match_token(_ScannerT::_S_token_or)) | |
760 | { | |
761 | _StateSeq __alt1 = _M_stack.top(); _M_stack.pop(); | |
762 | this->_M_disjunction(); | |
763 | _StateSeq __alt2 = _M_stack.top(); _M_stack.pop(); | |
764 | _M_stack.push(_StateSeq(__alt1, __alt2)); | |
765 | } | |
766 | } | |
767 | ||
768 | template<typename _InIter, typename _TraitsT> | |
769 | bool | |
770 | _Compiler<_InIter, _TraitsT>:: | |
771 | _M_alternative() | |
772 | { | |
773 | if (this->_M_term()) | |
774 | { | |
775 | _StateSeq __re = _M_stack.top(); _M_stack.pop(); | |
776 | this->_M_alternative(); | |
777 | if (!_M_stack.empty()) | |
778 | { | |
779 | __re._M_append(_M_stack.top()); | |
780 | _M_stack.pop(); | |
781 | } | |
782 | _M_stack.push(__re); | |
783 | return true; | |
784 | } | |
785 | return false; | |
786 | } | |
787 | ||
788 | template<typename _InIter, typename _TraitsT> | |
789 | bool | |
790 | _Compiler<_InIter, _TraitsT>:: | |
791 | _M_term() | |
792 | { | |
793 | if (this->_M_assertion()) | |
794 | return true; | |
795 | if (this->_M_atom()) | |
796 | { | |
797 | this->_M_quantifier(); | |
798 | return true; | |
799 | } | |
800 | return false; | |
801 | } | |
802 | ||
803 | template<typename _InIter, typename _TraitsT> | |
804 | bool | |
805 | _Compiler<_InIter, _TraitsT>:: | |
806 | _M_assertion() | |
807 | { | |
808 | if (_M_match_token(_ScannerT::_S_token_line_begin)) | |
809 | { | |
810 | // __m.push(_Matcher::_S_opcode_line_begin); | |
811 | return true; | |
812 | } | |
813 | if (_M_match_token(_ScannerT::_S_token_line_end)) | |
814 | { | |
815 | // __m.push(_Matcher::_S_opcode_line_end); | |
816 | return true; | |
817 | } | |
818 | if (_M_match_token(_ScannerT::_S_token_word_begin)) | |
819 | { | |
820 | // __m.push(_Matcher::_S_opcode_word_begin); | |
821 | return true; | |
822 | } | |
823 | if (_M_match_token(_ScannerT::_S_token_word_end)) | |
824 | { | |
825 | // __m.push(_Matcher::_S_opcode_word_end); | |
826 | return true; | |
827 | } | |
828 | return false; | |
829 | } | |
830 | ||
831 | template<typename _InIter, typename _TraitsT> | |
832 | bool | |
833 | _Compiler<_InIter, _TraitsT>:: | |
834 | _M_quantifier() | |
835 | { | |
836 | if (_M_match_token(_ScannerT::_S_token_closure0)) | |
837 | { | |
838 | if (_M_stack.empty()) | |
839 | __throw_regex_error(regex_constants::error_badrepeat); | |
840 | _StateSeq __r(_M_stack.top(), -1); | |
841 | __r._M_append(__r._M_front()); | |
842 | _M_stack.pop(); | |
843 | _M_stack.push(__r); | |
844 | return true; | |
845 | } | |
846 | if (_M_match_token(_ScannerT::_S_token_closure1)) | |
847 | { | |
848 | if (_M_stack.empty()) | |
849 | __throw_regex_error(regex_constants::error_badrepeat); | |
850 | _StateSeq __r(_M_state_store, | |
851 | _M_state_store._M_insert_alt(_S_invalid_state_id, | |
852 | _M_stack.top()._M_front())); | |
853 | _M_stack.top()._M_append(__r); | |
854 | return true; | |
855 | } | |
856 | if (_M_match_token(_ScannerT::_S_token_opt)) | |
857 | { | |
858 | if (_M_stack.empty()) | |
859 | __throw_regex_error(regex_constants::error_badrepeat); | |
860 | _StateSeq __r(_M_stack.top(), -1); | |
861 | _M_stack.pop(); | |
862 | _M_stack.push(__r); | |
863 | return true; | |
864 | } | |
865 | if (_M_match_token(_ScannerT::_S_token_interval_begin)) | |
866 | { | |
867 | if (_M_stack.empty()) | |
868 | __throw_regex_error(regex_constants::error_badrepeat); | |
869 | if (!_M_match_token(_ScannerT::_S_token_dup_count)) | |
870 | __throw_regex_error(regex_constants::error_badbrace); | |
871 | _StateSeq __r(_M_stack.top()); | |
872 | int __min_rep = _M_cur_int_value(10); | |
873 | for (int __i = 1; __i < __min_rep; ++__i) | |
874 | _M_stack.top()._M_append(__r._M_clone()); | |
875 | if (_M_match_token(_ScannerT::_S_token_comma)) | |
876 | if (_M_match_token(_ScannerT::_S_token_dup_count)) | |
877 | { | |
878 | int __n = _M_cur_int_value(10) - __min_rep; | |
879 | if (__n < 0) | |
880 | __throw_regex_error(regex_constants::error_badbrace); | |
881 | for (int __i = 0; __i < __n; ++__i) | |
882 | { | |
883 | _StateSeq __r(_M_state_store, | |
884 | _M_state_store._M_insert_alt(_S_invalid_state_id, | |
885 | _M_stack.top()._M_front())); | |
886 | _M_stack.top()._M_append(__r); | |
887 | } | |
888 | } | |
889 | else | |
890 | { | |
891 | _StateSeq __r(_M_stack.top(), -1); | |
892 | __r._M_push_back(__r._M_front()); | |
893 | _M_stack.pop(); | |
894 | _M_stack.push(__r); | |
895 | } | |
896 | if (!_M_match_token(_ScannerT::_S_token_interval_end)) | |
897 | __throw_regex_error(regex_constants::error_brace); | |
898 | return true; | |
899 | } | |
900 | return false; | |
901 | } | |
902 | ||
903 | template<typename _InIter, typename _TraitsT> | |
904 | bool | |
905 | _Compiler<_InIter, _TraitsT>:: | |
906 | _M_atom() | |
907 | { | |
908 | using std::bind; | |
909 | using std::placeholders::_1; | |
910 | using std::placeholders::_2; | |
911 | typedef _CharMatcher<_InIter, _TraitsT> _CMatcher; | |
912 | typedef _StartTagger<_InIter, _TraitsT> _Start; | |
913 | typedef _EndTagger<_InIter, _TraitsT> _End; | |
914 | ||
915 | if (_M_match_token(_ScannerT::_S_token_anychar)) | |
916 | { | |
917 | _M_stack.push(_StateSeq(_M_state_store, | |
918 | _M_state_store._M_insert_matcher(bind(_AnyMatcher, _1)))); | |
919 | return true; | |
920 | } | |
921 | if (_M_match_token(_ScannerT::_S_token_ord_char)) | |
922 | { | |
923 | _M_stack.push(_StateSeq(_M_state_store, | |
924 | _M_state_store._M_insert_matcher( | |
925 | bind(_CMatcher(_M_cur_value[0], _M_traits), _1)))); | |
926 | return true; | |
927 | } | |
928 | if (_M_match_token(_ScannerT::_S_token_quoted_char)) | |
929 | { | |
930 | // note that in the ECMA grammar, this case covers backrefs. | |
931 | _M_stack.push(_StateSeq(_M_state_store, | |
932 | _M_state_store._M_insert_matcher( | |
933 | bind(_CMatcher(_M_cur_value[0], _M_traits), _1)))); | |
934 | return true; | |
935 | } | |
936 | if (_M_match_token(_ScannerT::_S_token_backref)) | |
937 | { | |
938 | // __m.push(_Matcher::_S_opcode_ordchar, _M_cur_value); | |
939 | return true; | |
940 | } | |
941 | if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) | |
942 | { | |
943 | int __mark = _M_state_store._M_sub_count(); | |
944 | _StateSeq __r(_M_state_store, | |
945 | _M_state_store._M_insert_subexpr_begin(bind(_Start(__mark), _1, _2))); | |
946 | this->_M_disjunction(); | |
947 | if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) | |
948 | __throw_regex_error(regex_constants::error_paren); | |
949 | if (!_M_stack.empty()) | |
950 | { | |
951 | __r._M_append(_M_stack.top()); | |
952 | _M_stack.pop(); | |
953 | } | |
954 | __r._M_append(_M_state_store._M_insert_subexpr_end(__mark, | |
955 | bind(_End(__mark), _1, _2))); | |
956 | _M_stack.push(__r); | |
957 | return true; | |
958 | } | |
959 | return _M_bracket_expression(); | |
960 | } | |
961 | ||
962 | template<typename _InIter, typename _TraitsT> | |
963 | bool | |
964 | _Compiler<_InIter, _TraitsT>:: | |
965 | _M_bracket_expression() | |
966 | { | |
967 | using std::bind; | |
968 | using std::placeholders::_1; | |
969 | if (_M_match_token(_ScannerT::_S_token_bracket_begin)) | |
970 | { | |
971 | _RMatcherT __matcher(_M_match_token(_ScannerT::_S_token_line_begin), | |
972 | _M_traits); | |
973 | if (!_M_bracket_list(__matcher) | |
974 | || !_M_match_token(_ScannerT::_S_token_bracket_end)) | |
975 | __throw_regex_error(regex_constants::error_brack); | |
976 | _M_stack.push(_StateSeq(_M_state_store, | |
977 | _M_state_store._M_insert_matcher( | |
978 | bind(__matcher, _1)))); | |
979 | return true; | |
980 | } | |
981 | return false; | |
982 | } | |
983 | ||
984 | // If the dash is the last character in the bracket expression, it is not | |
985 | // special. | |
986 | template<typename _InIter, typename _TraitsT> | |
987 | bool | |
988 | _Compiler<_InIter, _TraitsT>:: | |
989 | _M_bracket_list(_RMatcherT& __matcher) | |
990 | { | |
991 | if (_M_follow_list(__matcher)) | |
992 | { | |
993 | if (_M_match_token(_ScannerT::_S_token_dash)) | |
994 | __matcher._M_add_char(_M_cur_value[0]); | |
995 | return true; | |
996 | } | |
997 | return false; | |
998 | } | |
999 | ||
1000 | template<typename _InIter, typename _TraitsT> | |
1001 | bool | |
1002 | _Compiler<_InIter, _TraitsT>:: | |
1003 | _M_follow_list(_RMatcherT& __matcher) | |
1004 | { return _M_expression_term(__matcher) && _M_follow_list2(__matcher); } | |
1005 | ||
1006 | template<typename _InIter, typename _TraitsT> | |
1007 | bool | |
1008 | _Compiler<_InIter, _TraitsT>:: | |
1009 | _M_follow_list2(_RMatcherT& __matcher) | |
1010 | { | |
1011 | if (_M_expression_term(__matcher)) | |
1012 | return _M_follow_list2(__matcher); | |
1013 | return true; | |
1014 | } | |
1015 | ||
1016 | template<typename _InIter, typename _TraitsT> | |
1017 | bool | |
1018 | _Compiler<_InIter, _TraitsT>:: | |
1019 | _M_expression_term(_RMatcherT& __matcher) | |
1020 | { | |
1021 | return _M_collating_symbol(__matcher) | |
1022 | || _M_character_class(__matcher) | |
1023 | || _M_equivalence_class(__matcher) | |
1024 | || (_M_start_range(__matcher) && _M_range_expression(__matcher)); | |
1025 | } | |
1026 | ||
1027 | template<typename _InIter, typename _TraitsT> | |
1028 | bool | |
1029 | _Compiler<_InIter, _TraitsT>:: | |
1030 | _M_range_expression(_RMatcherT& __matcher) | |
1031 | { | |
1032 | if (!_M_collating_symbol(__matcher)) | |
1033 | if (!_M_match_token(_ScannerT::_S_token_dash)) | |
1034 | __throw_regex_error(regex_constants::error_range); | |
1035 | __matcher._M_make_range(); | |
1036 | return true; | |
1037 | } | |
1038 | ||
1039 | template<typename _InIter, typename _TraitsT> | |
1040 | bool | |
1041 | _Compiler<_InIter, _TraitsT>:: | |
1042 | _M_start_range(_RMatcherT& __matcher) | |
1043 | { return _M_match_token(_ScannerT::_S_token_dash); } | |
1044 | ||
1045 | template<typename _InIter, typename _TraitsT> | |
1046 | bool | |
1047 | _Compiler<_InIter, _TraitsT>:: | |
1048 | _M_collating_symbol(_RMatcherT& __matcher) | |
1049 | { | |
1050 | if (_M_match_token(_ScannerT::_S_token_collelem_single)) | |
1051 | { | |
1052 | __matcher._M_add_char(_M_cur_value[0]); | |
1053 | return true; | |
1054 | } | |
1055 | if (_M_match_token(_ScannerT::_S_token_collsymbol)) | |
1056 | { | |
1057 | __matcher._M_add_collating_element(_M_cur_value); | |
1058 | return true; | |
1059 | } | |
1060 | return false; | |
1061 | } | |
1062 | ||
1063 | template<typename _InIter, typename _TraitsT> | |
1064 | bool | |
1065 | _Compiler<_InIter, _TraitsT>:: | |
1066 | _M_equivalence_class(_RMatcherT& __matcher) | |
1067 | { | |
1068 | if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) | |
1069 | { | |
1070 | __matcher._M_add_equivalence_class(_M_cur_value); | |
1071 | return true; | |
1072 | } | |
1073 | return false; | |
1074 | } | |
1075 | ||
1076 | template<typename _InIter, typename _TraitsT> | |
1077 | bool | |
1078 | _Compiler<_InIter, _TraitsT>:: | |
1079 | _M_character_class(_RMatcherT& __matcher) | |
1080 | { | |
1081 | if (_M_match_token(_ScannerT::_S_token_char_class_name)) | |
1082 | { | |
1083 | __matcher._M_add_character_class(_M_cur_value); | |
1084 | return true; | |
1085 | } | |
1086 | return false; | |
1087 | } | |
1088 | ||
1089 | template<typename _InIter, typename _TraitsT> | |
1090 | int | |
1091 | _Compiler<_InIter, _TraitsT>:: | |
1092 | _M_cur_int_value(int __radix) | |
1093 | { | |
1094 | int __v = 0; | |
1095 | for (typename _StringT::size_type __i = 0; | |
1096 | __i < _M_cur_value.length(); ++__i) | |
1097 | __v =__v * __radix + _M_traits.value(_M_cur_value[__i], __radix); | |
1098 | return __v; | |
1099 | } | |
1100 | ||
1101 | template<typename _InIter, typename _TraitsT> | |
1102 | _AutomatonPtr | |
1103 | __compile(const _InIter& __b, const _InIter& __e, _TraitsT& __t, | |
1104 | regex_constants::syntax_option_type __f) | |
1105 | { return _AutomatonPtr(new _Nfa(_Compiler<_InIter, _TraitsT>(__b, __e, __t, | |
1106 | __f)._M_nfa())); } | |
1107 | ||
1108 | } // namespace __regex | |
1109 | } // namespace std | |
1110 | ||
1111 | /* vim: set ts=8 sw=2 sts=2: */ |