]>
Commit | Line | Data |
---|---|---|
849cab7b SW |
1 | // class template regex -*- C++ -*- |
2 | ||
3 | // Copyright (C) 2010 Free Software Foundation, Inc. | |
4 | // | |
5 | // This file is part of the GNU ISO C++ Library. This library is free | |
6 | // software; you can redistribute it and/or modify it under the | |
7 | // terms of the GNU General Public License as published by the | |
8 | // Free Software Foundation; either version 3, or (at your option) | |
9 | // any later version. | |
10 | ||
11 | // This library is distributed in the hope that it will be useful, | |
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | // GNU General Public License for more details. | |
15 | ||
16 | // Under Section 7 of GPL version 3, you are granted additional | |
17 | // permissions described in the GCC Runtime Library Exception, version | |
18 | // 3.1, as published by the Free Software Foundation. | |
19 | ||
20 | // You should have received a copy of the GNU General Public License and | |
21 | // a copy of the GCC Runtime Library Exception along with this program; | |
22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
23 | // <http://www.gnu.org/licenses/>. | |
24 | ||
25 | /** | |
f910786b BK |
26 | * @file bits/regex_compiler.h |
27 | * This is an internal header file, included by other library headers. | |
28 | * Do not attempt to use it directly. @headername{regex} | |
849cab7b SW |
29 | */ |
30 | ||
12ffa228 BK |
31 | namespace std _GLIBCXX_VISIBILITY(default) |
32 | { | |
33 | _GLIBCXX_BEGIN_NAMESPACE_VERSION | |
53dc5044 | 34 | |
849cab7b SW |
35 | namespace __regex |
36 | { | |
37 | struct _Scanner_base | |
38 | { | |
d860c842 PC |
39 | // FIXME: replace these constanst with constexpr |
40 | typedef unsigned int _StateT; | |
849cab7b SW |
41 | |
42 | static const _StateT _S_state_at_start = 1 << 0; | |
43 | static const _StateT _S_state_in_brace = 1 << 2; | |
44 | static const _StateT _S_state_in_bracket = 1 << 3; | |
45 | }; | |
46 | ||
47 | // | |
48 | // @brief Scans an input range for regex tokens. | |
49 | // | |
50 | // The %_Scanner class interprets the regular expression pattern in the input | |
51 | // range passed to its constructor as a sequence of parse tokens passed to | |
52 | // the regular expression compiler. The sequence of tokens provided depends | |
53 | // on the flag settings passed to the constructor: different regular | |
54 | // expression gramars will interpret the same input pattern in syntactically | |
55 | // different ways. | |
56 | // | |
57 | template<typename _InputIterator> | |
58 | class _Scanner: public _Scanner_base | |
59 | { | |
60 | public: | |
61 | typedef _InputIterator _IteratorT; | |
62 | typedef typename std::iterator_traits<_IteratorT>::value_type _CharT; | |
63 | typedef std::basic_string<_CharT> _StringT; | |
64 | typedef regex_constants::syntax_option_type _FlagT; | |
65 | typedef const std::ctype<_CharT> _CtypeT; | |
66 | ||
67 | // Token types returned from the scanner. | |
68 | enum _TokenT | |
69 | { | |
70 | _S_token_anychar, | |
71 | _S_token_backref, | |
72 | _S_token_bracket_begin, | |
73 | _S_token_bracket_end, | |
74 | _S_token_inverse_class, | |
75 | _S_token_char_class_name, | |
76 | _S_token_closure0, | |
77 | _S_token_closure1, | |
78 | _S_token_collelem_multi, | |
79 | _S_token_collelem_single, | |
80 | _S_token_collsymbol, | |
81 | _S_token_comma, | |
82 | _S_token_dash, | |
83 | _S_token_dup_count, | |
84 | _S_token_eof, | |
85 | _S_token_equiv_class_name, | |
86 | _S_token_interval_begin, | |
87 | _S_token_interval_end, | |
88 | _S_token_line_begin, | |
89 | _S_token_line_end, | |
90 | _S_token_opt, | |
91 | _S_token_or, | |
92 | _S_token_ord_char, | |
93 | _S_token_quoted_char, | |
94 | _S_token_subexpr_begin, | |
95 | _S_token_subexpr_end, | |
96 | _S_token_word_begin, | |
97 | _S_token_word_end, | |
98 | _S_token_unknown | |
99 | }; | |
100 | ||
101 | public: | |
102 | _Scanner(_IteratorT __begin, _IteratorT __end, _FlagT __flags, | |
103 | std::locale __loc) | |
104 | : _M_current(__begin) , _M_end(__end) , _M_flags(__flags), | |
105 | _M_ctype(std::use_facet<_CtypeT>(__loc)), _M_state(_S_state_at_start) | |
106 | { _M_advance(); } | |
107 | ||
108 | void | |
109 | _M_advance(); | |
110 | ||
111 | _TokenT | |
112 | _M_token() const | |
113 | { return _M_curToken; } | |
114 | ||
115 | const _StringT& | |
116 | _M_value() const | |
117 | { return _M_curValue; } | |
118 | ||
119 | #ifdef _GLIBCXX_DEBUG | |
120 | std::ostream& | |
121 | _M_print(std::ostream&); | |
122 | #endif | |
123 | ||
124 | private: | |
125 | void | |
126 | _M_eat_escape(); | |
127 | ||
128 | void | |
129 | _M_scan_in_brace(); | |
130 | ||
131 | void | |
132 | _M_scan_in_bracket(); | |
133 | ||
134 | void | |
135 | _M_eat_charclass(); | |
136 | ||
137 | void | |
138 | _M_eat_equivclass(); | |
139 | ||
140 | void | |
141 | _M_eat_collsymbol(); | |
142 | ||
143 | private: | |
144 | _IteratorT _M_current; | |
145 | _IteratorT _M_end; | |
146 | _FlagT _M_flags; | |
147 | _CtypeT& _M_ctype; | |
148 | _TokenT _M_curToken; | |
149 | _StringT _M_curValue; | |
150 | _StateT _M_state; | |
151 | }; | |
152 | ||
153 | template<typename _InputIterator> | |
154 | void | |
155 | _Scanner<_InputIterator>:: | |
156 | _M_advance() | |
157 | { | |
158 | if (_M_current == _M_end) | |
d860c842 PC |
159 | { |
160 | _M_curToken = _S_token_eof; | |
161 | return; | |
162 | } | |
849cab7b SW |
163 | |
164 | _CharT __c = *_M_current; | |
165 | if (_M_state & _S_state_in_bracket) | |
d860c842 PC |
166 | { |
167 | _M_scan_in_bracket(); | |
168 | return; | |
169 | } | |
849cab7b | 170 | if (_M_state & _S_state_in_brace) |
d860c842 PC |
171 | { |
172 | _M_scan_in_brace(); | |
173 | return; | |
174 | } | |
849cab7b | 175 | else if (_M_state & _S_state_at_start && __c == _M_ctype.widen('^')) |
d860c842 PC |
176 | { |
177 | _M_curToken = _S_token_line_begin; | |
178 | ++_M_current; | |
179 | return; | |
180 | } | |
849cab7b | 181 | else if (__c == _M_ctype.widen('$')) |
d860c842 PC |
182 | { |
183 | _M_curToken = _S_token_line_end; | |
184 | ++_M_current; | |
185 | return; | |
186 | } | |
849cab7b | 187 | else if (__c == _M_ctype.widen('.')) |
d860c842 PC |
188 | { |
189 | _M_curToken = _S_token_anychar; | |
190 | ++_M_current; | |
191 | return; | |
192 | } | |
849cab7b | 193 | else if (__c == _M_ctype.widen('*')) |
d860c842 PC |
194 | { |
195 | _M_curToken = _S_token_closure0; | |
196 | ++_M_current; | |
197 | return; | |
198 | } | |
849cab7b | 199 | else if (__c == _M_ctype.widen('+')) |
849cab7b | 200 | { |
d860c842 | 201 | _M_curToken = _S_token_closure1; |
849cab7b SW |
202 | ++_M_current; |
203 | return; | |
204 | } | |
d860c842 | 205 | else if (__c == _M_ctype.widen('|')) |
849cab7b | 206 | { |
d860c842 | 207 | _M_curToken = _S_token_or; |
849cab7b SW |
208 | ++_M_current; |
209 | return; | |
210 | } | |
d860c842 | 211 | else if (__c == _M_ctype.widen('[')) |
849cab7b | 212 | { |
d860c842 PC |
213 | _M_curToken = _S_token_bracket_begin; |
214 | _M_state |= (_S_state_in_bracket | _S_state_at_start); | |
849cab7b SW |
215 | ++_M_current; |
216 | return; | |
217 | } | |
d860c842 PC |
218 | else if (__c == _M_ctype.widen('\\')) |
219 | { | |
220 | _M_eat_escape(); | |
221 | return; | |
222 | } | |
223 | else if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) | |
224 | { | |
225 | if (__c == _M_ctype.widen('(')) | |
226 | { | |
227 | _M_curToken = _S_token_subexpr_begin; | |
228 | ++_M_current; | |
229 | return; | |
230 | } | |
231 | else if (__c == _M_ctype.widen(')')) | |
232 | { | |
233 | _M_curToken = _S_token_subexpr_end; | |
234 | ++_M_current; | |
235 | return; | |
236 | } | |
237 | else if (__c == _M_ctype.widen('{')) | |
238 | { | |
239 | _M_curToken = _S_token_interval_begin; | |
240 | _M_state |= _S_state_in_brace; | |
241 | ++_M_current; | |
242 | return; | |
243 | } | |
244 | } | |
849cab7b SW |
245 | |
246 | _M_curToken = _S_token_ord_char; | |
247 | _M_curValue.assign(1, __c); | |
248 | ++_M_current; | |
249 | } | |
250 | ||
251 | ||
252 | template<typename _InputIterator> | |
253 | void | |
254 | _Scanner<_InputIterator>:: | |
255 | _M_scan_in_brace() | |
256 | { | |
257 | if (_M_ctype.is(_CtypeT::digit, *_M_current)) | |
849cab7b | 258 | { |
d860c842 PC |
259 | _M_curToken = _S_token_dup_count; |
260 | _M_curValue.assign(1, *_M_current); | |
849cab7b | 261 | ++_M_current; |
d860c842 PC |
262 | while (_M_current != _M_end |
263 | && _M_ctype.is(_CtypeT::digit, *_M_current)) | |
264 | { | |
265 | _M_curValue += *_M_current; | |
266 | ++_M_current; | |
267 | } | |
268 | return; | |
849cab7b | 269 | } |
849cab7b | 270 | else if (*_M_current == _M_ctype.widen(',')) |
849cab7b | 271 | { |
d860c842 | 272 | _M_curToken = _S_token_comma; |
849cab7b SW |
273 | ++_M_current; |
274 | return; | |
275 | } | |
d860c842 PC |
276 | if (_M_flags & (regex_constants::basic | regex_constants::grep)) |
277 | { | |
278 | if (*_M_current == _M_ctype.widen('\\')) | |
279 | _M_eat_escape(); | |
280 | } | |
281 | else | |
282 | { | |
283 | if (*_M_current == _M_ctype.widen('}')) | |
284 | { | |
285 | _M_curToken = _S_token_interval_end; | |
286 | _M_state &= ~_S_state_in_brace; | |
287 | ++_M_current; | |
288 | return; | |
289 | } | |
290 | } | |
849cab7b SW |
291 | } |
292 | ||
293 | template<typename _InputIterator> | |
294 | void | |
295 | _Scanner<_InputIterator>:: | |
296 | _M_scan_in_bracket() | |
297 | { | |
298 | if (_M_state & _S_state_at_start && *_M_current == _M_ctype.widen('^')) | |
849cab7b | 299 | { |
d860c842 PC |
300 | _M_curToken = _S_token_inverse_class; |
301 | _M_state &= ~_S_state_at_start; | |
302 | ++_M_current; | |
849cab7b SW |
303 | return; |
304 | } | |
d860c842 | 305 | else if (*_M_current == _M_ctype.widen('[')) |
849cab7b | 306 | { |
d860c842 PC |
307 | ++_M_current; |
308 | if (_M_current == _M_end) | |
309 | { | |
310 | _M_curToken = _S_token_eof; | |
311 | return; | |
312 | } | |
313 | ||
314 | if (*_M_current == _M_ctype.widen('.')) | |
315 | { | |
316 | _M_curToken = _S_token_collsymbol; | |
317 | _M_eat_collsymbol(); | |
318 | return; | |
319 | } | |
320 | else if (*_M_current == _M_ctype.widen(':')) | |
321 | { | |
322 | _M_curToken = _S_token_char_class_name; | |
323 | _M_eat_charclass(); | |
324 | return; | |
325 | } | |
326 | else if (*_M_current == _M_ctype.widen('=')) | |
327 | { | |
328 | _M_curToken = _S_token_equiv_class_name; | |
329 | _M_eat_equivclass(); | |
330 | return; | |
331 | } | |
849cab7b | 332 | } |
d860c842 | 333 | else if (*_M_current == _M_ctype.widen('-')) |
849cab7b | 334 | { |
d860c842 PC |
335 | _M_curToken = _S_token_dash; |
336 | ++_M_current; | |
849cab7b SW |
337 | return; |
338 | } | |
849cab7b | 339 | else if (*_M_current == _M_ctype.widen(']')) |
849cab7b | 340 | { |
d860c842 PC |
341 | if (!(_M_flags & regex_constants::ECMAScript) |
342 | || !(_M_state & _S_state_at_start)) | |
343 | { | |
344 | // special case: only if _not_ chr first after | |
345 | // '[' or '[^' and if not ECMAscript | |
346 | _M_curToken = _S_token_bracket_end; | |
347 | ++_M_current; | |
348 | return; | |
349 | } | |
849cab7b | 350 | } |
849cab7b SW |
351 | _M_curToken = _S_token_collelem_single; |
352 | _M_curValue.assign(1, *_M_current); | |
353 | ++_M_current; | |
354 | } | |
355 | ||
356 | template<typename _InputIterator> | |
357 | void | |
358 | _Scanner<_InputIterator>:: | |
359 | _M_eat_escape() | |
360 | { | |
361 | ++_M_current; | |
362 | if (_M_current == _M_end) | |
d860c842 PC |
363 | { |
364 | _M_curToken = _S_token_eof; | |
365 | return; | |
366 | } | |
849cab7b SW |
367 | _CharT __c = *_M_current; |
368 | ++_M_current; | |
369 | ||
370 | if (__c == _M_ctype.widen('(')) | |
849cab7b | 371 | { |
d860c842 PC |
372 | if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) |
373 | { | |
374 | _M_curToken = _S_token_ord_char; | |
375 | _M_curValue.assign(1, __c); | |
376 | } | |
377 | else | |
378 | _M_curToken = _S_token_subexpr_begin; | |
849cab7b | 379 | } |
849cab7b | 380 | else if (__c == _M_ctype.widen(')')) |
849cab7b | 381 | { |
d860c842 PC |
382 | if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) |
383 | { | |
384 | _M_curToken = _S_token_ord_char; | |
385 | _M_curValue.assign(1, __c); | |
386 | } | |
387 | else | |
388 | _M_curToken = _S_token_subexpr_end; | |
849cab7b | 389 | } |
849cab7b | 390 | else if (__c == _M_ctype.widen('{')) |
849cab7b | 391 | { |
d860c842 PC |
392 | if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) |
393 | { | |
394 | _M_curToken = _S_token_ord_char; | |
395 | _M_curValue.assign(1, __c); | |
396 | } | |
397 | else | |
398 | { | |
399 | _M_curToken = _S_token_interval_begin; | |
400 | _M_state |= _S_state_in_brace; | |
401 | } | |
849cab7b | 402 | } |
849cab7b | 403 | else if (__c == _M_ctype.widen('}')) |
849cab7b | 404 | { |
d860c842 PC |
405 | if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) |
406 | { | |
407 | _M_curToken = _S_token_ord_char; | |
408 | _M_curValue.assign(1, __c); | |
409 | } | |
410 | else | |
411 | { | |
412 | if (!(_M_state && _S_state_in_brace)) | |
413 | __throw_regex_error(regex_constants::error_badbrace); | |
414 | _M_state &= ~_S_state_in_brace; | |
415 | _M_curToken = _S_token_interval_end; | |
416 | } | |
849cab7b | 417 | } |
849cab7b | 418 | else if (__c == _M_ctype.widen('x')) |
849cab7b | 419 | { |
849cab7b SW |
420 | ++_M_current; |
421 | if (_M_current == _M_end) | |
d860c842 PC |
422 | { |
423 | _M_curToken = _S_token_eof; | |
424 | return; | |
425 | } | |
849cab7b | 426 | if (_M_ctype.is(_CtypeT::digit, *_M_current)) |
d860c842 PC |
427 | { |
428 | _M_curValue.assign(1, *_M_current); | |
429 | ++_M_current; | |
430 | if (_M_current == _M_end) | |
431 | { | |
432 | _M_curToken = _S_token_eof; | |
433 | return; | |
434 | } | |
435 | if (_M_ctype.is(_CtypeT::digit, *_M_current)) | |
436 | { | |
437 | _M_curValue += *_M_current; | |
438 | ++_M_current; | |
439 | return; | |
440 | } | |
441 | } | |
849cab7b | 442 | } |
849cab7b | 443 | else if (__c == _M_ctype.widen('^') |
d860c842 PC |
444 | || __c == _M_ctype.widen('.') |
445 | || __c == _M_ctype.widen('*') | |
446 | || __c == _M_ctype.widen('$') | |
447 | || __c == _M_ctype.widen('\\')) | |
448 | { | |
449 | _M_curToken = _S_token_ord_char; | |
450 | _M_curValue.assign(1, __c); | |
451 | } | |
849cab7b | 452 | else if (_M_ctype.is(_CtypeT::digit, __c)) |
d860c842 PC |
453 | { |
454 | _M_curToken = _S_token_backref; | |
455 | _M_curValue.assign(1, __c); | |
456 | } | |
849cab7b | 457 | else |
849cab7b | 458 | __throw_regex_error(regex_constants::error_escape); |
849cab7b SW |
459 | } |
460 | ||
461 | ||
462 | // Eats a character class or throwns an exception. | |
463 | // current point to ':' delimiter on entry, char after ']' on return | |
464 | template<typename _InputIterator> | |
465 | void | |
466 | _Scanner<_InputIterator>:: | |
467 | _M_eat_charclass() | |
468 | { | |
469 | ++_M_current; // skip ':' | |
470 | if (_M_current == _M_end) | |
471 | __throw_regex_error(regex_constants::error_ctype); | |
472 | for (_M_curValue.clear(); | |
473 | _M_current != _M_end && *_M_current != _M_ctype.widen(':'); | |
474 | ++_M_current) | |
475 | _M_curValue += *_M_current; | |
476 | if (_M_current == _M_end) | |
477 | __throw_regex_error(regex_constants::error_ctype); | |
478 | ++_M_current; // skip ':' | |
479 | if (*_M_current != _M_ctype.widen(']')) | |
480 | __throw_regex_error(regex_constants::error_ctype); | |
481 | ++_M_current; // skip ']' | |
482 | } | |
483 | ||
484 | ||
485 | template<typename _InputIterator> | |
486 | void | |
487 | _Scanner<_InputIterator>:: | |
488 | _M_eat_equivclass() | |
489 | { | |
490 | ++_M_current; // skip '=' | |
491 | if (_M_current == _M_end) | |
492 | __throw_regex_error(regex_constants::error_collate); | |
493 | for (_M_curValue.clear(); | |
494 | _M_current != _M_end && *_M_current != _M_ctype.widen('='); | |
495 | ++_M_current) | |
496 | _M_curValue += *_M_current; | |
497 | if (_M_current == _M_end) | |
498 | __throw_regex_error(regex_constants::error_collate); | |
499 | ++_M_current; // skip '=' | |
500 | if (*_M_current != _M_ctype.widen(']')) | |
501 | __throw_regex_error(regex_constants::error_collate); | |
502 | ++_M_current; // skip ']' | |
503 | } | |
504 | ||
505 | ||
506 | template<typename _InputIterator> | |
507 | void | |
508 | _Scanner<_InputIterator>:: | |
509 | _M_eat_collsymbol() | |
510 | { | |
511 | ++_M_current; // skip '.' | |
512 | if (_M_current == _M_end) | |
513 | __throw_regex_error(regex_constants::error_collate); | |
514 | for (_M_curValue.clear(); | |
515 | _M_current != _M_end && *_M_current != _M_ctype.widen('.'); | |
516 | ++_M_current) | |
517 | _M_curValue += *_M_current; | |
518 | if (_M_current == _M_end) | |
519 | __throw_regex_error(regex_constants::error_collate); | |
520 | ++_M_current; // skip '.' | |
521 | if (*_M_current != _M_ctype.widen(']')) | |
522 | __throw_regex_error(regex_constants::error_collate); | |
523 | ++_M_current; // skip ']' | |
524 | } | |
525 | ||
526 | #ifdef _GLIBCXX_DEBUG | |
527 | template<typename _InputIterator> | |
528 | std::ostream& | |
529 | _Scanner<_InputIterator>:: | |
530 | _M_print(std::ostream& ostr) | |
531 | { | |
532 | switch (_M_curToken) | |
533 | { | |
534 | case _S_token_anychar: | |
d860c842 PC |
535 | ostr << "any-character\n"; |
536 | break; | |
849cab7b | 537 | case _S_token_backref: |
d860c842 PC |
538 | ostr << "backref\n"; |
539 | break; | |
849cab7b | 540 | case _S_token_bracket_begin: |
d860c842 PC |
541 | ostr << "bracket-begin\n"; |
542 | break; | |
849cab7b | 543 | case _S_token_bracket_end: |
d860c842 PC |
544 | ostr << "bracket-end\n"; |
545 | break; | |
849cab7b | 546 | case _S_token_char_class_name: |
d860c842 PC |
547 | ostr << "char-class-name \"" << _M_curValue << "\"\n"; |
548 | break; | |
849cab7b | 549 | case _S_token_closure0: |
d860c842 PC |
550 | ostr << "closure0\n"; |
551 | break; | |
849cab7b | 552 | case _S_token_closure1: |
d860c842 PC |
553 | ostr << "closure1\n"; |
554 | break; | |
849cab7b | 555 | case _S_token_collelem_multi: |
d860c842 PC |
556 | ostr << "coll-elem-multi \"" << _M_curValue << "\"\n"; |
557 | break; | |
849cab7b | 558 | case _S_token_collelem_single: |
d860c842 PC |
559 | ostr << "coll-elem-single \"" << _M_curValue << "\"\n"; |
560 | break; | |
849cab7b | 561 | case _S_token_collsymbol: |
d860c842 PC |
562 | ostr << "collsymbol \"" << _M_curValue << "\"\n"; |
563 | break; | |
849cab7b | 564 | case _S_token_comma: |
d860c842 PC |
565 | ostr << "comma\n"; |
566 | break; | |
849cab7b | 567 | case _S_token_dash: |
d860c842 PC |
568 | ostr << "dash\n"; |
569 | break; | |
849cab7b | 570 | case _S_token_dup_count: |
d860c842 PC |
571 | ostr << "dup count: " << _M_curValue << "\n"; |
572 | break; | |
849cab7b | 573 | case _S_token_eof: |
d860c842 PC |
574 | ostr << "EOF\n"; |
575 | break; | |
849cab7b | 576 | case _S_token_equiv_class_name: |
d860c842 PC |
577 | ostr << "equiv-class-name \"" << _M_curValue << "\"\n"; |
578 | break; | |
849cab7b | 579 | case _S_token_interval_begin: |
d860c842 PC |
580 | ostr << "interval begin\n"; |
581 | break; | |
849cab7b | 582 | case _S_token_interval_end: |
d860c842 PC |
583 | ostr << "interval end\n"; |
584 | break; | |
849cab7b | 585 | case _S_token_line_begin: |
d860c842 PC |
586 | ostr << "line begin\n"; |
587 | break; | |
849cab7b | 588 | case _S_token_line_end: |
d860c842 PC |
589 | ostr << "line end\n"; |
590 | break; | |
849cab7b | 591 | case _S_token_opt: |
d860c842 PC |
592 | ostr << "opt\n"; |
593 | break; | |
849cab7b | 594 | case _S_token_or: |
d860c842 PC |
595 | ostr << "or\n"; |
596 | break; | |
849cab7b | 597 | case _S_token_ord_char: |
d860c842 PC |
598 | ostr << "ordinary character: \"" << _M_value() << "\"\n"; |
599 | break; | |
849cab7b | 600 | case _S_token_quoted_char: |
d860c842 PC |
601 | ostr << "quoted char\n"; |
602 | break; | |
849cab7b | 603 | case _S_token_subexpr_begin: |
d860c842 PC |
604 | ostr << "subexpr begin\n"; |
605 | break; | |
849cab7b | 606 | case _S_token_subexpr_end: |
d860c842 PC |
607 | ostr << "subexpr end\n"; |
608 | break; | |
849cab7b | 609 | case _S_token_word_begin: |
d860c842 PC |
610 | ostr << "word begin\n"; |
611 | break; | |
849cab7b | 612 | case _S_token_word_end: |
d860c842 PC |
613 | ostr << "word end\n"; |
614 | break; | |
849cab7b | 615 | case _S_token_unknown: |
d860c842 PC |
616 | ostr << "-- unknown token --\n"; |
617 | break; | |
849cab7b SW |
618 | } |
619 | return ostr; | |
620 | } | |
621 | #endif | |
622 | ||
623 | // Builds an NFA from an input iterator interval. | |
624 | template<typename _InIter, typename _TraitsT> | |
625 | class _Compiler | |
626 | { | |
627 | public: | |
628 | typedef _InIter _IterT; | |
629 | typedef typename std::iterator_traits<_InIter>::value_type _CharT; | |
630 | typedef std::basic_string<_CharT> _StringT; | |
631 | typedef regex_constants::syntax_option_type _FlagT; | |
632 | ||
633 | public: | |
634 | _Compiler(const _InIter& __b, const _InIter& __e, | |
635 | _TraitsT& __traits, _FlagT __flags); | |
636 | ||
637 | const _Nfa& | |
638 | _M_nfa() const | |
639 | { return _M_state_store; } | |
640 | ||
641 | private: | |
642 | typedef _Scanner<_InIter> _ScannerT; | |
643 | typedef typename _ScannerT::_TokenT _TokenT; | |
644 | typedef std::stack<_StateSeq, std::vector<_StateSeq> > _StackT; | |
645 | typedef _RangeMatcher<_InIter, _TraitsT> _RMatcherT; | |
646 | ||
647 | // accepts a specific token or returns false. | |
648 | bool | |
d860c842 | 649 | _M_match_token(_TokenT __token); |
849cab7b SW |
650 | |
651 | void | |
652 | _M_disjunction(); | |
653 | ||
654 | bool | |
655 | _M_alternative(); | |
656 | ||
657 | bool | |
658 | _M_term(); | |
659 | ||
660 | bool | |
661 | _M_assertion(); | |
662 | ||
663 | bool | |
664 | _M_quantifier(); | |
665 | ||
666 | bool | |
667 | _M_atom(); | |
668 | ||
669 | bool | |
670 | _M_bracket_expression(); | |
671 | ||
672 | bool | |
673 | _M_bracket_list(_RMatcherT& __matcher); | |
674 | ||
675 | bool | |
676 | _M_follow_list(_RMatcherT& __matcher); | |
677 | ||
678 | bool | |
679 | _M_follow_list2(_RMatcherT& __matcher); | |
680 | ||
681 | bool | |
682 | _M_expression_term(_RMatcherT& __matcher); | |
683 | ||
684 | bool | |
685 | _M_range_expression(_RMatcherT& __matcher); | |
686 | ||
687 | bool | |
688 | _M_start_range(_RMatcherT& __matcher); | |
689 | ||
690 | bool | |
691 | _M_collating_symbol(_RMatcherT& __matcher); | |
692 | ||
693 | bool | |
694 | _M_equivalence_class(_RMatcherT& __matcher); | |
695 | ||
696 | bool | |
697 | _M_character_class(_RMatcherT& __matcher); | |
698 | ||
699 | int | |
700 | _M_cur_int_value(int __radix); | |
701 | ||
702 | private: | |
703 | _TraitsT& _M_traits; | |
704 | _ScannerT _M_scanner; | |
705 | _StringT _M_cur_value; | |
706 | _Nfa _M_state_store; | |
707 | _StackT _M_stack; | |
708 | }; | |
709 | ||
710 | template<typename _InIter, typename _TraitsT> | |
711 | _Compiler<_InIter, _TraitsT>:: | |
712 | _Compiler(const _InIter& __b, const _InIter& __e, _TraitsT& __traits, | |
713 | _Compiler<_InIter, _TraitsT>::_FlagT __flags) | |
714 | : _M_traits(__traits), _M_scanner(__b, __e, __flags, _M_traits.getloc()), | |
715 | _M_state_store(__flags) | |
716 | { | |
717 | using std::bind; | |
718 | using std::placeholders::_1; | |
719 | using std::placeholders::_2; | |
720 | typedef _StartTagger<_InIter, _TraitsT> _Start; | |
721 | typedef _EndTagger<_InIter, _TraitsT> _End; | |
722 | ||
723 | _StateSeq __r(_M_state_store, | |
724 | _M_state_store._M_insert_subexpr_begin( | |
725 | bind(_Start(0), _1, _2))); | |
726 | _M_disjunction(); | |
727 | if (!_M_stack.empty()) | |
d860c842 PC |
728 | { |
729 | __r._M_append(_M_stack.top()); | |
730 | _M_stack.pop(); | |
731 | } | |
732 | __r._M_append(_M_state_store. | |
733 | _M_insert_subexpr_end(0, bind(_End(0), _1, _2))); | |
849cab7b SW |
734 | __r._M_append(_M_state_store._M_insert_accept()); |
735 | } | |
736 | ||
737 | template<typename _InIter, typename _TraitsT> | |
738 | bool | |
739 | _Compiler<_InIter, _TraitsT>:: | |
740 | _M_match_token(_Compiler<_InIter, _TraitsT>::_TokenT token) | |
741 | { | |
742 | if (token == _M_scanner._M_token()) | |
d860c842 PC |
743 | { |
744 | _M_cur_value = _M_scanner._M_value(); | |
745 | _M_scanner._M_advance(); | |
746 | return true; | |
747 | } | |
849cab7b SW |
748 | return false; |
749 | } | |
750 | ||
751 | template<typename _InIter, typename _TraitsT> | |
752 | void | |
753 | _Compiler<_InIter, _TraitsT>:: | |
754 | _M_disjunction() | |
755 | { | |
756 | this->_M_alternative(); | |
757 | if (_M_match_token(_ScannerT::_S_token_or)) | |
d860c842 PC |
758 | { |
759 | _StateSeq __alt1 = _M_stack.top(); _M_stack.pop(); | |
760 | this->_M_disjunction(); | |
761 | _StateSeq __alt2 = _M_stack.top(); _M_stack.pop(); | |
762 | _M_stack.push(_StateSeq(__alt1, __alt2)); | |
763 | } | |
849cab7b SW |
764 | } |
765 | ||
766 | template<typename _InIter, typename _TraitsT> | |
767 | bool | |
768 | _Compiler<_InIter, _TraitsT>:: | |
769 | _M_alternative() | |
770 | { | |
771 | if (this->_M_term()) | |
849cab7b | 772 | { |
d860c842 PC |
773 | _StateSeq __re = _M_stack.top(); _M_stack.pop(); |
774 | this->_M_alternative(); | |
775 | if (!_M_stack.empty()) | |
776 | { | |
777 | __re._M_append(_M_stack.top()); | |
778 | _M_stack.pop(); | |
779 | } | |
780 | _M_stack.push(__re); | |
781 | return true; | |
849cab7b | 782 | } |
849cab7b SW |
783 | return false; |
784 | } | |
785 | ||
786 | template<typename _InIter, typename _TraitsT> | |
787 | bool | |
788 | _Compiler<_InIter, _TraitsT>:: | |
789 | _M_term() | |
790 | { | |
791 | if (this->_M_assertion()) | |
792 | return true; | |
793 | if (this->_M_atom()) | |
d860c842 PC |
794 | { |
795 | this->_M_quantifier(); | |
796 | return true; | |
797 | } | |
849cab7b SW |
798 | return false; |
799 | } | |
800 | ||
801 | template<typename _InIter, typename _TraitsT> | |
802 | bool | |
803 | _Compiler<_InIter, _TraitsT>:: | |
804 | _M_assertion() | |
805 | { | |
806 | if (_M_match_token(_ScannerT::_S_token_line_begin)) | |
d860c842 PC |
807 | { |
808 | // __m.push(_Matcher::_S_opcode_line_begin); | |
809 | return true; | |
810 | } | |
849cab7b | 811 | if (_M_match_token(_ScannerT::_S_token_line_end)) |
d860c842 PC |
812 | { |
813 | // __m.push(_Matcher::_S_opcode_line_end); | |
814 | return true; | |
815 | } | |
849cab7b | 816 | if (_M_match_token(_ScannerT::_S_token_word_begin)) |
d860c842 PC |
817 | { |
818 | // __m.push(_Matcher::_S_opcode_word_begin); | |
819 | return true; | |
820 | } | |
849cab7b | 821 | if (_M_match_token(_ScannerT::_S_token_word_end)) |
d860c842 PC |
822 | { |
823 | // __m.push(_Matcher::_S_opcode_word_end); | |
824 | return true; | |
825 | } | |
849cab7b SW |
826 | return false; |
827 | } | |
828 | ||
829 | template<typename _InIter, typename _TraitsT> | |
830 | bool | |
831 | _Compiler<_InIter, _TraitsT>:: | |
832 | _M_quantifier() | |
833 | { | |
834 | if (_M_match_token(_ScannerT::_S_token_closure0)) | |
d860c842 PC |
835 | { |
836 | if (_M_stack.empty()) | |
837 | __throw_regex_error(regex_constants::error_badrepeat); | |
838 | _StateSeq __r(_M_stack.top(), -1); | |
839 | __r._M_append(__r._M_front()); | |
840 | _M_stack.pop(); | |
841 | _M_stack.push(__r); | |
842 | return true; | |
843 | } | |
849cab7b | 844 | if (_M_match_token(_ScannerT::_S_token_closure1)) |
d860c842 PC |
845 | { |
846 | if (_M_stack.empty()) | |
847 | __throw_regex_error(regex_constants::error_badrepeat); | |
848 | _StateSeq __r(_M_state_store, | |
849 | _M_state_store. | |
850 | _M_insert_alt(_S_invalid_state_id, | |
851 | _M_stack.top()._M_front())); | |
852 | _M_stack.top()._M_append(__r); | |
853 | return true; | |
854 | } | |
849cab7b | 855 | if (_M_match_token(_ScannerT::_S_token_opt)) |
d860c842 PC |
856 | { |
857 | if (_M_stack.empty()) | |
849cab7b | 858 | __throw_regex_error(regex_constants::error_badrepeat); |
d860c842 PC |
859 | _StateSeq __r(_M_stack.top(), -1); |
860 | _M_stack.pop(); | |
861 | _M_stack.push(__r); | |
862 | return true; | |
863 | } | |
849cab7b | 864 | if (_M_match_token(_ScannerT::_S_token_interval_begin)) |
d860c842 PC |
865 | { |
866 | if (_M_stack.empty()) | |
867 | __throw_regex_error(regex_constants::error_badrepeat); | |
868 | if (!_M_match_token(_ScannerT::_S_token_dup_count)) | |
869 | __throw_regex_error(regex_constants::error_badbrace); | |
870 | _StateSeq __r(_M_stack.top()); | |
871 | int __min_rep = _M_cur_int_value(10); | |
872 | for (int __i = 1; __i < __min_rep; ++__i) | |
873 | _M_stack.top()._M_append(__r._M_clone()); | |
874 | if (_M_match_token(_ScannerT::_S_token_comma)) | |
875 | if (_M_match_token(_ScannerT::_S_token_dup_count)) | |
876 | { | |
877 | int __n = _M_cur_int_value(10) - __min_rep; | |
878 | if (__n < 0) | |
879 | __throw_regex_error(regex_constants::error_badbrace); | |
880 | for (int __i = 0; __i < __n; ++__i) | |
881 | { | |
882 | _StateSeq __r(_M_state_store, | |
883 | _M_state_store. | |
884 | _M_insert_alt(_S_invalid_state_id, | |
885 | _M_stack.top()._M_front())); | |
886 | _M_stack.top()._M_append(__r); | |
887 | } | |
888 | } | |
889 | else | |
890 | { | |
891 | _StateSeq __r(_M_stack.top(), -1); | |
892 | __r._M_push_back(__r._M_front()); | |
893 | _M_stack.pop(); | |
894 | _M_stack.push(__r); | |
895 | } | |
896 | if (!_M_match_token(_ScannerT::_S_token_interval_end)) | |
897 | __throw_regex_error(regex_constants::error_brace); | |
898 | return true; | |
899 | } | |
849cab7b SW |
900 | return false; |
901 | } | |
902 | ||
903 | template<typename _InIter, typename _TraitsT> | |
904 | bool | |
905 | _Compiler<_InIter, _TraitsT>:: | |
906 | _M_atom() | |
907 | { | |
908 | using std::bind; | |
909 | using std::placeholders::_1; | |
910 | using std::placeholders::_2; | |
911 | typedef _CharMatcher<_InIter, _TraitsT> _CMatcher; | |
912 | typedef _StartTagger<_InIter, _TraitsT> _Start; | |
913 | typedef _EndTagger<_InIter, _TraitsT> _End; | |
914 | ||
915 | if (_M_match_token(_ScannerT::_S_token_anychar)) | |
d860c842 PC |
916 | { |
917 | _M_stack.push(_StateSeq(_M_state_store, | |
918 | _M_state_store. | |
919 | _M_insert_matcher(bind(_AnyMatcher, _1)))); | |
920 | return true; | |
921 | } | |
849cab7b | 922 | if (_M_match_token(_ScannerT::_S_token_ord_char)) |
d860c842 PC |
923 | { |
924 | _M_stack.push(_StateSeq | |
925 | (_M_state_store, _M_state_store. | |
926 | _M_insert_matcher | |
927 | (bind(_CMatcher(_M_cur_value[0], _M_traits), _1)))); | |
928 | return true; | |
929 | } | |
849cab7b | 930 | if (_M_match_token(_ScannerT::_S_token_quoted_char)) |
d860c842 PC |
931 | { |
932 | // note that in the ECMA grammar, this case covers backrefs. | |
933 | _M_stack.push(_StateSeq(_M_state_store, | |
934 | _M_state_store. | |
935 | _M_insert_matcher | |
936 | (bind(_CMatcher(_M_cur_value[0], _M_traits), | |
937 | _1)))); | |
938 | return true; | |
939 | } | |
849cab7b | 940 | if (_M_match_token(_ScannerT::_S_token_backref)) |
d860c842 PC |
941 | { |
942 | // __m.push(_Matcher::_S_opcode_ordchar, _M_cur_value); | |
943 | return true; | |
944 | } | |
849cab7b | 945 | if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) |
849cab7b | 946 | { |
d860c842 PC |
947 | int __mark = _M_state_store._M_sub_count(); |
948 | _StateSeq __r(_M_state_store, | |
949 | _M_state_store. | |
950 | _M_insert_subexpr_begin(bind(_Start(__mark), _1, _2))); | |
951 | this->_M_disjunction(); | |
952 | if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) | |
953 | __throw_regex_error(regex_constants::error_paren); | |
954 | if (!_M_stack.empty()) | |
955 | { | |
956 | __r._M_append(_M_stack.top()); | |
957 | _M_stack.pop(); | |
958 | } | |
959 | __r._M_append(_M_state_store._M_insert_subexpr_end | |
960 | (__mark, bind(_End(__mark), _1, _2))); | |
961 | _M_stack.push(__r); | |
962 | return true; | |
849cab7b | 963 | } |
849cab7b SW |
964 | return _M_bracket_expression(); |
965 | } | |
966 | ||
967 | template<typename _InIter, typename _TraitsT> | |
968 | bool | |
969 | _Compiler<_InIter, _TraitsT>:: | |
970 | _M_bracket_expression() | |
971 | { | |
972 | using std::bind; | |
973 | using std::placeholders::_1; | |
974 | if (_M_match_token(_ScannerT::_S_token_bracket_begin)) | |
d860c842 PC |
975 | { |
976 | _RMatcherT __matcher(_M_match_token(_ScannerT::_S_token_line_begin), | |
977 | _M_traits); | |
978 | if (!_M_bracket_list(__matcher) | |
979 | || !_M_match_token(_ScannerT::_S_token_bracket_end)) | |
980 | __throw_regex_error(regex_constants::error_brack); | |
981 | _M_stack.push(_StateSeq(_M_state_store, | |
982 | _M_state_store._M_insert_matcher | |
983 | (bind(__matcher, _1)))); | |
984 | return true; | |
985 | } | |
849cab7b SW |
986 | return false; |
987 | } | |
988 | ||
989 | // If the dash is the last character in the bracket expression, it is not | |
990 | // special. | |
991 | template<typename _InIter, typename _TraitsT> | |
992 | bool | |
993 | _Compiler<_InIter, _TraitsT>:: | |
994 | _M_bracket_list(_RMatcherT& __matcher) | |
995 | { | |
996 | if (_M_follow_list(__matcher)) | |
d860c842 PC |
997 | { |
998 | if (_M_match_token(_ScannerT::_S_token_dash)) | |
999 | __matcher._M_add_char(_M_cur_value[0]); | |
1000 | return true; | |
1001 | } | |
849cab7b SW |
1002 | return false; |
1003 | } | |
1004 | ||
1005 | template<typename _InIter, typename _TraitsT> | |
1006 | bool | |
1007 | _Compiler<_InIter, _TraitsT>:: | |
1008 | _M_follow_list(_RMatcherT& __matcher) | |
1009 | { return _M_expression_term(__matcher) && _M_follow_list2(__matcher); } | |
1010 | ||
1011 | template<typename _InIter, typename _TraitsT> | |
1012 | bool | |
1013 | _Compiler<_InIter, _TraitsT>:: | |
1014 | _M_follow_list2(_RMatcherT& __matcher) | |
1015 | { | |
1016 | if (_M_expression_term(__matcher)) | |
1017 | return _M_follow_list2(__matcher); | |
1018 | return true; | |
1019 | } | |
1020 | ||
1021 | template<typename _InIter, typename _TraitsT> | |
1022 | bool | |
1023 | _Compiler<_InIter, _TraitsT>:: | |
1024 | _M_expression_term(_RMatcherT& __matcher) | |
1025 | { | |
d860c842 PC |
1026 | return (_M_collating_symbol(__matcher) |
1027 | || _M_character_class(__matcher) | |
1028 | || _M_equivalence_class(__matcher) | |
1029 | || (_M_start_range(__matcher) | |
1030 | && _M_range_expression(__matcher))); | |
849cab7b SW |
1031 | } |
1032 | ||
1033 | template<typename _InIter, typename _TraitsT> | |
1034 | bool | |
1035 | _Compiler<_InIter, _TraitsT>:: | |
1036 | _M_range_expression(_RMatcherT& __matcher) | |
1037 | { | |
1038 | if (!_M_collating_symbol(__matcher)) | |
1039 | if (!_M_match_token(_ScannerT::_S_token_dash)) | |
1040 | __throw_regex_error(regex_constants::error_range); | |
1041 | __matcher._M_make_range(); | |
1042 | return true; | |
1043 | } | |
1044 | ||
1045 | template<typename _InIter, typename _TraitsT> | |
1046 | bool | |
1047 | _Compiler<_InIter, _TraitsT>:: | |
1048 | _M_start_range(_RMatcherT& __matcher) | |
1049 | { return _M_match_token(_ScannerT::_S_token_dash); } | |
1050 | ||
1051 | template<typename _InIter, typename _TraitsT> | |
1052 | bool | |
1053 | _Compiler<_InIter, _TraitsT>:: | |
1054 | _M_collating_symbol(_RMatcherT& __matcher) | |
1055 | { | |
1056 | if (_M_match_token(_ScannerT::_S_token_collelem_single)) | |
d860c842 PC |
1057 | { |
1058 | __matcher._M_add_char(_M_cur_value[0]); | |
1059 | return true; | |
1060 | } | |
849cab7b | 1061 | if (_M_match_token(_ScannerT::_S_token_collsymbol)) |
d860c842 PC |
1062 | { |
1063 | __matcher._M_add_collating_element(_M_cur_value); | |
1064 | return true; | |
1065 | } | |
849cab7b SW |
1066 | return false; |
1067 | } | |
1068 | ||
1069 | template<typename _InIter, typename _TraitsT> | |
1070 | bool | |
1071 | _Compiler<_InIter, _TraitsT>:: | |
1072 | _M_equivalence_class(_RMatcherT& __matcher) | |
1073 | { | |
1074 | if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) | |
d860c842 PC |
1075 | { |
1076 | __matcher._M_add_equivalence_class(_M_cur_value); | |
1077 | return true; | |
1078 | } | |
849cab7b SW |
1079 | return false; |
1080 | } | |
1081 | ||
1082 | template<typename _InIter, typename _TraitsT> | |
1083 | bool | |
1084 | _Compiler<_InIter, _TraitsT>:: | |
1085 | _M_character_class(_RMatcherT& __matcher) | |
1086 | { | |
1087 | if (_M_match_token(_ScannerT::_S_token_char_class_name)) | |
d860c842 PC |
1088 | { |
1089 | __matcher._M_add_character_class(_M_cur_value); | |
1090 | return true; | |
1091 | } | |
849cab7b SW |
1092 | return false; |
1093 | } | |
1094 | ||
1095 | template<typename _InIter, typename _TraitsT> | |
1096 | int | |
1097 | _Compiler<_InIter, _TraitsT>:: | |
1098 | _M_cur_int_value(int __radix) | |
1099 | { | |
1100 | int __v = 0; | |
1101 | for (typename _StringT::size_type __i = 0; | |
1102 | __i < _M_cur_value.length(); ++__i) | |
1103 | __v =__v * __radix + _M_traits.value(_M_cur_value[__i], __radix); | |
1104 | return __v; | |
1105 | } | |
1106 | ||
1107 | template<typename _InIter, typename _TraitsT> | |
1108 | _AutomatonPtr | |
1109 | __compile(const _InIter& __b, const _InIter& __e, _TraitsT& __t, | |
1110 | regex_constants::syntax_option_type __f) | |
1111 | { return _AutomatonPtr(new _Nfa(_Compiler<_InIter, _TraitsT>(__b, __e, __t, | |
1112 | __f)._M_nfa())); } | |
1113 | ||
1114 | } // namespace __regex | |
53dc5044 | 1115 | |
12ffa228 BK |
1116 | _GLIBCXX_END_NAMESPACE_VERSION |
1117 | } // namespace | |
849cab7b SW |
1118 | |
1119 | /* vim: set ts=8 sw=2 sts=2: */ |