libstdc++-v3/include/bits/regex_scanner.tcc

   1 // class template regex -*- C++ -*-
   2
   3 // Copyright (C) 2013 Free Software Foundation, Inc.
   4 //
   5 // This file is part of the GNU ISO C++ Library.  This library is free
   6 // software; you can redistribute it and/or modify it under the
   7 // terms of the GNU General Public License as published by the
   8 // Free Software Foundation; either version 3, or (at your option)
   9 // any later version.
  10
  11 // This library is distributed in the hope that it will be useful,
  12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 // GNU General Public License for more details.
  15
  16 // Under Section 7 of GPL version 3, you are granted additional
  17 // permissions described in the GCC Runtime Library Exception, version
  18 // 3.1, as published by the Free Software Foundation.
  19
  20 // You should have received a copy of the GNU General Public License and
  21 // a copy of the GCC Runtime Library Exception along with this program;
  22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23 // <http://www.gnu.org/licenses/>.
  24
  25 /**
  26  *  @file bits/regex_scanner.tcc
  27  *  This is an internal header file, included by other library headers.
  28  *  Do not attempt to use it directly. @headername{regex}
  29  */
  30
  31 // FIXME make comments doxygen format.
  32
  33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
  34 // and awk
  35 // 1) grep is basic except '\n' is treated as '|'
  36 // 2) egrep is extended except '\n' is treated as '|'
  37 // 3) awk is extended except special escaping rules, and there's no
  38 //    back-reference.
  39 //
  40 // References:
  41 //
  42 // ECMAScript: ECMA-262 15.10
  43 //
  44 // basic, extended:
  45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
  46 //
  47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
  48
  49 namespace std _GLIBCXX_VISIBILITY(default)
  50 {
  51 namespace __detail
  52 {
  53 _GLIBCXX_BEGIN_NAMESPACE_VERSION
  54
  55   template<typename _FwdIter>
  56     _Scanner<_FwdIter>::
  57     _Scanner(_FwdIter __begin, _FwdIter __end,
  58              _FlagT __flags, std::locale __loc)
  59     : _M_current(__begin) , _M_end(__end) , _M_flags(__flags),
  60       _M_ctype(std::use_facet<_CtypeT>(__loc)), _M_state(_S_state_normal),
  61       _M_at_bracket_start(false),
  62       _M_token_map
  63         {
  64           {'^', _S_token_line_begin},
  65           {'$', _S_token_line_end},
  66           {'.', _S_token_anychar},
  67           {'*', _S_token_closure0},
  68           {'+', _S_token_closure1},
  69           {'?', _S_token_opt},
  70           {'|', _S_token_or},
  71           // grep and egrep
  72           {'\n', _S_token_or},
  73         },
  74       _M_ecma_escape_map
  75         {
  76           {'0', '\0'},
  77           {'b', '\b'},
  78           {'f', '\f'},
  79           {'n', '\n'},
  80           {'r', '\r'},
  81           {'t', '\t'},
  82           {'v', '\v'},
  83         },
  84       _M_awk_escape_map
  85         {
  86           {'"', '"'},
  87           {'/', '/'},
  88           {'\\', '\\'},
  89           {'a', '\a'},
  90           {'b', '\b'},
  91           {'f', '\f'},
  92           {'n', '\n'},
  93           {'r', '\r'},
  94           {'t', '\t'},
  95           {'v', '\v'},
  96         },
  97       _M_escape_map(_M_is_ecma()
  98                     ? _M_ecma_escape_map
  99                     : _M_awk_escape_map),
 100       _M_ecma_spec_char
 101         {
 102           '^',
 103           '$',
 104           '\\',
 105           '.',
 106           '*',
 107           '+',
 108           '?',
 109           '(',
 110           ')',
 111           '[',
 112           ']',
 113           '{',
 114           '}',
 115           '|',
 116         },
 117       _M_basic_spec_char
 118         {
 119           '.',
 120           '[',
 121           '\\',
 122           '*',
 123           '^',
 124           '$',
 125         },
 126       _M_extended_spec_char
 127         {
 128           '.',
 129           '[',
 130           '\\',
 131           '(',
 132           ')',
 133           '*',
 134           '+',
 135           '?',
 136           '{',
 137           '|',
 138           '^',
 139           '$',
 140         },
 141       _M_eat_escape(_M_is_ecma()
 142                     ? &_Scanner::_M_eat_escape_ecma
 143                     : &_Scanner::_M_eat_escape_posix),
 144       _M_spec_char(_M_is_ecma()
 145                    ? _M_ecma_spec_char
 146                    : _M_is_basic()
 147                    ? _M_basic_spec_char
 148                    : _M_extended_spec_char)
 149     { _M_advance(); }
 150
 151   template<typename _FwdIter>
 152     void
 153     _Scanner<_FwdIter>::
 154     _M_advance()
 155     {
 156       if (_M_current == _M_end)
 157         {
 158           _M_token = _S_token_eof;
 159           return;
 160         }
 161
 162       if (_M_state == _S_state_normal)
 163         _M_scan_normal();
 164       else if (_M_state == _S_state_in_bracket)
 165         _M_scan_in_bracket();
 166       else if (_M_state == _S_state_in_brace)
 167         _M_scan_in_brace();
 168       else
 169         _GLIBCXX_DEBUG_ASSERT(false);
 170     }
 171
 172   // Differences between styles:
 173   // 1) "\(", "\)", "\{" in basic. It's not escaping.
 174   // 2) "(?:", "(?=", "(?!" in ECMAScript.
 175   template<typename _FwdIter>
 176     void
 177     _Scanner<_FwdIter>::
 178     _M_scan_normal()
 179     {
 180       auto __c = *_M_current++;
 181
 182       if (__c == '\\')
 183         {
 184           if (_M_current == _M_end)
 185             __throw_regex_error(regex_constants::error_escape);
 186
 187           if (!_M_is_basic()
 188               || (*_M_current != '('
 189                   && *_M_current != ')'
 190                   && *_M_current != '{'))
 191             {
 192               (this->*_M_eat_escape)();
 193               return;
 194             }
 195           __c = *_M_current++;
 196         }
 197       if (__c == '(')
 198         {
 199           if (_M_is_ecma() && *_M_current == '?')
 200             {
 201               if (++_M_current == _M_end)
 202                 __throw_regex_error(regex_constants::error_paren);
 203
 204               if (*_M_current == ':')
 205                 {
 206                   ++_M_current;
 207                   _M_token = _S_token_subexpr_no_group_begin;
 208                 }
 209               else if (*_M_current == '=')
 210                 {
 211                   ++_M_current;
 212                   _M_token = _S_token_subexpr_lookahead_begin;
 213                   _M_value.assign(1, 'p');
 214                 }
 215               else if (*_M_current == '!')
 216                 {
 217                   ++_M_current;
 218                   _M_token = _S_token_subexpr_lookahead_begin;
 219                   _M_value.assign(1, 'n');
 220                 }
 221               else
 222                 __throw_regex_error(regex_constants::error_paren);
 223             }
 224           else
 225             _M_token = _S_token_subexpr_begin;
 226         }
 227       else if (__c == ')')
 228         _M_token = _S_token_subexpr_end;
 229       else if (__c == '[')
 230         {
 231           _M_state = _S_state_in_bracket;
 232           _M_at_bracket_start = true;
 233           if (_M_current != _M_end && *_M_current == '^')
 234             {
 235               _M_token = _S_token_bracket_neg_begin;
 236               ++_M_current;
 237             }
 238           else
 239             _M_token = _S_token_bracket_begin;
 240         }
 241       else if (__c == '{')
 242         {
 243           _M_state = _S_state_in_brace;
 244           _M_token = _S_token_interval_begin;
 245         }
 246       else if (_M_spec_char.count(__c)
 247                && __c != ']'
 248                && __c != '}'
 249                || (_M_is_grep() && __c == '\n'))
 250         _M_token = _M_token_map.at(__c);
 251       else
 252         {
 253           _M_token = _S_token_ord_char;
 254           _M_value.assign(1, __c);
 255         }
 256     }
 257
 258   // Differences between styles:
 259   // 1) different semantics of "[]" and "[^]".
 260   // 2) Escaping in bracket expr.
 261   template<typename _FwdIter>
 262     void
 263     _Scanner<_FwdIter>::
 264     _M_scan_in_bracket()
 265     {
 266       if (_M_current == _M_end)
 267         __throw_regex_error(regex_constants::error_brack);
 268
 269       auto __c = *_M_current++;
 270
 271       if (__c == '[')
 272         {
 273           if (_M_current == _M_end)
 274             __throw_regex_error(regex_constants::error_brack);
 275
 276           if (*_M_current == '.')
 277             {
 278               _M_token = _S_token_collsymbol;
 279               _M_eat_class(*_M_current++);
 280             }
 281           else if (*_M_current == ':')
 282             {
 283               _M_token = _S_token_char_class_name;
 284               _M_eat_class(*_M_current++);
 285             }
 286           else if (*_M_current == '=')
 287             {
 288               _M_token = _S_token_equiv_class_name;
 289               _M_eat_class(*_M_current++);
 290             }
 291           else
 292             {
 293               _M_token = _S_token_ord_char;
 294               _M_value.assign(1, __c);
 295             }
 296         }
 297       // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
 298       // literally. So "[]]" or "[^]]" is valid regex. See the testcases
 299       // `*/empty_range.cc`.
 300       else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
 301         {
 302           _M_token = _S_token_bracket_end;
 303           _M_state = _S_state_normal;
 304         }
 305       // ECMAScirpt and awk permmits escaping in bracket.
 306       else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
 307         (this->*_M_eat_escape)();
 308       else
 309         {
 310           _M_token = _S_token_ord_char;
 311           _M_value.assign(1, __c);
 312         }
 313       _M_at_bracket_start = false;
 314     }
 315
 316   // Differences between styles:
 317   // 1) "\}" in basic style.
 318   template<typename _FwdIter>
 319     void
 320     _Scanner<_FwdIter>::
 321     _M_scan_in_brace()
 322     {
 323       if (_M_current == _M_end)
 324         __throw_regex_error(regex_constants::error_brace);
 325
 326       auto __c = *_M_current++;
 327
 328       if (_M_ctype.is(_CtypeT::digit, __c))
 329         {
 330           _M_token = _S_token_dup_count;
 331           _M_value.assign(1, __c);
 332           while (_M_current != _M_end
 333                  && _M_ctype.is(_CtypeT::digit, *_M_current))
 334             _M_value += *_M_current++;
 335         }
 336       else if (__c == ',')
 337         _M_token = _S_token_comma;
 338       // basic use \}.
 339       else if (_M_is_basic())
 340         {
 341           if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
 342             {
 343               _M_state = _S_state_normal;
 344               _M_token = _S_token_interval_end;
 345               ++_M_current;
 346             }
 347           else
 348             __throw_regex_error(regex_constants::error_badbrace);
 349         }
 350       else if (__c == '}')
 351         {
 352           _M_state = _S_state_normal;
 353           _M_token = _S_token_interval_end;
 354         }
 355       else
 356         __throw_regex_error(regex_constants::error_badbrace);
 357     }
 358
 359   template<typename _FwdIter>
 360     void
 361     _Scanner<_FwdIter>::
 362     _M_eat_escape_ecma()
 363     {
 364       if (_M_current == _M_end)
 365         __throw_regex_error(regex_constants::error_escape);
 366
 367       auto __c = *_M_current++;
 368
 369       if (_M_escape_map.count(__c)
 370           && (__c != 'b' || _M_state == _S_state_in_bracket))
 371         {
 372           _M_token = _S_token_ord_char;
 373           _M_value.assign(1, _M_escape_map.at(__c));
 374         }
 375       else if (__c == 'b')
 376         {
 377           _M_token = _S_token_word_bound;
 378           _M_value.assign(1, 'p');
 379         }
 380       else if (__c == 'B')
 381         {
 382           _M_token = _S_token_word_bound;
 383           _M_value.assign(1, 'n');
 384         }
 385       // N3376 28.13
 386       else if (__c == 'd'
 387                || __c == 'D'
 388                || __c == 's'
 389                || __c == 'S'
 390                || __c == 'w'
 391                || __c == 'W')
 392         {
 393           _M_token = _S_token_quoted_class;
 394           _M_value.assign(1, __c);
 395         }
 396       else if (__c == 'c')
 397         {
 398           if (_M_current == _M_end)
 399             __throw_regex_error(regex_constants::error_escape);
 400           _M_token = _S_token_ord_char;
 401           _M_value.assign(1, *_M_current++);
 402         }
 403       else if (__c == 'x' || __c == 'u')
 404         {
 405           _M_value.erase();
 406           for (int i = 0; i < (__c == 'x' ? 2 : 4); i++)
 407             {
 408               if (_M_current == _M_end
 409                   || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
 410                 __throw_regex_error(regex_constants::error_escape);
 411               _M_value += *_M_current++;
 412             }
 413           _M_token = _S_token_hex_num;
 414         }
 415       // ECMAScript recongnizes multi-digit back-references.
 416       else if (_M_ctype.is(_CtypeT::digit, __c))
 417         {
 418           _M_value.assign(1, __c);
 419           while (_M_current != _M_end
 420                  && _M_ctype.is(_CtypeT::digit, *_M_current))
 421             _M_value += *_M_current++;
 422           _M_token = _S_token_backref;
 423         }
 424       else
 425         {
 426           _M_token = _S_token_ord_char;
 427           _M_value.assign(1, __c);
 428         }
 429     }
 430
 431   // Differences between styles:
 432   // 1) Extended doesn't support backref, but basic does.
 433   template<typename _FwdIter>
 434     void
 435     _Scanner<_FwdIter>::
 436     _M_eat_escape_posix()
 437     {
 438       if (_M_current == _M_end)
 439         __throw_regex_error(regex_constants::error_escape);
 440
 441       auto __c = *_M_current;
 442
 443       if (_M_spec_char.count(__c))
 444         {
 445           _M_token = _S_token_ord_char;
 446           _M_value.assign(1, __c);
 447         }
 448       // We MUST judge awk before handling backrefs. There's no backref in awk.
 449       else if (_M_is_awk())
 450         {
 451           _M_eat_escape_awk();
 452           return;
 453         }
 454       else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
 455         {
 456           _M_token = _S_token_backref;
 457           _M_value.assign(1, __c);
 458         }
 459       else
 460         __throw_regex_error(regex_constants::error_escape);
 461       ++_M_current;
 462     }
 463
 464   template<typename _FwdIter>
 465     void
 466     _Scanner<_FwdIter>::
 467     _M_eat_escape_awk()
 468     {
 469       auto __c = *_M_current++;
 470
 471       if (_M_escape_map.count(__c))
 472         {
 473           _M_token = _S_token_ord_char;
 474           _M_value.assign(1, _M_escape_map.at(__c));
 475         }
 476       // \ddd for oct representation
 477       else if (_M_ctype.is(_CtypeT::digit, __c)
 478                && __c != '8'
 479                && __c != '9')
 480         {
 481           _M_value.assign(1,  __c);
 482           for (int __i = 0;
 483                __i < 2
 484                && _M_current != _M_end
 485                && _M_ctype.is(_CtypeT::digit, *_M_current)
 486                && *_M_current != '8'
 487                && *_M_current != '9';
 488                __i++)
 489             _M_value += *_M_current++;
 490           _M_token = _S_token_oct_num;
 491           return;
 492         }
 493       else
 494         __throw_regex_error(regex_constants::error_escape);
 495     }
 496
 497   // Eats a character class or throwns an exception.
 498   // __ch cound be ':', '.' or '=', _M_current is the char after ']' when
 499   // returning.
 500   template<typename _FwdIter>
 501     void
 502     _Scanner<_FwdIter>::
 503     _M_eat_class(char __ch)
 504     {
 505       for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
 506         _M_value += *_M_current++;
 507       if (_M_current == _M_end
 508           || *_M_current++ != __ch
 509           || _M_current == _M_end // skip __ch
 510           || *_M_current++ != ']') // skip ']'
 511         if (__ch == ':')
 512           __throw_regex_error(regex_constants::error_ctype);
 513         else
 514           __throw_regex_error(regex_constants::error_collate);
 515     }
 516
 517 #ifdef _GLIBCXX_DEBUG
 518   template<typename _FwdIter>
 519     std::ostream&
 520     _Scanner<_FwdIter>::
 521     _M_print(std::ostream& ostr)
 522     {
 523       switch (_M_token)
 524       {
 525       case _S_token_anychar:
 526         ostr << "any-character\n";
 527         break;
 528       case _S_token_backref:
 529         ostr << "backref\n";
 530         break;
 531       case _S_token_bracket_begin:
 532         ostr << "bracket-begin\n";
 533         break;
 534       case _S_token_bracket_neg_begin:
 535         ostr << "bracket-neg-begin\n";
 536         break;
 537       case _S_token_bracket_end:
 538         ostr << "bracket-end\n";
 539         break;
 540       case _S_token_char_class_name:
 541         ostr << "char-class-name \"" << _M_value << "\"\n";
 542         break;
 543       case _S_token_closure0:
 544         ostr << "closure0\n";
 545         break;
 546       case _S_token_closure1:
 547         ostr << "closure1\n";
 548         break;
 549       case _S_token_collsymbol:
 550         ostr << "collsymbol \"" << _M_value << "\"\n";
 551         break;
 552       case _S_token_comma:
 553         ostr << "comma\n";
 554         break;
 555       case _S_token_dup_count:
 556         ostr << "dup count: " << _M_value << "\n";
 557         break;
 558       case _S_token_eof:
 559         ostr << "EOF\n";
 560         break;
 561       case _S_token_equiv_class_name:
 562         ostr << "equiv-class-name \"" << _M_value << "\"\n";
 563         break;
 564       case _S_token_interval_begin:
 565         ostr << "interval begin\n";
 566         break;
 567       case _S_token_interval_end:
 568         ostr << "interval end\n";
 569         break;
 570       case _S_token_line_begin:
 571         ostr << "line begin\n";
 572         break;
 573       case _S_token_line_end:
 574         ostr << "line end\n";
 575         break;
 576       case _S_token_opt:
 577         ostr << "opt\n";
 578         break;
 579       case _S_token_or:
 580         ostr << "or\n";
 581         break;
 582       case _S_token_ord_char:
 583         ostr << "ordinary character: \"" << _M_value << "\"\n";
 584         break;
 585       case _S_token_subexpr_begin:
 586         ostr << "subexpr begin\n";
 587         break;
 588       case _S_token_subexpr_no_group_begin:
 589         ostr << "no grouping subexpr begin\n";
 590         break;
 591       case _S_token_subexpr_lookahead_begin:
 592         ostr << "lookahead subexpr begin\n";
 593         break;
 594       case _S_token_subexpr_end:
 595         ostr << "subexpr end\n";
 596         break;
 597       case _S_token_unknown:
 598         ostr << "-- unknown token --\n";
 599         break;
 600       case _S_token_oct_num:
 601         ostr << "oct number " << _M_value << "\n";
 602         break;
 603       case _S_token_hex_num:
 604         ostr << "hex number " << _M_value << "\n";
 605         break;
 606       case _S_token_quoted_class:
 607         ostr << "quoted class " << "\\" << _M_value << "\n";
 608         break;
 609       default:
 610         _GLIBCXX_DEBUG_ASSERT(false);
 611       }
 612       return ostr;
 613     }
 614 #endif
 615
 616 _GLIBCXX_END_NAMESPACE_VERSION
 617 } // namespace __detail
 618 } // namespace