posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           int length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  27 static void init_word_char (re_dfa_t *dfa);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 static reg_errcode_t analyze (regex_t *preg);
  37 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  38 static reg_errcode_t preorder (bin_tree_t *root,
  39                                reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                void *extra);
  41 static reg_errcode_t postorder (bin_tree_t *root,
  42                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  43                                 void *extra);
  44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  47                                  bin_tree_t *node);
  48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  51 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  52                                              int top_clone_node, int root_node,
  53                                              unsigned int constraint);
  54 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  55                                      unsigned int constraint);
  56 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  57                                    unsigned int constraint);
  58 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  59 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  60                                          int node, int root);
  61 static void calc_inveclosure (re_dfa_t *dfa);
  62 static int fetch_number (re_string_t *input, re_token_t *token,
  63                          reg_syntax_t syntax);
  64 static void fetch_token (re_token_t *result, re_string_t *input,
  65                          reg_syntax_t syntax);
  66 static int peek_token (re_token_t *token, re_string_t *input,
  67                         reg_syntax_t syntax);
  68 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  69                                reg_syntax_t syntax);
  70 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  71                           reg_syntax_t syntax, reg_errcode_t *err);
  72 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  73                                   re_token_t *token, reg_syntax_t syntax,
  74                                   int nest, reg_errcode_t *err);
  75 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  76                                  re_token_t *token, reg_syntax_t syntax,
  77                                  int nest, reg_errcode_t *err);
  78 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  79                                      re_token_t *token, reg_syntax_t syntax,
  80                                      int nest, reg_errcode_t *err);
  81 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  82                                   re_token_t *token, reg_syntax_t syntax,
  83                                   int nest, reg_errcode_t *err);
  84 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  85                                  re_dfa_t *dfa, re_token_t *token,
  86                                  reg_syntax_t syntax, reg_errcode_t *err);
  87 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  88                                       re_token_t *token, reg_syntax_t syntax,
  89                                       reg_errcode_t *err);
  90 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  91                                             re_string_t *regexp,
  92                                             re_token_t *token, int token_len,
  93                                             re_dfa_t *dfa,
  94                                             reg_syntax_t syntax,
  95                                             int accept_hyphen);
  96 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  97                                           re_string_t *regexp,
  98                                           re_token_t *token);
  99 #ifndef _LIBC
 100 # ifdef RE_ENABLE_I18N
 101 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 102                                       re_charset_t *mbcset, int *range_alloc,
 103                                       bracket_elem_t *start_elem,
 104                                       bracket_elem_t *end_elem);
 105 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 106                                              re_charset_t *mbcset,
 107                                              int *coll_sym_alloc,
 108                                              const unsigned char *name);
 109 # else /* not RE_ENABLE_I18N */
 110 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 111                                       bracket_elem_t *start_elem,
 112                                       bracket_elem_t *end_elem);
 113 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 114                                              const unsigned char *name);
 115 # endif /* not RE_ENABLE_I18N */
 116 #endif /* not _LIBC */
 117 #ifdef RE_ENABLE_I18N
 118 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 119                                         re_charset_t *mbcset,
 120                                         int *equiv_class_alloc,
 121                                         const unsigned char *name);
 122 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 123                                       re_bitset_ptr_t sbcset,
 124                                       re_charset_t *mbcset,
 125                                       int *char_class_alloc,
 126                                       const unsigned char *class_name,
 127                                       reg_syntax_t syntax);
 128 #else  /* not RE_ENABLE_I18N */
 129 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 130                                         const unsigned char *name);
 131 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 132                                       re_bitset_ptr_t sbcset,
 133                                       const unsigned char *class_name,
 134                                       reg_syntax_t syntax);
 135 #endif /* not RE_ENABLE_I18N */
 136 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 137                                        unsigned RE_TRANSLATE_TYPE trans,
 138                                        const unsigned char *class_name,
 139                                        const unsigned char *extra,
 140                                        int non_match, reg_errcode_t *err);
 141 static bin_tree_t *create_tree (re_dfa_t *dfa,
 142                                 bin_tree_t *left, bin_tree_t *right,
 143                                 re_token_type_t type);
 144 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 145                                       bin_tree_t *left, bin_tree_t *right,
 146                                       const re_token_t *token);
 147 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 148 static void free_token (re_token_t *node);
 149 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 150 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 151 \f
 152 /* This table gives an error message for each of the error codes listed
 153    in regex.h.  Obviously the order here has to be same as there.
 154    POSIX doesn't require that we do anything for REG_NOERROR,
 155    but why not be nice?  */
 156
 157 const char __re_error_msgid[] attribute_hidden =
 158   {
 159 #define REG_NOERROR_IDX 0
 160     gettext_noop ("Success")    /* REG_NOERROR */
 161     "\0"
 162 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 163     gettext_noop ("No match")   /* REG_NOMATCH */
 164     "\0"
 165 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 166     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 167     "\0"
 168 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 169     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 170     "\0"
 171 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 172     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 173     "\0"
 174 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 175     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 176     "\0"
 177 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 178     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 179     "\0"
 180 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 181     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 182     "\0"
 183 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 184     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 185     "\0"
 186 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 187     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 188     "\0"
 189 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 190     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 191     "\0"
 192 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 193     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 194     "\0"
 195 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 196     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 197     "\0"
 198 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 199     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 200     "\0"
 201 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 202     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 203     "\0"
 204 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 205     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 206     "\0"
 207 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 208     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 209   };
 210
 211 const size_t __re_error_msgid_idx[] attribute_hidden =
 212   {
 213     REG_NOERROR_IDX,
 214     REG_NOMATCH_IDX,
 215     REG_BADPAT_IDX,
 216     REG_ECOLLATE_IDX,
 217     REG_ECTYPE_IDX,
 218     REG_EESCAPE_IDX,
 219     REG_ESUBREG_IDX,
 220     REG_EBRACK_IDX,
 221     REG_EPAREN_IDX,
 222     REG_EBRACE_IDX,
 223     REG_BADBR_IDX,
 224     REG_ERANGE_IDX,
 225     REG_ESPACE_IDX,
 226     REG_BADRPT_IDX,
 227     REG_EEND_IDX,
 228     REG_ESIZE_IDX,
 229     REG_ERPAREN_IDX
 230   };
 231 \f
 232 /* Entry points for GNU code.  */
 233
 234 /* re_compile_pattern is the GNU regular expression compiler: it
 235    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 236    Returns 0 if the pattern was valid, otherwise an error string.
 237
 238    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 239    are set in BUFP on entry.  */
 240
 241 const char *
 242 re_compile_pattern (pattern, length, bufp)
 243     const char *pattern;
 244     size_t length;
 245     struct re_pattern_buffer *bufp;
 246 {
 247   reg_errcode_t ret;
 248
 249   /* And GNU code determines whether or not to get register information
 250      by passing null for the REGS argument to re_match, etc., not by
 251      setting no_sub, unless RE_NO_SUB is set.  */
 252   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 253
 254   /* Match anchors at newline.  */
 255   bufp->newline_anchor = 1;
 256
 257   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 258
 259   if (!ret)
 260     return NULL;
 261   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 262 }
 263 #ifdef _LIBC
 264 weak_alias (__re_compile_pattern, re_compile_pattern)
 265 #endif
 266
 267 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 268    also be assigned to arbitrarily: each pattern buffer stores its own
 269    syntax, so it can be changed between regex compilations.  */
 270 /* This has no initializer because initialized variables in Emacs
 271    become read-only after dumping.  */
 272 reg_syntax_t re_syntax_options;
 273
 274
 275 /* Specify the precise syntax of regexps for compilation.  This provides
 276    for compatibility for various utilities which historically have
 277    different, incompatible syntaxes.
 278
 279    The argument SYNTAX is a bit mask comprised of the various bits
 280    defined in regex.h.  We return the old syntax.  */
 281
 282 reg_syntax_t
 283 re_set_syntax (syntax)
 284     reg_syntax_t syntax;
 285 {
 286   reg_syntax_t ret = re_syntax_options;
 287
 288   re_syntax_options = syntax;
 289   return ret;
 290 }
 291 #ifdef _LIBC
 292 weak_alias (__re_set_syntax, re_set_syntax)
 293 #endif
 294
 295 int
 296 re_compile_fastmap (bufp)
 297     struct re_pattern_buffer *bufp;
 298 {
 299   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 300   char *fastmap = bufp->fastmap;
 301
 302   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 303   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 304   if (dfa->init_state != dfa->init_state_word)
 305     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 306   if (dfa->init_state != dfa->init_state_nl)
 307     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 308   if (dfa->init_state != dfa->init_state_begbuf)
 309     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 310   bufp->fastmap_accurate = 1;
 311   return 0;
 312 }
 313 #ifdef _LIBC
 314 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 315 #endif
 316
 317 static inline void
 318 __attribute ((always_inline))
 319 re_set_fastmap (char *fastmap, int icase, int ch)
 320 {
 321   fastmap[ch] = 1;
 322   if (icase)
 323     fastmap[tolower (ch)] = 1;
 324 }
 325
 326 /* Helper function for re_compile_fastmap.
 327    Compile fastmap for the initial_state INIT_STATE.  */
 328
 329 static void
 330 re_compile_fastmap_iter (bufp, init_state, fastmap)
 331      regex_t *bufp;
 332      const re_dfastate_t *init_state;
 333      char *fastmap;
 334 {
 335   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 336   int node_cnt;
 337   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 338   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 339     {
 340       int node = init_state->nodes.elems[node_cnt];
 341       re_token_type_t type = dfa->nodes[node].type;
 342
 343       if (type == CHARACTER)
 344         {
 345           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 346 #ifdef RE_ENABLE_I18N
 347           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 348             {
 349               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 350               wchar_t wc;
 351               mbstate_t state;
 352
 353               p = buf;
 354               *p++ = dfa->nodes[node].opr.c;
 355               while (++node < dfa->nodes_len
 356                      && dfa->nodes[node].type == CHARACTER
 357                      && dfa->nodes[node].mb_partial)
 358                 *p++ = dfa->nodes[node].opr.c;
 359               memset (&state, 0, sizeof (state));
 360               if (mbrtowc (&wc, (const char *) buf, p - buf,
 361                            &state) == p - buf
 362                   && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
 363                 re_set_fastmap (fastmap, 0, buf[0]);
 364             }
 365 #endif
 366         }
 367       else if (type == SIMPLE_BRACKET)
 368         {
 369           int i, j, ch;
 370           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 371             for (j = 0; j < UINT_BITS; ++j, ++ch)
 372               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 373                 re_set_fastmap (fastmap, icase, ch);
 374         }
 375 #ifdef RE_ENABLE_I18N
 376       else if (type == COMPLEX_BRACKET)
 377         {
 378           int i;
 379           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 380           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 381               || cset->nranges || cset->nchar_classes)
 382             {
 383 # ifdef _LIBC
 384               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 385                 {
 386                   /* In this case we want to catch the bytes which are
 387                      the first byte of any collation elements.
 388                      e.g. In da_DK, we want to catch 'a' since "aa"
 389                           is a valid collation element, and don't catch
 390                           'b' since 'b' is the only collation element
 391                           which starts from 'b'.  */
 392                   int j, ch;
 393                   const int32_t *table = (const int32_t *)
 394                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 395                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 396                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 397                       if (table[ch] < 0)
 398                         re_set_fastmap (fastmap, icase, ch);
 399                 }
 400 # else
 401               if (dfa->mb_cur_max > 1)
 402                 for (i = 0; i < SBC_MAX; ++i)
 403                   if (__btowc (i) == WEOF)
 404                     re_set_fastmap (fastmap, icase, i);
 405 # endif /* not _LIBC */
 406             }
 407           for (i = 0; i < cset->nmbchars; ++i)
 408             {
 409               char buf[256];
 410               mbstate_t state;
 411               memset (&state, '\0', sizeof (state));
 412               __wcrtomb (buf, cset->mbchars[i], &state);
 413               re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 414               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 415                 {
 416                   __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
 417                   re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 418                 }
 419             }
 420         }
 421 #endif /* RE_ENABLE_I18N */
 422       else if (type == OP_PERIOD
 423 #ifdef RE_ENABLE_I18N
 424                || type == OP_UTF8_PERIOD
 425 #endif /* RE_ENABLE_I18N */
 426                || type == END_OF_RE)
 427         {
 428           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 429           if (type == END_OF_RE)
 430             bufp->can_be_null = 1;
 431           return;
 432         }
 433     }
 434 }
 435 \f
 436 /* Entry point for POSIX code.  */
 437 /* regcomp takes a regular expression as a string and compiles it.
 438
 439    PREG is a regex_t *.  We do not expect any fields to be initialized,
 440    since POSIX says we shouldn't.  Thus, we set
 441
 442      `buffer' to the compiled pattern;
 443      `used' to the length of the compiled pattern;
 444      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 445        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 446        RE_SYNTAX_POSIX_BASIC;
 447      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 448      `fastmap' to an allocated space for the fastmap;
 449      `fastmap_accurate' to zero;
 450      `re_nsub' to the number of subexpressions in PATTERN.
 451
 452    PATTERN is the address of the pattern string.
 453
 454    CFLAGS is a series of bits which affect compilation.
 455
 456      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 457      use POSIX basic syntax.
 458
 459      If REG_NEWLINE is set, then . and [^...] don't match newline.
 460      Also, regexec will try a match beginning after every newline.
 461
 462      If REG_ICASE is set, then we considers upper- and lowercase
 463      versions of letters to be equivalent when matching.
 464
 465      If REG_NOSUB is set, then when PREG is passed to regexec, that
 466      routine will report only success or failure, and nothing about the
 467      registers.
 468
 469    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 470    the return codes and their meanings.)  */
 471
 472 int
 473 regcomp (preg, pattern, cflags)
 474     regex_t *__restrict preg;
 475     const char *__restrict pattern;
 476     int cflags;
 477 {
 478   reg_errcode_t ret;
 479   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 480                          : RE_SYNTAX_POSIX_BASIC);
 481
 482   preg->buffer = NULL;
 483   preg->allocated = 0;
 484   preg->used = 0;
 485
 486   /* Try to allocate space for the fastmap.  */
 487   preg->fastmap = re_malloc (char, SBC_MAX);
 488   if (BE (preg->fastmap == NULL, 0))
 489     return REG_ESPACE;
 490
 491   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 492
 493   /* If REG_NEWLINE is set, newlines are treated differently.  */
 494   if (cflags & REG_NEWLINE)
 495     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 496       syntax &= ~RE_DOT_NEWLINE;
 497       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 498       /* It also changes the matching behavior.  */
 499       preg->newline_anchor = 1;
 500     }
 501   else
 502     preg->newline_anchor = 0;
 503   preg->no_sub = !!(cflags & REG_NOSUB);
 504   preg->translate = NULL;
 505
 506   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 507
 508   /* POSIX doesn't distinguish between an unmatched open-group and an
 509      unmatched close-group: both are REG_EPAREN.  */
 510   if (ret == REG_ERPAREN)
 511     ret = REG_EPAREN;
 512
 513   /* We have already checked preg->fastmap != NULL.  */
 514   if (BE (ret == REG_NOERROR, 1))
 515     /* Compute the fastmap now, since regexec cannot modify the pattern
 516        buffer.  This function never fails in this implementation.  */
 517     (void) re_compile_fastmap (preg);
 518   else
 519     {
 520       /* Some error occurred while compiling the expression.  */
 521       re_free (preg->fastmap);
 522       preg->fastmap = NULL;
 523     }
 524
 525   return (int) ret;
 526 }
 527 #ifdef _LIBC
 528 weak_alias (__regcomp, regcomp)
 529 #endif
 530
 531 /* Returns a message corresponding to an error code, ERRCODE, returned
 532    from either regcomp or regexec.   We don't use PREG here.  */
 533
 534 size_t
 535 regerror (errcode, preg, errbuf, errbuf_size)
 536     int errcode;
 537     const regex_t *preg;
 538     char *errbuf;
 539     size_t errbuf_size;
 540 {
 541   const char *msg;
 542   size_t msg_size;
 543
 544   if (BE (errcode < 0
 545           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 546                                / sizeof (__re_error_msgid_idx[0])), 0))
 547     /* Only error codes returned by the rest of the code should be passed
 548        to this routine.  If we are given anything else, or if other regex
 549        code generates an invalid error code, then the program has a bug.
 550        Dump core so we can fix it.  */
 551     abort ();
 552
 553   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 554
 555   msg_size = strlen (msg) + 1; /* Includes the null.  */
 556
 557   if (BE (errbuf_size != 0, 1))
 558     {
 559       if (BE (msg_size > errbuf_size, 0))
 560         {
 561 #if defined HAVE_MEMPCPY || defined _LIBC
 562           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 563 #else
 564           memcpy (errbuf, msg, errbuf_size - 1);
 565           errbuf[errbuf_size - 1] = 0;
 566 #endif
 567         }
 568       else
 569         memcpy (errbuf, msg, msg_size);
 570     }
 571
 572   return msg_size;
 573 }
 574 #ifdef _LIBC
 575 weak_alias (__regerror, regerror)
 576 #endif
 577
 578
 579 #ifdef RE_ENABLE_I18N
 580 /* This static array is used for the map to single-byte characters when
 581    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 582    it the same all the time.  UTF-8 is the preferred encoding so this is
 583    a worthwhile optimization.  */
 584 static const bitset utf8_sb_map =
 585 {
 586   /* Set the first 128 bits.  */
 587 # if UINT_MAX == 0xffffffff
 588   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
 589 # else
 590 #  error "Add case for new unsigned int size"
 591 # endif
 592 };
 593 #endif
 594
 595
 596 static void
 597 free_dfa_content (re_dfa_t *dfa)
 598 {
 599   int i, j;
 600
 601   if (dfa->nodes)
 602     for (i = 0; i < dfa->nodes_len; ++i)
 603       free_token (dfa->nodes + i);
 604   re_free (dfa->nexts);
 605   for (i = 0; i < dfa->nodes_len; ++i)
 606     {
 607       if (dfa->eclosures != NULL)
 608         re_node_set_free (dfa->eclosures + i);
 609       if (dfa->inveclosures != NULL)
 610         re_node_set_free (dfa->inveclosures + i);
 611       if (dfa->edests != NULL)
 612         re_node_set_free (dfa->edests + i);
 613     }
 614   re_free (dfa->edests);
 615   re_free (dfa->eclosures);
 616   re_free (dfa->inveclosures);
 617   re_free (dfa->nodes);
 618
 619   if (dfa->state_table)
 620     for (i = 0; i <= dfa->state_hash_mask; ++i)
 621       {
 622         struct re_state_table_entry *entry = dfa->state_table + i;
 623         for (j = 0; j < entry->num; ++j)
 624           {
 625             re_dfastate_t *state = entry->array[j];
 626             free_state (state);
 627           }
 628         re_free (entry->array);
 629       }
 630   re_free (dfa->state_table);
 631 #ifdef RE_ENABLE_I18N
 632   if (dfa->sb_char != utf8_sb_map)
 633     re_free (dfa->sb_char);
 634 #endif
 635   re_free (dfa->subexp_map);
 636 #ifdef DEBUG
 637   re_free (dfa->re_str);
 638 #endif
 639
 640   re_free (dfa);
 641 }
 642
 643
 644 /* Free dynamically allocated space used by PREG.  */
 645
 646 void
 647 regfree (preg)
 648     regex_t *preg;
 649 {
 650   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 651   if (BE (dfa != NULL, 1))
 652     free_dfa_content (dfa);
 653   preg->buffer = NULL;
 654   preg->allocated = 0;
 655
 656   re_free (preg->fastmap);
 657   preg->fastmap = NULL;
 658
 659   re_free (preg->translate);
 660   preg->translate = NULL;
 661 }
 662 #ifdef _LIBC
 663 weak_alias (__regfree, regfree)
 664 #endif
 665 \f
 666 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 667    them unless specifically requested.  */
 668
 669 #if defined _REGEX_RE_COMP || defined _LIBC
 670
 671 /* BSD has one and only one pattern buffer.  */
 672 static struct re_pattern_buffer re_comp_buf;
 673
 674 char *
 675 # ifdef _LIBC
 676 /* Make these definitions weak in libc, so POSIX programs can redefine
 677    these names if they don't use our functions, and still use
 678    regcomp/regexec above without link errors.  */
 679 weak_function
 680 # endif
 681 re_comp (s)
 682      const char *s;
 683 {
 684   reg_errcode_t ret;
 685   char *fastmap;
 686
 687   if (!s)
 688     {
 689       if (!re_comp_buf.buffer)
 690         return gettext ("No previous regular expression");
 691       return 0;
 692     }
 693
 694   if (re_comp_buf.buffer)
 695     {
 696       fastmap = re_comp_buf.fastmap;
 697       re_comp_buf.fastmap = NULL;
 698       __regfree (&re_comp_buf);
 699       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 700       re_comp_buf.fastmap = fastmap;
 701     }
 702
 703   if (re_comp_buf.fastmap == NULL)
 704     {
 705       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 706       if (re_comp_buf.fastmap == NULL)
 707         return (char *) gettext (__re_error_msgid
 708                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 709     }
 710
 711   /* Since `re_exec' always passes NULL for the `regs' argument, we
 712      don't need to initialize the pattern buffer fields which affect it.  */
 713
 714   /* Match anchors at newlines.  */
 715   re_comp_buf.newline_anchor = 1;
 716
 717   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 718
 719   if (!ret)
 720     return NULL;
 721
 722   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 723   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 724 }
 725
 726 #ifdef _LIBC
 727 libc_freeres_fn (free_mem)
 728 {
 729   __regfree (&re_comp_buf);
 730 }
 731 #endif
 732
 733 #endif /* _REGEX_RE_COMP */
 734 \f
 735 /* Internal entry point.
 736    Compile the regular expression PATTERN, whose length is LENGTH.
 737    SYNTAX indicate regular expression's syntax.  */
 738
 739 static reg_errcode_t
 740 re_compile_internal (preg, pattern, length, syntax)
 741      regex_t *preg;
 742      const char * pattern;
 743      int length;
 744      reg_syntax_t syntax;
 745 {
 746   reg_errcode_t err = REG_NOERROR;
 747   re_dfa_t *dfa;
 748   re_string_t regexp;
 749
 750   /* Initialize the pattern buffer.  */
 751   preg->fastmap_accurate = 0;
 752   preg->syntax = syntax;
 753   preg->not_bol = preg->not_eol = 0;
 754   preg->used = 0;
 755   preg->re_nsub = 0;
 756   preg->can_be_null = 0;
 757   preg->regs_allocated = REGS_UNALLOCATED;
 758
 759   /* Initialize the dfa.  */
 760   dfa = (re_dfa_t *) preg->buffer;
 761   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 762     {
 763       /* If zero allocated, but buffer is non-null, try to realloc
 764          enough space.  This loses if buffer's address is bogus, but
 765          that is the user's responsibility.  If ->buffer is NULL this
 766          is a simple allocation.  */
 767       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 768       if (dfa == NULL)
 769         return REG_ESPACE;
 770       preg->allocated = sizeof (re_dfa_t);
 771       preg->buffer = (unsigned char *) dfa;
 772     }
 773   preg->used = sizeof (re_dfa_t);
 774
 775   err = init_dfa (dfa, length);
 776   if (BE (err != REG_NOERROR, 0))
 777     {
 778       free_dfa_content (dfa);
 779       preg->buffer = NULL;
 780       preg->allocated = 0;
 781       return err;
 782     }
 783 #ifdef DEBUG
 784   dfa->re_str = re_malloc (char, length + 1);
 785   strncpy (dfa->re_str, pattern, length + 1);
 786 #endif
 787
 788   err = re_string_construct (&regexp, pattern, length, preg->translate,
 789                              syntax & RE_ICASE, dfa);
 790   if (BE (err != REG_NOERROR, 0))
 791     {
 792     re_compile_internal_free_return:
 793       free_workarea_compile (preg);
 794       re_string_destruct (&regexp);
 795       free_dfa_content (dfa);
 796       preg->buffer = NULL;
 797       preg->allocated = 0;
 798       return err;
 799     }
 800
 801   /* Parse the regular expression, and build a structure tree.  */
 802   preg->re_nsub = 0;
 803   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 804   if (BE (dfa->str_tree == NULL, 0))
 805     goto re_compile_internal_free_return;
 806
 807   /* Analyze the tree and create the nfa.  */
 808   err = analyze (preg);
 809   if (BE (err != REG_NOERROR, 0))
 810     goto re_compile_internal_free_return;
 811
 812 #ifdef RE_ENABLE_I18N
 813   /* If possible, do searching in single byte encoding to speed things up.  */
 814   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 815     optimize_utf8 (dfa);
 816 #endif
 817
 818   /* Then create the initial state of the dfa.  */
 819   err = create_initial_state (dfa);
 820
 821   /* Release work areas.  */
 822   free_workarea_compile (preg);
 823   re_string_destruct (&regexp);
 824
 825   if (BE (err != REG_NOERROR, 0))
 826     {
 827       free_dfa_content (dfa);
 828       preg->buffer = NULL;
 829       preg->allocated = 0;
 830     }
 831
 832   return err;
 833 }
 834
 835 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 836    as the initial length of some arrays.  */
 837
 838 static reg_errcode_t
 839 init_dfa (dfa, pat_len)
 840      re_dfa_t *dfa;
 841      int pat_len;
 842 {
 843   int table_size;
 844 #ifndef _LIBC
 845   char *codeset_name;
 846 #endif
 847
 848   memset (dfa, '\0', sizeof (re_dfa_t));
 849
 850   /* Force allocation of str_tree_storage the first time.  */
 851   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 852
 853   dfa->nodes_alloc = pat_len + 1;
 854   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 855
 856   dfa->states_alloc = pat_len + 1;
 857
 858   /*  table_size = 2 ^ ceil(log pat_len) */
 859   for (table_size = 1; table_size > 0; table_size <<= 1)
 860     if (table_size > pat_len)
 861       break;
 862
 863   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 864   dfa->state_hash_mask = table_size - 1;
 865
 866   dfa->mb_cur_max = MB_CUR_MAX;
 867 #ifdef _LIBC
 868   if (dfa->mb_cur_max == 6
 869       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 870     dfa->is_utf8 = 1;
 871   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 872                        != 0);
 873 #else
 874 # ifdef HAVE_LANGINFO_CODESET
 875   codeset_name = nl_langinfo (CODESET);
 876 # else
 877   codeset_name = getenv ("LC_ALL");
 878   if (codeset_name == NULL || codeset_name[0] == '\0')
 879     codeset_name = getenv ("LC_CTYPE");
 880   if (codeset_name == NULL || codeset_name[0] == '\0')
 881     codeset_name = getenv ("LANG");
 882   if (codeset_name == NULL)
 883     codeset_name = "";
 884   else if (strchr (codeset_name, '.') !=  NULL)
 885     codeset_name = strchr (codeset_name, '.') + 1;
 886 # endif
 887
 888   if (strcasecmp (codeset_name, "UTF-8") == 0
 889       || strcasecmp (codeset_name, "UTF8") == 0)
 890     dfa->is_utf8 = 1;
 891
 892   /* We check exhaustively in the loop below if this charset is a
 893      superset of ASCII.  */
 894   dfa->map_notascii = 0;
 895 #endif
 896
 897 #ifdef RE_ENABLE_I18N
 898   if (dfa->mb_cur_max > 1)
 899     {
 900       if (dfa->is_utf8)
 901         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 902       else
 903         {
 904           int i, j, ch;
 905
 906           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 907           if (BE (dfa->sb_char == NULL, 0))
 908             return REG_ESPACE;
 909
 910           /* Clear all bits by, then set those corresponding to single
 911              byte chars.  */
 912           bitset_empty (dfa->sb_char);
 913
 914           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 915             for (j = 0; j < UINT_BITS; ++j, ++ch)
 916               {
 917                 wchar_t wch = __btowc (ch);
 918                 if (wch != WEOF)
 919                   dfa->sb_char[i] |= 1 << j;
 920 # ifndef _LIBC
 921                 if (isascii (ch) && wch != (wchar_t) ch)
 922                   dfa->map_notascii = 1;
 923 # endif
 924               }
 925         }
 926     }
 927 #endif
 928
 929   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 930     return REG_ESPACE;
 931   return REG_NOERROR;
 932 }
 933
 934 /* Initialize WORD_CHAR table, which indicate which character is
 935    "word".  In this case "word" means that it is the word construction
 936    character used by some operators like "\<", "\>", etc.  */
 937
 938 static void
 939 init_word_char (dfa)
 940      re_dfa_t *dfa;
 941 {
 942   int i, j, ch;
 943   dfa->word_ops_used = 1;
 944   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 945     for (j = 0; j < UINT_BITS; ++j, ++ch)
 946       if (isalnum (ch) || ch == '_')
 947         dfa->word_char[i] |= 1 << j;
 948 }
 949
 950 /* Free the work area which are only used while compiling.  */
 951
 952 static void
 953 free_workarea_compile (preg)
 954      regex_t *preg;
 955 {
 956   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 957   bin_tree_storage_t *storage, *next;
 958   for (storage = dfa->str_tree_storage; storage; storage = next)
 959     {
 960       next = storage->next;
 961       re_free (storage);
 962     }
 963   dfa->str_tree_storage = NULL;
 964   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 965   dfa->str_tree = NULL;
 966   re_free (dfa->org_indices);
 967   dfa->org_indices = NULL;
 968 }
 969
 970 /* Create initial states for all contexts.  */
 971
 972 static reg_errcode_t
 973 create_initial_state (dfa)
 974      re_dfa_t *dfa;
 975 {
 976   int first, i;
 977   reg_errcode_t err;
 978   re_node_set init_nodes;
 979
 980   /* Initial states have the epsilon closure of the node which is
 981      the first node of the regular expression.  */
 982   first = dfa->str_tree->first->node_idx;
 983   dfa->init_node = first;
 984   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 985   if (BE (err != REG_NOERROR, 0))
 986     return err;
 987
 988   /* The back-references which are in initial states can epsilon transit,
 989      since in this case all of the subexpressions can be null.
 990      Then we add epsilon closures of the nodes which are the next nodes of
 991      the back-references.  */
 992   if (dfa->nbackref > 0)
 993     for (i = 0; i < init_nodes.nelem; ++i)
 994       {
 995         int node_idx = init_nodes.elems[i];
 996         re_token_type_t type = dfa->nodes[node_idx].type;
 997
 998         int clexp_idx;
 999         if (type != OP_BACK_REF)
1000           continue;
1001         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1002           {
1003             re_token_t *clexp_node;
1004             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1005             if (clexp_node->type == OP_CLOSE_SUBEXP
1006                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1007               break;
1008           }
1009         if (clexp_idx == init_nodes.nelem)
1010           continue;
1011
1012         if (type == OP_BACK_REF)
1013           {
1014             int dest_idx = dfa->edests[node_idx].elems[0];
1015             if (!re_node_set_contains (&init_nodes, dest_idx))
1016               {
1017                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1018                 i = 0;
1019               }
1020           }
1021       }
1022
1023   /* It must be the first time to invoke acquire_state.  */
1024   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1025   /* We don't check ERR here, since the initial state must not be NULL.  */
1026   if (BE (dfa->init_state == NULL, 0))
1027     return err;
1028   if (dfa->init_state->has_constraint)
1029     {
1030       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1031                                                        CONTEXT_WORD);
1032       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1033                                                      CONTEXT_NEWLINE);
1034       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1035                                                          &init_nodes,
1036                                                          CONTEXT_NEWLINE
1037                                                          | CONTEXT_BEGBUF);
1038       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1039               || dfa->init_state_begbuf == NULL, 0))
1040         return err;
1041     }
1042   else
1043     dfa->init_state_word = dfa->init_state_nl
1044       = dfa->init_state_begbuf = dfa->init_state;
1045
1046   re_node_set_free (&init_nodes);
1047   return REG_NOERROR;
1048 }
1049 \f
1050 #ifdef RE_ENABLE_I18N
1051 /* If it is possible to do searching in single byte encoding instead of UTF-8
1052    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1053    DFA nodes where needed.  */
1054
1055 static void
1056 optimize_utf8 (dfa)
1057      re_dfa_t *dfa;
1058 {
1059   int node, i, mb_chars = 0, has_period = 0;
1060
1061   for (node = 0; node < dfa->nodes_len; ++node)
1062     switch (dfa->nodes[node].type)
1063       {
1064       case CHARACTER:
1065         if (dfa->nodes[node].opr.c >= 0x80)
1066           mb_chars = 1;
1067         break;
1068       case ANCHOR:
1069         switch (dfa->nodes[node].opr.idx)
1070           {
1071           case LINE_FIRST:
1072           case LINE_LAST:
1073           case BUF_FIRST:
1074           case BUF_LAST:
1075             break;
1076           default:
1077             /* Word anchors etc. cannot be handled.  */
1078             return;
1079           }
1080         break;
1081       case OP_PERIOD:
1082         has_period = 1;
1083         break;
1084       case OP_BACK_REF:
1085       case OP_ALT:
1086       case END_OF_RE:
1087       case OP_DUP_ASTERISK:
1088       case OP_OPEN_SUBEXP:
1089       case OP_CLOSE_SUBEXP:
1090         break;
1091       case COMPLEX_BRACKET:
1092         return;
1093       case SIMPLE_BRACKET:
1094         /* Just double check.  */
1095         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1096           if (dfa->nodes[node].opr.sbcset[i])
1097             return;
1098         break;
1099       default:
1100         abort ();
1101       }
1102
1103   if (mb_chars || has_period)
1104     for (node = 0; node < dfa->nodes_len; ++node)
1105       {
1106         if (dfa->nodes[node].type == CHARACTER
1107             && dfa->nodes[node].opr.c >= 0x80)
1108           dfa->nodes[node].mb_partial = 0;
1109         else if (dfa->nodes[node].type == OP_PERIOD)
1110           dfa->nodes[node].type = OP_UTF8_PERIOD;
1111       }
1112
1113   /* The search can be in single byte locale.  */
1114   dfa->mb_cur_max = 1;
1115   dfa->is_utf8 = 0;
1116   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1117 }
1118 #endif
1119 \f
1120 /* Analyze the structure tree, and calculate "first", "next", "edest",
1121    "eclosure", and "inveclosure".  */
1122
1123 static reg_errcode_t
1124 analyze (preg)
1125      regex_t *preg;
1126 {
1127   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1128   reg_errcode_t ret;
1129
1130   /* Allocate arrays.  */
1131   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1132   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1133   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1134   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1135   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1136   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1137           || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
1138     return REG_ESPACE;
1139
1140   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1141   if (dfa->subexp_map != NULL)
1142     {
1143       int i;
1144       for (i = 0; i < preg->re_nsub; i++)
1145         dfa->subexp_map[i] = i;
1146       preorder (dfa->str_tree, optimize_subexps, dfa);
1147       for (i = 0; i < preg->re_nsub; i++)
1148         if (dfa->subexp_map[i] != i)
1149           break;
1150       if (i == preg->re_nsub)
1151         {
1152           free (dfa->subexp_map);
1153           dfa->subexp_map = NULL;
1154         }
1155     }
1156
1157   ret = postorder (dfa->str_tree, lower_subexps, preg);
1158   if (BE (ret != REG_NOERROR, 0))
1159     return ret;
1160   ret = postorder (dfa->str_tree, calc_first, dfa);
1161   if (BE (ret != REG_NOERROR, 0))
1162     return ret;
1163   preorder (dfa->str_tree, calc_next, dfa);
1164   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1165   if (BE (ret != REG_NOERROR, 0))
1166     return ret;
1167   ret = calc_eclosure (dfa);
1168   if (BE (ret != REG_NOERROR, 0))
1169     return ret;
1170   calc_inveclosure (dfa);
1171   return ret;
1172 }
1173
1174 /* Our parse trees are very unbalanced, so we cannot use a stack to
1175    implement parse tree visits.  Instead, we use parent pointers and
1176    some hairy code in these two functions.  */
1177 static reg_errcode_t
1178 postorder (root, fn, extra)
1179      bin_tree_t *root;
1180      reg_errcode_t (fn (void *, bin_tree_t *));
1181      void *extra;
1182 {
1183   bin_tree_t *node, *prev;
1184
1185   for (node = root; ; )
1186     {
1187       /* Descend down the tree, preferably to the left (or to the right
1188          if that's the only child).  */
1189       while (node->left || node->right)
1190         if (node->left)
1191           node = node->left;
1192         else
1193           node = node->right;
1194
1195       do
1196         {
1197           reg_errcode_t err = fn (extra, node);
1198           if (BE (err != REG_NOERROR, 0))
1199             return err;
1200           if (node->parent == NULL)
1201             return REG_NOERROR;
1202           prev = node;
1203           node = node->parent;
1204         }
1205       /* Go up while we have a node that is reached from the right.  */
1206       while (node->right == prev || node->right == NULL);
1207       node = node->right;
1208     }
1209 }
1210
1211 static reg_errcode_t
1212 preorder (root, fn, extra)
1213      bin_tree_t *root;
1214      reg_errcode_t (fn (void *, bin_tree_t *));
1215      void *extra;
1216 {
1217   bin_tree_t *node;
1218
1219   for (node = root; ; )
1220     {
1221       reg_errcode_t err = fn (extra, node);
1222       if (BE (err != REG_NOERROR, 0))
1223         return err;
1224
1225       /* Go to the left node, or up and to the right.  */
1226       if (node->left)
1227         node = node->left;
1228       else
1229         {
1230           bin_tree_t *prev = NULL;
1231           while (node->right == prev || node->right == NULL)
1232             {
1233               prev = node;
1234               node = node->parent;
1235               if (!node)
1236                 return REG_NOERROR;
1237             }
1238           node = node->right;
1239         }
1240     }
1241 }
1242
1243 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1244    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1245    backreferences as well.  Requires a preorder visit.  */
1246 static reg_errcode_t
1247 optimize_subexps (extra, node)
1248      void *extra;
1249      bin_tree_t *node;
1250 {
1251   re_dfa_t *dfa = (re_dfa_t *) extra;
1252
1253   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1254     {
1255       int idx = node->token.opr.idx;
1256       node->token.opr.idx = dfa->subexp_map[idx];
1257       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1258     }
1259
1260   else if (node->token.type == SUBEXP
1261            && node->left && node->left->token.type == SUBEXP)
1262     {
1263       int other_idx = node->left->token.opr.idx;
1264
1265       node->left = node->left->left;
1266       if (node->left)
1267         node->left->parent = node;
1268
1269       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1270       if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1271         dfa->used_bkref_map &= ~(1 << other_idx);
1272     }
1273
1274   return REG_NOERROR;
1275 }
1276
1277 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1278    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1279 static reg_errcode_t
1280 lower_subexps (extra, node)
1281      void *extra;
1282      bin_tree_t *node;
1283 {
1284   regex_t *preg = (regex_t *) extra;
1285   reg_errcode_t err = REG_NOERROR;
1286
1287   if (node->left && node->left->token.type == SUBEXP)
1288     {
1289       node->left = lower_subexp (&err, preg, node->left);
1290       if (node->left)
1291         node->left->parent = node;
1292     }
1293   if (node->right && node->right->token.type == SUBEXP)
1294     {
1295       node->right = lower_subexp (&err, preg, node->right);
1296       if (node->right)
1297         node->right->parent = node;
1298     }
1299
1300   return err;
1301 }
1302
1303 static bin_tree_t *
1304 lower_subexp (err, preg, node)
1305      reg_errcode_t *err;
1306      regex_t *preg;
1307      bin_tree_t *node;
1308 {
1309   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1310   bin_tree_t *body = node->left;
1311   bin_tree_t *op, *cls, *tree1, *tree;
1312
1313   if (preg->no_sub
1314       && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1315           || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1316     return node->left;
1317
1318   /* Convert the SUBEXP node to the concatenation of an
1319      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1320   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1321   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1322   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1323   tree = create_tree (dfa, op, tree1, CONCAT);
1324   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1325     {
1326       *err = REG_ESPACE;
1327       return NULL;
1328     }
1329
1330   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1331   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1332   return tree;
1333 }
1334
1335 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1336    nodes.  Requires a postorder visit.  */
1337 static reg_errcode_t
1338 calc_first (extra, node)
1339      void *extra;
1340      bin_tree_t *node;
1341 {
1342   re_dfa_t *dfa = (re_dfa_t *) extra;
1343   if (node->token.type == CONCAT)
1344     {
1345       node->first = node->left->first;
1346       node->node_idx = node->left->node_idx;
1347     }
1348   else
1349     {
1350       node->first = node;
1351       node->node_idx = re_dfa_add_node (dfa, node->token);
1352       if (BE (node->node_idx == -1, 0))
1353         return REG_ESPACE;
1354     }
1355   return REG_NOERROR;
1356 }
1357
1358 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1359 static reg_errcode_t
1360 calc_next (extra, node)
1361      void *extra;
1362      bin_tree_t *node;
1363 {
1364   switch (node->token.type)
1365     {
1366     case OP_DUP_ASTERISK:
1367       node->left->next = node;
1368       break;
1369     case CONCAT:
1370       node->left->next = node->right->first;
1371       node->right->next = node->next;
1372       break;
1373     default:
1374       if (node->left)
1375         node->left->next = node->next;
1376       if (node->right)
1377         node->right->next = node->next;
1378       break;
1379     }
1380   return REG_NOERROR;
1381 }
1382
1383 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1384 static reg_errcode_t
1385 link_nfa_nodes (extra, node)
1386      void *extra;
1387      bin_tree_t *node;
1388 {
1389   re_dfa_t *dfa = (re_dfa_t *) extra;
1390   int idx = node->node_idx;
1391   reg_errcode_t err = REG_NOERROR;
1392
1393   switch (node->token.type)
1394     {
1395     case CONCAT:
1396       break;
1397
1398     case END_OF_RE:
1399       assert (node->next == NULL);
1400       break;
1401
1402     case OP_DUP_ASTERISK:
1403     case OP_ALT:
1404       {
1405         int left, right;
1406         dfa->has_plural_match = 1;
1407         if (node->left != NULL)
1408           left = node->left->first->node_idx;
1409         else
1410           left = node->next->node_idx;
1411         if (node->right != NULL)
1412           right = node->right->first->node_idx;
1413         else
1414           right = node->next->node_idx;
1415         assert (left > -1);
1416         assert (right > -1);
1417         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1418       }
1419       break;
1420
1421     case ANCHOR:
1422     case OP_OPEN_SUBEXP:
1423     case OP_CLOSE_SUBEXP:
1424       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1425       break;
1426
1427     case OP_BACK_REF:
1428       dfa->nexts[idx] = node->next->node_idx;
1429       if (node->token.type == OP_BACK_REF)
1430         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1431       break;
1432
1433     default:
1434       assert (!IS_EPSILON_NODE (node->token.type));
1435       dfa->nexts[idx] = node->next->node_idx;
1436       break;
1437     }
1438
1439   return err;
1440 }
1441
1442 /* Duplicate the epsilon closure of the node ROOT_NODE.
1443    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1444    to their own constraint.  */
1445
1446 static reg_errcode_t
1447 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1448                         init_constraint)
1449      re_dfa_t *dfa;
1450      int top_org_node, top_clone_node, root_node;
1451      unsigned int init_constraint;
1452 {
1453   reg_errcode_t err;
1454   int org_node, clone_node, ret;
1455   unsigned int constraint = init_constraint;
1456   for (org_node = top_org_node, clone_node = top_clone_node;;)
1457     {
1458       int org_dest, clone_dest;
1459       if (dfa->nodes[org_node].type == OP_BACK_REF)
1460         {
1461           /* If the back reference epsilon-transit, its destination must
1462              also have the constraint.  Then duplicate the epsilon closure
1463              of the destination of the back reference, and store it in
1464              edests of the back reference.  */
1465           org_dest = dfa->nexts[org_node];
1466           re_node_set_empty (dfa->edests + clone_node);
1467           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1468           if (BE (err != REG_NOERROR, 0))
1469             return err;
1470           dfa->nexts[clone_node] = dfa->nexts[org_node];
1471           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1472           if (BE (ret < 0, 0))
1473             return REG_ESPACE;
1474         }
1475       else if (dfa->edests[org_node].nelem == 0)
1476         {
1477           /* In case of the node can't epsilon-transit, don't duplicate the
1478              destination and store the original destination as the
1479              destination of the node.  */
1480           dfa->nexts[clone_node] = dfa->nexts[org_node];
1481           break;
1482         }
1483       else if (dfa->edests[org_node].nelem == 1)
1484         {
1485           /* In case of the node can epsilon-transit, and it has only one
1486              destination.  */
1487           org_dest = dfa->edests[org_node].elems[0];
1488           re_node_set_empty (dfa->edests + clone_node);
1489           if (dfa->nodes[org_node].type == ANCHOR)
1490             {
1491               /* In case of the node has another constraint, append it.  */
1492               if (org_node == root_node && clone_node != org_node)
1493                 {
1494                   /* ...but if the node is root_node itself, it means the
1495                      epsilon closure have a loop, then tie it to the
1496                      destination of the root_node.  */
1497                   ret = re_node_set_insert (dfa->edests + clone_node,
1498                                             org_dest);
1499                   if (BE (ret < 0, 0))
1500                     return REG_ESPACE;
1501                   break;
1502                 }
1503               constraint |= dfa->nodes[org_node].opr.ctx_type;
1504             }
1505           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1506           if (BE (err != REG_NOERROR, 0))
1507             return err;
1508           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1509           if (BE (ret < 0, 0))
1510             return REG_ESPACE;
1511         }
1512       else /* dfa->edests[org_node].nelem == 2 */
1513         {
1514           /* In case of the node can epsilon-transit, and it has two
1515              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1516           org_dest = dfa->edests[org_node].elems[0];
1517           re_node_set_empty (dfa->edests + clone_node);
1518           /* Search for a duplicated node which satisfies the constraint.  */
1519           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1520           if (clone_dest == -1)
1521             {
1522               /* There are no such a duplicated node, create a new one.  */
1523               err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1524               if (BE (err != REG_NOERROR, 0))
1525                 return err;
1526               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1527               if (BE (ret < 0, 0))
1528                 return REG_ESPACE;
1529               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1530                                             root_node, constraint);
1531               if (BE (err != REG_NOERROR, 0))
1532                 return err;
1533             }
1534           else
1535             {
1536               /* There are a duplicated node which satisfy the constraint,
1537                  use it to avoid infinite loop.  */
1538               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1539               if (BE (ret < 0, 0))
1540                 return REG_ESPACE;
1541             }
1542
1543           org_dest = dfa->edests[org_node].elems[1];
1544           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1545           if (BE (err != REG_NOERROR, 0))
1546             return err;
1547           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548           if (BE (ret < 0, 0))
1549             return REG_ESPACE;
1550         }
1551       org_node = org_dest;
1552       clone_node = clone_dest;
1553     }
1554   return REG_NOERROR;
1555 }
1556
1557 /* Search for a node which is duplicated from the node ORG_NODE, and
1558    satisfies the constraint CONSTRAINT.  */
1559
1560 static int
1561 search_duplicated_node (dfa, org_node, constraint)
1562      re_dfa_t *dfa;
1563      int org_node;
1564      unsigned int constraint;
1565 {
1566   int idx;
1567   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1568     {
1569       if (org_node == dfa->org_indices[idx]
1570           && constraint == dfa->nodes[idx].constraint)
1571         return idx; /* Found.  */
1572     }
1573   return -1; /* Not found.  */
1574 }
1575
1576 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1577    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1578    otherwise return the error code.  */
1579
1580 static reg_errcode_t
1581 duplicate_node (new_idx, dfa, org_idx, constraint)
1582      re_dfa_t *dfa;
1583      int *new_idx, org_idx;
1584      unsigned int constraint;
1585 {
1586   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1587   if (BE (dup_idx == -1, 0))
1588     return REG_ESPACE;
1589   dfa->nodes[dup_idx].constraint = constraint;
1590   if (dfa->nodes[org_idx].type == ANCHOR)
1591     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1592   dfa->nodes[dup_idx].duplicated = 1;
1593
1594   /* Store the index of the original node.  */
1595   dfa->org_indices[dup_idx] = org_idx;
1596   *new_idx = dup_idx;
1597   return REG_NOERROR;
1598 }
1599
1600 static void
1601 calc_inveclosure (dfa)
1602      re_dfa_t *dfa;
1603 {
1604   int src, idx, dest;
1605   for (src = 0; src < dfa->nodes_len; ++src)
1606     {
1607       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1608         {
1609           dest = dfa->eclosures[src].elems[idx];
1610           re_node_set_insert_last (dfa->inveclosures + dest, src);
1611         }
1612     }
1613 }
1614
1615 /* Calculate "eclosure" for all the node in DFA.  */
1616
1617 static reg_errcode_t
1618 calc_eclosure (dfa)
1619      re_dfa_t *dfa;
1620 {
1621   int node_idx, incomplete;
1622 #ifdef DEBUG
1623   assert (dfa->nodes_len > 0);
1624 #endif
1625   incomplete = 0;
1626   /* For each nodes, calculate epsilon closure.  */
1627   for (node_idx = 0; ; ++node_idx)
1628     {
1629       reg_errcode_t err;
1630       re_node_set eclosure_elem;
1631       if (node_idx == dfa->nodes_len)
1632         {
1633           if (!incomplete)
1634             break;
1635           incomplete = 0;
1636           node_idx = 0;
1637         }
1638
1639 #ifdef DEBUG
1640       assert (dfa->eclosures[node_idx].nelem != -1);
1641 #endif
1642
1643       /* If we have already calculated, skip it.  */
1644       if (dfa->eclosures[node_idx].nelem != 0)
1645         continue;
1646       /* Calculate epsilon closure of `node_idx'.  */
1647       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1648       if (BE (err != REG_NOERROR, 0))
1649         return err;
1650
1651       if (dfa->eclosures[node_idx].nelem == 0)
1652         {
1653           incomplete = 1;
1654           re_node_set_free (&eclosure_elem);
1655         }
1656     }
1657   return REG_NOERROR;
1658 }
1659
1660 /* Calculate epsilon closure of NODE.  */
1661
1662 static reg_errcode_t
1663 calc_eclosure_iter (new_set, dfa, node, root)
1664      re_node_set *new_set;
1665      re_dfa_t *dfa;
1666      int node, root;
1667 {
1668   reg_errcode_t err;
1669   unsigned int constraint;
1670   int i, incomplete;
1671   re_node_set eclosure;
1672   incomplete = 0;
1673   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1674   if (BE (err != REG_NOERROR, 0))
1675     return err;
1676
1677   /* This indicates that we are calculating this node now.
1678      We reference this value to avoid infinite loop.  */
1679   dfa->eclosures[node].nelem = -1;
1680
1681   constraint = ((dfa->nodes[node].type == ANCHOR)
1682                 ? dfa->nodes[node].opr.ctx_type : 0);
1683   /* If the current node has constraints, duplicate all nodes.
1684      Since they must inherit the constraints.  */
1685   if (constraint
1686       && dfa->edests[node].nelem
1687       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1688     {
1689       int org_node, cur_node;
1690       org_node = cur_node = node;
1691       err = duplicate_node_closure (dfa, node, node, node, constraint);
1692       if (BE (err != REG_NOERROR, 0))
1693         return err;
1694     }
1695
1696   /* Expand each epsilon destination nodes.  */
1697   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1698     for (i = 0; i < dfa->edests[node].nelem; ++i)
1699       {
1700         re_node_set eclosure_elem;
1701         int edest = dfa->edests[node].elems[i];
1702         /* If calculating the epsilon closure of `edest' is in progress,
1703            return intermediate result.  */
1704         if (dfa->eclosures[edest].nelem == -1)
1705           {
1706             incomplete = 1;
1707             continue;
1708           }
1709         /* If we haven't calculated the epsilon closure of `edest' yet,
1710            calculate now. Otherwise use calculated epsilon closure.  */
1711         if (dfa->eclosures[edest].nelem == 0)
1712           {
1713             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1714             if (BE (err != REG_NOERROR, 0))
1715               return err;
1716           }
1717         else
1718           eclosure_elem = dfa->eclosures[edest];
1719         /* Merge the epsilon closure of `edest'.  */
1720         re_node_set_merge (&eclosure, &eclosure_elem);
1721         /* If the epsilon closure of `edest' is incomplete,
1722            the epsilon closure of this node is also incomplete.  */
1723         if (dfa->eclosures[edest].nelem == 0)
1724           {
1725             incomplete = 1;
1726             re_node_set_free (&eclosure_elem);
1727           }
1728       }
1729
1730   /* Epsilon closures include itself.  */
1731   re_node_set_insert (&eclosure, node);
1732   if (incomplete && !root)
1733     dfa->eclosures[node].nelem = 0;
1734   else
1735     dfa->eclosures[node] = eclosure;
1736   *new_set = eclosure;
1737   return REG_NOERROR;
1738 }
1739 \f
1740 /* Functions for token which are used in the parser.  */
1741
1742 /* Fetch a token from INPUT.
1743    We must not use this function inside bracket expressions.  */
1744
1745 static void
1746 fetch_token (result, input, syntax)
1747      re_token_t *result;
1748      re_string_t *input;
1749      reg_syntax_t syntax;
1750 {
1751   re_string_skip_bytes (input, peek_token (result, input, syntax));
1752 }
1753
1754 /* Peek a token from INPUT, and return the length of the token.
1755    We must not use this function inside bracket expressions.  */
1756
1757 static int
1758 peek_token (token, input, syntax)
1759      re_token_t *token;
1760      re_string_t *input;
1761      reg_syntax_t syntax;
1762 {
1763   unsigned char c;
1764
1765   if (re_string_eoi (input))
1766     {
1767       token->type = END_OF_RE;
1768       return 0;
1769     }
1770
1771   c = re_string_peek_byte (input, 0);
1772   token->opr.c = c;
1773
1774   token->word_char = 0;
1775 #ifdef RE_ENABLE_I18N
1776   token->mb_partial = 0;
1777   if (input->mb_cur_max > 1 &&
1778       !re_string_first_byte (input, re_string_cur_idx (input)))
1779     {
1780       token->type = CHARACTER;
1781       token->mb_partial = 1;
1782       return 1;
1783     }
1784 #endif
1785   if (c == '\\')
1786     {
1787       unsigned char c2;
1788       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1789         {
1790           token->type = BACK_SLASH;
1791           return 1;
1792         }
1793
1794       c2 = re_string_peek_byte_case (input, 1);
1795       token->opr.c = c2;
1796       token->type = CHARACTER;
1797 #ifdef RE_ENABLE_I18N
1798       if (input->mb_cur_max > 1)
1799         {
1800           wint_t wc = re_string_wchar_at (input,
1801                                           re_string_cur_idx (input) + 1);
1802           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1803         }
1804       else
1805 #endif
1806         token->word_char = IS_WORD_CHAR (c2) != 0;
1807
1808       switch (c2)
1809         {
1810         case '|':
1811           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1812             token->type = OP_ALT;
1813           break;
1814         case '1': case '2': case '3': case '4': case '5':
1815         case '6': case '7': case '8': case '9':
1816           if (!(syntax & RE_NO_BK_REFS))
1817             {
1818               token->type = OP_BACK_REF;
1819               token->opr.idx = c2 - '1';
1820             }
1821           break;
1822         case '<':
1823           if (!(syntax & RE_NO_GNU_OPS))
1824             {
1825               token->type = ANCHOR;
1826               token->opr.ctx_type = WORD_FIRST;
1827             }
1828           break;
1829         case '>':
1830           if (!(syntax & RE_NO_GNU_OPS))
1831             {
1832               token->type = ANCHOR;
1833               token->opr.ctx_type = WORD_LAST;
1834             }
1835           break;
1836         case 'b':
1837           if (!(syntax & RE_NO_GNU_OPS))
1838             {
1839               token->type = ANCHOR;
1840               token->opr.ctx_type = WORD_DELIM;
1841             }
1842           break;
1843         case 'B':
1844           if (!(syntax & RE_NO_GNU_OPS))
1845             {
1846               token->type = ANCHOR;
1847               token->opr.ctx_type = NOT_WORD_DELIM;
1848             }
1849           break;
1850         case 'w':
1851           if (!(syntax & RE_NO_GNU_OPS))
1852             token->type = OP_WORD;
1853           break;
1854         case 'W':
1855           if (!(syntax & RE_NO_GNU_OPS))
1856             token->type = OP_NOTWORD;
1857           break;
1858         case 's':
1859           if (!(syntax & RE_NO_GNU_OPS))
1860             token->type = OP_SPACE;
1861           break;
1862         case 'S':
1863           if (!(syntax & RE_NO_GNU_OPS))
1864             token->type = OP_NOTSPACE;
1865           break;
1866         case '`':
1867           if (!(syntax & RE_NO_GNU_OPS))
1868             {
1869               token->type = ANCHOR;
1870               token->opr.ctx_type = BUF_FIRST;
1871             }
1872           break;
1873         case '\'':
1874           if (!(syntax & RE_NO_GNU_OPS))
1875             {
1876               token->type = ANCHOR;
1877               token->opr.ctx_type = BUF_LAST;
1878             }
1879           break;
1880         case '(':
1881           if (!(syntax & RE_NO_BK_PARENS))
1882             token->type = OP_OPEN_SUBEXP;
1883           break;
1884         case ')':
1885           if (!(syntax & RE_NO_BK_PARENS))
1886             token->type = OP_CLOSE_SUBEXP;
1887           break;
1888         case '+':
1889           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1890             token->type = OP_DUP_PLUS;
1891           break;
1892         case '?':
1893           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1894             token->type = OP_DUP_QUESTION;
1895           break;
1896         case '{':
1897           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1898             token->type = OP_OPEN_DUP_NUM;
1899           break;
1900         case '}':
1901           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1902             token->type = OP_CLOSE_DUP_NUM;
1903           break;
1904         default:
1905           break;
1906         }
1907       return 2;
1908     }
1909
1910   token->type = CHARACTER;
1911 #ifdef RE_ENABLE_I18N
1912   if (input->mb_cur_max > 1)
1913     {
1914       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1915       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1916     }
1917   else
1918 #endif
1919     token->word_char = IS_WORD_CHAR (token->opr.c);
1920
1921   switch (c)
1922     {
1923     case '\n':
1924       if (syntax & RE_NEWLINE_ALT)
1925         token->type = OP_ALT;
1926       break;
1927     case '|':
1928       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1929         token->type = OP_ALT;
1930       break;
1931     case '*':
1932       token->type = OP_DUP_ASTERISK;
1933       break;
1934     case '+':
1935       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1936         token->type = OP_DUP_PLUS;
1937       break;
1938     case '?':
1939       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1940         token->type = OP_DUP_QUESTION;
1941       break;
1942     case '{':
1943       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1944         token->type = OP_OPEN_DUP_NUM;
1945       break;
1946     case '}':
1947       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1948         token->type = OP_CLOSE_DUP_NUM;
1949       break;
1950     case '(':
1951       if (syntax & RE_NO_BK_PARENS)
1952         token->type = OP_OPEN_SUBEXP;
1953       break;
1954     case ')':
1955       if (syntax & RE_NO_BK_PARENS)
1956         token->type = OP_CLOSE_SUBEXP;
1957       break;
1958     case '[':
1959       token->type = OP_OPEN_BRACKET;
1960       break;
1961     case '.':
1962       token->type = OP_PERIOD;
1963       break;
1964     case '^':
1965       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1966           re_string_cur_idx (input) != 0)
1967         {
1968           char prev = re_string_peek_byte (input, -1);
1969           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1970             break;
1971         }
1972       token->type = ANCHOR;
1973       token->opr.ctx_type = LINE_FIRST;
1974       break;
1975     case '$':
1976       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1977           re_string_cur_idx (input) + 1 != re_string_length (input))
1978         {
1979           re_token_t next;
1980           re_string_skip_bytes (input, 1);
1981           peek_token (&next, input, syntax);
1982           re_string_skip_bytes (input, -1);
1983           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1984             break;
1985         }
1986       token->type = ANCHOR;
1987       token->opr.ctx_type = LINE_LAST;
1988       break;
1989     default:
1990       break;
1991     }
1992   return 1;
1993 }
1994
1995 /* Peek a token from INPUT, and return the length of the token.
1996    We must not use this function out of bracket expressions.  */
1997
1998 static int
1999 peek_token_bracket (token, input, syntax)
2000      re_token_t *token;
2001      re_string_t *input;
2002      reg_syntax_t syntax;
2003 {
2004   unsigned char c;
2005   if (re_string_eoi (input))
2006     {
2007       token->type = END_OF_RE;
2008       return 0;
2009     }
2010   c = re_string_peek_byte (input, 0);
2011   token->opr.c = c;
2012
2013 #ifdef RE_ENABLE_I18N
2014   if (input->mb_cur_max > 1 &&
2015       !re_string_first_byte (input, re_string_cur_idx (input)))
2016     {
2017       token->type = CHARACTER;
2018       return 1;
2019     }
2020 #endif /* RE_ENABLE_I18N */
2021
2022   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2023       && re_string_cur_idx (input) + 1 < re_string_length (input))
2024     {
2025       /* In this case, '\' escape a character.  */
2026       unsigned char c2;
2027       re_string_skip_bytes (input, 1);
2028       c2 = re_string_peek_byte (input, 0);
2029       token->opr.c = c2;
2030       token->type = CHARACTER;
2031       return 1;
2032     }
2033   if (c == '[') /* '[' is a special char in a bracket exps.  */
2034     {
2035       unsigned char c2;
2036       int token_len;
2037       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2038         c2 = re_string_peek_byte (input, 1);
2039       else
2040         c2 = 0;
2041       token->opr.c = c2;
2042       token_len = 2;
2043       switch (c2)
2044         {
2045         case '.':
2046           token->type = OP_OPEN_COLL_ELEM;
2047           break;
2048         case '=':
2049           token->type = OP_OPEN_EQUIV_CLASS;
2050           break;
2051         case ':':
2052           if (syntax & RE_CHAR_CLASSES)
2053             {
2054               token->type = OP_OPEN_CHAR_CLASS;
2055               break;
2056             }
2057           /* else fall through.  */
2058         default:
2059           token->type = CHARACTER;
2060           token->opr.c = c;
2061           token_len = 1;
2062           break;
2063         }
2064       return token_len;
2065     }
2066   switch (c)
2067     {
2068     case '-':
2069       token->type = OP_CHARSET_RANGE;
2070       break;
2071     case ']':
2072       token->type = OP_CLOSE_BRACKET;
2073       break;
2074     case '^':
2075       token->type = OP_NON_MATCH_LIST;
2076       break;
2077     default:
2078       token->type = CHARACTER;
2079     }
2080   return 1;
2081 }
2082 \f
2083 /* Functions for parser.  */
2084
2085 /* Entry point of the parser.
2086    Parse the regular expression REGEXP and return the structure tree.
2087    If an error is occured, ERR is set by error code, and return NULL.
2088    This function build the following tree, from regular expression <reg_exp>:
2089            CAT
2090            / \
2091           /   \
2092    <reg_exp>  EOR
2093
2094    CAT means concatenation.
2095    EOR means end of regular expression.  */
2096
2097 static bin_tree_t *
2098 parse (regexp, preg, syntax, err)
2099      re_string_t *regexp;
2100      regex_t *preg;
2101      reg_syntax_t syntax;
2102      reg_errcode_t *err;
2103 {
2104   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2105   bin_tree_t *tree, *eor, *root;
2106   re_token_t current_token;
2107   dfa->syntax = syntax;
2108   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2109   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2110   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2111     return NULL;
2112   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2113   if (tree != NULL)
2114     root = create_tree (dfa, tree, eor, CONCAT);
2115   else
2116     root = eor;
2117   if (BE (eor == NULL || root == NULL, 0))
2118     {
2119       *err = REG_ESPACE;
2120       return NULL;
2121     }
2122   return root;
2123 }
2124
2125 /* This function build the following tree, from regular expression
2126    <branch1>|<branch2>:
2127            ALT
2128            / \
2129           /   \
2130    <branch1> <branch2>
2131
2132    ALT means alternative, which represents the operator `|'.  */
2133
2134 static bin_tree_t *
2135 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2136      re_string_t *regexp;
2137      regex_t *preg;
2138      re_token_t *token;
2139      reg_syntax_t syntax;
2140      int nest;
2141      reg_errcode_t *err;
2142 {
2143   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2144   bin_tree_t *tree, *branch = NULL;
2145   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2146   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2147     return NULL;
2148
2149   while (token->type == OP_ALT)
2150     {
2151       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2152       if (token->type != OP_ALT && token->type != END_OF_RE
2153           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2154         {
2155           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2156           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2157             return NULL;
2158         }
2159       else
2160         branch = NULL;
2161       tree = create_tree (dfa, tree, branch, OP_ALT);
2162       if (BE (tree == NULL, 0))
2163         {
2164           *err = REG_ESPACE;
2165           return NULL;
2166         }
2167     }
2168   return tree;
2169 }
2170
2171 /* This function build the following tree, from regular expression
2172    <exp1><exp2>:
2173         CAT
2174         / \
2175        /   \
2176    <exp1> <exp2>
2177
2178    CAT means concatenation.  */
2179
2180 static bin_tree_t *
2181 parse_branch (regexp, preg, token, syntax, nest, err)
2182      re_string_t *regexp;
2183      regex_t *preg;
2184      re_token_t *token;
2185      reg_syntax_t syntax;
2186      int nest;
2187      reg_errcode_t *err;
2188 {
2189   bin_tree_t *tree, *exp;
2190   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2191   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2192   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2193     return NULL;
2194
2195   while (token->type != OP_ALT && token->type != END_OF_RE
2196          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2197     {
2198       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2199       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2200         {
2201           return NULL;
2202         }
2203       if (tree != NULL && exp != NULL)
2204         {
2205           tree = create_tree (dfa, tree, exp, CONCAT);
2206           if (tree == NULL)
2207             {
2208               *err = REG_ESPACE;
2209               return NULL;
2210             }
2211         }
2212       else if (tree == NULL)
2213         tree = exp;
2214       /* Otherwise exp == NULL, we don't need to create new tree.  */
2215     }
2216   return tree;
2217 }
2218
2219 /* This function build the following tree, from regular expression a*:
2220          *
2221          |
2222          a
2223 */
2224
2225 static bin_tree_t *
2226 parse_expression (regexp, preg, token, syntax, nest, err)
2227      re_string_t *regexp;
2228      regex_t *preg;
2229      re_token_t *token;
2230      reg_syntax_t syntax;
2231      int nest;
2232      reg_errcode_t *err;
2233 {
2234   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2235   bin_tree_t *tree;
2236   switch (token->type)
2237     {
2238     case CHARACTER:
2239       tree = create_token_tree (dfa, NULL, NULL, token);
2240       if (BE (tree == NULL, 0))
2241         {
2242           *err = REG_ESPACE;
2243           return NULL;
2244         }
2245 #ifdef RE_ENABLE_I18N
2246       if (dfa->mb_cur_max > 1)
2247         {
2248           while (!re_string_eoi (regexp)
2249                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2250             {
2251               bin_tree_t *mbc_remain;
2252               fetch_token (token, regexp, syntax);
2253               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2254               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2255               if (BE (mbc_remain == NULL || tree == NULL, 0))
2256                 {
2257                   *err = REG_ESPACE;
2258                   return NULL;
2259                 }
2260             }
2261         }
2262 #endif
2263       break;
2264     case OP_OPEN_SUBEXP:
2265       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2266       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2267         return NULL;
2268       break;
2269     case OP_OPEN_BRACKET:
2270       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2271       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2272         return NULL;
2273       break;
2274     case OP_BACK_REF:
2275       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2276         {
2277           *err = REG_ESUBREG;
2278           return NULL;
2279         }
2280       dfa->used_bkref_map |= 1 << token->opr.idx;
2281       tree = create_token_tree (dfa, NULL, NULL, token);
2282       if (BE (tree == NULL, 0))
2283         {
2284           *err = REG_ESPACE;
2285           return NULL;
2286         }
2287       ++dfa->nbackref;
2288       dfa->has_mb_node = 1;
2289       break;
2290     case OP_OPEN_DUP_NUM:
2291       if (syntax & RE_CONTEXT_INVALID_DUP)
2292         {
2293           *err = REG_BADRPT;
2294           return NULL;
2295         }
2296       /* FALLTHROUGH */
2297     case OP_DUP_ASTERISK:
2298     case OP_DUP_PLUS:
2299     case OP_DUP_QUESTION:
2300       if (syntax & RE_CONTEXT_INVALID_OPS)
2301         {
2302           *err = REG_BADRPT;
2303           return NULL;
2304         }
2305       else if (syntax & RE_CONTEXT_INDEP_OPS)
2306         {
2307           fetch_token (token, regexp, syntax);
2308           return parse_expression (regexp, preg, token, syntax, nest, err);
2309         }
2310       /* else fall through  */
2311     case OP_CLOSE_SUBEXP:
2312       if ((token->type == OP_CLOSE_SUBEXP) &&
2313           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2314         {
2315           *err = REG_ERPAREN;
2316           return NULL;
2317         }
2318       /* else fall through  */
2319     case OP_CLOSE_DUP_NUM:
2320       /* We treat it as a normal character.  */
2321
2322       /* Then we can these characters as normal characters.  */
2323       token->type = CHARACTER;
2324       /* mb_partial and word_char bits should be initialized already
2325          by peek_token.  */
2326       tree = create_token_tree (dfa, NULL, NULL, token);
2327       if (BE (tree == NULL, 0))
2328         {
2329           *err = REG_ESPACE;
2330           return NULL;
2331         }
2332       break;
2333     case ANCHOR:
2334       if ((token->opr.ctx_type
2335            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2336           && dfa->word_ops_used == 0)
2337         init_word_char (dfa);
2338       if (token->opr.ctx_type == WORD_DELIM
2339           || token->opr.ctx_type == NOT_WORD_DELIM)
2340         {
2341           bin_tree_t *tree_first, *tree_last;
2342           if (token->opr.ctx_type == WORD_DELIM)
2343             {
2344               token->opr.ctx_type = WORD_FIRST;
2345               tree_first = create_token_tree (dfa, NULL, NULL, token);
2346               token->opr.ctx_type = WORD_LAST;
2347             }
2348           else
2349             {
2350               token->opr.ctx_type = INSIDE_WORD;
2351               tree_first = create_token_tree (dfa, NULL, NULL, token);
2352               token->opr.ctx_type = INSIDE_NOTWORD;
2353             }
2354           tree_last = create_token_tree (dfa, NULL, NULL, token);
2355           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2356           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2357             {
2358               *err = REG_ESPACE;
2359               return NULL;
2360             }
2361         }
2362       else
2363         {
2364           tree = create_token_tree (dfa, NULL, NULL, token);
2365           if (BE (tree == NULL, 0))
2366             {
2367               *err = REG_ESPACE;
2368               return NULL;
2369             }
2370         }
2371       /* We must return here, since ANCHORs can't be followed
2372          by repetition operators.
2373          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2374              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2375       fetch_token (token, regexp, syntax);
2376       return tree;
2377     case OP_PERIOD:
2378       tree = create_token_tree (dfa, NULL, NULL, token);
2379       if (BE (tree == NULL, 0))
2380         {
2381           *err = REG_ESPACE;
2382           return NULL;
2383         }
2384       if (dfa->mb_cur_max > 1)
2385         dfa->has_mb_node = 1;
2386       break;
2387     case OP_WORD:
2388     case OP_NOTWORD:
2389       tree = build_charclass_op (dfa, regexp->trans,
2390                                  (const unsigned char *) "alnum",
2391                                  (const unsigned char *) "_",
2392                                  token->type == OP_NOTWORD, err);
2393       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2394         return NULL;
2395       break;
2396     case OP_SPACE:
2397     case OP_NOTSPACE:
2398       tree = build_charclass_op (dfa, regexp->trans,
2399                                  (const unsigned char *) "space",
2400                                  (const unsigned char *) "",
2401                                  token->type == OP_NOTSPACE, err);
2402       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2403         return NULL;
2404       break;
2405     case OP_ALT:
2406     case END_OF_RE:
2407       return NULL;
2408     case BACK_SLASH:
2409       *err = REG_EESCAPE;
2410       return NULL;
2411     default:
2412       /* Must not happen?  */
2413 #ifdef DEBUG
2414       assert (0);
2415 #endif
2416       return NULL;
2417     }
2418   fetch_token (token, regexp, syntax);
2419
2420   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2421          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2422     {
2423       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2424       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2425         return NULL;
2426       /* In BRE consecutive duplications are not allowed.  */
2427       if ((syntax & RE_CONTEXT_INVALID_DUP)
2428           && (token->type == OP_DUP_ASTERISK
2429               || token->type == OP_OPEN_DUP_NUM))
2430         {
2431           *err = REG_BADRPT;
2432           return NULL;
2433         }
2434     }
2435
2436   return tree;
2437 }
2438
2439 /* This function build the following tree, from regular expression
2440    (<reg_exp>):
2441          SUBEXP
2442             |
2443         <reg_exp>
2444 */
2445
2446 static bin_tree_t *
2447 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2448      re_string_t *regexp;
2449      regex_t *preg;
2450      re_token_t *token;
2451      reg_syntax_t syntax;
2452      int nest;
2453      reg_errcode_t *err;
2454 {
2455   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2456   bin_tree_t *tree;
2457   size_t cur_nsub;
2458   cur_nsub = preg->re_nsub++;
2459
2460   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2461
2462   /* The subexpression may be a null string.  */
2463   if (token->type == OP_CLOSE_SUBEXP)
2464     tree = NULL;
2465   else
2466     {
2467       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2468       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2469         *err = REG_EPAREN;
2470       if (BE (*err != REG_NOERROR, 0))
2471         return NULL;
2472     }
2473   dfa->completed_bkref_map |= 1 << cur_nsub;
2474
2475   tree = create_tree (dfa, tree, NULL, SUBEXP);
2476   if (BE (tree == NULL, 0))
2477     {
2478       *err = REG_ESPACE;
2479       return NULL;
2480     }
2481   tree->token.opr.idx = cur_nsub;
2482   return tree;
2483 }
2484
2485 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2486
2487 static bin_tree_t *
2488 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2489      bin_tree_t *elem;
2490      re_string_t *regexp;
2491      re_dfa_t *dfa;
2492      re_token_t *token;
2493      reg_syntax_t syntax;
2494      reg_errcode_t *err;
2495 {
2496   bin_tree_t *tree = NULL, *old_tree = NULL;
2497   int i, start, end, start_idx = re_string_cur_idx (regexp);
2498   re_token_t start_token = *token;
2499
2500   if (token->type == OP_OPEN_DUP_NUM)
2501     {
2502       end = 0;
2503       start = fetch_number (regexp, token, syntax);
2504       if (start == -1)
2505         {
2506           if (token->type == CHARACTER && token->opr.c == ',')
2507             start = 0; /* We treat "{,m}" as "{0,m}".  */
2508           else
2509             {
2510               *err = REG_BADBR; /* <re>{} is invalid.  */
2511               return NULL;
2512             }
2513         }
2514       if (BE (start != -2, 1))
2515         {
2516           /* We treat "{n}" as "{n,n}".  */
2517           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2518                  : ((token->type == CHARACTER && token->opr.c == ',')
2519                     ? fetch_number (regexp, token, syntax) : -2));
2520         }
2521       if (BE (start == -2 || end == -2, 0))
2522         {
2523           /* Invalid sequence.  */
2524           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2525             {
2526               if (token->type == END_OF_RE)
2527                 *err = REG_EBRACE;
2528               else
2529                 *err = REG_BADBR;
2530
2531               return NULL;
2532             }
2533
2534           /* If the syntax bit is set, rollback.  */
2535           re_string_set_index (regexp, start_idx);
2536           *token = start_token;
2537           token->type = CHARACTER;
2538           /* mb_partial and word_char bits should be already initialized by
2539              peek_token.  */
2540           return elem;
2541         }
2542
2543       if (BE (end != -1 && start > end, 0))
2544         {
2545           /* First number greater than second.  */
2546           *err = REG_BADBR;
2547           return NULL;
2548         }
2549     }
2550   else
2551     {
2552       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2553       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2554     }
2555
2556   fetch_token (token, regexp, syntax);
2557
2558   if (BE (elem == NULL, 0))
2559     return NULL;
2560   if (BE (start == 0 && end == 0, 0))
2561     {
2562       postorder (elem, free_tree, NULL);
2563       return NULL;
2564     }
2565
2566   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2567   if (BE (start > 0, 0))
2568     {
2569       tree = elem;
2570       for (i = 2; i <= start; ++i)
2571         {
2572           elem = duplicate_tree (elem, dfa);
2573           tree = create_tree (dfa, tree, elem, CONCAT);
2574           if (BE (elem == NULL || tree == NULL, 0))
2575             goto parse_dup_op_espace;
2576         }
2577
2578       if (start == end)
2579         return tree;
2580
2581       /* Duplicate ELEM before it is marked optional.  */
2582       elem = duplicate_tree (elem, dfa);
2583       old_tree = tree;
2584     }
2585   else
2586     old_tree = NULL;
2587
2588   if (elem->token.type == SUBEXP)
2589     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2590
2591   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2592   if (BE (tree == NULL, 0))
2593     goto parse_dup_op_espace;
2594
2595   /* This loop is actually executed only when end != -1,
2596      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2597      already created the start+1-th copy.  */
2598   for (i = start + 2; i <= end; ++i)
2599     {
2600       elem = duplicate_tree (elem, dfa);
2601       tree = create_tree (dfa, tree, elem, CONCAT);
2602       if (BE (elem == NULL || tree == NULL, 0))
2603         goto parse_dup_op_espace;
2604
2605       tree = create_tree (dfa, tree, NULL, OP_ALT);
2606       if (BE (tree == NULL, 0))
2607         goto parse_dup_op_espace;
2608     }
2609
2610   if (old_tree)
2611     tree = create_tree (dfa, old_tree, tree, CONCAT);
2612
2613   return tree;
2614
2615  parse_dup_op_espace:
2616   *err = REG_ESPACE;
2617   return NULL;
2618 }
2619
2620 /* Size of the names for collating symbol/equivalence_class/character_class.
2621    I'm not sure, but maybe enough.  */
2622 #define BRACKET_NAME_BUF_SIZE 32
2623
2624 #ifndef _LIBC
2625   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2626      Build the range expression which starts from START_ELEM, and ends
2627      at END_ELEM.  The result are written to MBCSET and SBCSET.
2628      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2629      mbcset->range_ends, is a pointer argument sinse we may
2630      update it.  */
2631
2632 static reg_errcode_t
2633 # ifdef RE_ENABLE_I18N
2634 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2635      re_charset_t *mbcset;
2636      int *range_alloc;
2637 # else /* not RE_ENABLE_I18N */
2638 build_range_exp (sbcset, start_elem, end_elem)
2639 # endif /* not RE_ENABLE_I18N */
2640      re_bitset_ptr_t sbcset;
2641      bracket_elem_t *start_elem, *end_elem;
2642 {
2643   unsigned int start_ch, end_ch;
2644   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2645   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2646           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2647           0))
2648     return REG_ERANGE;
2649
2650   /* We can handle no multi character collating elements without libc
2651      support.  */
2652   if (BE ((start_elem->type == COLL_SYM
2653            && strlen ((char *) start_elem->opr.name) > 1)
2654           || (end_elem->type == COLL_SYM
2655               && strlen ((char *) end_elem->opr.name) > 1), 0))
2656     return REG_ECOLLATE;
2657
2658 # ifdef RE_ENABLE_I18N
2659   {
2660     wchar_t wc, start_wc, end_wc;
2661     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2662
2663     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2664                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2665                    : 0));
2666     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2667               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2668                  : 0));
2669     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2670                 ? __btowc (start_ch) : start_elem->opr.wch);
2671     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2672               ? __btowc (end_ch) : end_elem->opr.wch);
2673     if (start_wc == WEOF || end_wc == WEOF)
2674       return REG_ECOLLATE;
2675     cmp_buf[0] = start_wc;
2676     cmp_buf[4] = end_wc;
2677     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2678       return REG_ERANGE;
2679
2680     /* Got valid collation sequence values, add them as a new entry.
2681        However, for !_LIBC we have no collation elements: if the
2682        character set is single byte, the single byte character set
2683        that we build below suffices.  parse_bracket_exp passes
2684        no MBCSET if dfa->mb_cur_max == 1.  */
2685     if (mbcset)
2686       {
2687         /* Check the space of the arrays.  */
2688         if (BE (*range_alloc == mbcset->nranges, 0))
2689           {
2690             /* There is not enough space, need realloc.  */
2691             wchar_t *new_array_start, *new_array_end;
2692             int new_nranges;
2693
2694             /* +1 in case of mbcset->nranges is 0.  */
2695             new_nranges = 2 * mbcset->nranges + 1;
2696             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2697                are NULL if *range_alloc == 0.  */
2698             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2699                                           new_nranges);
2700             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2701                                         new_nranges);
2702
2703             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2704               return REG_ESPACE;
2705
2706             mbcset->range_starts = new_array_start;
2707             mbcset->range_ends = new_array_end;
2708             *range_alloc = new_nranges;
2709           }
2710
2711         mbcset->range_starts[mbcset->nranges] = start_wc;
2712         mbcset->range_ends[mbcset->nranges++] = end_wc;
2713       }
2714
2715     /* Build the table for single byte characters.  */
2716     for (wc = 0; wc < SBC_MAX; ++wc)
2717       {
2718         cmp_buf[2] = wc;
2719         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2720             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2721           bitset_set (sbcset, wc);
2722       }
2723   }
2724 # else /* not RE_ENABLE_I18N */
2725   {
2726     unsigned int ch;
2727     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2728                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2729                    : 0));
2730     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2731               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2732                  : 0));
2733     if (start_ch > end_ch)
2734       return REG_ERANGE;
2735     /* Build the table for single byte characters.  */
2736     for (ch = 0; ch < SBC_MAX; ++ch)
2737       if (start_ch <= ch  && ch <= end_ch)
2738         bitset_set (sbcset, ch);
2739   }
2740 # endif /* not RE_ENABLE_I18N */
2741   return REG_NOERROR;
2742 }
2743 #endif /* not _LIBC */
2744
2745 #ifndef _LIBC
2746 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2747    Build the collating element which is represented by NAME.
2748    The result are written to MBCSET and SBCSET.
2749    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2750    pointer argument since we may update it.  */
2751
2752 static reg_errcode_t
2753 # ifdef RE_ENABLE_I18N
2754 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2755      re_charset_t *mbcset;
2756      int *coll_sym_alloc;
2757 # else /* not RE_ENABLE_I18N */
2758 build_collating_symbol (sbcset, name)
2759 # endif /* not RE_ENABLE_I18N */
2760      re_bitset_ptr_t sbcset;
2761      const unsigned char *name;
2762 {
2763   size_t name_len = strlen ((const char *) name);
2764   if (BE (name_len != 1, 0))
2765     return REG_ECOLLATE;
2766   else
2767     {
2768       bitset_set (sbcset, name[0]);
2769       return REG_NOERROR;
2770     }
2771 }
2772 #endif /* not _LIBC */
2773
2774 /* This function parse bracket expression like "[abc]", "[a-c]",
2775    "[[.a-a.]]" etc.  */
2776
2777 static bin_tree_t *
2778 parse_bracket_exp (regexp, dfa, token, syntax, err)
2779      re_string_t *regexp;
2780      re_dfa_t *dfa;
2781      re_token_t *token;
2782      reg_syntax_t syntax;
2783      reg_errcode_t *err;
2784 {
2785 #ifdef _LIBC
2786   const unsigned char *collseqmb;
2787   const char *collseqwc;
2788   uint32_t nrules;
2789   int32_t table_size;
2790   const int32_t *symb_table;
2791   const unsigned char *extra;
2792
2793   /* Local function for parse_bracket_exp used in _LIBC environement.
2794      Seek the collating symbol entry correspondings to NAME.
2795      Return the index of the symbol in the SYMB_TABLE.  */
2796
2797   auto inline int32_t
2798   __attribute ((always_inline))
2799   seek_collating_symbol_entry (name, name_len)
2800          const unsigned char *name;
2801          size_t name_len;
2802     {
2803       int32_t hash = elem_hash ((const char *) name, name_len);
2804       int32_t elem = hash % table_size;
2805       int32_t second = hash % (table_size - 2);
2806       while (symb_table[2 * elem] != 0)
2807         {
2808           /* First compare the hashing value.  */
2809           if (symb_table[2 * elem] == hash
2810               /* Compare the length of the name.  */
2811               && name_len == extra[symb_table[2 * elem + 1]]
2812               /* Compare the name.  */
2813               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2814                          name_len) == 0)
2815             {
2816               /* Yep, this is the entry.  */
2817               break;
2818             }
2819
2820           /* Next entry.  */
2821           elem += second;
2822         }
2823       return elem;
2824     }
2825
2826   /* Local function for parse_bracket_exp used in _LIBC environement.
2827      Look up the collation sequence value of BR_ELEM.
2828      Return the value if succeeded, UINT_MAX otherwise.  */
2829
2830   auto inline unsigned int
2831   __attribute ((always_inline))
2832   lookup_collation_sequence_value (br_elem)
2833          bracket_elem_t *br_elem;
2834     {
2835       if (br_elem->type == SB_CHAR)
2836         {
2837           /*
2838           if (MB_CUR_MAX == 1)
2839           */
2840           if (nrules == 0)
2841             return collseqmb[br_elem->opr.ch];
2842           else
2843             {
2844               wint_t wc = __btowc (br_elem->opr.ch);
2845               return __collseq_table_lookup (collseqwc, wc);
2846             }
2847         }
2848       else if (br_elem->type == MB_CHAR)
2849         {
2850           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2851         }
2852       else if (br_elem->type == COLL_SYM)
2853         {
2854           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2855           if (nrules != 0)
2856             {
2857               int32_t elem, idx;
2858               elem = seek_collating_symbol_entry (br_elem->opr.name,
2859                                                   sym_name_len);
2860               if (symb_table[2 * elem] != 0)
2861                 {
2862                   /* We found the entry.  */
2863                   idx = symb_table[2 * elem + 1];
2864                   /* Skip the name of collating element name.  */
2865                   idx += 1 + extra[idx];
2866                   /* Skip the byte sequence of the collating element.  */
2867                   idx += 1 + extra[idx];
2868                   /* Adjust for the alignment.  */
2869                   idx = (idx + 3) & ~3;
2870                   /* Skip the multibyte collation sequence value.  */
2871                   idx += sizeof (unsigned int);
2872                   /* Skip the wide char sequence of the collating element.  */
2873                   idx += sizeof (unsigned int) *
2874                     (1 + *(unsigned int *) (extra + idx));
2875                   /* Return the collation sequence value.  */
2876                   return *(unsigned int *) (extra + idx);
2877                 }
2878               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2879                 {
2880                   /* No valid character.  Match it as a single byte
2881                      character.  */
2882                   return collseqmb[br_elem->opr.name[0]];
2883                 }
2884             }
2885           else if (sym_name_len == 1)
2886             return collseqmb[br_elem->opr.name[0]];
2887         }
2888       return UINT_MAX;
2889     }
2890
2891   /* Local function for parse_bracket_exp used in _LIBC environement.
2892      Build the range expression which starts from START_ELEM, and ends
2893      at END_ELEM.  The result are written to MBCSET and SBCSET.
2894      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2895      mbcset->range_ends, is a pointer argument sinse we may
2896      update it.  */
2897
2898   auto inline reg_errcode_t
2899   __attribute ((always_inline))
2900   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2901          re_charset_t *mbcset;
2902          int *range_alloc;
2903          re_bitset_ptr_t sbcset;
2904          bracket_elem_t *start_elem, *end_elem;
2905     {
2906       unsigned int ch;
2907       uint32_t start_collseq;
2908       uint32_t end_collseq;
2909
2910       /* Equivalence Classes and Character Classes can't be a range
2911          start/end.  */
2912       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2913               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2914               0))
2915         return REG_ERANGE;
2916
2917       start_collseq = lookup_collation_sequence_value (start_elem);
2918       end_collseq = lookup_collation_sequence_value (end_elem);
2919       /* Check start/end collation sequence values.  */
2920       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2921         return REG_ECOLLATE;
2922       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2923         return REG_ERANGE;
2924
2925       /* Got valid collation sequence values, add them as a new entry.
2926          However, if we have no collation elements, and the character set
2927          is single byte, the single byte character set that we
2928          build below suffices. */
2929       if (nrules > 0 || dfa->mb_cur_max > 1)
2930         {
2931           /* Check the space of the arrays.  */
2932           if (BE (*range_alloc == mbcset->nranges, 0))
2933             {
2934               /* There is not enough space, need realloc.  */
2935               uint32_t *new_array_start;
2936               uint32_t *new_array_end;
2937               int new_nranges;
2938
2939               /* +1 in case of mbcset->nranges is 0.  */
2940               new_nranges = 2 * mbcset->nranges + 1;
2941               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2942                                             new_nranges);
2943               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2944                                           new_nranges);
2945
2946               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2947                 return REG_ESPACE;
2948
2949               mbcset->range_starts = new_array_start;
2950               mbcset->range_ends = new_array_end;
2951               *range_alloc = new_nranges;
2952             }
2953
2954           mbcset->range_starts[mbcset->nranges] = start_collseq;
2955           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2956         }
2957
2958       /* Build the table for single byte characters.  */
2959       for (ch = 0; ch < SBC_MAX; ch++)
2960         {
2961           uint32_t ch_collseq;
2962           /*
2963           if (MB_CUR_MAX == 1)
2964           */
2965           if (nrules == 0)
2966             ch_collseq = collseqmb[ch];
2967           else
2968             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2969           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2970             bitset_set (sbcset, ch);
2971         }
2972       return REG_NOERROR;
2973     }
2974
2975   /* Local function for parse_bracket_exp used in _LIBC environement.
2976      Build the collating element which is represented by NAME.
2977      The result are written to MBCSET and SBCSET.
2978      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2979      pointer argument sinse we may update it.  */
2980
2981   auto inline reg_errcode_t
2982   __attribute ((always_inline))
2983   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2984          re_charset_t *mbcset;
2985          int *coll_sym_alloc;
2986          re_bitset_ptr_t sbcset;
2987          const unsigned char *name;
2988     {
2989       int32_t elem, idx;
2990       size_t name_len = strlen ((const char *) name);
2991       if (nrules != 0)
2992         {
2993           elem = seek_collating_symbol_entry (name, name_len);
2994           if (symb_table[2 * elem] != 0)
2995             {
2996               /* We found the entry.  */
2997               idx = symb_table[2 * elem + 1];
2998               /* Skip the name of collating element name.  */
2999               idx += 1 + extra[idx];
3000             }
3001           else if (symb_table[2 * elem] == 0 && name_len == 1)
3002             {
3003               /* No valid character, treat it as a normal
3004                  character.  */
3005               bitset_set (sbcset, name[0]);
3006               return REG_NOERROR;
3007             }
3008           else
3009             return REG_ECOLLATE;
3010
3011           /* Got valid collation sequence, add it as a new entry.  */
3012           /* Check the space of the arrays.  */
3013           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3014             {
3015               /* Not enough, realloc it.  */
3016               /* +1 in case of mbcset->ncoll_syms is 0.  */
3017               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3018               /* Use realloc since mbcset->coll_syms is NULL
3019                  if *alloc == 0.  */
3020               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3021                                                    new_coll_sym_alloc);
3022               if (BE (new_coll_syms == NULL, 0))
3023                 return REG_ESPACE;
3024               mbcset->coll_syms = new_coll_syms;
3025               *coll_sym_alloc = new_coll_sym_alloc;
3026             }
3027           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3028           return REG_NOERROR;
3029         }
3030       else
3031         {
3032           if (BE (name_len != 1, 0))
3033             return REG_ECOLLATE;
3034           else
3035             {
3036               bitset_set (sbcset, name[0]);
3037               return REG_NOERROR;
3038             }
3039         }
3040     }
3041 #endif
3042
3043   re_token_t br_token;
3044   re_bitset_ptr_t sbcset;
3045 #ifdef RE_ENABLE_I18N
3046   re_charset_t *mbcset;
3047   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3048   int equiv_class_alloc = 0, char_class_alloc = 0;
3049 #endif /* not RE_ENABLE_I18N */
3050   int non_match = 0;
3051   bin_tree_t *work_tree;
3052   int token_len;
3053   int first_round = 1;
3054 #ifdef _LIBC
3055   collseqmb = (const unsigned char *)
3056     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3057   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3058   if (nrules)
3059     {
3060       /*
3061       if (MB_CUR_MAX > 1)
3062       */
3063         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3064       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3065       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3066                                                   _NL_COLLATE_SYMB_TABLEMB);
3067       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3068                                                    _NL_COLLATE_SYMB_EXTRAMB);
3069     }
3070 #endif
3071   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3072 #ifdef RE_ENABLE_I18N
3073   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3074 #endif /* RE_ENABLE_I18N */
3075 #ifdef RE_ENABLE_I18N
3076   if (BE (sbcset == NULL || mbcset == NULL, 0))
3077 #else
3078   if (BE (sbcset == NULL, 0))
3079 #endif /* RE_ENABLE_I18N */
3080     {
3081       *err = REG_ESPACE;
3082       return NULL;
3083     }
3084
3085   token_len = peek_token_bracket (token, regexp, syntax);
3086   if (BE (token->type == END_OF_RE, 0))
3087     {
3088       *err = REG_BADPAT;
3089       goto parse_bracket_exp_free_return;
3090     }
3091   if (token->type == OP_NON_MATCH_LIST)
3092     {
3093 #ifdef RE_ENABLE_I18N
3094       mbcset->non_match = 1;
3095 #endif /* not RE_ENABLE_I18N */
3096       non_match = 1;
3097       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3098         bitset_set (sbcset, '\0');
3099       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3100       token_len = peek_token_bracket (token, regexp, syntax);
3101       if (BE (token->type == END_OF_RE, 0))
3102         {
3103           *err = REG_BADPAT;
3104           goto parse_bracket_exp_free_return;
3105         }
3106     }
3107
3108   /* We treat the first ']' as a normal character.  */
3109   if (token->type == OP_CLOSE_BRACKET)
3110     token->type = CHARACTER;
3111
3112   while (1)
3113     {
3114       bracket_elem_t start_elem, end_elem;
3115       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3116       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3117       reg_errcode_t ret;
3118       int token_len2 = 0, is_range_exp = 0;
3119       re_token_t token2;
3120
3121       start_elem.opr.name = start_name_buf;
3122       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3123                                    syntax, first_round);
3124       if (BE (ret != REG_NOERROR, 0))
3125         {
3126           *err = ret;
3127           goto parse_bracket_exp_free_return;
3128         }
3129       first_round = 0;
3130
3131       /* Get information about the next token.  We need it in any case.  */
3132       token_len = peek_token_bracket (token, regexp, syntax);
3133
3134       /* Do not check for ranges if we know they are not allowed.  */
3135       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3136         {
3137           if (BE (token->type == END_OF_RE, 0))
3138             {
3139               *err = REG_EBRACK;
3140               goto parse_bracket_exp_free_return;
3141             }
3142           if (token->type == OP_CHARSET_RANGE)
3143             {
3144               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3145               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3146               if (BE (token2.type == END_OF_RE, 0))
3147                 {
3148                   *err = REG_EBRACK;
3149                   goto parse_bracket_exp_free_return;
3150                 }
3151               if (token2.type == OP_CLOSE_BRACKET)
3152                 {
3153                   /* We treat the last '-' as a normal character.  */
3154                   re_string_skip_bytes (regexp, -token_len);
3155                   token->type = CHARACTER;
3156                 }
3157               else
3158                 is_range_exp = 1;
3159             }
3160         }
3161
3162       if (is_range_exp == 1)
3163         {
3164           end_elem.opr.name = end_name_buf;
3165           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3166                                        dfa, syntax, 1);
3167           if (BE (ret != REG_NOERROR, 0))
3168             {
3169               *err = ret;
3170               goto parse_bracket_exp_free_return;
3171             }
3172
3173           token_len = peek_token_bracket (token, regexp, syntax);
3174
3175 #ifdef _LIBC
3176           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3177                                   &start_elem, &end_elem);
3178 #else
3179 # ifdef RE_ENABLE_I18N
3180           *err = build_range_exp (sbcset,
3181                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3182                                   &range_alloc, &start_elem, &end_elem);
3183 # else
3184           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3185 # endif
3186 #endif /* RE_ENABLE_I18N */
3187           if (BE (*err != REG_NOERROR, 0))
3188             goto parse_bracket_exp_free_return;
3189         }
3190       else
3191         {
3192           switch (start_elem.type)
3193             {
3194             case SB_CHAR:
3195               bitset_set (sbcset, start_elem.opr.ch);
3196               break;
3197 #ifdef RE_ENABLE_I18N
3198             case MB_CHAR:
3199               /* Check whether the array has enough space.  */
3200               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3201                 {
3202                   wchar_t *new_mbchars;
3203                   /* Not enough, realloc it.  */
3204                   /* +1 in case of mbcset->nmbchars is 0.  */
3205                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3206                   /* Use realloc since array is NULL if *alloc == 0.  */
3207                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3208                                             mbchar_alloc);
3209                   if (BE (new_mbchars == NULL, 0))
3210                     goto parse_bracket_exp_espace;
3211                   mbcset->mbchars = new_mbchars;
3212                 }
3213               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3214               break;
3215 #endif /* RE_ENABLE_I18N */
3216             case EQUIV_CLASS:
3217               *err = build_equiv_class (sbcset,
3218 #ifdef RE_ENABLE_I18N
3219                                         mbcset, &equiv_class_alloc,
3220 #endif /* RE_ENABLE_I18N */
3221                                         start_elem.opr.name);
3222               if (BE (*err != REG_NOERROR, 0))
3223                 goto parse_bracket_exp_free_return;
3224               break;
3225             case COLL_SYM:
3226               *err = build_collating_symbol (sbcset,
3227 #ifdef RE_ENABLE_I18N
3228                                              mbcset, &coll_sym_alloc,
3229 #endif /* RE_ENABLE_I18N */
3230                                              start_elem.opr.name);
3231               if (BE (*err != REG_NOERROR, 0))
3232                 goto parse_bracket_exp_free_return;
3233               break;
3234             case CHAR_CLASS:
3235               *err = build_charclass (regexp->trans, sbcset,
3236 #ifdef RE_ENABLE_I18N
3237                                       mbcset, &char_class_alloc,
3238 #endif /* RE_ENABLE_I18N */
3239                                       start_elem.opr.name, syntax);
3240               if (BE (*err != REG_NOERROR, 0))
3241                goto parse_bracket_exp_free_return;
3242               break;
3243             default:
3244               assert (0);
3245               break;
3246             }
3247         }
3248       if (BE (token->type == END_OF_RE, 0))
3249         {
3250           *err = REG_EBRACK;
3251           goto parse_bracket_exp_free_return;
3252         }
3253       if (token->type == OP_CLOSE_BRACKET)
3254         break;
3255     }
3256
3257   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3258
3259   /* If it is non-matching list.  */
3260   if (non_match)
3261     bitset_not (sbcset);
3262
3263 #ifdef RE_ENABLE_I18N
3264   /* Ensure only single byte characters are set.  */
3265   if (dfa->mb_cur_max > 1)
3266     bitset_mask (sbcset, dfa->sb_char);
3267
3268   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3269       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3270                                                      || mbcset->non_match)))
3271     {
3272       bin_tree_t *mbc_tree;
3273       int sbc_idx;
3274       /* Build a tree for complex bracket.  */
3275       dfa->has_mb_node = 1;
3276       br_token.type = COMPLEX_BRACKET;
3277       br_token.opr.mbcset = mbcset;
3278       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3279       if (BE (mbc_tree == NULL, 0))
3280         goto parse_bracket_exp_espace;
3281       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3282         if (sbcset[sbc_idx])
3283           break;
3284       /* If there are no bits set in sbcset, there is no point
3285          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3286       if (sbc_idx < BITSET_UINTS)
3287         {
3288           /* Build a tree for simple bracket.  */
3289           br_token.type = SIMPLE_BRACKET;
3290           br_token.opr.sbcset = sbcset;
3291           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3292           if (BE (work_tree == NULL, 0))
3293             goto parse_bracket_exp_espace;
3294
3295           /* Then join them by ALT node.  */
3296           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3297           if (BE (work_tree == NULL, 0))
3298             goto parse_bracket_exp_espace;
3299         }
3300       else
3301         {
3302           re_free (sbcset);
3303           work_tree = mbc_tree;
3304         }
3305     }
3306   else
3307     {
3308       /* Build a tree for simple bracket.  */
3309       br_token.type = SIMPLE_BRACKET;
3310       br_token.opr.sbcset = sbcset;
3311       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3312       if (BE (work_tree == NULL, 0))
3313         goto parse_bracket_exp_espace;
3314
3315       free_charset (mbcset);
3316     }
3317 #endif /* not RE_ENABLE_I18N */
3318   return work_tree;
3319
3320  parse_bracket_exp_espace:
3321   *err = REG_ESPACE;
3322  parse_bracket_exp_free_return:
3323   re_free (sbcset);
3324 #ifdef RE_ENABLE_I18N
3325   free_charset (mbcset);
3326 #endif /* RE_ENABLE_I18N */
3327   return NULL;
3328 }
3329
3330 /* Parse an element in the bracket expression.  */
3331
3332 static reg_errcode_t
3333 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3334                        accept_hyphen)
3335      bracket_elem_t *elem;
3336      re_string_t *regexp;
3337      re_token_t *token;
3338      int token_len;
3339      re_dfa_t *dfa;
3340      reg_syntax_t syntax;
3341      int accept_hyphen;
3342 {
3343 #ifdef RE_ENABLE_I18N
3344   int cur_char_size;
3345   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3346   if (cur_char_size > 1)
3347     {
3348       elem->type = MB_CHAR;
3349       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3350       re_string_skip_bytes (regexp, cur_char_size);
3351       return REG_NOERROR;
3352     }
3353 #endif /* RE_ENABLE_I18N */
3354   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3355   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3356       || token->type == OP_OPEN_EQUIV_CLASS)
3357     return parse_bracket_symbol (elem, regexp, token);
3358   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3359     {
3360       /* A '-' must only appear as anything but a range indicator before
3361          the closing bracket.  Everything else is an error.  */
3362       re_token_t token2;
3363       (void) peek_token_bracket (&token2, regexp, syntax);
3364       if (token2.type != OP_CLOSE_BRACKET)
3365         /* The actual error value is not standardized since this whole
3366            case is undefined.  But ERANGE makes good sense.  */
3367         return REG_ERANGE;
3368     }
3369   elem->type = SB_CHAR;
3370   elem->opr.ch = token->opr.c;
3371   return REG_NOERROR;
3372 }
3373
3374 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3375    such as [:<character_class>:], [.<collating_element>.], and
3376    [=<equivalent_class>=].  */
3377
3378 static reg_errcode_t
3379 parse_bracket_symbol (elem, regexp, token)
3380      bracket_elem_t *elem;
3381      re_string_t *regexp;
3382      re_token_t *token;
3383 {
3384   unsigned char ch, delim = token->opr.c;
3385   int i = 0;
3386   if (re_string_eoi(regexp))
3387     return REG_EBRACK;
3388   for (;; ++i)
3389     {
3390       if (i >= BRACKET_NAME_BUF_SIZE)
3391         return REG_EBRACK;
3392       if (token->type == OP_OPEN_CHAR_CLASS)
3393         ch = re_string_fetch_byte_case (regexp);
3394       else
3395         ch = re_string_fetch_byte (regexp);
3396       if (re_string_eoi(regexp))
3397         return REG_EBRACK;
3398       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3399         break;
3400       elem->opr.name[i] = ch;
3401     }
3402   re_string_skip_bytes (regexp, 1);
3403   elem->opr.name[i] = '\0';
3404   switch (token->type)
3405     {
3406     case OP_OPEN_COLL_ELEM:
3407       elem->type = COLL_SYM;
3408       break;
3409     case OP_OPEN_EQUIV_CLASS:
3410       elem->type = EQUIV_CLASS;
3411       break;
3412     case OP_OPEN_CHAR_CLASS:
3413       elem->type = CHAR_CLASS;
3414       break;
3415     default:
3416       break;
3417     }
3418   return REG_NOERROR;
3419 }
3420
3421   /* Helper function for parse_bracket_exp.
3422      Build the equivalence class which is represented by NAME.
3423      The result are written to MBCSET and SBCSET.
3424      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3425      is a pointer argument sinse we may update it.  */
3426
3427 static reg_errcode_t
3428 #ifdef RE_ENABLE_I18N
3429 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3430      re_charset_t *mbcset;
3431      int *equiv_class_alloc;
3432 #else /* not RE_ENABLE_I18N */
3433 build_equiv_class (sbcset, name)
3434 #endif /* not RE_ENABLE_I18N */
3435      re_bitset_ptr_t sbcset;
3436      const unsigned char *name;
3437 {
3438 #if defined _LIBC
3439   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3440   if (nrules != 0)
3441     {
3442       const int32_t *table, *indirect;
3443       const unsigned char *weights, *extra, *cp;
3444       unsigned char char_buf[2];
3445       int32_t idx1, idx2;
3446       unsigned int ch;
3447       size_t len;
3448       /* This #include defines a local function!  */
3449 # include <locale/weight.h>
3450       /* Calculate the index for equivalence class.  */
3451       cp = name;
3452       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3453       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3454                                                _NL_COLLATE_WEIGHTMB);
3455       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3456                                                    _NL_COLLATE_EXTRAMB);
3457       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3458                                                 _NL_COLLATE_INDIRECTMB);
3459       idx1 = findidx (&cp);
3460       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3461         /* This isn't a valid character.  */
3462         return REG_ECOLLATE;
3463
3464       /* Build single byte matcing table for this equivalence class.  */
3465       char_buf[1] = (unsigned char) '\0';
3466       len = weights[idx1];
3467       for (ch = 0; ch < SBC_MAX; ++ch)
3468         {
3469           char_buf[0] = ch;
3470           cp = char_buf;
3471           idx2 = findidx (&cp);
3472 /*
3473           idx2 = table[ch];
3474 */
3475           if (idx2 == 0)
3476             /* This isn't a valid character.  */
3477             continue;
3478           if (len == weights[idx2])
3479             {
3480               int cnt = 0;
3481               while (cnt <= len &&
3482                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3483                 ++cnt;
3484
3485               if (cnt > len)
3486                 bitset_set (sbcset, ch);
3487             }
3488         }
3489       /* Check whether the array has enough space.  */
3490       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3491         {
3492           /* Not enough, realloc it.  */
3493           /* +1 in case of mbcset->nequiv_classes is 0.  */
3494           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3495           /* Use realloc since the array is NULL if *alloc == 0.  */
3496           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3497                                                    int32_t,
3498                                                    new_equiv_class_alloc);
3499           if (BE (new_equiv_classes == NULL, 0))
3500             return REG_ESPACE;
3501           mbcset->equiv_classes = new_equiv_classes;
3502           *equiv_class_alloc = new_equiv_class_alloc;
3503         }
3504       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3505     }
3506   else
3507 #endif /* _LIBC */
3508     {
3509       if (BE (strlen ((const char *) name) != 1, 0))
3510         return REG_ECOLLATE;
3511       bitset_set (sbcset, *name);
3512     }
3513   return REG_NOERROR;
3514 }
3515
3516   /* Helper function for parse_bracket_exp.
3517      Build the character class which is represented by NAME.
3518      The result are written to MBCSET and SBCSET.
3519      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3520      is a pointer argument sinse we may update it.  */
3521
3522 static reg_errcode_t
3523 #ifdef RE_ENABLE_I18N
3524 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3525      re_charset_t *mbcset;
3526      int *char_class_alloc;
3527 #else /* not RE_ENABLE_I18N */
3528 build_charclass (trans, sbcset, class_name, syntax)
3529 #endif /* not RE_ENABLE_I18N */
3530      unsigned RE_TRANSLATE_TYPE trans;
3531      re_bitset_ptr_t sbcset;
3532      const unsigned char *class_name;
3533      reg_syntax_t syntax;
3534 {
3535   int i;
3536   const char *name = (const char *) class_name;
3537
3538   /* In case of REG_ICASE "upper" and "lower" match the both of
3539      upper and lower cases.  */
3540   if ((syntax & RE_ICASE)
3541       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3542     name = "alpha";
3543
3544 #ifdef RE_ENABLE_I18N
3545   /* Check the space of the arrays.  */
3546   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3547     {
3548       /* Not enough, realloc it.  */
3549       /* +1 in case of mbcset->nchar_classes is 0.  */
3550       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3551       /* Use realloc since array is NULL if *alloc == 0.  */
3552       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3553                                                new_char_class_alloc);
3554       if (BE (new_char_classes == NULL, 0))
3555         return REG_ESPACE;
3556       mbcset->char_classes = new_char_classes;
3557       *char_class_alloc = new_char_class_alloc;
3558     }
3559   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3560 #endif /* RE_ENABLE_I18N */
3561
3562 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3563     for (i = 0; i < SBC_MAX; ++i)               \
3564       {                                         \
3565         if (ctype_func (i))                     \
3566           {                                     \
3567             int ch = trans ? trans[i] : i;      \
3568             bitset_set (sbcset, ch);            \
3569           }                                     \
3570       }
3571
3572   if (strcmp (name, "alnum") == 0)
3573     BUILD_CHARCLASS_LOOP (isalnum)
3574   else if (strcmp (name, "cntrl") == 0)
3575     BUILD_CHARCLASS_LOOP (iscntrl)
3576   else if (strcmp (name, "lower") == 0)
3577     BUILD_CHARCLASS_LOOP (islower)
3578   else if (strcmp (name, "space") == 0)
3579     BUILD_CHARCLASS_LOOP (isspace)
3580   else if (strcmp (name, "alpha") == 0)
3581     BUILD_CHARCLASS_LOOP (isalpha)
3582   else if (strcmp (name, "digit") == 0)
3583     BUILD_CHARCLASS_LOOP (isdigit)
3584   else if (strcmp (name, "print") == 0)
3585     BUILD_CHARCLASS_LOOP (isprint)
3586   else if (strcmp (name, "upper") == 0)
3587     BUILD_CHARCLASS_LOOP (isupper)
3588   else if (strcmp (name, "blank") == 0)
3589     BUILD_CHARCLASS_LOOP (isblank)
3590   else if (strcmp (name, "graph") == 0)
3591     BUILD_CHARCLASS_LOOP (isgraph)
3592   else if (strcmp (name, "punct") == 0)
3593     BUILD_CHARCLASS_LOOP (ispunct)
3594   else if (strcmp (name, "xdigit") == 0)
3595     BUILD_CHARCLASS_LOOP (isxdigit)
3596   else
3597     return REG_ECTYPE;
3598
3599   return REG_NOERROR;
3600 }
3601
3602 static bin_tree_t *
3603 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3604      re_dfa_t *dfa;
3605      unsigned RE_TRANSLATE_TYPE trans;
3606      const unsigned char *class_name;
3607      const unsigned char *extra;
3608      int non_match;
3609      reg_errcode_t *err;
3610 {
3611   re_bitset_ptr_t sbcset;
3612 #ifdef RE_ENABLE_I18N
3613   re_charset_t *mbcset;
3614   int alloc = 0;
3615 #endif /* not RE_ENABLE_I18N */
3616   reg_errcode_t ret;
3617   re_token_t br_token;
3618   bin_tree_t *tree;
3619
3620   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3621 #ifdef RE_ENABLE_I18N
3622   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3623 #endif /* RE_ENABLE_I18N */
3624
3625 #ifdef RE_ENABLE_I18N
3626   if (BE (sbcset == NULL || mbcset == NULL, 0))
3627 #else /* not RE_ENABLE_I18N */
3628   if (BE (sbcset == NULL, 0))
3629 #endif /* not RE_ENABLE_I18N */
3630     {
3631       *err = REG_ESPACE;
3632       return NULL;
3633     }
3634
3635   if (non_match)
3636     {
3637 #ifdef RE_ENABLE_I18N
3638       /*
3639       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3640         bitset_set(cset->sbcset, '\0');
3641       */
3642       mbcset->non_match = 1;
3643 #endif /* not RE_ENABLE_I18N */
3644     }
3645
3646   /* We don't care the syntax in this case.  */
3647   ret = build_charclass (trans, sbcset,
3648 #ifdef RE_ENABLE_I18N
3649                          mbcset, &alloc,
3650 #endif /* RE_ENABLE_I18N */
3651                          class_name, 0);
3652
3653   if (BE (ret != REG_NOERROR, 0))
3654     {
3655       re_free (sbcset);
3656 #ifdef RE_ENABLE_I18N
3657       free_charset (mbcset);
3658 #endif /* RE_ENABLE_I18N */
3659       *err = ret;
3660       return NULL;
3661     }
3662   /* \w match '_' also.  */
3663   for (; *extra; extra++)
3664     bitset_set (sbcset, *extra);
3665
3666   /* If it is non-matching list.  */
3667   if (non_match)
3668     bitset_not (sbcset);
3669
3670 #ifdef RE_ENABLE_I18N
3671   /* Ensure only single byte characters are set.  */
3672   if (dfa->mb_cur_max > 1)
3673     bitset_mask (sbcset, dfa->sb_char);
3674 #endif
3675
3676   /* Build a tree for simple bracket.  */
3677   br_token.type = SIMPLE_BRACKET;
3678   br_token.opr.sbcset = sbcset;
3679   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3680   if (BE (tree == NULL, 0))
3681     goto build_word_op_espace;
3682
3683 #ifdef RE_ENABLE_I18N
3684   if (dfa->mb_cur_max > 1)
3685     {
3686       bin_tree_t *mbc_tree;
3687       /* Build a tree for complex bracket.  */
3688       br_token.type = COMPLEX_BRACKET;
3689       br_token.opr.mbcset = mbcset;
3690       dfa->has_mb_node = 1;
3691       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3692       if (BE (mbc_tree == NULL, 0))
3693         goto build_word_op_espace;
3694       /* Then join them by ALT node.  */
3695       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3696       if (BE (mbc_tree != NULL, 1))
3697         return tree;
3698     }
3699   else
3700     {
3701       free_charset (mbcset);
3702       return tree;
3703     }
3704 #else /* not RE_ENABLE_I18N */
3705   return tree;
3706 #endif /* not RE_ENABLE_I18N */
3707
3708  build_word_op_espace:
3709   re_free (sbcset);
3710 #ifdef RE_ENABLE_I18N
3711   free_charset (mbcset);
3712 #endif /* RE_ENABLE_I18N */
3713   *err = REG_ESPACE;
3714   return NULL;
3715 }
3716
3717 /* This is intended for the expressions like "a{1,3}".
3718    Fetch a number from `input', and return the number.
3719    Return -1, if the number field is empty like "{,1}".
3720    Return -2, If an error is occured.  */
3721
3722 static int
3723 fetch_number (input, token, syntax)
3724      re_string_t *input;
3725      re_token_t *token;
3726      reg_syntax_t syntax;
3727 {
3728   int num = -1;
3729   unsigned char c;
3730   while (1)
3731     {
3732       fetch_token (token, input, syntax);
3733       c = token->opr.c;
3734       if (BE (token->type == END_OF_RE, 0))
3735         return -2;
3736       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3737         break;
3738       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3739              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3740       num = (num > RE_DUP_MAX) ? -2 : num;
3741     }
3742   return num;
3743 }
3744 \f
3745 #ifdef RE_ENABLE_I18N
3746 static void
3747 free_charset (re_charset_t *cset)
3748 {
3749   re_free (cset->mbchars);
3750 # ifdef _LIBC
3751   re_free (cset->coll_syms);
3752   re_free (cset->equiv_classes);
3753   re_free (cset->range_starts);
3754   re_free (cset->range_ends);
3755 # endif
3756   re_free (cset->char_classes);
3757   re_free (cset);
3758 }
3759 #endif /* RE_ENABLE_I18N */
3760 \f
3761 /* Functions for binary tree operation.  */
3762
3763 /* Create a tree node.  */
3764
3765 static bin_tree_t *
3766 create_tree (dfa, left, right, type)
3767      re_dfa_t *dfa;
3768      bin_tree_t *left;
3769      bin_tree_t *right;
3770      re_token_type_t type;
3771 {
3772   re_token_t t;
3773   t.type = type;
3774   return create_token_tree (dfa, left, right, &t);
3775 }
3776
3777 static bin_tree_t *
3778 create_token_tree (dfa, left, right, token)
3779      re_dfa_t *dfa;
3780      bin_tree_t *left;
3781      bin_tree_t *right;
3782      const re_token_t *token;
3783 {
3784   bin_tree_t *tree;
3785   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3786     {
3787       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3788
3789       if (storage == NULL)
3790         return NULL;
3791       storage->next = dfa->str_tree_storage;
3792       dfa->str_tree_storage = storage;
3793       dfa->str_tree_storage_idx = 0;
3794     }
3795   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3796
3797   tree->parent = NULL;
3798   tree->left = left;
3799   tree->right = right;
3800   tree->token = *token;
3801   tree->token.duplicated = 0;
3802   tree->token.opt_subexp = 0;
3803   tree->first = NULL;
3804   tree->next = NULL;
3805   tree->node_idx = -1;
3806
3807   if (left != NULL)
3808     left->parent = tree;
3809   if (right != NULL)
3810     right->parent = tree;
3811   return tree;
3812 }
3813
3814 /* Mark the tree SRC as an optional subexpression.
3815    To be called from preorder or postorder.  */
3816
3817 static reg_errcode_t
3818 mark_opt_subexp (extra, node)
3819      void *extra;
3820      bin_tree_t *node;
3821 {
3822   int idx = (int) (long) extra;
3823   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3824     node->token.opt_subexp = 1;
3825
3826   return REG_NOERROR;
3827 }
3828
3829 /* Free the allocated memory inside NODE. */
3830
3831 static void
3832 free_token (re_token_t *node)
3833 {
3834 #ifdef RE_ENABLE_I18N
3835   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3836     free_charset (node->opr.mbcset);
3837   else
3838 #endif /* RE_ENABLE_I18N */
3839     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3840       re_free (node->opr.sbcset);
3841 }
3842
3843 /* Worker function for tree walking.  Free the allocated memory inside NODE
3844    and its children. */
3845
3846 static reg_errcode_t
3847 free_tree (void *extra, bin_tree_t *node)
3848 {
3849   free_token (&node->token);
3850   return REG_NOERROR;
3851 }
3852
3853
3854 /* Duplicate the node SRC, and return new node.  This is a preorder
3855    visit similar to the one implemented by the generic visitor, but
3856    we need more infrastructure to maintain two parallel trees --- so,
3857    it's easier to duplicate.  */
3858
3859 static bin_tree_t *
3860 duplicate_tree (root, dfa)
3861      const bin_tree_t *root;
3862      re_dfa_t *dfa;
3863 {
3864   const bin_tree_t *node;
3865   bin_tree_t *dup_root;
3866   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3867
3868   for (node = root; ; )
3869     {
3870       /* Create a new tree and link it back to the current parent.  */
3871       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3872       if (*p_new == NULL)
3873         return NULL;
3874       (*p_new)->parent = dup_node;
3875       (*p_new)->token.duplicated = 1;
3876       dup_node = *p_new;
3877
3878       /* Go to the left node, or up and to the right.  */
3879       if (node->left)
3880         {
3881           node = node->left;
3882           p_new = &dup_node->left;
3883         }
3884       else
3885         {
3886           const bin_tree_t *prev = NULL;
3887           while (node->right == prev || node->right == NULL)
3888             {
3889               prev = node;
3890               node = node->parent;
3891               dup_node = dup_node->parent;
3892               if (!node)
3893                 return dup_root;
3894             }
3895           node = node->right;
3896           p_new = &dup_node->right;
3897         }
3898     }
3899 }