posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           int length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  27 static reg_errcode_t init_word_char (re_dfa_t *dfa);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 static reg_errcode_t analyze (re_dfa_t *dfa);
  37 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
  38 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
  39 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
  40 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
  41 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  42                                              int top_clone_node, int root_node,
  43                                              unsigned int constraint);
  44 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  45                                      unsigned int constraint);
  46 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  47                                    unsigned int constraint);
  48 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  49 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  50                                          int node, int root);
  51 static void calc_inveclosure (re_dfa_t *dfa);
  52 static int fetch_number (re_string_t *input, re_token_t *token,
  53                          reg_syntax_t syntax);
  54 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
  55 static int peek_token (re_token_t *token, re_string_t *input,
  56                         reg_syntax_t syntax);
  57 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  58                                reg_syntax_t syntax);
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   int nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  int nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             int accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifndef _LIBC
  89 # ifdef RE_ENABLE_I18N
  90 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
  91                                       re_charset_t *mbcset, int *range_alloc,
  92                                       bracket_elem_t *start_elem,
  93                                       bracket_elem_t *end_elem);
  94 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
  95                                              re_charset_t *mbcset,
  96                                              int *coll_sym_alloc,
  97                                              const unsigned char *name);
  98 # else /* not RE_ENABLE_I18N */
  99 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 100                                       bracket_elem_t *start_elem,
 101                                       bracket_elem_t *end_elem);
 102 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 103                                              const unsigned char *name);
 104 # endif /* not RE_ENABLE_I18N */
 105 #endif /* not _LIBC */
 106 #ifdef RE_ENABLE_I18N
 107 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 108                                         re_charset_t *mbcset,
 109                                         int *equiv_class_alloc,
 110                                         const unsigned char *name);
 111 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 112                                       re_bitset_ptr_t sbcset,
 113                                       re_charset_t *mbcset,
 114                                       int *char_class_alloc,
 115                                       const unsigned char *class_name,
 116                                       reg_syntax_t syntax);
 117 #else  /* not RE_ENABLE_I18N */
 118 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 119                                         const unsigned char *name);
 120 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 121                                       re_bitset_ptr_t sbcset,
 122                                       const unsigned char *class_name,
 123                                       reg_syntax_t syntax);
 124 #endif /* not RE_ENABLE_I18N */
 125 static bin_tree_t *build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
 126                                        const unsigned char *class_name,
 127                                        const unsigned char *extra, int not,
 128                                        reg_errcode_t *err);
 129 static void free_bin_tree (bin_tree_t *tree);
 130 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
 131                                 re_token_type_t type, int index);
 132 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 133 \f
 134 /* This table gives an error message for each of the error codes listed
 135    in regex.h.  Obviously the order here has to be same as there.
 136    POSIX doesn't require that we do anything for REG_NOERROR,
 137    but why not be nice?  */
 138
 139 const char __re_error_msgid[] attribute_hidden =
 140   {
 141 #define REG_NOERROR_IDX 0
 142     gettext_noop ("Success")    /* REG_NOERROR */
 143     "\0"
 144 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 145     gettext_noop ("No match")   /* REG_NOMATCH */
 146     "\0"
 147 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 148     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 149     "\0"
 150 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 151     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 152     "\0"
 153 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 154     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 155     "\0"
 156 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 157     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 158     "\0"
 159 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 160     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 161     "\0"
 162 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 163     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 164     "\0"
 165 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 166     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 167     "\0"
 168 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 169     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 170     "\0"
 171 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 172     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 173     "\0"
 174 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 175     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 176     "\0"
 177 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 178     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 179     "\0"
 180 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 181     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 182     "\0"
 183 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 184     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 185     "\0"
 186 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 187     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 188     "\0"
 189 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 190     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 191   };
 192
 193 const size_t __re_error_msgid_idx[] attribute_hidden =
 194   {
 195     REG_NOERROR_IDX,
 196     REG_NOMATCH_IDX,
 197     REG_BADPAT_IDX,
 198     REG_ECOLLATE_IDX,
 199     REG_ECTYPE_IDX,
 200     REG_EESCAPE_IDX,
 201     REG_ESUBREG_IDX,
 202     REG_EBRACK_IDX,
 203     REG_EPAREN_IDX,
 204     REG_EBRACE_IDX,
 205     REG_BADBR_IDX,
 206     REG_ERANGE_IDX,
 207     REG_ESPACE_IDX,
 208     REG_BADRPT_IDX,
 209     REG_EEND_IDX,
 210     REG_ESIZE_IDX,
 211     REG_ERPAREN_IDX
 212   };
 213 \f
 214 /* Entry points for GNU code.  */
 215
 216 /* re_compile_pattern is the GNU regular expression compiler: it
 217    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 218    Returns 0 if the pattern was valid, otherwise an error string.
 219
 220    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 221    are set in BUFP on entry.  */
 222
 223 const char *
 224 re_compile_pattern (pattern, length, bufp)
 225     const char *pattern;
 226     size_t length;
 227     struct re_pattern_buffer *bufp;
 228 {
 229   reg_errcode_t ret;
 230
 231   /* And GNU code determines whether or not to get register information
 232      by passing null for the REGS argument to re_match, etc., not by
 233      setting no_sub.  */
 234   bufp->no_sub = 0;
 235
 236   /* Match anchors at newline.  */
 237   bufp->newline_anchor = 1;
 238
 239   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 240
 241   if (!ret)
 242     return NULL;
 243   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 244 }
 245 #ifdef _LIBC
 246 weak_alias (__re_compile_pattern, re_compile_pattern)
 247 #endif
 248
 249 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 250    also be assigned to arbitrarily: each pattern buffer stores its own
 251    syntax, so it can be changed between regex compilations.  */
 252 /* This has no initializer because initialized variables in Emacs
 253    become read-only after dumping.  */
 254 reg_syntax_t re_syntax_options;
 255
 256
 257 /* Specify the precise syntax of regexps for compilation.  This provides
 258    for compatibility for various utilities which historically have
 259    different, incompatible syntaxes.
 260
 261    The argument SYNTAX is a bit mask comprised of the various bits
 262    defined in regex.h.  We return the old syntax.  */
 263
 264 reg_syntax_t
 265 re_set_syntax (syntax)
 266     reg_syntax_t syntax;
 267 {
 268   reg_syntax_t ret = re_syntax_options;
 269
 270   re_syntax_options = syntax;
 271   return ret;
 272 }
 273 #ifdef _LIBC
 274 weak_alias (__re_set_syntax, re_set_syntax)
 275 #endif
 276
 277 int
 278 re_compile_fastmap (bufp)
 279     struct re_pattern_buffer *bufp;
 280 {
 281   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 282   char *fastmap = bufp->fastmap;
 283
 284   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 285   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 286   if (dfa->init_state != dfa->init_state_word)
 287     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 288   if (dfa->init_state != dfa->init_state_nl)
 289     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 290   if (dfa->init_state != dfa->init_state_begbuf)
 291     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 292   bufp->fastmap_accurate = 1;
 293   return 0;
 294 }
 295 #ifdef _LIBC
 296 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 297 #endif
 298
 299 static inline void
 300 __attribute ((always_inline))
 301 re_set_fastmap (char *fastmap, int icase, int ch)
 302 {
 303   fastmap[ch] = 1;
 304   if (icase)
 305     fastmap[tolower (ch)] = 1;
 306 }
 307
 308 /* Helper function for re_compile_fastmap.
 309    Compile fastmap for the initial_state INIT_STATE.  */
 310
 311 static void
 312 re_compile_fastmap_iter (bufp, init_state, fastmap)
 313      regex_t *bufp;
 314      const re_dfastate_t *init_state;
 315      char *fastmap;
 316 {
 317   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 318   int node_cnt;
 319   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 320   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 321     {
 322       int node = init_state->nodes.elems[node_cnt];
 323       re_token_type_t type = dfa->nodes[node].type;
 324
 325       if (type == CHARACTER)
 326         {
 327           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 328 #ifdef RE_ENABLE_I18N
 329           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 330             {
 331               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 332               wchar_t wc;
 333               mbstate_t state;
 334
 335               p = buf;
 336               *p++ = dfa->nodes[node].opr.c;
 337               while (++node < dfa->nodes_len
 338                      && dfa->nodes[node].type == CHARACTER
 339                      && dfa->nodes[node].mb_partial)
 340                 *p++ = dfa->nodes[node].opr.c;
 341               memset (&state, 0, sizeof (state));
 342               if (mbrtowc (&wc, (const char *) buf, p - buf,
 343                            &state) == p - buf
 344                   && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
 345                 re_set_fastmap (fastmap, 0, buf[0]);
 346             }
 347 #endif
 348         }
 349       else if (type == SIMPLE_BRACKET)
 350         {
 351           int i, j, ch;
 352           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 353             for (j = 0; j < UINT_BITS; ++j, ++ch)
 354               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 355                 re_set_fastmap (fastmap, icase, ch);
 356         }
 357 #ifdef RE_ENABLE_I18N
 358       else if (type == COMPLEX_BRACKET)
 359         {
 360           int i;
 361           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 362           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 363               || cset->nranges || cset->nchar_classes)
 364             {
 365 # ifdef _LIBC
 366               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 367                 {
 368                   /* In this case we want to catch the bytes which are
 369                      the first byte of any collation elements.
 370                      e.g. In da_DK, we want to catch 'a' since "aa"
 371                           is a valid collation element, and don't catch
 372                           'b' since 'b' is the only collation element
 373                           which starts from 'b'.  */
 374                   int j, ch;
 375                   const int32_t *table = (const int32_t *)
 376                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 377                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 378                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 379                       if (table[ch] < 0)
 380                         re_set_fastmap (fastmap, icase, ch);
 381                 }
 382 # else
 383               if (dfa->mb_cur_max > 1)
 384                 for (i = 0; i < SBC_MAX; ++i)
 385                   if (__btowc (i) == WEOF)
 386                     re_set_fastmap (fastmap, icase, i);
 387 # endif /* not _LIBC */
 388             }
 389           for (i = 0; i < cset->nmbchars; ++i)
 390             {
 391               char buf[256];
 392               mbstate_t state;
 393               memset (&state, '\0', sizeof (state));
 394               __wcrtomb (buf, cset->mbchars[i], &state);
 395               re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 396               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 397                 {
 398                   __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
 399                   re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 400                 }
 401             }
 402         }
 403 #endif /* RE_ENABLE_I18N */
 404       else if (type == END_OF_RE || type == OP_PERIOD)
 405         {
 406           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 407           if (type == END_OF_RE)
 408             bufp->can_be_null = 1;
 409           return;
 410         }
 411     }
 412 }
 413 \f
 414 /* Entry point for POSIX code.  */
 415 /* regcomp takes a regular expression as a string and compiles it.
 416
 417    PREG is a regex_t *.  We do not expect any fields to be initialized,
 418    since POSIX says we shouldn't.  Thus, we set
 419
 420      `buffer' to the compiled pattern;
 421      `used' to the length of the compiled pattern;
 422      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 423        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 424        RE_SYNTAX_POSIX_BASIC;
 425      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 426      `fastmap' to an allocated space for the fastmap;
 427      `fastmap_accurate' to zero;
 428      `re_nsub' to the number of subexpressions in PATTERN.
 429
 430    PATTERN is the address of the pattern string.
 431
 432    CFLAGS is a series of bits which affect compilation.
 433
 434      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 435      use POSIX basic syntax.
 436
 437      If REG_NEWLINE is set, then . and [^...] don't match newline.
 438      Also, regexec will try a match beginning after every newline.
 439
 440      If REG_ICASE is set, then we considers upper- and lowercase
 441      versions of letters to be equivalent when matching.
 442
 443      If REG_NOSUB is set, then when PREG is passed to regexec, that
 444      routine will report only success or failure, and nothing about the
 445      registers.
 446
 447    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 448    the return codes and their meanings.)  */
 449
 450 int
 451 regcomp (preg, pattern, cflags)
 452     regex_t *__restrict preg;
 453     const char *__restrict pattern;
 454     int cflags;
 455 {
 456   reg_errcode_t ret;
 457   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 458                          : RE_SYNTAX_POSIX_BASIC);
 459
 460   preg->buffer = NULL;
 461   preg->allocated = 0;
 462   preg->used = 0;
 463
 464   /* Try to allocate space for the fastmap.  */
 465   preg->fastmap = re_malloc (char, SBC_MAX);
 466   if (BE (preg->fastmap == NULL, 0))
 467     return REG_ESPACE;
 468
 469   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 470
 471   /* If REG_NEWLINE is set, newlines are treated differently.  */
 472   if (cflags & REG_NEWLINE)
 473     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 474       syntax &= ~RE_DOT_NEWLINE;
 475       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 476       /* It also changes the matching behavior.  */
 477       preg->newline_anchor = 1;
 478     }
 479   else
 480     preg->newline_anchor = 0;
 481   preg->no_sub = !!(cflags & REG_NOSUB);
 482   preg->translate = NULL;
 483
 484   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 485
 486   /* POSIX doesn't distinguish between an unmatched open-group and an
 487      unmatched close-group: both are REG_EPAREN.  */
 488   if (ret == REG_ERPAREN)
 489     ret = REG_EPAREN;
 490
 491   /* We have already checked preg->fastmap != NULL.  */
 492   if (BE (ret == REG_NOERROR, 1))
 493     /* Compute the fastmap now, since regexec cannot modify the pattern
 494        buffer.  This function nevers fails in this implementation.  */
 495     (void) re_compile_fastmap (preg);
 496   else
 497     {
 498       /* Some error occurred while compiling the expression.  */
 499       re_free (preg->fastmap);
 500       preg->fastmap = NULL;
 501     }
 502
 503   return (int) ret;
 504 }
 505 #ifdef _LIBC
 506 weak_alias (__regcomp, regcomp)
 507 #endif
 508
 509 /* Returns a message corresponding to an error code, ERRCODE, returned
 510    from either regcomp or regexec.   We don't use PREG here.  */
 511
 512 size_t
 513 regerror (errcode, preg, errbuf, errbuf_size)
 514     int errcode;
 515     const regex_t *preg;
 516     char *errbuf;
 517     size_t errbuf_size;
 518 {
 519   const char *msg;
 520   size_t msg_size;
 521
 522   if (BE (errcode < 0
 523           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 524                                / sizeof (__re_error_msgid_idx[0])), 0))
 525     /* Only error codes returned by the rest of the code should be passed
 526        to this routine.  If we are given anything else, or if other regex
 527        code generates an invalid error code, then the program has a bug.
 528        Dump core so we can fix it.  */
 529     abort ();
 530
 531   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 532
 533   msg_size = strlen (msg) + 1; /* Includes the null.  */
 534
 535   if (BE (errbuf_size != 0, 1))
 536     {
 537       if (BE (msg_size > errbuf_size, 0))
 538         {
 539 #if defined HAVE_MEMPCPY || defined _LIBC
 540           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 541 #else
 542           memcpy (errbuf, msg, errbuf_size - 1);
 543           errbuf[errbuf_size - 1] = 0;
 544 #endif
 545         }
 546       else
 547         memcpy (errbuf, msg, msg_size);
 548     }
 549
 550   return msg_size;
 551 }
 552 #ifdef _LIBC
 553 weak_alias (__regerror, regerror)
 554 #endif
 555
 556
 557 static void
 558 free_dfa_content (re_dfa_t *dfa)
 559 {
 560   int i, j;
 561
 562   re_free (dfa->subexps);
 563
 564   for (i = 0; i < dfa->nodes_len; ++i)
 565     {
 566       re_token_t *node = dfa->nodes + i;
 567 #ifdef RE_ENABLE_I18N
 568       if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
 569         free_charset (node->opr.mbcset);
 570       else
 571 #endif /* RE_ENABLE_I18N */
 572         if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
 573           re_free (node->opr.sbcset);
 574     }
 575   re_free (dfa->nexts);
 576   for (i = 0; i < dfa->nodes_len; ++i)
 577     {
 578       if (dfa->eclosures != NULL)
 579         re_node_set_free (dfa->eclosures + i);
 580       if (dfa->inveclosures != NULL)
 581         re_node_set_free (dfa->inveclosures + i);
 582       if (dfa->edests != NULL)
 583         re_node_set_free (dfa->edests + i);
 584     }
 585   re_free (dfa->edests);
 586   re_free (dfa->eclosures);
 587   re_free (dfa->inveclosures);
 588   re_free (dfa->nodes);
 589
 590   for (i = 0; i <= dfa->state_hash_mask; ++i)
 591     {
 592       struct re_state_table_entry *entry = dfa->state_table + i;
 593       for (j = 0; j < entry->num; ++j)
 594         {
 595           re_dfastate_t *state = entry->array[j];
 596           free_state (state);
 597         }
 598       re_free (entry->array);
 599     }
 600   re_free (dfa->state_table);
 601
 602   if (dfa->word_char != NULL)
 603     re_free (dfa->word_char);
 604 #ifdef DEBUG
 605   re_free (dfa->re_str);
 606 #endif
 607
 608   re_free (dfa);
 609 }
 610
 611
 612 /* Free dynamically allocated space used by PREG.  */
 613
 614 void
 615 regfree (preg)
 616     regex_t *preg;
 617 {
 618   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 619   if (BE (dfa != NULL, 1))
 620     free_dfa_content (dfa);
 621
 622   re_free (preg->fastmap);
 623 }
 624 #ifdef _LIBC
 625 weak_alias (__regfree, regfree)
 626 #endif
 627 \f
 628 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 629    them unless specifically requested.  */
 630
 631 #if defined _REGEX_RE_COMP || defined _LIBC
 632
 633 /* BSD has one and only one pattern buffer.  */
 634 static struct re_pattern_buffer re_comp_buf;
 635
 636 char *
 637 # ifdef _LIBC
 638 /* Make these definitions weak in libc, so POSIX programs can redefine
 639    these names if they don't use our functions, and still use
 640    regcomp/regexec above without link errors.  */
 641 weak_function
 642 # endif
 643 re_comp (s)
 644      const char *s;
 645 {
 646   reg_errcode_t ret;
 647   char *fastmap;
 648
 649   if (!s)
 650     {
 651       if (!re_comp_buf.buffer)
 652         return gettext ("No previous regular expression");
 653       return 0;
 654     }
 655
 656   if (re_comp_buf.buffer)
 657     {
 658       fastmap = re_comp_buf.fastmap;
 659       re_comp_buf.fastmap = NULL;
 660       __regfree (&re_comp_buf);
 661       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 662       re_comp_buf.fastmap = fastmap;
 663     }
 664
 665   if (re_comp_buf.fastmap == NULL)
 666     {
 667       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 668       if (re_comp_buf.fastmap == NULL)
 669         return (char *) gettext (__re_error_msgid
 670                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 671     }
 672
 673   /* Since `re_exec' always passes NULL for the `regs' argument, we
 674      don't need to initialize the pattern buffer fields which affect it.  */
 675
 676   /* Match anchors at newlines.  */
 677   re_comp_buf.newline_anchor = 1;
 678
 679   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 680
 681   if (!ret)
 682     return NULL;
 683
 684   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 685   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 686 }
 687
 688 #ifdef _LIBC
 689 libc_freeres_fn (free_mem)
 690 {
 691   __regfree (&re_comp_buf);
 692 }
 693 #endif
 694
 695 #endif /* _REGEX_RE_COMP */
 696 \f
 697 /* Internal entry point.
 698    Compile the regular expression PATTERN, whose length is LENGTH.
 699    SYNTAX indicate regular expression's syntax.  */
 700
 701 static reg_errcode_t
 702 re_compile_internal (preg, pattern, length, syntax)
 703      regex_t *preg;
 704      const char * pattern;
 705      int length;
 706      reg_syntax_t syntax;
 707 {
 708   reg_errcode_t err = REG_NOERROR;
 709   re_dfa_t *dfa;
 710   re_string_t regexp;
 711
 712   /* Initialize the pattern buffer.  */
 713   preg->fastmap_accurate = 0;
 714   preg->syntax = syntax;
 715   preg->not_bol = preg->not_eol = 0;
 716   preg->used = 0;
 717   preg->re_nsub = 0;
 718   preg->can_be_null = 0;
 719   preg->regs_allocated = REGS_UNALLOCATED;
 720
 721   /* Initialize the dfa.  */
 722   dfa = (re_dfa_t *) preg->buffer;
 723   if (preg->allocated < sizeof (re_dfa_t))
 724     {
 725       /* If zero allocated, but buffer is non-null, try to realloc
 726          enough space.  This loses if buffer's address is bogus, but
 727          that is the user's responsibility.  If ->buffer is NULL this
 728          is a simple allocation.  */
 729       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 730       if (dfa == NULL)
 731         return REG_ESPACE;
 732       preg->allocated = sizeof (re_dfa_t);
 733     }
 734   preg->buffer = (unsigned char *) dfa;
 735   preg->used = sizeof (re_dfa_t);
 736
 737   err = init_dfa (dfa, length);
 738   if (BE (err != REG_NOERROR, 0))
 739     {
 740       re_free (dfa);
 741       preg->buffer = NULL;
 742       preg->allocated = 0;
 743       return err;
 744     }
 745 #ifdef DEBUG
 746   dfa->re_str = re_malloc (char, length + 1);
 747   strncpy (dfa->re_str, pattern, length + 1);
 748 #endif
 749
 750   err = re_string_construct (&regexp, pattern, length, preg->translate,
 751                              syntax & RE_ICASE, dfa->mb_cur_max,
 752                              dfa->is_utf8);
 753   if (BE (err != REG_NOERROR, 0))
 754     {
 755       re_free (dfa);
 756       preg->buffer = NULL;
 757       preg->allocated = 0;
 758       return err;
 759     }
 760
 761   /* Parse the regular expression, and build a structure tree.  */
 762   preg->re_nsub = 0;
 763   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 764   if (BE (dfa->str_tree == NULL, 0))
 765     goto re_compile_internal_free_return;
 766
 767 #ifdef RE_ENABLE_I18N
 768   /* If possible, do searching in single byte encoding to speed things up.  */
 769   if (dfa->is_utf8 && !(syntax & RE_ICASE))
 770     optimize_utf8 (dfa);
 771 #endif
 772
 773   /* Analyze the tree and collect information which is necessary to
 774      create the dfa.  */
 775   err = analyze (dfa);
 776   if (BE (err != REG_NOERROR, 0))
 777     goto re_compile_internal_free_return;
 778
 779   /* Then create the initial state of the dfa.  */
 780   err = create_initial_state (dfa);
 781
 782   /* Release work areas.  */
 783   free_workarea_compile (preg);
 784   re_string_destruct (&regexp);
 785
 786   if (BE (err != REG_NOERROR, 0))
 787     {
 788     re_compile_internal_free_return:
 789       free_dfa_content (dfa);
 790       preg->buffer = NULL;
 791       preg->allocated = 0;
 792     }
 793
 794   return err;
 795 }
 796
 797 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 798    as the initial length of some arrays.  */
 799
 800 static reg_errcode_t
 801 init_dfa (dfa, pat_len)
 802      re_dfa_t *dfa;
 803      int pat_len;
 804 {
 805   int table_size;
 806
 807   memset (dfa, '\0', sizeof (re_dfa_t));
 808
 809   dfa->nodes_alloc = pat_len + 1;
 810   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 811
 812   dfa->states_alloc = pat_len + 1;
 813
 814   /*  table_size = 2 ^ ceil(log pat_len) */
 815   for (table_size = 1; table_size > 0; table_size <<= 1)
 816     if (table_size > pat_len)
 817       break;
 818
 819   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 820   dfa->state_hash_mask = table_size - 1;
 821
 822   dfa->subexps_alloc = 1;
 823   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
 824   dfa->word_char = NULL;
 825
 826   dfa->mb_cur_max = MB_CUR_MAX;
 827 #ifdef _LIBC
 828   if (dfa->mb_cur_max > 1
 829       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 830     dfa->is_utf8 = 1;
 831 #endif
 832
 833   if (BE (dfa->nodes == NULL || dfa->state_table == NULL
 834           || dfa->subexps == NULL, 0))
 835     {
 836       /* We don't bother to free anything which was allocated.  Very
 837          soon the process will go down anyway.  */
 838       dfa->subexps = NULL;
 839       dfa->state_table = NULL;
 840       dfa->nodes = NULL;
 841       return REG_ESPACE;
 842     }
 843   return REG_NOERROR;
 844 }
 845
 846 /* Initialize WORD_CHAR table, which indicate which character is
 847    "word".  In this case "word" means that it is the word construction
 848    character used by some operators like "\<", "\>", etc.  */
 849
 850 static reg_errcode_t
 851 init_word_char (dfa)
 852      re_dfa_t *dfa;
 853 {
 854   int i, j, ch;
 855   dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 856   if (BE (dfa->word_char == NULL, 0))
 857     return REG_ESPACE;
 858   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 859     for (j = 0; j < UINT_BITS; ++j, ++ch)
 860       if (isalnum (ch) || ch == '_')
 861         dfa->word_char[i] |= 1 << j;
 862   return REG_NOERROR;
 863 }
 864
 865 /* Free the work area which are only used while compiling.  */
 866
 867 static void
 868 free_workarea_compile (preg)
 869      regex_t *preg;
 870 {
 871   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 872   free_bin_tree (dfa->str_tree);
 873   dfa->str_tree = NULL;
 874   re_free (dfa->org_indices);
 875   dfa->org_indices = NULL;
 876 }
 877
 878 /* Create initial states for all contexts.  */
 879
 880 static reg_errcode_t
 881 create_initial_state (dfa)
 882      re_dfa_t *dfa;
 883 {
 884   int first, i;
 885   reg_errcode_t err;
 886   re_node_set init_nodes;
 887
 888   /* Initial states have the epsilon closure of the node which is
 889      the first node of the regular expression.  */
 890   first = dfa->str_tree->first;
 891   dfa->init_node = first;
 892   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 893   if (BE (err != REG_NOERROR, 0))
 894     return err;
 895
 896   /* The back-references which are in initial states can epsilon transit,
 897      since in this case all of the subexpressions can be null.
 898      Then we add epsilon closures of the nodes which are the next nodes of
 899      the back-references.  */
 900   if (dfa->nbackref > 0)
 901     for (i = 0; i < init_nodes.nelem; ++i)
 902       {
 903         int node_idx = init_nodes.elems[i];
 904         re_token_type_t type = dfa->nodes[node_idx].type;
 905
 906         int clexp_idx;
 907         if (type != OP_BACK_REF)
 908           continue;
 909         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 910           {
 911             re_token_t *clexp_node;
 912             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 913             if (clexp_node->type == OP_CLOSE_SUBEXP
 914                 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
 915               break;
 916           }
 917         if (clexp_idx == init_nodes.nelem)
 918           continue;
 919
 920         if (type == OP_BACK_REF)
 921           {
 922             int dest_idx = dfa->edests[node_idx].elems[0];
 923             if (!re_node_set_contains (&init_nodes, dest_idx))
 924               {
 925                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 926                 i = 0;
 927               }
 928           }
 929       }
 930
 931   /* It must be the first time to invoke acquire_state.  */
 932   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 933   /* We don't check ERR here, since the initial state must not be NULL.  */
 934   if (BE (dfa->init_state == NULL, 0))
 935     return err;
 936   if (dfa->init_state->has_constraint)
 937     {
 938       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 939                                                        CONTEXT_WORD);
 940       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 941                                                      CONTEXT_NEWLINE);
 942       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
 943                                                          &init_nodes,
 944                                                          CONTEXT_NEWLINE
 945                                                          | CONTEXT_BEGBUF);
 946       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 947               || dfa->init_state_begbuf == NULL, 0))
 948         return err;
 949     }
 950   else
 951     dfa->init_state_word = dfa->init_state_nl
 952       = dfa->init_state_begbuf = dfa->init_state;
 953
 954   re_node_set_free (&init_nodes);
 955   return REG_NOERROR;
 956 }
 957 \f
 958 #ifdef RE_ENABLE_I18N
 959 /* If it is possible to do searching in single byte encoding instead of UTF-8
 960    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
 961    DFA nodes where needed.  */
 962
 963 static void
 964 optimize_utf8 (dfa)
 965      re_dfa_t *dfa;
 966 {
 967   int node;
 968
 969   for (node = 0; node < dfa->nodes_len; ++node)
 970     switch (dfa->nodes[node].type)
 971       {
 972       case CHARACTER:
 973         /* Chars >= 0x80 are optimizable in some cases (e.g. when not
 974            followed by DUP operator, not in bracket etc.).
 975            For now punt on them all.  */
 976         if (dfa->nodes[node].opr.c >= 0x80)
 977           return;
 978         break;
 979       case ANCHOR:
 980         switch (dfa->nodes[node].opr.idx)
 981           {
 982           case LINE_FIRST:
 983           case LINE_LAST:
 984           case BUF_FIRST:
 985           case BUF_LAST:
 986             break;
 987           default:
 988             /* Word anchors etc. cannot be handled.  */
 989             return;
 990           }
 991         break;
 992       case OP_BACK_REF:
 993       case OP_ALT:
 994       case END_OF_RE:
 995       case BACK_SLASH:
 996       case OP_DUP_ASTERISK:
 997       case OP_DUP_QUESTION:
 998       case OP_DUP_PLUS:
 999       case OP_OPEN_SUBEXP:
1000       case OP_CLOSE_SUBEXP:
1001         break;
1002       default:
1003         return;
1004       }
1005
1006   /* The search can be in single byte locale.  */
1007   dfa->mb_cur_max = 1;
1008   dfa->is_utf8 = 0;
1009   dfa->has_mb_node = dfa->nbackref > 0;
1010 }
1011 #endif
1012 \f
1013 /* Analyze the structure tree, and calculate "first", "next", "edest",
1014    "eclosure", and "inveclosure".  */
1015
1016 static reg_errcode_t
1017 analyze (dfa)
1018      re_dfa_t *dfa;
1019 {
1020   int i;
1021   reg_errcode_t ret;
1022
1023   /* Allocate arrays.  */
1024   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1025   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1026   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1027   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1028   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1029   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1030           || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
1031     return REG_ESPACE;
1032   /* Initialize them.  */
1033   for (i = 0; i < dfa->nodes_len; ++i)
1034     {
1035       dfa->nexts[i] = -1;
1036       re_node_set_init_empty (dfa->edests + i);
1037       re_node_set_init_empty (dfa->eclosures + i);
1038       re_node_set_init_empty (dfa->inveclosures + i);
1039     }
1040
1041   ret = analyze_tree (dfa, dfa->str_tree);
1042   if (BE (ret == REG_NOERROR, 1))
1043     {
1044       ret = calc_eclosure (dfa);
1045       if (ret == REG_NOERROR)
1046         calc_inveclosure (dfa);
1047     }
1048   return ret;
1049 }
1050
1051 /* Helper functions for analyze.
1052    This function calculate "first", "next", and "edest" for the subtree
1053    whose root is NODE.  */
1054
1055 static reg_errcode_t
1056 analyze_tree (dfa, node)
1057      re_dfa_t *dfa;
1058      bin_tree_t *node;
1059 {
1060   reg_errcode_t ret;
1061   if (node->first == -1)
1062     calc_first (dfa, node);
1063   if (node->next == -1)
1064     calc_next (dfa, node);
1065   if (node->eclosure.nelem == 0)
1066     calc_epsdest (dfa, node);
1067   /* Calculate "first" etc. for the left child.  */
1068   if (node->left != NULL)
1069     {
1070       ret = analyze_tree (dfa, node->left);
1071       if (BE (ret != REG_NOERROR, 0))
1072         return ret;
1073     }
1074   /* Calculate "first" etc. for the right child.  */
1075   if (node->right != NULL)
1076     {
1077       ret = analyze_tree (dfa, node->right);
1078       if (BE (ret != REG_NOERROR, 0))
1079         return ret;
1080     }
1081   return REG_NOERROR;
1082 }
1083
1084 /* Calculate "first" for the node NODE.  */
1085 static void
1086 calc_first (dfa, node)
1087      re_dfa_t *dfa;
1088      bin_tree_t *node;
1089 {
1090   int idx, type;
1091   idx = node->node_idx;
1092   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1093
1094   switch (type)
1095     {
1096 #ifdef DEBUG
1097     case OP_OPEN_BRACKET:
1098     case OP_CLOSE_BRACKET:
1099     case OP_OPEN_DUP_NUM:
1100     case OP_CLOSE_DUP_NUM:
1101     case OP_NON_MATCH_LIST:
1102     case OP_OPEN_COLL_ELEM:
1103     case OP_CLOSE_COLL_ELEM:
1104     case OP_OPEN_EQUIV_CLASS:
1105     case OP_CLOSE_EQUIV_CLASS:
1106     case OP_OPEN_CHAR_CLASS:
1107     case OP_CLOSE_CHAR_CLASS:
1108       /* These must not be appeared here.  */
1109       assert (0);
1110 #endif
1111     case END_OF_RE:
1112     case CHARACTER:
1113     case OP_PERIOD:
1114     case OP_DUP_ASTERISK:
1115     case OP_DUP_QUESTION:
1116 #ifdef RE_ENABLE_I18N
1117     case COMPLEX_BRACKET:
1118 #endif /* RE_ENABLE_I18N */
1119     case SIMPLE_BRACKET:
1120     case OP_BACK_REF:
1121     case ANCHOR:
1122     case OP_OPEN_SUBEXP:
1123     case OP_CLOSE_SUBEXP:
1124       node->first = idx;
1125       break;
1126     case OP_DUP_PLUS:
1127 #ifdef DEBUG
1128       assert (node->left != NULL);
1129 #endif
1130       if (node->left->first == -1)
1131         calc_first (dfa, node->left);
1132       node->first = node->left->first;
1133       break;
1134     case OP_ALT:
1135       node->first = idx;
1136       break;
1137       /* else fall through */
1138     default:
1139 #ifdef DEBUG
1140       assert (node->left != NULL);
1141 #endif
1142       if (node->left->first == -1)
1143         calc_first (dfa, node->left);
1144       node->first = node->left->first;
1145       break;
1146     }
1147 }
1148
1149 /* Calculate "next" for the node NODE.  */
1150
1151 static void
1152 calc_next (dfa, node)
1153      re_dfa_t *dfa;
1154      bin_tree_t *node;
1155 {
1156   int idx, type;
1157   bin_tree_t *parent = node->parent;
1158   if (parent == NULL)
1159     {
1160       node->next = -1;
1161       idx = node->node_idx;
1162       if (node->type == 0)
1163         dfa->nexts[idx] = node->next;
1164       return;
1165     }
1166
1167   idx = parent->node_idx;
1168   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1169
1170   switch (type)
1171     {
1172     case OP_DUP_ASTERISK:
1173     case OP_DUP_PLUS:
1174       node->next = idx;
1175       break;
1176     case CONCAT:
1177       if (parent->left == node)
1178         {
1179           if (parent->right->first == -1)
1180             calc_first (dfa, parent->right);
1181           node->next = parent->right->first;
1182           break;
1183         }
1184       /* else fall through */
1185     default:
1186       if (parent->next == -1)
1187         calc_next (dfa, parent);
1188       node->next = parent->next;
1189       break;
1190     }
1191   idx = node->node_idx;
1192   if (node->type == 0)
1193     dfa->nexts[idx] = node->next;
1194 }
1195
1196 /* Calculate "edest" for the node NODE.  */
1197
1198 static void
1199 calc_epsdest (dfa, node)
1200      re_dfa_t *dfa;
1201      bin_tree_t *node;
1202 {
1203   int idx;
1204   idx = node->node_idx;
1205   if (node->type == 0)
1206     {
1207       if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1208           || dfa->nodes[idx].type == OP_DUP_PLUS
1209           || dfa->nodes[idx].type == OP_DUP_QUESTION)
1210         {
1211           if (node->left->first == -1)
1212             calc_first (dfa, node->left);
1213           if (node->next == -1)
1214             calc_next (dfa, node);
1215           re_node_set_init_2 (dfa->edests + idx, node->left->first,
1216                               node->next);
1217         }
1218       else if (dfa->nodes[idx].type == OP_ALT)
1219         {
1220           int left, right;
1221           if (node->left != NULL)
1222             {
1223               if (node->left->first == -1)
1224                 calc_first (dfa, node->left);
1225               left = node->left->first;
1226             }
1227           else
1228             {
1229               if (node->next == -1)
1230                 calc_next (dfa, node);
1231               left = node->next;
1232             }
1233           if (node->right != NULL)
1234             {
1235               if (node->right->first == -1)
1236                 calc_first (dfa, node->right);
1237               right = node->right->first;
1238             }
1239           else
1240             {
1241               if (node->next == -1)
1242                 calc_next (dfa, node);
1243               right = node->next;
1244             }
1245           re_node_set_init_2 (dfa->edests + idx, left, right);
1246         }
1247       else if (dfa->nodes[idx].type == ANCHOR
1248                || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1249                || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1250                || dfa->nodes[idx].type == OP_BACK_REF)
1251         re_node_set_init_1 (dfa->edests + idx, node->next);
1252       else
1253         assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
1254     }
1255 }
1256
1257 /* Duplicate the epsilon closure of the node ROOT_NODE.
1258    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1259    to their own constraint.  */
1260
1261 static reg_errcode_t
1262 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1263                         init_constraint)
1264      re_dfa_t *dfa;
1265      int top_org_node, top_clone_node, root_node;
1266      unsigned int init_constraint;
1267 {
1268   reg_errcode_t err;
1269   int org_node, clone_node, ret;
1270   unsigned int constraint = init_constraint;
1271   for (org_node = top_org_node, clone_node = top_clone_node;;)
1272     {
1273       int org_dest, clone_dest;
1274       if (dfa->nodes[org_node].type == OP_BACK_REF)
1275         {
1276           /* If the back reference epsilon-transit, its destination must
1277              also have the constraint.  Then duplicate the epsilon closure
1278              of the destination of the back reference, and store it in
1279              edests of the back reference.  */
1280           org_dest = dfa->nexts[org_node];
1281           re_node_set_empty (dfa->edests + clone_node);
1282           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1283           if (BE (err != REG_NOERROR, 0))
1284             return err;
1285           dfa->nexts[clone_node] = dfa->nexts[org_node];
1286           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1287           if (BE (ret < 0, 0))
1288             return REG_ESPACE;
1289         }
1290       else if (dfa->edests[org_node].nelem == 0)
1291         {
1292           /* In case of the node can't epsilon-transit, don't duplicate the
1293              destination and store the original destination as the
1294              destination of the node.  */
1295           dfa->nexts[clone_node] = dfa->nexts[org_node];
1296           break;
1297         }
1298       else if (dfa->edests[org_node].nelem == 1)
1299         {
1300           /* In case of the node can epsilon-transit, and it has only one
1301              destination.  */
1302           org_dest = dfa->edests[org_node].elems[0];
1303           re_node_set_empty (dfa->edests + clone_node);
1304           if (dfa->nodes[org_node].type == ANCHOR)
1305             {
1306               /* In case of the node has another constraint, append it.  */
1307               if (org_node == root_node && clone_node != org_node)
1308                 {
1309                   /* ...but if the node is root_node itself, it means the
1310                      epsilon closure have a loop, then tie it to the
1311                      destination of the root_node.  */
1312                   ret = re_node_set_insert (dfa->edests + clone_node,
1313                                             org_dest);
1314                   if (BE (ret < 0, 0))
1315                     return REG_ESPACE;
1316                   break;
1317                 }
1318               constraint |= dfa->nodes[org_node].opr.ctx_type;
1319             }
1320           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1321           if (BE (err != REG_NOERROR, 0))
1322             return err;
1323           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1324           if (BE (ret < 0, 0))
1325             return REG_ESPACE;
1326         }
1327       else /* dfa->edests[org_node].nelem == 2 */
1328         {
1329           /* In case of the node can epsilon-transit, and it has two
1330              destinations. E.g. '|', '*', '+', '?'.   */
1331           org_dest = dfa->edests[org_node].elems[0];
1332           re_node_set_empty (dfa->edests + clone_node);
1333           /* Search for a duplicated node which satisfies the constraint.  */
1334           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1335           if (clone_dest == -1)
1336             {
1337               /* There are no such a duplicated node, create a new one.  */
1338               err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1339               if (BE (err != REG_NOERROR, 0))
1340                 return err;
1341               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1342               if (BE (ret < 0, 0))
1343                 return REG_ESPACE;
1344               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1345                                             root_node, constraint);
1346               if (BE (err != REG_NOERROR, 0))
1347                 return err;
1348             }
1349           else
1350             {
1351               /* There are a duplicated node which satisfy the constraint,
1352                  use it to avoid infinite loop.  */
1353               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1354               if (BE (ret < 0, 0))
1355                 return REG_ESPACE;
1356             }
1357
1358           org_dest = dfa->edests[org_node].elems[1];
1359           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1360           if (BE (err != REG_NOERROR, 0))
1361             return err;
1362           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1363           if (BE (ret < 0, 0))
1364             return REG_ESPACE;
1365         }
1366       org_node = org_dest;
1367       clone_node = clone_dest;
1368     }
1369   return REG_NOERROR;
1370 }
1371
1372 /* Search for a node which is duplicated from the node ORG_NODE, and
1373    satisfies the constraint CONSTRAINT.  */
1374
1375 static int
1376 search_duplicated_node (dfa, org_node, constraint)
1377      re_dfa_t *dfa;
1378      int org_node;
1379      unsigned int constraint;
1380 {
1381   int idx;
1382   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1383     {
1384       if (org_node == dfa->org_indices[idx]
1385           && constraint == dfa->nodes[idx].constraint)
1386         return idx; /* Found.  */
1387     }
1388   return -1; /* Not found.  */
1389 }
1390
1391 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1392    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1393    otherwise return the error code.  */
1394
1395 static reg_errcode_t
1396 duplicate_node (new_idx, dfa, org_idx, constraint)
1397      re_dfa_t *dfa;
1398      int *new_idx, org_idx;
1399      unsigned int constraint;
1400 {
1401   re_token_t dup;
1402   int dup_idx;
1403
1404   dup = dfa->nodes[org_idx];
1405   dup_idx = re_dfa_add_node (dfa, dup, 1);
1406   if (BE (dup_idx == -1, 0))
1407     return REG_ESPACE;
1408   dfa->nodes[dup_idx].constraint = constraint;
1409   if (dfa->nodes[org_idx].type == ANCHOR)
1410     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1411   dfa->nodes[dup_idx].duplicated = 1;
1412   re_node_set_init_empty (dfa->edests + dup_idx);
1413   re_node_set_init_empty (dfa->eclosures + dup_idx);
1414   re_node_set_init_empty (dfa->inveclosures + dup_idx);
1415
1416   /* Store the index of the original node.  */
1417   dfa->org_indices[dup_idx] = org_idx;
1418   *new_idx = dup_idx;
1419   return REG_NOERROR;
1420 }
1421
1422 static void
1423 calc_inveclosure (dfa)
1424      re_dfa_t *dfa;
1425 {
1426   int src, idx, dest;
1427   for (src = 0; src < dfa->nodes_len; ++src)
1428     {
1429       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1430         {
1431           dest = dfa->eclosures[src].elems[idx];
1432           re_node_set_insert (dfa->inveclosures + dest, src);
1433         }
1434     }
1435 }
1436
1437 /* Calculate "eclosure" for all the node in DFA.  */
1438
1439 static reg_errcode_t
1440 calc_eclosure (dfa)
1441      re_dfa_t *dfa;
1442 {
1443   int node_idx, incomplete;
1444 #ifdef DEBUG
1445   assert (dfa->nodes_len > 0);
1446 #endif
1447   incomplete = 0;
1448   /* For each nodes, calculate epsilon closure.  */
1449   for (node_idx = 0; ; ++node_idx)
1450     {
1451       reg_errcode_t err;
1452       re_node_set eclosure_elem;
1453       if (node_idx == dfa->nodes_len)
1454         {
1455           if (!incomplete)
1456             break;
1457           incomplete = 0;
1458           node_idx = 0;
1459         }
1460
1461 #ifdef DEBUG
1462       assert (dfa->eclosures[node_idx].nelem != -1);
1463 #endif
1464       /* If we have already calculated, skip it.  */
1465       if (dfa->eclosures[node_idx].nelem != 0)
1466         continue;
1467       /* Calculate epsilon closure of `node_idx'.  */
1468       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1469       if (BE (err != REG_NOERROR, 0))
1470         return err;
1471
1472       if (dfa->eclosures[node_idx].nelem == 0)
1473         {
1474           incomplete = 1;
1475           re_node_set_free (&eclosure_elem);
1476         }
1477     }
1478   return REG_NOERROR;
1479 }
1480
1481 /* Calculate epsilon closure of NODE.  */
1482
1483 static reg_errcode_t
1484 calc_eclosure_iter (new_set, dfa, node, root)
1485      re_node_set *new_set;
1486      re_dfa_t *dfa;
1487      int node, root;
1488 {
1489   reg_errcode_t err;
1490   unsigned int constraint;
1491   int i, incomplete;
1492   re_node_set eclosure;
1493   incomplete = 0;
1494   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1495   if (BE (err != REG_NOERROR, 0))
1496     return err;
1497
1498   /* This indicates that we are calculating this node now.
1499      We reference this value to avoid infinite loop.  */
1500   dfa->eclosures[node].nelem = -1;
1501
1502   constraint = ((dfa->nodes[node].type == ANCHOR)
1503                 ? dfa->nodes[node].opr.ctx_type : 0);
1504   /* If the current node has constraints, duplicate all nodes.
1505      Since they must inherit the constraints.  */
1506   if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1507     {
1508       int org_node, cur_node;
1509       org_node = cur_node = node;
1510       err = duplicate_node_closure (dfa, node, node, node, constraint);
1511       if (BE (err != REG_NOERROR, 0))
1512         return err;
1513     }
1514
1515   /* Expand each epsilon destination nodes.  */
1516   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1517     for (i = 0; i < dfa->edests[node].nelem; ++i)
1518       {
1519         re_node_set eclosure_elem;
1520         int edest = dfa->edests[node].elems[i];
1521         /* If calculating the epsilon closure of `edest' is in progress,
1522            return intermediate result.  */
1523         if (dfa->eclosures[edest].nelem == -1)
1524           {
1525             incomplete = 1;
1526             continue;
1527           }
1528         /* If we haven't calculated the epsilon closure of `edest' yet,
1529            calculate now. Otherwise use calculated epsilon closure.  */
1530         if (dfa->eclosures[edest].nelem == 0)
1531           {
1532             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1533             if (BE (err != REG_NOERROR, 0))
1534               return err;
1535           }
1536         else
1537           eclosure_elem = dfa->eclosures[edest];
1538         /* Merge the epsilon closure of `edest'.  */
1539         re_node_set_merge (&eclosure, &eclosure_elem);
1540         /* If the epsilon closure of `edest' is incomplete,
1541            the epsilon closure of this node is also incomplete.  */
1542         if (dfa->eclosures[edest].nelem == 0)
1543           {
1544             incomplete = 1;
1545             re_node_set_free (&eclosure_elem);
1546           }
1547       }
1548
1549   /* Epsilon closures include itself.  */
1550   re_node_set_insert (&eclosure, node);
1551   if (incomplete && !root)
1552     dfa->eclosures[node].nelem = 0;
1553   else
1554     dfa->eclosures[node] = eclosure;
1555   *new_set = eclosure;
1556   return REG_NOERROR;
1557 }
1558 \f
1559 /* Functions for token which are used in the parser.  */
1560
1561 /* Fetch a token from INPUT.
1562    We must not use this function inside bracket expressions.  */
1563
1564 static re_token_t
1565 fetch_token (input, syntax)
1566      re_string_t *input;
1567      reg_syntax_t syntax;
1568 {
1569   re_token_t token;
1570   int consumed_byte;
1571   consumed_byte = peek_token (&token, input, syntax);
1572   re_string_skip_bytes (input, consumed_byte);
1573   return token;
1574 }
1575
1576 /* Peek a token from INPUT, and return the length of the token.
1577    We must not use this function inside bracket expressions.  */
1578
1579 static int
1580 peek_token (token, input, syntax)
1581      re_token_t *token;
1582      re_string_t *input;
1583      reg_syntax_t syntax;
1584 {
1585   unsigned char c;
1586
1587   if (re_string_eoi (input))
1588     {
1589       token->type = END_OF_RE;
1590       return 0;
1591     }
1592
1593   c = re_string_peek_byte (input, 0);
1594   token->opr.c = c;
1595
1596 #ifdef RE_ENABLE_I18N
1597   token->mb_partial = 0;
1598   if (input->mb_cur_max > 1 &&
1599       !re_string_first_byte (input, re_string_cur_idx (input)))
1600     {
1601       token->type = CHARACTER;
1602       token->mb_partial = 1;
1603       return 1;
1604     }
1605 #endif
1606   if (c == '\\')
1607     {
1608       unsigned char c2;
1609       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1610         {
1611           token->type = BACK_SLASH;
1612           return 1;
1613         }
1614
1615       c2 = re_string_peek_byte_case (input, 1);
1616       token->opr.c = c2;
1617       token->type = CHARACTER;
1618       switch (c2)
1619         {
1620         case '|':
1621           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1622             token->type = OP_ALT;
1623           break;
1624         case '1': case '2': case '3': case '4': case '5':
1625         case '6': case '7': case '8': case '9':
1626           if (!(syntax & RE_NO_BK_REFS))
1627             {
1628               token->type = OP_BACK_REF;
1629               token->opr.idx = c2 - '0';
1630             }
1631           break;
1632         case '<':
1633           if (!(syntax & RE_NO_GNU_OPS))
1634             {
1635               token->type = ANCHOR;
1636               token->opr.idx = WORD_FIRST;
1637             }
1638           break;
1639         case '>':
1640           if (!(syntax & RE_NO_GNU_OPS))
1641             {
1642               token->type = ANCHOR;
1643               token->opr.idx = WORD_LAST;
1644             }
1645           break;
1646         case 'b':
1647           if (!(syntax & RE_NO_GNU_OPS))
1648             {
1649               token->type = ANCHOR;
1650               token->opr.idx = WORD_DELIM;
1651             }
1652           break;
1653         case 'B':
1654           if (!(syntax & RE_NO_GNU_OPS))
1655             {
1656               token->type = ANCHOR;
1657               token->opr.idx = INSIDE_WORD;
1658             }
1659           break;
1660         case 'w':
1661           if (!(syntax & RE_NO_GNU_OPS))
1662             token->type = OP_WORD;
1663           break;
1664         case 'W':
1665           if (!(syntax & RE_NO_GNU_OPS))
1666             token->type = OP_NOTWORD;
1667           break;
1668         case 's':
1669           if (!(syntax & RE_NO_GNU_OPS))
1670             token->type = OP_SPACE;
1671           break;
1672         case 'S':
1673           if (!(syntax & RE_NO_GNU_OPS))
1674             token->type = OP_NOTSPACE;
1675           break;
1676         case '`':
1677           if (!(syntax & RE_NO_GNU_OPS))
1678             {
1679               token->type = ANCHOR;
1680               token->opr.idx = BUF_FIRST;
1681             }
1682           break;
1683         case '\'':
1684           if (!(syntax & RE_NO_GNU_OPS))
1685             {
1686               token->type = ANCHOR;
1687               token->opr.idx = BUF_LAST;
1688             }
1689           break;
1690         case '(':
1691           if (!(syntax & RE_NO_BK_PARENS))
1692             token->type = OP_OPEN_SUBEXP;
1693           break;
1694         case ')':
1695           if (!(syntax & RE_NO_BK_PARENS))
1696             token->type = OP_CLOSE_SUBEXP;
1697           break;
1698         case '+':
1699           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1700             token->type = OP_DUP_PLUS;
1701           break;
1702         case '?':
1703           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1704             token->type = OP_DUP_QUESTION;
1705           break;
1706         case '{':
1707           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1708             token->type = OP_OPEN_DUP_NUM;
1709           break;
1710         case '}':
1711           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1712             token->type = OP_CLOSE_DUP_NUM;
1713           break;
1714         default:
1715           break;
1716         }
1717       return 2;
1718     }
1719
1720   token->type = CHARACTER;
1721   switch (c)
1722     {
1723     case '\n':
1724       if (syntax & RE_NEWLINE_ALT)
1725         token->type = OP_ALT;
1726       break;
1727     case '|':
1728       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1729         token->type = OP_ALT;
1730       break;
1731     case '*':
1732       token->type = OP_DUP_ASTERISK;
1733       break;
1734     case '+':
1735       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1736         token->type = OP_DUP_PLUS;
1737       break;
1738     case '?':
1739       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1740         token->type = OP_DUP_QUESTION;
1741       break;
1742     case '{':
1743       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1744         token->type = OP_OPEN_DUP_NUM;
1745       break;
1746     case '}':
1747       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1748         token->type = OP_CLOSE_DUP_NUM;
1749       break;
1750     case '(':
1751       if (syntax & RE_NO_BK_PARENS)
1752         token->type = OP_OPEN_SUBEXP;
1753       break;
1754     case ')':
1755       if (syntax & RE_NO_BK_PARENS)
1756         token->type = OP_CLOSE_SUBEXP;
1757       break;
1758     case '[':
1759       token->type = OP_OPEN_BRACKET;
1760       break;
1761     case '.':
1762       token->type = OP_PERIOD;
1763       break;
1764     case '^':
1765       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1766           re_string_cur_idx (input) != 0)
1767         {
1768           char prev = re_string_peek_byte (input, -1);
1769           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1770             break;
1771         }
1772       token->type = ANCHOR;
1773       token->opr.idx = LINE_FIRST;
1774       break;
1775     case '$':
1776       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1777           re_string_cur_idx (input) + 1 != re_string_length (input))
1778         {
1779           re_token_t next;
1780           re_string_skip_bytes (input, 1);
1781           peek_token (&next, input, syntax);
1782           re_string_skip_bytes (input, -1);
1783           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1784             break;
1785         }
1786       token->type = ANCHOR;
1787       token->opr.idx = LINE_LAST;
1788       break;
1789     default:
1790       break;
1791     }
1792   return 1;
1793 }
1794
1795 /* Peek a token from INPUT, and return the length of the token.
1796    We must not use this function out of bracket expressions.  */
1797
1798 static int
1799 peek_token_bracket (token, input, syntax)
1800      re_token_t *token;
1801      re_string_t *input;
1802      reg_syntax_t syntax;
1803 {
1804   unsigned char c;
1805   if (re_string_eoi (input))
1806     {
1807       token->type = END_OF_RE;
1808       return 0;
1809     }
1810   c = re_string_peek_byte (input, 0);
1811   token->opr.c = c;
1812
1813 #ifdef RE_ENABLE_I18N
1814   if (input->mb_cur_max > 1 &&
1815       !re_string_first_byte (input, re_string_cur_idx (input)))
1816     {
1817       token->type = CHARACTER;
1818       return 1;
1819     }
1820 #endif /* RE_ENABLE_I18N */
1821
1822   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1823     {
1824       /* In this case, '\' escape a character.  */
1825       unsigned char c2;
1826       re_string_skip_bytes (input, 1);
1827       c2 = re_string_peek_byte (input, 0);
1828       token->opr.c = c2;
1829       token->type = CHARACTER;
1830       return 1;
1831     }
1832   if (c == '[') /* '[' is a special char in a bracket exps.  */
1833     {
1834       unsigned char c2;
1835       int token_len;
1836       c2 = re_string_peek_byte (input, 1);
1837       token->opr.c = c2;
1838       token_len = 2;
1839       switch (c2)
1840         {
1841         case '.':
1842           token->type = OP_OPEN_COLL_ELEM;
1843           break;
1844         case '=':
1845           token->type = OP_OPEN_EQUIV_CLASS;
1846           break;
1847         case ':':
1848           if (syntax & RE_CHAR_CLASSES)
1849             {
1850               token->type = OP_OPEN_CHAR_CLASS;
1851               break;
1852             }
1853           /* else fall through.  */
1854         default:
1855           token->type = CHARACTER;
1856           token->opr.c = c;
1857           token_len = 1;
1858           break;
1859         }
1860       return token_len;
1861     }
1862   switch (c)
1863     {
1864     case '-':
1865       token->type = OP_CHARSET_RANGE;
1866       break;
1867     case ']':
1868       token->type = OP_CLOSE_BRACKET;
1869       break;
1870     case '^':
1871       token->type = OP_NON_MATCH_LIST;
1872       break;
1873     default:
1874       token->type = CHARACTER;
1875     }
1876   return 1;
1877 }
1878 \f
1879 /* Functions for parser.  */
1880
1881 /* Entry point of the parser.
1882    Parse the regular expression REGEXP and return the structure tree.
1883    If an error is occured, ERR is set by error code, and return NULL.
1884    This function build the following tree, from regular expression <reg_exp>:
1885            CAT
1886            / \
1887           /   \
1888    <reg_exp>  EOR
1889
1890    CAT means concatenation.
1891    EOR means end of regular expression.  */
1892
1893 static bin_tree_t *
1894 parse (regexp, preg, syntax, err)
1895      re_string_t *regexp;
1896      regex_t *preg;
1897      reg_syntax_t syntax;
1898      reg_errcode_t *err;
1899 {
1900   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1901   bin_tree_t *tree, *eor, *root;
1902   re_token_t current_token;
1903   int new_idx;
1904   current_token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
1905   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1906   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1907     return NULL;
1908   new_idx = re_dfa_add_node (dfa, current_token, 0);
1909   eor = create_tree (NULL, NULL, 0, new_idx);
1910   if (tree != NULL)
1911     root = create_tree (tree, eor, CONCAT, 0);
1912   else
1913     root = eor;
1914   if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1915     {
1916       *err = REG_ESPACE;
1917       return NULL;
1918     }
1919   return root;
1920 }
1921
1922 /* This function build the following tree, from regular expression
1923    <branch1>|<branch2>:
1924            ALT
1925            / \
1926           /   \
1927    <branch1> <branch2>
1928
1929    ALT means alternative, which represents the operator `|'.  */
1930
1931 static bin_tree_t *
1932 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1933      re_string_t *regexp;
1934      regex_t *preg;
1935      re_token_t *token;
1936      reg_syntax_t syntax;
1937      int nest;
1938      reg_errcode_t *err;
1939 {
1940   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1941   bin_tree_t *tree, *branch = NULL;
1942   int new_idx;
1943   tree = parse_branch (regexp, preg, token, syntax, nest, err);
1944   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1945     return NULL;
1946
1947   while (token->type == OP_ALT)
1948     {
1949       re_token_t alt_token = *token;
1950       new_idx = re_dfa_add_node (dfa, alt_token, 0);
1951       *token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
1952       if (token->type != OP_ALT && token->type != END_OF_RE
1953           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1954         {
1955           branch = parse_branch (regexp, preg, token, syntax, nest, err);
1956           if (BE (*err != REG_NOERROR && branch == NULL, 0))
1957             {
1958               free_bin_tree (tree);
1959               return NULL;
1960             }
1961         }
1962       else
1963         branch = NULL;
1964       tree = create_tree (tree, branch, 0, new_idx);
1965       if (BE (new_idx == -1 || tree == NULL, 0))
1966         {
1967           *err = REG_ESPACE;
1968           return NULL;
1969         }
1970       dfa->has_plural_match = 1;
1971     }
1972   return tree;
1973 }
1974
1975 /* This function build the following tree, from regular expression
1976    <exp1><exp2>:
1977         CAT
1978         / \
1979        /   \
1980    <exp1> <exp2>
1981
1982    CAT means concatenation.  */
1983
1984 static bin_tree_t *
1985 parse_branch (regexp, preg, token, syntax, nest, err)
1986      re_string_t *regexp;
1987      regex_t *preg;
1988      re_token_t *token;
1989      reg_syntax_t syntax;
1990      int nest;
1991      reg_errcode_t *err;
1992 {
1993   bin_tree_t *tree, *exp;
1994   tree = parse_expression (regexp, preg, token, syntax, nest, err);
1995   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1996     return NULL;
1997
1998   while (token->type != OP_ALT && token->type != END_OF_RE
1999          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2000     {
2001       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2002       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2003         {
2004           free_bin_tree (tree);
2005           return NULL;
2006         }
2007       if (tree != NULL && exp != NULL)
2008         {
2009           tree = create_tree (tree, exp, CONCAT, 0);
2010           if (tree == NULL)
2011             {
2012               *err = REG_ESPACE;
2013               return NULL;
2014             }
2015         }
2016       else if (tree == NULL)
2017         tree = exp;
2018       /* Otherwise exp == NULL, we don't need to create new tree.  */
2019     }
2020   return tree;
2021 }
2022
2023 /* This function build the following tree, from regular expression a*:
2024          *
2025          |
2026          a
2027 */
2028
2029 static bin_tree_t *
2030 parse_expression (regexp, preg, token, syntax, nest, err)
2031      re_string_t *regexp;
2032      regex_t *preg;
2033      re_token_t *token;
2034      reg_syntax_t syntax;
2035      int nest;
2036      reg_errcode_t *err;
2037 {
2038   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2039   bin_tree_t *tree;
2040   int new_idx;
2041   switch (token->type)
2042     {
2043     case CHARACTER:
2044       new_idx = re_dfa_add_node (dfa, *token, 0);
2045       tree = create_tree (NULL, NULL, 0, new_idx);
2046       if (BE (new_idx == -1 || tree == NULL, 0))
2047         {
2048           *err = REG_ESPACE;
2049           return NULL;
2050         }
2051 #ifdef RE_ENABLE_I18N
2052       if (dfa->mb_cur_max > 1)
2053         {
2054           while (!re_string_eoi (regexp)
2055                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2056             {
2057               bin_tree_t *mbc_remain;
2058               *token = fetch_token (regexp, syntax);
2059               new_idx = re_dfa_add_node (dfa, *token, 0);
2060               mbc_remain = create_tree (NULL, NULL, 0, new_idx);
2061               tree = create_tree (tree, mbc_remain, CONCAT, 0);
2062               if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
2063                 {
2064                   *err = REG_ESPACE;
2065                   return NULL;
2066                 }
2067             }
2068         }
2069 #endif
2070       break;
2071     case OP_OPEN_SUBEXP:
2072       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2073       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2074         return NULL;
2075       break;
2076     case OP_OPEN_BRACKET:
2077       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2078       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2079         return NULL;
2080       break;
2081     case OP_BACK_REF:
2082       if (BE (preg->re_nsub < token->opr.idx
2083               || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2084         {
2085           *err = REG_ESUBREG;
2086           return NULL;
2087         }
2088       dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
2089       new_idx = re_dfa_add_node (dfa, *token, 0);
2090       tree = create_tree (NULL, NULL, 0, new_idx);
2091       if (BE (new_idx == -1 || tree == NULL, 0))
2092         {
2093           *err = REG_ESPACE;
2094           return NULL;
2095         }
2096       ++dfa->nbackref;
2097       dfa->has_mb_node = 1;
2098       break;
2099     case OP_OPEN_DUP_NUM:
2100       if (syntax & RE_CONTEXT_INVALID_DUP)
2101         {
2102           *err = REG_BADRPT;
2103           return NULL;
2104         }
2105       /* FALLTHROUGH */
2106     case OP_DUP_ASTERISK:
2107     case OP_DUP_PLUS:
2108     case OP_DUP_QUESTION:
2109       if (syntax & RE_CONTEXT_INVALID_OPS)
2110         {
2111           *err = REG_BADRPT;
2112           return NULL;
2113         }
2114       else if (syntax & RE_CONTEXT_INDEP_OPS)
2115         {
2116           *token = fetch_token (regexp, syntax);
2117           return parse_expression (regexp, preg, token, syntax, nest, err);
2118         }
2119       /* else fall through  */
2120     case OP_CLOSE_SUBEXP:
2121       if ((token->type == OP_CLOSE_SUBEXP) &&
2122           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2123         {
2124           *err = REG_ERPAREN;
2125           return NULL;
2126         }
2127       /* else fall through  */
2128     case OP_CLOSE_DUP_NUM:
2129       /* We treat it as a normal character.  */
2130
2131       /* Then we can these characters as normal characters.  */
2132       token->type = CHARACTER;
2133       new_idx = re_dfa_add_node (dfa, *token, 0);
2134       tree = create_tree (NULL, NULL, 0, new_idx);
2135       if (BE (new_idx == -1 || tree == NULL, 0))
2136         {
2137           *err = REG_ESPACE;
2138           return NULL;
2139         }
2140       break;
2141     case ANCHOR:
2142       if (dfa->word_char == NULL)
2143         {
2144           *err = init_word_char (dfa);
2145           if (BE (*err != REG_NOERROR, 0))
2146             return NULL;
2147         }
2148       if (token->opr.ctx_type == WORD_DELIM)
2149         {
2150           bin_tree_t *tree_first, *tree_last;
2151           int idx_first, idx_last;
2152           token->opr.ctx_type = WORD_FIRST;
2153           idx_first = re_dfa_add_node (dfa, *token, 0);
2154           tree_first = create_tree (NULL, NULL, 0, idx_first);
2155           token->opr.ctx_type = WORD_LAST;
2156           idx_last = re_dfa_add_node (dfa, *token, 0);
2157           tree_last = create_tree (NULL, NULL, 0, idx_last);
2158           token->type = OP_ALT;
2159           new_idx = re_dfa_add_node (dfa, *token, 0);
2160           tree = create_tree (tree_first, tree_last, 0, new_idx);
2161           if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2162                   || tree_first == NULL || tree_last == NULL
2163                   || tree == NULL, 0))
2164             {
2165               *err = REG_ESPACE;
2166               return NULL;
2167             }
2168         }
2169       else
2170         {
2171           new_idx = re_dfa_add_node (dfa, *token, 0);
2172           tree = create_tree (NULL, NULL, 0, new_idx);
2173           if (BE (new_idx == -1 || tree == NULL, 0))
2174             {
2175               *err = REG_ESPACE;
2176               return NULL;
2177             }
2178         }
2179       /* We must return here, since ANCHORs can't be followed
2180          by repetition operators.
2181          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2182              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2183       *token = fetch_token (regexp, syntax);
2184       return tree;
2185     case OP_PERIOD:
2186       new_idx = re_dfa_add_node (dfa, *token, 0);
2187       tree = create_tree (NULL, NULL, 0, new_idx);
2188       if (BE (new_idx == -1 || tree == NULL, 0))
2189         {
2190           *err = REG_ESPACE;
2191           return NULL;
2192         }
2193       if (dfa->mb_cur_max > 1)
2194         dfa->has_mb_node = 1;
2195       break;
2196     case OP_WORD:
2197       tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 0, err);
2198       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2199         return NULL;
2200       break;
2201     case OP_NOTWORD:
2202       tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 1, err);
2203       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2204         return NULL;
2205       break;
2206     case OP_SPACE:
2207       tree = build_charclass_op (dfa, regexp->trans, "space", "", 0, err);
2208       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2209         return NULL;
2210       break;
2211     case OP_NOTSPACE:
2212       tree = build_charclass_op (dfa, regexp->trans, "space", "", 1, err);
2213       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2214         return NULL;
2215       break;
2216     case OP_ALT:
2217     case END_OF_RE:
2218       return NULL;
2219     case BACK_SLASH:
2220       *err = REG_EESCAPE;
2221       return NULL;
2222     default:
2223       /* Must not happen?  */
2224 #ifdef DEBUG
2225       assert (0);
2226 #endif
2227       return NULL;
2228     }
2229   *token = fetch_token (regexp, syntax);
2230
2231   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2232          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2233     {
2234       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2235       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2236         return NULL;
2237       /* In BRE consecutive duplications are not allowed.  */
2238       if ((syntax & RE_CONTEXT_INVALID_DUP)
2239           && (token->type == OP_DUP_ASTERISK
2240               || token->type == OP_OPEN_DUP_NUM))
2241         {
2242           *err = REG_BADRPT;
2243           return NULL;
2244         }
2245       dfa->has_plural_match = 1;
2246     }
2247
2248   return tree;
2249 }
2250
2251 /* This function build the following tree, from regular expression
2252    (<reg_exp>):
2253          SUBEXP
2254             |
2255         <reg_exp>
2256 */
2257
2258 static bin_tree_t *
2259 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2260      re_string_t *regexp;
2261      regex_t *preg;
2262      re_token_t *token;
2263      reg_syntax_t syntax;
2264      int nest;
2265      reg_errcode_t *err;
2266 {
2267   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2268   bin_tree_t *tree, *left_par, *right_par;
2269   size_t cur_nsub;
2270   int new_idx;
2271   cur_nsub = preg->re_nsub++;
2272   if (dfa->subexps_alloc < preg->re_nsub)
2273     {
2274       re_subexp_t *new_array;
2275       dfa->subexps_alloc *= 2;
2276       new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2277       if (BE (new_array == NULL, 0))
2278         {
2279           dfa->subexps_alloc /= 2;
2280           *err = REG_ESPACE;
2281           return NULL;
2282         }
2283       dfa->subexps = new_array;
2284     }
2285   dfa->subexps[cur_nsub].start = dfa->nodes_len;
2286   dfa->subexps[cur_nsub].end = -1;
2287
2288   new_idx = re_dfa_add_node (dfa, *token, 0);
2289   left_par = create_tree (NULL, NULL, 0, new_idx);
2290   if (BE (new_idx == -1 || left_par == NULL, 0))
2291     {
2292       *err = REG_ESPACE;
2293       return NULL;
2294     }
2295   dfa->nodes[new_idx].opr.idx = cur_nsub;
2296   *token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
2297
2298   /* The subexpression may be a null string.  */
2299   if (token->type == OP_CLOSE_SUBEXP)
2300     tree = NULL;
2301   else
2302     {
2303       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2304       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2305         return NULL;
2306     }
2307   if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2308     {
2309       free_bin_tree (tree);
2310       *err = REG_EPAREN;
2311       return NULL;
2312     }
2313   new_idx = re_dfa_add_node (dfa, *token, 0);
2314   dfa->subexps[cur_nsub].end = dfa->nodes_len;
2315   right_par = create_tree (NULL, NULL, 0, new_idx);
2316   tree = ((tree == NULL) ? right_par
2317           : create_tree (tree, right_par, CONCAT, 0));
2318   tree = create_tree (left_par, tree, CONCAT, 0);
2319   if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2320     {
2321       *err = REG_ESPACE;
2322       return NULL;
2323     }
2324   dfa->nodes[new_idx].opr.idx = cur_nsub;
2325
2326   return tree;
2327 }
2328
2329 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2330
2331 static bin_tree_t *
2332 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2333      bin_tree_t *dup_elem;
2334      re_string_t *regexp;
2335      re_dfa_t *dfa;
2336      re_token_t *token;
2337      reg_syntax_t syntax;
2338      reg_errcode_t *err;
2339 {
2340   re_token_t dup_token;
2341   bin_tree_t *tree = dup_elem, *work_tree;
2342   int new_idx, start_idx = re_string_cur_idx (regexp);
2343   re_token_t start_token = *token;
2344   if (token->type == OP_OPEN_DUP_NUM)
2345     {
2346       int i;
2347       int end = 0;
2348       int start = fetch_number (regexp, token, syntax);
2349       bin_tree_t *elem;
2350       if (start == -1)
2351         {
2352           if (token->type == CHARACTER && token->opr.c == ',')
2353             start = 0; /* We treat "{,m}" as "{0,m}".  */
2354           else
2355             {
2356               *err = REG_BADBR; /* <re>{} is invalid.  */
2357               return NULL;
2358             }
2359         }
2360       if (BE (start != -2, 1))
2361         {
2362           /* We treat "{n}" as "{n,n}".  */
2363           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2364                  : ((token->type == CHARACTER && token->opr.c == ',')
2365                     ? fetch_number (regexp, token, syntax) : -2));
2366         }
2367       if (BE (start == -2 || end == -2, 0))
2368         {
2369           /* Invalid sequence.  */
2370           if (token->type == OP_CLOSE_DUP_NUM)
2371             goto parse_dup_op_invalid_interval;
2372           else
2373             goto parse_dup_op_ebrace;
2374         }
2375       if (BE (start == 0 && end == 0, 0))
2376         {
2377           /* We treat "<re>{0}" and "<re>{0,0}" as null string.  */
2378           *token = fetch_token (regexp, syntax);
2379           free_bin_tree (dup_elem);
2380           return NULL;
2381         }
2382
2383       /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2384       elem = tree;
2385       for (i = 0; i < start; ++i)
2386         if (i != 0)
2387           {
2388             work_tree = duplicate_tree (elem, dfa);
2389             tree = create_tree (tree, work_tree, CONCAT, 0);
2390             if (BE (work_tree == NULL || tree == NULL, 0))
2391               goto parse_dup_op_espace;
2392           }
2393
2394       if (end == -1)
2395         {
2396           /* We treat "<re>{0,}" as "<re>*".  */
2397           dup_token.type = OP_DUP_ASTERISK;
2398           if (start > 0)
2399             {
2400               elem = duplicate_tree (elem, dfa);
2401               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2402               work_tree = create_tree (elem, NULL, 0, new_idx);
2403               tree = create_tree (tree, work_tree, CONCAT, 0);
2404               if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2405                       || tree == NULL, 0))
2406                 goto parse_dup_op_espace;
2407             }
2408           else
2409             {
2410               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2411               tree = create_tree (elem, NULL, 0, new_idx);
2412               if (BE (new_idx == -1 || tree == NULL, 0))
2413                 goto parse_dup_op_espace;
2414             }
2415         }
2416       else if (BE (start > end, 0))
2417         {
2418           /* First  number greater than first.  */
2419           *err = REG_BADBR;
2420           return NULL;
2421         }
2422       else if (end - start > 0)
2423         {
2424           /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?".  */
2425           dup_token.type = OP_DUP_QUESTION;
2426           if (start > 0)
2427             {
2428               elem = duplicate_tree (elem, dfa);
2429               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2430               elem = create_tree (elem, NULL, 0, new_idx);
2431               tree = create_tree (tree, elem, CONCAT, 0);
2432               if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2433                 goto parse_dup_op_espace;
2434             }
2435           else
2436             {
2437               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2438               tree = elem = create_tree (elem, NULL, 0, new_idx);
2439               if (BE (new_idx == -1 || tree == NULL, 0))
2440                 goto parse_dup_op_espace;
2441             }
2442           for (i = 1; i < end - start; ++i)
2443             {
2444               work_tree = duplicate_tree (elem, dfa);
2445               tree = create_tree (tree, work_tree, CONCAT, 0);
2446               if (BE (work_tree == NULL || tree == NULL, 0))
2447                 {
2448                   *err = REG_ESPACE;
2449                   return NULL;
2450                 }
2451             }
2452         }
2453     }
2454   else
2455     {
2456       new_idx = re_dfa_add_node (dfa, *token, 0);
2457       tree = create_tree (tree, NULL, 0, new_idx);
2458       if (BE (new_idx == -1 || tree == NULL, 0))
2459         {
2460           *err = REG_ESPACE;
2461           return NULL;
2462         }
2463     }
2464   *token = fetch_token (regexp, syntax);
2465   return tree;
2466
2467  parse_dup_op_espace:
2468   free_bin_tree (tree);
2469   *err = REG_ESPACE;
2470   return NULL;
2471
2472  parse_dup_op_ebrace:
2473   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2474     {
2475       *err = REG_EBRACE;
2476       return NULL;
2477     }
2478   goto parse_dup_op_rollback;
2479  parse_dup_op_invalid_interval:
2480   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2481     {
2482       *err = REG_BADBR;
2483       return NULL;
2484     }
2485  parse_dup_op_rollback:
2486   re_string_set_index (regexp, start_idx);
2487   *token = start_token;
2488   token->type = CHARACTER;
2489   return dup_elem;
2490 }
2491
2492 /* Size of the names for collating symbol/equivalence_class/character_class.
2493    I'm not sure, but maybe enough.  */
2494 #define BRACKET_NAME_BUF_SIZE 32
2495
2496 #ifndef _LIBC
2497   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2498      Build the range expression which starts from START_ELEM, and ends
2499      at END_ELEM.  The result are written to MBCSET and SBCSET.
2500      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2501      mbcset->range_ends, is a pointer argument sinse we may
2502      update it.  */
2503
2504 static reg_errcode_t
2505 # ifdef RE_ENABLE_I18N
2506 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2507      re_charset_t *mbcset;
2508      int *range_alloc;
2509 # else /* not RE_ENABLE_I18N */
2510 build_range_exp (sbcset, start_elem, end_elem)
2511 # endif /* not RE_ENABLE_I18N */
2512      re_bitset_ptr_t sbcset;
2513      bracket_elem_t *start_elem, *end_elem;
2514 {
2515   unsigned int start_ch, end_ch;
2516   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2517   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2518           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2519           0))
2520     return REG_ERANGE;
2521
2522   /* We can handle no multi character collating elements without libc
2523      support.  */
2524   if (BE ((start_elem->type == COLL_SYM
2525            && strlen ((char *) start_elem->opr.name) > 1)
2526           || (end_elem->type == COLL_SYM
2527               && strlen ((char *) end_elem->opr.name) > 1), 0))
2528     return REG_ECOLLATE;
2529
2530 # ifdef RE_ENABLE_I18N
2531   {
2532     wchar_t wc, start_wc, end_wc;
2533     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2534
2535     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2536                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2537                    : 0));
2538     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2539               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2540                  : 0));
2541     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2542                 ? __btowc (start_ch) : start_elem->opr.wch);
2543     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2544               ? __btowc (end_ch) : end_elem->opr.wch);
2545     cmp_buf[0] = start_wc;
2546     cmp_buf[4] = end_wc;
2547     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2548       return REG_ERANGE;
2549
2550     /* Check the space of the arrays.  */
2551     if (*range_alloc == mbcset->nranges)
2552       {
2553         /* There are not enough space, need realloc.  */
2554         wchar_t *new_array_start, *new_array_end;
2555         int new_nranges;
2556
2557         /* +1 in case of mbcset->nranges is 0.  */
2558         new_nranges = 2 * mbcset->nranges + 1;
2559         /* Use realloc since mbcset->range_starts and mbcset->range_ends
2560            are NULL if *range_alloc == 0.  */
2561         new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2562                                       new_nranges);
2563         new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2564                                     new_nranges);
2565
2566         if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2567           return REG_ESPACE;
2568
2569         mbcset->range_starts = new_array_start;
2570         mbcset->range_ends = new_array_end;
2571         *range_alloc = new_nranges;
2572       }
2573
2574     mbcset->range_starts[mbcset->nranges] = start_wc;
2575     mbcset->range_ends[mbcset->nranges++] = end_wc;
2576
2577     /* Build the table for single byte characters.  */
2578     for (wc = 0; wc <= SBC_MAX; ++wc)
2579       {
2580         cmp_buf[2] = wc;
2581         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2582             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2583           bitset_set (sbcset, wc);
2584       }
2585   }
2586 # else /* not RE_ENABLE_I18N */
2587   {
2588     unsigned int ch;
2589     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2590                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2591                    : 0));
2592     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2593               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2594                  : 0));
2595     if (start_ch > end_ch)
2596       return REG_ERANGE;
2597     /* Build the table for single byte characters.  */
2598     for (ch = 0; ch <= SBC_MAX; ++ch)
2599       if (start_ch <= ch  && ch <= end_ch)
2600         bitset_set (sbcset, ch);
2601   }
2602 # endif /* not RE_ENABLE_I18N */
2603   return REG_NOERROR;
2604 }
2605 #endif /* not _LIBC */
2606
2607 #ifndef _LIBC
2608 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2609    Build the collating element which is represented by NAME.
2610    The result are written to MBCSET and SBCSET.
2611    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2612    pointer argument since we may update it.  */
2613
2614 static reg_errcode_t
2615 # ifdef RE_ENABLE_I18N
2616 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2617      re_charset_t *mbcset;
2618      int *coll_sym_alloc;
2619 # else /* not RE_ENABLE_I18N */
2620 build_collating_symbol (sbcset, name)
2621 # endif /* not RE_ENABLE_I18N */
2622      re_bitset_ptr_t sbcset;
2623      const unsigned char *name;
2624 {
2625   size_t name_len = strlen ((const char *) name);
2626   if (BE (name_len != 1, 0))
2627     return REG_ECOLLATE;
2628   else
2629     {
2630       bitset_set (sbcset, name[0]);
2631       return REG_NOERROR;
2632     }
2633 }
2634 #endif /* not _LIBC */
2635
2636 /* This function parse bracket expression like "[abc]", "[a-c]",
2637    "[[.a-a.]]" etc.  */
2638
2639 static bin_tree_t *
2640 parse_bracket_exp (regexp, dfa, token, syntax, err)
2641      re_string_t *regexp;
2642      re_dfa_t *dfa;
2643      re_token_t *token;
2644      reg_syntax_t syntax;
2645      reg_errcode_t *err;
2646 {
2647 #ifdef _LIBC
2648   const unsigned char *collseqmb;
2649   const char *collseqwc;
2650   uint32_t nrules;
2651   int32_t table_size;
2652   const int32_t *symb_table;
2653   const unsigned char *extra;
2654
2655   /* Local function for parse_bracket_exp used in _LIBC environement.
2656      Seek the collating symbol entry correspondings to NAME.
2657      Return the index of the symbol in the SYMB_TABLE.  */
2658
2659   static inline int32_t
2660   __attribute ((always_inline))
2661   seek_collating_symbol_entry (name, name_len)
2662          const unsigned char *name;
2663          size_t name_len;
2664     {
2665       int32_t hash = elem_hash ((const char *) name, name_len);
2666       int32_t elem = hash % table_size;
2667       int32_t second = hash % (table_size - 2);
2668       while (symb_table[2 * elem] != 0)
2669         {
2670           /* First compare the hashing value.  */
2671           if (symb_table[2 * elem] == hash
2672               /* Compare the length of the name.  */
2673               && name_len == extra[symb_table[2 * elem + 1]]
2674               /* Compare the name.  */
2675               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2676                          name_len) == 0)
2677             {
2678               /* Yep, this is the entry.  */
2679               break;
2680             }
2681
2682           /* Next entry.  */
2683           elem += second;
2684         }
2685       return elem;
2686     }
2687
2688   /* Local function for parse_bracket_exp used in _LIBC environement.
2689      Look up the collation sequence value of BR_ELEM.
2690      Return the value if succeeded, UINT_MAX otherwise.  */
2691
2692   static inline unsigned int
2693   __attribute ((always_inline))
2694   lookup_collation_sequence_value (br_elem)
2695          bracket_elem_t *br_elem;
2696     {
2697       if (br_elem->type == SB_CHAR)
2698         {
2699           /*
2700           if (MB_CUR_MAX == 1)
2701           */
2702           if (nrules == 0)
2703             return collseqmb[br_elem->opr.ch];
2704           else
2705             {
2706               wint_t wc = __btowc (br_elem->opr.ch);
2707               return __collseq_table_lookup (collseqwc, wc);
2708             }
2709         }
2710       else if (br_elem->type == MB_CHAR)
2711         {
2712           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2713         }
2714       else if (br_elem->type == COLL_SYM)
2715         {
2716           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2717           if (nrules != 0)
2718             {
2719               int32_t elem, idx;
2720               elem = seek_collating_symbol_entry (br_elem->opr.name,
2721                                                   sym_name_len);
2722               if (symb_table[2 * elem] != 0)
2723                 {
2724                   /* We found the entry.  */
2725                   idx = symb_table[2 * elem + 1];
2726                   /* Skip the name of collating element name.  */
2727                   idx += 1 + extra[idx];
2728                   /* Skip the byte sequence of the collating element.  */
2729                   idx += 1 + extra[idx];
2730                   /* Adjust for the alignment.  */
2731                   idx = (idx + 3) & ~3;
2732                   /* Skip the multibyte collation sequence value.  */
2733                   idx += sizeof (unsigned int);
2734                   /* Skip the wide char sequence of the collating element.  */
2735                   idx += sizeof (unsigned int) *
2736                     (1 + *(unsigned int *) (extra + idx));
2737                   /* Return the collation sequence value.  */
2738                   return *(unsigned int *) (extra + idx);
2739                 }
2740               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2741                 {
2742                   /* No valid character.  Match it as a single byte
2743                      character.  */
2744                   return collseqmb[br_elem->opr.name[0]];
2745                 }
2746             }
2747           else if (sym_name_len == 1)
2748             return collseqmb[br_elem->opr.name[0]];
2749         }
2750       return UINT_MAX;
2751     }
2752
2753   /* Local function for parse_bracket_exp used in _LIBC environement.
2754      Build the range expression which starts from START_ELEM, and ends
2755      at END_ELEM.  The result are written to MBCSET and SBCSET.
2756      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2757      mbcset->range_ends, is a pointer argument sinse we may
2758      update it.  */
2759
2760   static inline reg_errcode_t
2761   __attribute ((always_inline))
2762 # ifdef RE_ENABLE_I18N
2763   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2764          re_charset_t *mbcset;
2765          int *range_alloc;
2766 # else /* not RE_ENABLE_I18N */
2767   build_range_exp (sbcset, start_elem, end_elem)
2768 # endif /* not RE_ENABLE_I18N */
2769          re_bitset_ptr_t sbcset;
2770          bracket_elem_t *start_elem, *end_elem;
2771     {
2772       unsigned int ch;
2773       uint32_t start_collseq;
2774       uint32_t end_collseq;
2775
2776 # ifdef RE_ENABLE_I18N
2777       /* Check the space of the arrays.  */
2778       if (*range_alloc == mbcset->nranges)
2779         {
2780           /* There are not enough space, need realloc.  */
2781           uint32_t *new_array_start;
2782           uint32_t *new_array_end;
2783           int new_nranges;
2784
2785           /* +1 in case of mbcset->nranges is 0.  */
2786           new_nranges = 2 * mbcset->nranges + 1;
2787           /* Use realloc since mbcset->range_starts and mbcset->range_ends
2788              are NULL if *range_alloc == 0.  */
2789           new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2790                                         new_nranges);
2791           new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2792                                       new_nranges);
2793
2794           if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2795             return REG_ESPACE;
2796
2797           mbcset->range_starts = new_array_start;
2798           mbcset->range_ends = new_array_end;
2799           *range_alloc = new_nranges;
2800         }
2801 # endif /* RE_ENABLE_I18N */
2802
2803       /* Equivalence Classes and Character Classes can't be a range
2804          start/end.  */
2805       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2806               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2807               0))
2808         return REG_ERANGE;
2809
2810       start_collseq = lookup_collation_sequence_value (start_elem);
2811       end_collseq = lookup_collation_sequence_value (end_elem);
2812       /* Check start/end collation sequence values.  */
2813       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2814         return REG_ECOLLATE;
2815       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2816         return REG_ERANGE;
2817
2818 # ifdef RE_ENABLE_I18N
2819       /* Got valid collation sequence values, add them as a new entry.  */
2820       mbcset->range_starts[mbcset->nranges] = start_collseq;
2821       mbcset->range_ends[mbcset->nranges++] = end_collseq;
2822 # endif /* RE_ENABLE_I18N */
2823
2824       /* Build the table for single byte characters.  */
2825       for (ch = 0; ch <= SBC_MAX; ch++)
2826         {
2827           uint32_t ch_collseq;
2828           /*
2829           if (MB_CUR_MAX == 1)
2830           */
2831           if (nrules == 0)
2832             ch_collseq = collseqmb[ch];
2833           else
2834             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2835           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2836             bitset_set (sbcset, ch);
2837         }
2838       return REG_NOERROR;
2839     }
2840
2841   /* Local function for parse_bracket_exp used in _LIBC environement.
2842      Build the collating element which is represented by NAME.
2843      The result are written to MBCSET and SBCSET.
2844      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2845      pointer argument sinse we may update it.  */
2846
2847   static inline reg_errcode_t
2848   __attribute ((always_inline))
2849 # ifdef RE_ENABLE_I18N
2850   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2851          re_charset_t *mbcset;
2852          int *coll_sym_alloc;
2853 # else /* not RE_ENABLE_I18N */
2854   build_collating_symbol (sbcset, name)
2855 # endif /* not RE_ENABLE_I18N */
2856          re_bitset_ptr_t sbcset;
2857          const unsigned char *name;
2858     {
2859       int32_t elem, idx;
2860       size_t name_len = strlen ((const char *) name);
2861       if (nrules != 0)
2862         {
2863           elem = seek_collating_symbol_entry (name, name_len);
2864           if (symb_table[2 * elem] != 0)
2865             {
2866               /* We found the entry.  */
2867               idx = symb_table[2 * elem + 1];
2868               /* Skip the name of collating element name.  */
2869               idx += 1 + extra[idx];
2870             }
2871           else if (symb_table[2 * elem] == 0 && name_len == 1)
2872             {
2873               /* No valid character, treat it as a normal
2874                  character.  */
2875               bitset_set (sbcset, name[0]);
2876               return REG_NOERROR;
2877             }
2878           else
2879             return REG_ECOLLATE;
2880
2881 # ifdef RE_ENABLE_I18N
2882           /* Got valid collation sequence, add it as a new entry.  */
2883           /* Check the space of the arrays.  */
2884           if (*coll_sym_alloc == mbcset->ncoll_syms)
2885             {
2886               /* Not enough, realloc it.  */
2887               /* +1 in case of mbcset->ncoll_syms is 0.  */
2888               *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2889               /* Use realloc since mbcset->coll_syms is NULL
2890                  if *alloc == 0.  */
2891               mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2892                                               *coll_sym_alloc);
2893               if (BE (mbcset->coll_syms == NULL, 0))
2894                 return REG_ESPACE;
2895             }
2896           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2897 # endif /* RE_ENABLE_I18N */
2898           return REG_NOERROR;
2899         }
2900       else
2901         {
2902           if (BE (name_len != 1, 0))
2903             return REG_ECOLLATE;
2904           else
2905             {
2906               bitset_set (sbcset, name[0]);
2907               return REG_NOERROR;
2908             }
2909         }
2910     }
2911 #endif
2912
2913   re_token_t br_token;
2914   re_bitset_ptr_t sbcset;
2915 #ifdef RE_ENABLE_I18N
2916   re_charset_t *mbcset;
2917   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2918   int equiv_class_alloc = 0, char_class_alloc = 0;
2919 #else /* not RE_ENABLE_I18N */
2920   int non_match = 0;
2921 #endif /* not RE_ENABLE_I18N */
2922   bin_tree_t *work_tree;
2923   int token_len, new_idx;
2924 #ifdef _LIBC
2925   collseqmb = (const unsigned char *)
2926     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2927   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2928   if (nrules)
2929     {
2930       /*
2931       if (MB_CUR_MAX > 1)
2932       */
2933         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2934       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2935       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2936                                                   _NL_COLLATE_SYMB_TABLEMB);
2937       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2938                                                    _NL_COLLATE_SYMB_EXTRAMB);
2939     }
2940 #endif
2941   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2942 #ifdef RE_ENABLE_I18N
2943   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2944 #endif /* RE_ENABLE_I18N */
2945 #ifdef RE_ENABLE_I18N
2946   if (BE (sbcset == NULL || mbcset == NULL, 0))
2947 #else
2948   if (BE (sbcset == NULL, 0))
2949 #endif /* RE_ENABLE_I18N */
2950     {
2951       *err = REG_ESPACE;
2952       return NULL;
2953     }
2954
2955   token_len = peek_token_bracket (token, regexp, syntax);
2956   if (BE (token->type == END_OF_RE, 0))
2957     {
2958       *err = REG_BADPAT;
2959       goto parse_bracket_exp_free_return;
2960     }
2961   if (token->type == OP_NON_MATCH_LIST)
2962     {
2963 #ifdef RE_ENABLE_I18N
2964       int i;
2965       mbcset->non_match = 1;
2966 #else /* not RE_ENABLE_I18N */
2967       non_match = 1;
2968 #endif /* not RE_ENABLE_I18N */
2969       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2970         bitset_set (sbcset, '\0');
2971       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2972       token_len = peek_token_bracket (token, regexp, syntax);
2973       if (BE (token->type == END_OF_RE, 0))
2974         {
2975           *err = REG_BADPAT;
2976           goto parse_bracket_exp_free_return;
2977         }
2978 #ifdef RE_ENABLE_I18N
2979       if (dfa->mb_cur_max > 1)
2980         for (i = 0; i < SBC_MAX; ++i)
2981           if (__btowc (i) == WEOF)
2982             bitset_set (sbcset, i);
2983 #endif /* RE_ENABLE_I18N */
2984     }
2985
2986   /* We treat the first ']' as a normal character.  */
2987   if (token->type == OP_CLOSE_BRACKET)
2988     token->type = CHARACTER;
2989
2990   int first_round = 1;
2991   while (1)
2992     {
2993       bracket_elem_t start_elem, end_elem;
2994       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2995       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2996       reg_errcode_t ret;
2997       int token_len2 = 0, is_range_exp = 0;
2998       re_token_t token2;
2999
3000       start_elem.opr.name = start_name_buf;
3001       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3002                                    syntax, first_round);
3003       if (BE (ret != REG_NOERROR, 0))
3004         {
3005           *err = ret;
3006           goto parse_bracket_exp_free_return;
3007         }
3008       first_round = 0;
3009
3010       /* Get information about the next token.  We need it in any case.  */
3011       token_len = peek_token_bracket (token, regexp, syntax);
3012
3013       /* Do not check for ranges if we know they are not allowed.  */
3014       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3015         {
3016           if (BE (token->type == END_OF_RE, 0))
3017             {
3018               *err = REG_EBRACK;
3019               goto parse_bracket_exp_free_return;
3020             }
3021           if (token->type == OP_CHARSET_RANGE)
3022             {
3023               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3024               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3025               if (BE (token2.type == END_OF_RE, 0))
3026                 {
3027                   *err = REG_EBRACK;
3028                   goto parse_bracket_exp_free_return;
3029                 }
3030               if (token2.type == OP_CLOSE_BRACKET)
3031                 {
3032                   /* We treat the last '-' as a normal character.  */
3033                   re_string_skip_bytes (regexp, -token_len);
3034                   token->type = CHARACTER;
3035                 }
3036               else
3037                 is_range_exp = 1;
3038             }
3039         }
3040
3041       if (is_range_exp == 1)
3042         {
3043           end_elem.opr.name = end_name_buf;
3044           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3045                                        dfa, syntax, 1);
3046           if (BE (ret != REG_NOERROR, 0))
3047             {
3048               *err = ret;
3049               goto parse_bracket_exp_free_return;
3050             }
3051
3052           token_len = peek_token_bracket (token, regexp, syntax);
3053
3054           *err = build_range_exp (sbcset,
3055 #ifdef RE_ENABLE_I18N
3056                                   mbcset, &range_alloc,
3057 #endif /* RE_ENABLE_I18N */
3058                                   &start_elem, &end_elem);
3059           if (BE (*err != REG_NOERROR, 0))
3060             goto parse_bracket_exp_free_return;
3061         }
3062       else
3063         {
3064           switch (start_elem.type)
3065             {
3066             case SB_CHAR:
3067               bitset_set (sbcset, start_elem.opr.ch);
3068               break;
3069 #ifdef RE_ENABLE_I18N
3070             case MB_CHAR:
3071               /* Check whether the array has enough space.  */
3072               if (mbchar_alloc == mbcset->nmbchars)
3073                 {
3074                   /* Not enough, realloc it.  */
3075                   /* +1 in case of mbcset->nmbchars is 0.  */
3076                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3077                   /* Use realloc since array is NULL if *alloc == 0.  */
3078                   mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
3079                                                 mbchar_alloc);
3080                   if (BE (mbcset->mbchars == NULL, 0))
3081                     goto parse_bracket_exp_espace;
3082                 }
3083               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3084               break;
3085 #endif /* RE_ENABLE_I18N */
3086             case EQUIV_CLASS:
3087               *err = build_equiv_class (sbcset,
3088 #ifdef RE_ENABLE_I18N
3089                                         mbcset, &equiv_class_alloc,
3090 #endif /* RE_ENABLE_I18N */
3091                                         start_elem.opr.name);
3092               if (BE (*err != REG_NOERROR, 0))
3093                 goto parse_bracket_exp_free_return;
3094               break;
3095             case COLL_SYM:
3096               *err = build_collating_symbol (sbcset,
3097 #ifdef RE_ENABLE_I18N
3098                                              mbcset, &coll_sym_alloc,
3099 #endif /* RE_ENABLE_I18N */
3100                                              start_elem.opr.name);
3101               if (BE (*err != REG_NOERROR, 0))
3102                 goto parse_bracket_exp_free_return;
3103               break;
3104             case CHAR_CLASS:
3105               *err = build_charclass (regexp->trans, sbcset,
3106 #ifdef RE_ENABLE_I18N
3107                                       mbcset, &char_class_alloc,
3108 #endif /* RE_ENABLE_I18N */
3109                                       start_elem.opr.name, syntax);
3110               if (BE (*err != REG_NOERROR, 0))
3111                goto parse_bracket_exp_free_return;
3112               break;
3113             default:
3114               assert (0);
3115               break;
3116             }
3117         }
3118       if (BE (token->type == END_OF_RE, 0))
3119         {
3120           *err = REG_EBRACK;
3121           goto parse_bracket_exp_free_return;
3122         }
3123       if (token->type == OP_CLOSE_BRACKET)
3124         break;
3125     }
3126
3127   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3128
3129   /* If it is non-matching list.  */
3130 #ifdef RE_ENABLE_I18N
3131   if (mbcset->non_match)
3132 #else /* not RE_ENABLE_I18N */
3133   if (non_match)
3134 #endif /* not RE_ENABLE_I18N */
3135     bitset_not (sbcset);
3136
3137   /* Build a tree for simple bracket.  */
3138   br_token.type = SIMPLE_BRACKET;
3139   br_token.opr.sbcset = sbcset;
3140   new_idx = re_dfa_add_node (dfa, br_token, 0);
3141   work_tree = create_tree (NULL, NULL, 0, new_idx);
3142   if (BE (new_idx == -1 || work_tree == NULL, 0))
3143     goto parse_bracket_exp_espace;
3144
3145 #ifdef RE_ENABLE_I18N
3146   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3147       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3148                                                      || mbcset->non_match)))
3149     {
3150       re_token_t alt_token;
3151       bin_tree_t *mbc_tree;
3152       /* Build a tree for complex bracket.  */
3153       br_token.type = COMPLEX_BRACKET;
3154       br_token.opr.mbcset = mbcset;
3155       dfa->has_mb_node = 1;
3156       new_idx = re_dfa_add_node (dfa, br_token, 0);
3157       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3158       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3159         goto parse_bracket_exp_espace;
3160       /* Then join them by ALT node.  */
3161       dfa->has_plural_match = 1;
3162       alt_token.type = OP_ALT;
3163       new_idx = re_dfa_add_node (dfa, alt_token, 0);
3164       work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3165       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3166         return work_tree;
3167     }
3168   else
3169     {
3170       free_charset (mbcset);
3171       return work_tree;
3172     }
3173 #else /* not RE_ENABLE_I18N */
3174   return work_tree;
3175 #endif /* not RE_ENABLE_I18N */
3176
3177  parse_bracket_exp_espace:
3178   *err = REG_ESPACE;
3179  parse_bracket_exp_free_return:
3180   re_free (sbcset);
3181 #ifdef RE_ENABLE_I18N
3182   free_charset (mbcset);
3183 #endif /* RE_ENABLE_I18N */
3184   return NULL;
3185 }
3186
3187 /* Parse an element in the bracket expression.  */
3188
3189 static reg_errcode_t
3190 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3191                        accept_hyphen)
3192      bracket_elem_t *elem;
3193      re_string_t *regexp;
3194      re_token_t *token;
3195      int token_len;
3196      re_dfa_t *dfa;
3197      reg_syntax_t syntax;
3198      int accept_hyphen;
3199 {
3200 #ifdef RE_ENABLE_I18N
3201   int cur_char_size;
3202   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3203   if (cur_char_size > 1)
3204     {
3205       elem->type = MB_CHAR;
3206       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3207       re_string_skip_bytes (regexp, cur_char_size);
3208       return REG_NOERROR;
3209     }
3210 #endif /* RE_ENABLE_I18N */
3211   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3212   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3213       || token->type == OP_OPEN_EQUIV_CLASS)
3214     return parse_bracket_symbol (elem, regexp, token);
3215   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3216     {
3217       /* A '-' must only appear as anything but a range indicator before
3218          the closing bracket.  Everything else is an error.  */
3219       re_token_t token2;
3220       (void) peek_token_bracket (&token2, regexp, syntax);
3221       if (token2.type != OP_CLOSE_BRACKET)
3222         /* The actual error value is not standardized since this whole
3223            case is undefined.  But ERANGE makes good sense.  */
3224         return REG_ERANGE;
3225     }
3226   elem->type = SB_CHAR;
3227   elem->opr.ch = token->opr.c;
3228   return REG_NOERROR;
3229 }
3230
3231 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3232    such as [:<character_class>:], [.<collating_element>.], and
3233    [=<equivalent_class>=].  */
3234
3235 static reg_errcode_t
3236 parse_bracket_symbol (elem, regexp, token)
3237      bracket_elem_t *elem;
3238      re_string_t *regexp;
3239      re_token_t *token;
3240 {
3241   unsigned char ch, delim = token->opr.c;
3242   int i = 0;
3243   for (;; ++i)
3244     {
3245       if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3246         return REG_EBRACK;
3247       if (token->type == OP_OPEN_CHAR_CLASS)
3248         ch = re_string_fetch_byte_case (regexp);
3249       else
3250         ch = re_string_fetch_byte (regexp);
3251       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3252         break;
3253       elem->opr.name[i] = ch;
3254     }
3255   re_string_skip_bytes (regexp, 1);
3256   elem->opr.name[i] = '\0';
3257   switch (token->type)
3258     {
3259     case OP_OPEN_COLL_ELEM:
3260       elem->type = COLL_SYM;
3261       break;
3262     case OP_OPEN_EQUIV_CLASS:
3263       elem->type = EQUIV_CLASS;
3264       break;
3265     case OP_OPEN_CHAR_CLASS:
3266       elem->type = CHAR_CLASS;
3267       break;
3268     default:
3269       break;
3270     }
3271   return REG_NOERROR;
3272 }
3273
3274   /* Helper function for parse_bracket_exp.
3275      Build the equivalence class which is represented by NAME.
3276      The result are written to MBCSET and SBCSET.
3277      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3278      is a pointer argument sinse we may update it.  */
3279
3280 static reg_errcode_t
3281 #ifdef RE_ENABLE_I18N
3282 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3283      re_charset_t *mbcset;
3284      int *equiv_class_alloc;
3285 #else /* not RE_ENABLE_I18N */
3286 build_equiv_class (sbcset, name)
3287 #endif /* not RE_ENABLE_I18N */
3288      re_bitset_ptr_t sbcset;
3289      const unsigned char *name;
3290 {
3291 #if defined _LIBC && defined RE_ENABLE_I18N
3292   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3293   if (nrules != 0)
3294     {
3295       const int32_t *table, *indirect;
3296       const unsigned char *weights, *extra, *cp;
3297       unsigned char char_buf[2];
3298       int32_t idx1, idx2;
3299       unsigned int ch;
3300       size_t len;
3301       /* This #include defines a local function!  */
3302 # include <locale/weight.h>
3303       /* Calculate the index for equivalence class.  */
3304       cp = name;
3305       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3306       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3307                                                _NL_COLLATE_WEIGHTMB);
3308       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3309                                                    _NL_COLLATE_EXTRAMB);
3310       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3311                                                 _NL_COLLATE_INDIRECTMB);
3312       idx1 = findidx (&cp);
3313       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3314         /* This isn't a valid character.  */
3315         return REG_ECOLLATE;
3316
3317       /* Build single byte matcing table for this equivalence class.  */
3318       char_buf[1] = (unsigned char) '\0';
3319       len = weights[idx1];
3320       for (ch = 0; ch < SBC_MAX; ++ch)
3321         {
3322           char_buf[0] = ch;
3323           cp = char_buf;
3324           idx2 = findidx (&cp);
3325 /*
3326           idx2 = table[ch];
3327 */
3328           if (idx2 == 0)
3329             /* This isn't a valid character.  */
3330             continue;
3331           if (len == weights[idx2])
3332             {
3333               int cnt = 0;
3334               while (cnt <= len &&
3335                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3336                 ++cnt;
3337
3338               if (cnt > len)
3339                 bitset_set (sbcset, ch);
3340             }
3341         }
3342       /* Check whether the array has enough space.  */
3343       if (*equiv_class_alloc == mbcset->nequiv_classes)
3344         {
3345           /* Not enough, realloc it.  */
3346           /* +1 in case of mbcset->nequiv_classes is 0.  */
3347           *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3348           /* Use realloc since the array is NULL if *alloc == 0.  */
3349           mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3350                                               *equiv_class_alloc);
3351           if (BE (mbcset->equiv_classes == NULL, 0))
3352             return REG_ESPACE;
3353         }
3354       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3355     }
3356   else
3357 #endif /* _LIBC && RE_ENABLE_I18N */
3358     {
3359       if (BE (strlen ((const char *) name) != 1, 0))
3360         return REG_ECOLLATE;
3361       bitset_set (sbcset, *name);
3362     }
3363   return REG_NOERROR;
3364 }
3365
3366   /* Helper function for parse_bracket_exp.
3367      Build the character class which is represented by NAME.
3368      The result are written to MBCSET and SBCSET.
3369      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3370      is a pointer argument sinse we may update it.  */
3371
3372 static reg_errcode_t
3373 #ifdef RE_ENABLE_I18N
3374 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3375      re_charset_t *mbcset;
3376      int *char_class_alloc;
3377 #else /* not RE_ENABLE_I18N */
3378 build_charclass (trans, sbcset, class_name, syntax)
3379 #endif /* not RE_ENABLE_I18N */
3380      RE_TRANSLATE_TYPE trans;
3381      re_bitset_ptr_t sbcset;
3382      const unsigned char *class_name;
3383      reg_syntax_t syntax;
3384 {
3385   int i;
3386   const char *name = (const char *) class_name;
3387
3388   /* In case of REG_ICASE "upper" and "lower" match the both of
3389      upper and lower cases.  */
3390   if ((syntax & RE_ICASE)
3391       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3392     name = "alpha";
3393
3394 #ifdef RE_ENABLE_I18N
3395   /* Check the space of the arrays.  */
3396   if (*char_class_alloc == mbcset->nchar_classes)
3397     {
3398       /* Not enough, realloc it.  */
3399       /* +1 in case of mbcset->nchar_classes is 0.  */
3400       *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3401       /* Use realloc since array is NULL if *alloc == 0.  */
3402       mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3403                                          *char_class_alloc);
3404       if (BE (mbcset->char_classes == NULL, 0))
3405         return REG_ESPACE;
3406     }
3407   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3408 #endif /* RE_ENABLE_I18N */
3409
3410 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3411     for (i = 0; i < SBC_MAX; ++i)               \
3412       {                                         \
3413         if (ctype_func (i))                     \
3414           {                                     \
3415             int ch = trans ? trans[i] : i;      \
3416             bitset_set (sbcset, ch);            \
3417           }                                     \
3418       }
3419
3420   if (strcmp (name, "alnum") == 0)
3421     BUILD_CHARCLASS_LOOP (isalnum)
3422   else if (strcmp (name, "cntrl") == 0)
3423     BUILD_CHARCLASS_LOOP (iscntrl)
3424   else if (strcmp (name, "lower") == 0)
3425     BUILD_CHARCLASS_LOOP (islower)
3426   else if (strcmp (name, "space") == 0)
3427     BUILD_CHARCLASS_LOOP (isspace)
3428   else if (strcmp (name, "alpha") == 0)
3429     BUILD_CHARCLASS_LOOP (isalpha)
3430   else if (strcmp (name, "digit") == 0)
3431     BUILD_CHARCLASS_LOOP (isdigit)
3432   else if (strcmp (name, "print") == 0)
3433     BUILD_CHARCLASS_LOOP (isprint)
3434   else if (strcmp (name, "upper") == 0)
3435     BUILD_CHARCLASS_LOOP (isupper)
3436   else if (strcmp (name, "blank") == 0)
3437     BUILD_CHARCLASS_LOOP (isblank)
3438   else if (strcmp (name, "graph") == 0)
3439     BUILD_CHARCLASS_LOOP (isgraph)
3440   else if (strcmp (name, "punct") == 0)
3441     BUILD_CHARCLASS_LOOP (ispunct)
3442   else if (strcmp (name, "xdigit") == 0)
3443     BUILD_CHARCLASS_LOOP (isxdigit)
3444   else
3445     return REG_ECTYPE;
3446
3447   return REG_NOERROR;
3448 }
3449
3450 static bin_tree_t *
3451 build_charclass_op (dfa, trans, class_name, extra, not, err)
3452      re_dfa_t *dfa;
3453      RE_TRANSLATE_TYPE trans;
3454      const unsigned char *class_name;
3455      const unsigned char *extra;
3456      int not;
3457      reg_errcode_t *err;
3458 {
3459   re_bitset_ptr_t sbcset;
3460 #ifdef RE_ENABLE_I18N
3461   re_charset_t *mbcset;
3462   int alloc = 0;
3463 #else /* not RE_ENABLE_I18N */
3464   int non_match = 0;
3465 #endif /* not RE_ENABLE_I18N */
3466   reg_errcode_t ret;
3467   re_token_t br_token;
3468   bin_tree_t *tree;
3469   int new_idx;
3470
3471   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3472 #ifdef RE_ENABLE_I18N
3473   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3474 #endif /* RE_ENABLE_I18N */
3475
3476 #ifdef RE_ENABLE_I18N
3477   if (BE (sbcset == NULL || mbcset == NULL, 0))
3478 #else /* not RE_ENABLE_I18N */
3479   if (BE (sbcset == NULL, 0))
3480 #endif /* not RE_ENABLE_I18N */
3481     {
3482       *err = REG_ESPACE;
3483       return NULL;
3484     }
3485
3486   if (not)
3487     {
3488 #ifdef RE_ENABLE_I18N
3489       int i;
3490       /*
3491       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3492         bitset_set(cset->sbcset, '\0');
3493       */
3494       mbcset->non_match = 1;
3495       if (dfa->mb_cur_max > 1)
3496         for (i = 0; i < SBC_MAX; ++i)
3497           if (__btowc (i) == WEOF)
3498             bitset_set (sbcset, i);
3499 #else /* not RE_ENABLE_I18N */
3500       non_match = 1;
3501 #endif /* not RE_ENABLE_I18N */
3502     }
3503
3504   /* We don't care the syntax in this case.  */
3505   ret = build_charclass (trans, sbcset,
3506 #ifdef RE_ENABLE_I18N
3507                          mbcset, &alloc,
3508 #endif /* RE_ENABLE_I18N */
3509                          class_name, 0);
3510
3511   if (BE (ret != REG_NOERROR, 0))
3512     {
3513       re_free (sbcset);
3514 #ifdef RE_ENABLE_I18N
3515       free_charset (mbcset);
3516 #endif /* RE_ENABLE_I18N */
3517       *err = ret;
3518       return NULL;
3519     }
3520   /* \w match '_' also.  */
3521   for (; *extra; extra++)
3522     bitset_set (sbcset, *extra);
3523
3524   /* If it is non-matching list.  */
3525 #ifdef RE_ENABLE_I18N
3526   if (mbcset->non_match)
3527 #else /* not RE_ENABLE_I18N */
3528   if (non_match)
3529 #endif /* not RE_ENABLE_I18N */
3530     bitset_not (sbcset);
3531
3532   /* Build a tree for simple bracket.  */
3533   br_token.type = SIMPLE_BRACKET;
3534   br_token.opr.sbcset = sbcset;
3535   new_idx = re_dfa_add_node (dfa, br_token, 0);
3536   tree = create_tree (NULL, NULL, 0, new_idx);
3537   if (BE (new_idx == -1 || tree == NULL, 0))
3538     goto build_word_op_espace;
3539
3540 #ifdef RE_ENABLE_I18N
3541   if (dfa->mb_cur_max > 1)
3542     {
3543       re_token_t alt_token;
3544       bin_tree_t *mbc_tree;
3545       /* Build a tree for complex bracket.  */
3546       br_token.type = COMPLEX_BRACKET;
3547       br_token.opr.mbcset = mbcset;
3548       dfa->has_mb_node = 1;
3549       new_idx = re_dfa_add_node (dfa, br_token, 0);
3550       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3551       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3552         goto build_word_op_espace;
3553       /* Then join them by ALT node.  */
3554       alt_token.type = OP_ALT;
3555       new_idx = re_dfa_add_node (dfa, alt_token, 0);
3556       tree = create_tree (tree, mbc_tree, 0, new_idx);
3557       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3558         return tree;
3559     }
3560   else
3561     {
3562       free_charset (mbcset);
3563       return tree;
3564     }
3565 #else /* not RE_ENABLE_I18N */
3566   return tree;
3567 #endif /* not RE_ENABLE_I18N */
3568
3569  build_word_op_espace:
3570   re_free (sbcset);
3571 #ifdef RE_ENABLE_I18N
3572   free_charset (mbcset);
3573 #endif /* RE_ENABLE_I18N */
3574   *err = REG_ESPACE;
3575   return NULL;
3576 }
3577
3578 /* This is intended for the expressions like "a{1,3}".
3579    Fetch a number from `input', and return the number.
3580    Return -1, if the number field is empty like "{,1}".
3581    Return -2, If an error is occured.  */
3582
3583 static int
3584 fetch_number (input, token, syntax)
3585      re_string_t *input;
3586      re_token_t *token;
3587      reg_syntax_t syntax;
3588 {
3589   int num = -1;
3590   unsigned char c;
3591   while (1)
3592     {
3593       *token = fetch_token (input, syntax);
3594       c = token->opr.c;
3595       if (BE (token->type == END_OF_RE, 0))
3596         return -2;
3597       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3598         break;
3599       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3600              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3601       num = (num > RE_DUP_MAX) ? -2 : num;
3602     }
3603   return num;
3604 }
3605 \f
3606 #ifdef RE_ENABLE_I18N
3607 static void
3608 free_charset (re_charset_t *cset)
3609 {
3610   re_free (cset->mbchars);
3611 # ifdef _LIBC
3612   re_free (cset->coll_syms);
3613   re_free (cset->equiv_classes);
3614   re_free (cset->range_starts);
3615   re_free (cset->range_ends);
3616 # endif
3617   re_free (cset->char_classes);
3618   re_free (cset);
3619 }
3620 #endif /* RE_ENABLE_I18N */
3621 \f
3622 /* Functions for binary tree operation.  */
3623
3624 /* Create a node of tree.
3625    Note: This function automatically free left and right if malloc fails.  */
3626
3627 static bin_tree_t *
3628 create_tree (left, right, type, index)
3629      bin_tree_t *left;
3630      bin_tree_t *right;
3631      re_token_type_t type;
3632      int index;
3633 {
3634   bin_tree_t *tree;
3635   tree = re_malloc (bin_tree_t, 1);
3636   if (BE (tree == NULL, 0))
3637     {
3638       free_bin_tree (left);
3639       free_bin_tree (right);
3640       return NULL;
3641     }
3642   tree->parent = NULL;
3643   tree->left = left;
3644   tree->right = right;
3645   tree->type = type;
3646   tree->node_idx = index;
3647   tree->first = -1;
3648   tree->next = -1;
3649   re_node_set_init_empty (&tree->eclosure);
3650
3651   if (left != NULL)
3652     left->parent = tree;
3653   if (right != NULL)
3654     right->parent = tree;
3655   return tree;
3656 }
3657
3658 /* Free the sub tree pointed by TREE.  */
3659
3660 static void
3661 free_bin_tree (tree)
3662      bin_tree_t *tree;
3663 {
3664   if (tree == NULL)
3665     return;
3666   /*re_node_set_free (&tree->eclosure);*/
3667   free_bin_tree (tree->left);
3668   free_bin_tree (tree->right);
3669   re_free (tree);
3670 }
3671
3672 /* Duplicate the node SRC, and return new node.  */
3673
3674 static bin_tree_t *
3675 duplicate_tree (src, dfa)
3676      const bin_tree_t *src;
3677      re_dfa_t *dfa;
3678 {
3679   bin_tree_t *left = NULL, *right = NULL, *new_tree;
3680   int new_node_idx;
3681   /* Since node indies must be according to Post-order of the tree,
3682      we must duplicate the left at first.  */
3683   if (src->left != NULL)
3684     {
3685       left = duplicate_tree (src->left, dfa);
3686       if (left == NULL)
3687         return NULL;
3688     }
3689
3690   /* Secondaly, duplicate the right.  */
3691   if (src->right != NULL)
3692     {
3693       right = duplicate_tree (src->right, dfa);
3694       if (right == NULL)
3695         {
3696           free_bin_tree (left);
3697           return NULL;
3698         }
3699     }
3700
3701   /* At last, duplicate itself.  */
3702   if (src->type == NON_TYPE)
3703     {
3704       new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3705       dfa->nodes[new_node_idx].duplicated = 1;
3706       if (BE (new_node_idx == -1, 0))
3707         {
3708           free_bin_tree (left);
3709           free_bin_tree (right);
3710           return NULL;
3711         }
3712     }
3713   else
3714     new_node_idx = src->type;
3715
3716   new_tree = create_tree (left, right, src->type, new_node_idx);
3717   if (BE (new_tree == NULL, 0))
3718     {
3719       free_bin_tree (left);
3720       free_bin_tree (right);
3721     }
3722   return new_tree;
3723 }