]> git.ipfire.org Git - thirdparty/glibc.git/blob - posix/regex.c
Update.
[thirdparty/glibc.git] / posix / regex.c
1 /* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P1003.2/D11.2, except for some of the
4 internationalization features.)
5 Copyright (C) 1993-1999, 2000 Free Software Foundation, Inc.
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Library General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
16
17 You should have received a copy of the GNU Library General Public
18 License along with the GNU C Library; see the file COPYING.LIB. If not,
19 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /* AIX requires this to be the first thing in the file. */
23 #if defined _AIX && !defined REGEX_MALLOC
24 #pragma alloca
25 #endif
26
27 #undef _GNU_SOURCE
28 #define _GNU_SOURCE
29
30 #ifdef HAVE_CONFIG_H
31 # include <config.h>
32 #endif
33
34 #ifndef PARAMS
35 # if defined __GNUC__ || (defined __STDC__ && __STDC__)
36 # define PARAMS(args) args
37 # else
38 # define PARAMS(args) ()
39 # endif /* GCC. */
40 #endif /* Not PARAMS. */
41
42 #if defined STDC_HEADERS && !defined emacs
43 # include <stddef.h>
44 #else
45 /* We need this for `regex.h', and perhaps for the Emacs include files. */
46 # include <sys/types.h>
47 #endif
48
49 #define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC)
50
51 /* For platform which support the ISO C amendement 1 functionality we
52 support user defined character classes. */
53 #if defined _LIBC || WIDE_CHAR_SUPPORT
54 /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
55 # include <wchar.h>
56 # include <wctype.h>
57 #endif
58
59 #ifdef _LIBC
60 /* We have to keep the namespace clean. */
61 # define regfree(preg) __regfree (preg)
62 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
63 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
64 # define regerror(errcode, preg, errbuf, errbuf_size) \
65 __regerror(errcode, preg, errbuf, errbuf_size)
66 # define re_set_registers(bu, re, nu, st, en) \
67 __re_set_registers (bu, re, nu, st, en)
68 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
69 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
70 # define re_match(bufp, string, size, pos, regs) \
71 __re_match (bufp, string, size, pos, regs)
72 # define re_search(bufp, string, size, startpos, range, regs) \
73 __re_search (bufp, string, size, startpos, range, regs)
74 # define re_compile_pattern(pattern, length, bufp) \
75 __re_compile_pattern (pattern, length, bufp)
76 # define re_set_syntax(syntax) __re_set_syntax (syntax)
77 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
78 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
79 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
80
81 # define btowc __btowc
82
83 /* We are also using some library internals. */
84 # include <locale/localeinfo.h>
85 # include <locale/elem-hash.h>
86 # include <langinfo.h>
87 #endif
88
89 /* This is for other GNU distributions with internationalized messages. */
90 #if HAVE_LIBINTL_H || defined _LIBC
91 # include <libintl.h>
92 # ifdef _LIBC
93 # undef gettext
94 # define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES)
95 # endif
96 #else
97 # define gettext(msgid) (msgid)
98 #endif
99
100 #ifndef gettext_noop
101 /* This define is so xgettext can find the internationalizable
102 strings. */
103 # define gettext_noop(String) String
104 #endif
105
106 /* The `emacs' switch turns on certain matching commands
107 that make sense only in Emacs. */
108 #ifdef emacs
109
110 # include "lisp.h"
111 # include "buffer.h"
112 # include "syntax.h"
113
114 #else /* not emacs */
115
116 /* If we are not linking with Emacs proper,
117 we can't use the relocating allocator
118 even if config.h says that we can. */
119 # undef REL_ALLOC
120
121 # if defined STDC_HEADERS || defined _LIBC
122 # include <stdlib.h>
123 # else
124 char *malloc ();
125 char *realloc ();
126 # endif
127
128 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
129 If nothing else has been done, use the method below. */
130 # ifdef INHIBIT_STRING_HEADER
131 # if !(defined HAVE_BZERO && defined HAVE_BCOPY)
132 # if !defined bzero && !defined bcopy
133 # undef INHIBIT_STRING_HEADER
134 # endif
135 # endif
136 # endif
137
138 /* This is the normal way of making sure we have a bcopy and a bzero.
139 This is used in most programs--a few other programs avoid this
140 by defining INHIBIT_STRING_HEADER. */
141 # ifndef INHIBIT_STRING_HEADER
142 # if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
143 # include <string.h>
144 # ifndef bzero
145 # ifndef _LIBC
146 # define bzero(s, n) (memset (s, '\0', n), (s))
147 # else
148 # define bzero(s, n) __bzero (s, n)
149 # endif
150 # endif
151 # else
152 # include <strings.h>
153 # ifndef memcmp
154 # define memcmp(s1, s2, n) bcmp (s1, s2, n)
155 # endif
156 # ifndef memcpy
157 # define memcpy(d, s, n) (bcopy (s, d, n), (d))
158 # endif
159 # endif
160 # endif
161
162 /* Define the syntax stuff for \<, \>, etc. */
163
164 /* This must be nonzero for the wordchar and notwordchar pattern
165 commands in re_match_2. */
166 # ifndef Sword
167 # define Sword 1
168 # endif
169
170 # ifdef SWITCH_ENUM_BUG
171 # define SWITCH_ENUM_CAST(x) ((int)(x))
172 # else
173 # define SWITCH_ENUM_CAST(x) (x)
174 # endif
175
176 #endif /* not emacs */
177
178 #if defined _LIBC || HAVE_LIMITS_H
179 # include <limits.h>
180 #endif
181
182 #ifndef MB_LEN_MAX
183 # define MB_LEN_MAX 1
184 #endif
185 \f
186 /* Get the interface, including the syntax bits. */
187 #include <regex.h>
188
189 /* isalpha etc. are used for the character classes. */
190 #include <ctype.h>
191
192 /* Jim Meyering writes:
193
194 "... Some ctype macros are valid only for character codes that
195 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
196 using /bin/cc or gcc but without giving an ansi option). So, all
197 ctype uses should be through macros like ISPRINT... If
198 STDC_HEADERS is defined, then autoconf has verified that the ctype
199 macros don't need to be guarded with references to isascii. ...
200 Defining isascii to 1 should let any compiler worth its salt
201 eliminate the && through constant folding."
202 Solaris defines some of these symbols so we must undefine them first. */
203
204 #undef ISASCII
205 #if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
206 # define ISASCII(c) 1
207 #else
208 # define ISASCII(c) isascii(c)
209 #endif
210
211 #ifdef isblank
212 # define ISBLANK(c) (ISASCII (c) && isblank (c))
213 #else
214 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
215 #endif
216 #ifdef isgraph
217 # define ISGRAPH(c) (ISASCII (c) && isgraph (c))
218 #else
219 # define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
220 #endif
221
222 #undef ISPRINT
223 #define ISPRINT(c) (ISASCII (c) && isprint (c))
224 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
225 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
226 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
227 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
228 #define ISLOWER(c) (ISASCII (c) && islower (c))
229 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
230 #define ISSPACE(c) (ISASCII (c) && isspace (c))
231 #define ISUPPER(c) (ISASCII (c) && isupper (c))
232 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
233
234 #ifdef _tolower
235 # define TOLOWER(c) _tolower(c)
236 #else
237 # define TOLOWER(c) tolower(c)
238 #endif
239
240 #ifndef NULL
241 # define NULL (void *)0
242 #endif
243
244 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
245 since ours (we hope) works properly with all combinations of
246 machines, compilers, `char' and `unsigned char' argument types.
247 (Per Bothner suggested the basic approach.) */
248 #undef SIGN_EXTEND_CHAR
249 #if __STDC__
250 # define SIGN_EXTEND_CHAR(c) ((signed char) (c))
251 #else /* not __STDC__ */
252 /* As in Harbison and Steele. */
253 # define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
254 #endif
255 \f
256 #ifndef emacs
257 /* How many characters in the character set. */
258 # define CHAR_SET_SIZE 256
259
260 # ifdef SYNTAX_TABLE
261
262 extern char *re_syntax_table;
263
264 # else /* not SYNTAX_TABLE */
265
266 static char re_syntax_table[CHAR_SET_SIZE];
267
268 static void
269 init_syntax_once ()
270 {
271 register int c;
272 static int done = 0;
273
274 if (done)
275 return;
276 bzero (re_syntax_table, sizeof re_syntax_table);
277
278 for (c = 0; c < CHAR_SET_SIZE; ++c)
279 if (ISALNUM (c))
280 re_syntax_table[c] = Sword;
281
282 re_syntax_table['_'] = Sword;
283
284 done = 1;
285 }
286
287 # endif /* not SYNTAX_TABLE */
288
289 # define SYNTAX(c) re_syntax_table[(unsigned char) (c)]
290
291 #endif /* emacs */
292 \f
293 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
294 use `alloca' instead of `malloc'. This is because using malloc in
295 re_search* or re_match* could cause memory leaks when C-g is used in
296 Emacs; also, malloc is slower and causes storage fragmentation. On
297 the other hand, malloc is more portable, and easier to debug.
298
299 Because we sometimes use alloca, some routines have to be macros,
300 not functions -- `alloca'-allocated space disappears at the end of the
301 function it is called in. */
302
303 #ifdef REGEX_MALLOC
304
305 # define REGEX_ALLOCATE malloc
306 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
307 # define REGEX_FREE free
308
309 #else /* not REGEX_MALLOC */
310
311 /* Emacs already defines alloca, sometimes. */
312 # ifndef alloca
313
314 /* Make alloca work the best possible way. */
315 # ifdef __GNUC__
316 # define alloca __builtin_alloca
317 # else /* not __GNUC__ */
318 # if HAVE_ALLOCA_H
319 # include <alloca.h>
320 # endif /* HAVE_ALLOCA_H */
321 # endif /* not __GNUC__ */
322
323 # endif /* not alloca */
324
325 # define REGEX_ALLOCATE alloca
326
327 /* Assumes a `char *destination' variable. */
328 # define REGEX_REALLOCATE(source, osize, nsize) \
329 (destination = (char *) alloca (nsize), \
330 memcpy (destination, source, osize))
331
332 /* No need to do anything to free, after alloca. */
333 # define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
334
335 #endif /* not REGEX_MALLOC */
336
337 /* Define how to allocate the failure stack. */
338
339 #if defined REL_ALLOC && defined REGEX_MALLOC
340
341 # define REGEX_ALLOCATE_STACK(size) \
342 r_alloc (&failure_stack_ptr, (size))
343 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
344 r_re_alloc (&failure_stack_ptr, (nsize))
345 # define REGEX_FREE_STACK(ptr) \
346 r_alloc_free (&failure_stack_ptr)
347
348 #else /* not using relocating allocator */
349
350 # ifdef REGEX_MALLOC
351
352 # define REGEX_ALLOCATE_STACK malloc
353 # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
354 # define REGEX_FREE_STACK free
355
356 # else /* not REGEX_MALLOC */
357
358 # define REGEX_ALLOCATE_STACK alloca
359
360 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
361 REGEX_REALLOCATE (source, osize, nsize)
362 /* No need to explicitly free anything. */
363 # define REGEX_FREE_STACK(arg)
364
365 # endif /* not REGEX_MALLOC */
366 #endif /* not using relocating allocator */
367
368
369 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
370 `string1' or just past its end. This works if PTR is NULL, which is
371 a good thing. */
372 #define FIRST_STRING_P(ptr) \
373 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
374
375 /* (Re)Allocate N items of type T using malloc, or fail. */
376 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
377 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
378 #define RETALLOC_IF(addr, n, t) \
379 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
380 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
381
382 #define BYTEWIDTH 8 /* In bits. */
383
384 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
385
386 #undef MAX
387 #undef MIN
388 #define MAX(a, b) ((a) > (b) ? (a) : (b))
389 #define MIN(a, b) ((a) < (b) ? (a) : (b))
390
391 typedef char boolean;
392 #define false 0
393 #define true 1
394
395 static int re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp,
396 const char *string1, int size1,
397 const char *string2, int size2,
398 int pos,
399 struct re_registers *regs,
400 int stop));
401 \f
402 /* These are the command codes that appear in compiled regular
403 expressions. Some opcodes are followed by argument bytes. A
404 command code can specify any interpretation whatsoever for its
405 arguments. Zero bytes may appear in the compiled regular expression. */
406
407 typedef enum
408 {
409 no_op = 0,
410
411 /* Succeed right away--no more backtracking. */
412 succeed,
413
414 /* Followed by one byte giving n, then by n literal bytes. */
415 exactn,
416
417 /* Matches any (more or less) character. */
418 anychar,
419
420 /* Matches any one char belonging to specified set. First
421 following byte is number of bitmap bytes. Then come bytes
422 for a bitmap saying which chars are in. Bits in each byte
423 are ordered low-bit-first. A character is in the set if its
424 bit is 1. A character too large to have a bit in the map is
425 automatically not in the set. */
426 charset,
427
428 /* Same parameters as charset, but match any character that is
429 not one of those specified. */
430 charset_not,
431
432 /* Start remembering the text that is matched, for storing in a
433 register. Followed by one byte with the register number, in
434 the range 0 to one less than the pattern buffer's re_nsub
435 field. Then followed by one byte with the number of groups
436 inner to this one. (This last has to be part of the
437 start_memory only because we need it in the on_failure_jump
438 of re_match_2.) */
439 start_memory,
440
441 /* Stop remembering the text that is matched and store it in a
442 memory register. Followed by one byte with the register
443 number, in the range 0 to one less than `re_nsub' in the
444 pattern buffer, and one byte with the number of inner groups,
445 just like `start_memory'. (We need the number of inner
446 groups here because we don't have any easy way of finding the
447 corresponding start_memory when we're at a stop_memory.) */
448 stop_memory,
449
450 /* Match a duplicate of something remembered. Followed by one
451 byte containing the register number. */
452 duplicate,
453
454 /* Fail unless at beginning of line. */
455 begline,
456
457 /* Fail unless at end of line. */
458 endline,
459
460 /* Succeeds if at beginning of buffer (if emacs) or at beginning
461 of string to be matched (if not). */
462 begbuf,
463
464 /* Analogously, for end of buffer/string. */
465 endbuf,
466
467 /* Followed by two byte relative address to which to jump. */
468 jump,
469
470 /* Same as jump, but marks the end of an alternative. */
471 jump_past_alt,
472
473 /* Followed by two-byte relative address of place to resume at
474 in case of failure. */
475 on_failure_jump,
476
477 /* Like on_failure_jump, but pushes a placeholder instead of the
478 current string position when executed. */
479 on_failure_keep_string_jump,
480
481 /* Throw away latest failure point and then jump to following
482 two-byte relative address. */
483 pop_failure_jump,
484
485 /* Change to pop_failure_jump if know won't have to backtrack to
486 match; otherwise change to jump. This is used to jump
487 back to the beginning of a repeat. If what follows this jump
488 clearly won't match what the repeat does, such that we can be
489 sure that there is no use backtracking out of repetitions
490 already matched, then we change it to a pop_failure_jump.
491 Followed by two-byte address. */
492 maybe_pop_jump,
493
494 /* Jump to following two-byte address, and push a dummy failure
495 point. This failure point will be thrown away if an attempt
496 is made to use it for a failure. A `+' construct makes this
497 before the first repeat. Also used as an intermediary kind
498 of jump when compiling an alternative. */
499 dummy_failure_jump,
500
501 /* Push a dummy failure point and continue. Used at the end of
502 alternatives. */
503 push_dummy_failure,
504
505 /* Followed by two-byte relative address and two-byte number n.
506 After matching N times, jump to the address upon failure. */
507 succeed_n,
508
509 /* Followed by two-byte relative address, and two-byte number n.
510 Jump to the address N times, then fail. */
511 jump_n,
512
513 /* Set the following two-byte relative address to the
514 subsequent two-byte number. The address *includes* the two
515 bytes of number. */
516 set_number_at,
517
518 wordchar, /* Matches any word-constituent character. */
519 notwordchar, /* Matches any char that is not a word-constituent. */
520
521 wordbeg, /* Succeeds if at word beginning. */
522 wordend, /* Succeeds if at word end. */
523
524 wordbound, /* Succeeds if at a word boundary. */
525 notwordbound /* Succeeds if not at a word boundary. */
526
527 #ifdef emacs
528 ,before_dot, /* Succeeds if before point. */
529 at_dot, /* Succeeds if at point. */
530 after_dot, /* Succeeds if after point. */
531
532 /* Matches any character whose syntax is specified. Followed by
533 a byte which contains a syntax code, e.g., Sword. */
534 syntaxspec,
535
536 /* Matches any character whose syntax is not that specified. */
537 notsyntaxspec
538 #endif /* emacs */
539 } re_opcode_t;
540 \f
541 /* Common operations on the compiled pattern. */
542
543 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
544
545 #define STORE_NUMBER(destination, number) \
546 do { \
547 (destination)[0] = (number) & 0377; \
548 (destination)[1] = (number) >> 8; \
549 } while (0)
550
551 /* Same as STORE_NUMBER, except increment DESTINATION to
552 the byte after where the number is stored. Therefore, DESTINATION
553 must be an lvalue. */
554
555 #define STORE_NUMBER_AND_INCR(destination, number) \
556 do { \
557 STORE_NUMBER (destination, number); \
558 (destination) += 2; \
559 } while (0)
560
561 /* Put into DESTINATION a number stored in two contiguous bytes starting
562 at SOURCE. */
563
564 #define EXTRACT_NUMBER(destination, source) \
565 do { \
566 (destination) = *(source) & 0377; \
567 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
568 } while (0)
569
570 #ifdef DEBUG
571 static void extract_number _RE_ARGS ((int *dest, unsigned char *source));
572 static void
573 extract_number (dest, source)
574 int *dest;
575 unsigned char *source;
576 {
577 int temp = SIGN_EXTEND_CHAR (*(source + 1));
578 *dest = *source & 0377;
579 *dest += temp << 8;
580 }
581
582 # ifndef EXTRACT_MACROS /* To debug the macros. */
583 # undef EXTRACT_NUMBER
584 # define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
585 # endif /* not EXTRACT_MACROS */
586
587 #endif /* DEBUG */
588
589 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
590 SOURCE must be an lvalue. */
591
592 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
593 do { \
594 EXTRACT_NUMBER (destination, source); \
595 (source) += 2; \
596 } while (0)
597
598 #ifdef DEBUG
599 static void extract_number_and_incr _RE_ARGS ((int *destination,
600 unsigned char **source));
601 static void
602 extract_number_and_incr (destination, source)
603 int *destination;
604 unsigned char **source;
605 {
606 extract_number (destination, *source);
607 *source += 2;
608 }
609
610 # ifndef EXTRACT_MACROS
611 # undef EXTRACT_NUMBER_AND_INCR
612 # define EXTRACT_NUMBER_AND_INCR(dest, src) \
613 extract_number_and_incr (&dest, &src)
614 # endif /* not EXTRACT_MACROS */
615
616 #endif /* DEBUG */
617 \f
618 /* If DEBUG is defined, Regex prints many voluminous messages about what
619 it is doing (if the variable `debug' is nonzero). If linked with the
620 main program in `iregex.c', you can enter patterns and strings
621 interactively. And if linked with the main program in `main.c' and
622 the other test files, you can run the already-written tests. */
623
624 #ifdef DEBUG
625
626 /* We use standard I/O for debugging. */
627 # include <stdio.h>
628
629 /* It is useful to test things that ``must'' be true when debugging. */
630 # include <assert.h>
631
632 static int debug;
633
634 # define DEBUG_STATEMENT(e) e
635 # define DEBUG_PRINT1(x) if (debug) printf (x)
636 # define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
637 # define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
638 # define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
639 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
640 if (debug) print_partial_compiled_pattern (s, e)
641 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
642 if (debug) print_double_string (w, s1, sz1, s2, sz2)
643
644
645 /* Print the fastmap in human-readable form. */
646
647 void
648 print_fastmap (fastmap)
649 char *fastmap;
650 {
651 unsigned was_a_range = 0;
652 unsigned i = 0;
653
654 while (i < (1 << BYTEWIDTH))
655 {
656 if (fastmap[i++])
657 {
658 was_a_range = 0;
659 putchar (i - 1);
660 while (i < (1 << BYTEWIDTH) && fastmap[i])
661 {
662 was_a_range = 1;
663 i++;
664 }
665 if (was_a_range)
666 {
667 printf ("-");
668 putchar (i - 1);
669 }
670 }
671 }
672 putchar ('\n');
673 }
674
675
676 /* Print a compiled pattern string in human-readable form, starting at
677 the START pointer into it and ending just before the pointer END. */
678
679 void
680 print_partial_compiled_pattern (start, end)
681 unsigned char *start;
682 unsigned char *end;
683 {
684 int mcnt, mcnt2;
685 unsigned char *p1;
686 unsigned char *p = start;
687 unsigned char *pend = end;
688
689 if (start == NULL)
690 {
691 printf ("(null)\n");
692 return;
693 }
694
695 /* Loop over pattern commands. */
696 while (p < pend)
697 {
698 #ifdef _LIBC
699 printf ("%t:\t", p - start);
700 #else
701 printf ("%ld:\t", (long int) (p - start));
702 #endif
703
704 switch ((re_opcode_t) *p++)
705 {
706 case no_op:
707 printf ("/no_op");
708 break;
709
710 case exactn:
711 mcnt = *p++;
712 printf ("/exactn/%d", mcnt);
713 do
714 {
715 putchar ('/');
716 putchar (*p++);
717 }
718 while (--mcnt);
719 break;
720
721 case start_memory:
722 mcnt = *p++;
723 printf ("/start_memory/%d/%d", mcnt, *p++);
724 break;
725
726 case stop_memory:
727 mcnt = *p++;
728 printf ("/stop_memory/%d/%d", mcnt, *p++);
729 break;
730
731 case duplicate:
732 printf ("/duplicate/%d", *p++);
733 break;
734
735 case anychar:
736 printf ("/anychar");
737 break;
738
739 case charset:
740 case charset_not:
741 {
742 register int c, last = -100;
743 register int in_range = 0;
744
745 printf ("/charset [%s",
746 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
747
748 assert (p + *p < pend);
749
750 for (c = 0; c < 256; c++)
751 if (c / 8 < *p
752 && (p[1 + (c/8)] & (1 << (c % 8))))
753 {
754 /* Are we starting a range? */
755 if (last + 1 == c && ! in_range)
756 {
757 putchar ('-');
758 in_range = 1;
759 }
760 /* Have we broken a range? */
761 else if (last + 1 != c && in_range)
762 {
763 putchar (last);
764 in_range = 0;
765 }
766
767 if (! in_range)
768 putchar (c);
769
770 last = c;
771 }
772
773 if (in_range)
774 putchar (last);
775
776 putchar (']');
777
778 p += 1 + *p;
779 }
780 break;
781
782 case begline:
783 printf ("/begline");
784 break;
785
786 case endline:
787 printf ("/endline");
788 break;
789
790 case on_failure_jump:
791 extract_number_and_incr (&mcnt, &p);
792 #ifdef _LIBC
793 printf ("/on_failure_jump to %t", p + mcnt - start);
794 #else
795 printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start));
796 #endif
797 break;
798
799 case on_failure_keep_string_jump:
800 extract_number_and_incr (&mcnt, &p);
801 #ifdef _LIBC
802 printf ("/on_failure_keep_string_jump to %t", p + mcnt - start);
803 #else
804 printf ("/on_failure_keep_string_jump to %ld",
805 (long int) (p + mcnt - start));
806 #endif
807 break;
808
809 case dummy_failure_jump:
810 extract_number_and_incr (&mcnt, &p);
811 #ifdef _LIBC
812 printf ("/dummy_failure_jump to %t", p + mcnt - start);
813 #else
814 printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start));
815 #endif
816 break;
817
818 case push_dummy_failure:
819 printf ("/push_dummy_failure");
820 break;
821
822 case maybe_pop_jump:
823 extract_number_and_incr (&mcnt, &p);
824 #ifdef _LIBC
825 printf ("/maybe_pop_jump to %t", p + mcnt - start);
826 #else
827 printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start));
828 #endif
829 break;
830
831 case pop_failure_jump:
832 extract_number_and_incr (&mcnt, &p);
833 #ifdef _LIBC
834 printf ("/pop_failure_jump to %t", p + mcnt - start);
835 #else
836 printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start));
837 #endif
838 break;
839
840 case jump_past_alt:
841 extract_number_and_incr (&mcnt, &p);
842 #ifdef _LIBC
843 printf ("/jump_past_alt to %t", p + mcnt - start);
844 #else
845 printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start));
846 #endif
847 break;
848
849 case jump:
850 extract_number_and_incr (&mcnt, &p);
851 #ifdef _LIBC
852 printf ("/jump to %t", p + mcnt - start);
853 #else
854 printf ("/jump to %ld", (long int) (p + mcnt - start));
855 #endif
856 break;
857
858 case succeed_n:
859 extract_number_and_incr (&mcnt, &p);
860 p1 = p + mcnt;
861 extract_number_and_incr (&mcnt2, &p);
862 #ifdef _LIBC
863 printf ("/succeed_n to %t, %d times", p1 - start, mcnt2);
864 #else
865 printf ("/succeed_n to %ld, %d times",
866 (long int) (p1 - start), mcnt2);
867 #endif
868 break;
869
870 case jump_n:
871 extract_number_and_incr (&mcnt, &p);
872 p1 = p + mcnt;
873 extract_number_and_incr (&mcnt2, &p);
874 printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
875 break;
876
877 case set_number_at:
878 extract_number_and_incr (&mcnt, &p);
879 p1 = p + mcnt;
880 extract_number_and_incr (&mcnt2, &p);
881 #ifdef _LIBC
882 printf ("/set_number_at location %t to %d", p1 - start, mcnt2);
883 #else
884 printf ("/set_number_at location %ld to %d",
885 (long int) (p1 - start), mcnt2);
886 #endif
887 break;
888
889 case wordbound:
890 printf ("/wordbound");
891 break;
892
893 case notwordbound:
894 printf ("/notwordbound");
895 break;
896
897 case wordbeg:
898 printf ("/wordbeg");
899 break;
900
901 case wordend:
902 printf ("/wordend");
903
904 # ifdef emacs
905 case before_dot:
906 printf ("/before_dot");
907 break;
908
909 case at_dot:
910 printf ("/at_dot");
911 break;
912
913 case after_dot:
914 printf ("/after_dot");
915 break;
916
917 case syntaxspec:
918 printf ("/syntaxspec");
919 mcnt = *p++;
920 printf ("/%d", mcnt);
921 break;
922
923 case notsyntaxspec:
924 printf ("/notsyntaxspec");
925 mcnt = *p++;
926 printf ("/%d", mcnt);
927 break;
928 # endif /* emacs */
929
930 case wordchar:
931 printf ("/wordchar");
932 break;
933
934 case notwordchar:
935 printf ("/notwordchar");
936 break;
937
938 case begbuf:
939 printf ("/begbuf");
940 break;
941
942 case endbuf:
943 printf ("/endbuf");
944 break;
945
946 default:
947 printf ("?%d", *(p-1));
948 }
949
950 putchar ('\n');
951 }
952
953 #ifdef _LIBC
954 printf ("%t:\tend of pattern.\n", p - start);
955 #else
956 printf ("%ld:\tend of pattern.\n", (long int) (p - start));
957 #endif
958 }
959
960
961 void
962 print_compiled_pattern (bufp)
963 struct re_pattern_buffer *bufp;
964 {
965 unsigned char *buffer = bufp->buffer;
966
967 print_partial_compiled_pattern (buffer, buffer + bufp->used);
968 printf ("%ld bytes used/%ld bytes allocated.\n",
969 bufp->used, bufp->allocated);
970
971 if (bufp->fastmap_accurate && bufp->fastmap)
972 {
973 printf ("fastmap: ");
974 print_fastmap (bufp->fastmap);
975 }
976
977 #ifdef _LIBC
978 printf ("re_nsub: %Zd\t", bufp->re_nsub);
979 #else
980 printf ("re_nsub: %ld\t", (long int) bufp->re_nsub);
981 #endif
982 printf ("regs_alloc: %d\t", bufp->regs_allocated);
983 printf ("can_be_null: %d\t", bufp->can_be_null);
984 printf ("newline_anchor: %d\n", bufp->newline_anchor);
985 printf ("no_sub: %d\t", bufp->no_sub);
986 printf ("not_bol: %d\t", bufp->not_bol);
987 printf ("not_eol: %d\t", bufp->not_eol);
988 printf ("syntax: %lx\n", bufp->syntax);
989 /* Perhaps we should print the translate table? */
990 }
991
992
993 void
994 print_double_string (where, string1, size1, string2, size2)
995 const char *where;
996 const char *string1;
997 const char *string2;
998 int size1;
999 int size2;
1000 {
1001 int this_char;
1002
1003 if (where == NULL)
1004 printf ("(null)");
1005 else
1006 {
1007 if (FIRST_STRING_P (where))
1008 {
1009 for (this_char = where - string1; this_char < size1; this_char++)
1010 putchar (string1[this_char]);
1011
1012 where = string2;
1013 }
1014
1015 for (this_char = where - string2; this_char < size2; this_char++)
1016 putchar (string2[this_char]);
1017 }
1018 }
1019
1020 void
1021 printchar (c)
1022 int c;
1023 {
1024 putc (c, stderr);
1025 }
1026
1027 #else /* not DEBUG */
1028
1029 # undef assert
1030 # define assert(e)
1031
1032 # define DEBUG_STATEMENT(e)
1033 # define DEBUG_PRINT1(x)
1034 # define DEBUG_PRINT2(x1, x2)
1035 # define DEBUG_PRINT3(x1, x2, x3)
1036 # define DEBUG_PRINT4(x1, x2, x3, x4)
1037 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1038 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1039
1040 #endif /* not DEBUG */
1041 \f
1042 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1043 also be assigned to arbitrarily: each pattern buffer stores its own
1044 syntax, so it can be changed between regex compilations. */
1045 /* This has no initializer because initialized variables in Emacs
1046 become read-only after dumping. */
1047 reg_syntax_t re_syntax_options;
1048
1049
1050 /* Specify the precise syntax of regexps for compilation. This provides
1051 for compatibility for various utilities which historically have
1052 different, incompatible syntaxes.
1053
1054 The argument SYNTAX is a bit mask comprised of the various bits
1055 defined in regex.h. We return the old syntax. */
1056
1057 reg_syntax_t
1058 re_set_syntax (syntax)
1059 reg_syntax_t syntax;
1060 {
1061 reg_syntax_t ret = re_syntax_options;
1062
1063 re_syntax_options = syntax;
1064 #ifdef DEBUG
1065 if (syntax & RE_DEBUG)
1066 debug = 1;
1067 else if (debug) /* was on but now is not */
1068 debug = 0;
1069 #endif /* DEBUG */
1070 return ret;
1071 }
1072 #ifdef _LIBC
1073 weak_alias (__re_set_syntax, re_set_syntax)
1074 #endif
1075 \f
1076 /* This table gives an error message for each of the error codes listed
1077 in regex.h. Obviously the order here has to be same as there.
1078 POSIX doesn't require that we do anything for REG_NOERROR,
1079 but why not be nice? */
1080
1081 static const char re_error_msgid[] =
1082 {
1083 #define REG_NOERROR_IDX 0
1084 gettext_noop ("Success") /* REG_NOERROR */
1085 "\0"
1086 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
1087 gettext_noop ("No match") /* REG_NOMATCH */
1088 "\0"
1089 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
1090 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
1091 "\0"
1092 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
1093 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
1094 "\0"
1095 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
1096 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
1097 "\0"
1098 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
1099 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
1100 "\0"
1101 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
1102 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
1103 "\0"
1104 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
1105 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
1106 "\0"
1107 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
1108 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
1109 "\0"
1110 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
1111 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
1112 "\0"
1113 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
1114 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
1115 "\0"
1116 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
1117 gettext_noop ("Invalid range end") /* REG_ERANGE */
1118 "\0"
1119 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
1120 gettext_noop ("Memory exhausted") /* REG_ESPACE */
1121 "\0"
1122 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
1123 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
1124 "\0"
1125 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
1126 gettext_noop ("Premature end of regular expression") /* REG_EEND */
1127 "\0"
1128 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
1129 gettext_noop ("Regular expression too big") /* REG_ESIZE */
1130 "\0"
1131 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
1132 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
1133 };
1134
1135 static const size_t re_error_msgid_idx[] =
1136 {
1137 REG_NOERROR_IDX,
1138 REG_NOMATCH_IDX,
1139 REG_BADPAT_IDX,
1140 REG_ECOLLATE_IDX,
1141 REG_ECTYPE_IDX,
1142 REG_EESCAPE_IDX,
1143 REG_ESUBREG_IDX,
1144 REG_EBRACK_IDX,
1145 REG_EPAREN_IDX,
1146 REG_EBRACE_IDX,
1147 REG_BADBR_IDX,
1148 REG_ERANGE_IDX,
1149 REG_ESPACE_IDX,
1150 REG_BADRPT_IDX,
1151 REG_EEND_IDX,
1152 REG_ESIZE_IDX,
1153 REG_ERPAREN_IDX
1154 };
1155 \f
1156 /* Avoiding alloca during matching, to placate r_alloc. */
1157
1158 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1159 searching and matching functions should not call alloca. On some
1160 systems, alloca is implemented in terms of malloc, and if we're
1161 using the relocating allocator routines, then malloc could cause a
1162 relocation, which might (if the strings being searched are in the
1163 ralloc heap) shift the data out from underneath the regexp
1164 routines.
1165
1166 Here's another reason to avoid allocation: Emacs
1167 processes input from X in a signal handler; processing X input may
1168 call malloc; if input arrives while a matching routine is calling
1169 malloc, then we're scrod. But Emacs can't just block input while
1170 calling matching routines; then we don't notice interrupts when
1171 they come in. So, Emacs blocks input around all regexp calls
1172 except the matching calls, which it leaves unprotected, in the
1173 faith that they will not malloc. */
1174
1175 /* Normally, this is fine. */
1176 #define MATCH_MAY_ALLOCATE
1177
1178 /* When using GNU C, we are not REALLY using the C alloca, no matter
1179 what config.h may say. So don't take precautions for it. */
1180 #ifdef __GNUC__
1181 # undef C_ALLOCA
1182 #endif
1183
1184 /* The match routines may not allocate if (1) they would do it with malloc
1185 and (2) it's not safe for them to use malloc.
1186 Note that if REL_ALLOC is defined, matching would not use malloc for the
1187 failure stack, but we would still use it for the register vectors;
1188 so REL_ALLOC should not affect this. */
1189 #if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1190 # undef MATCH_MAY_ALLOCATE
1191 #endif
1192
1193 \f
1194 /* Failure stack declarations and macros; both re_compile_fastmap and
1195 re_match_2 use a failure stack. These have to be macros because of
1196 REGEX_ALLOCATE_STACK. */
1197
1198
1199 /* Number of failure points for which to initially allocate space
1200 when matching. If this number is exceeded, we allocate more
1201 space, so it is not a hard limit. */
1202 #ifndef INIT_FAILURE_ALLOC
1203 # define INIT_FAILURE_ALLOC 5
1204 #endif
1205
1206 /* Roughly the maximum number of failure points on the stack. Would be
1207 exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
1208 This is a variable only so users of regex can assign to it; we never
1209 change it ourselves. */
1210
1211 #ifdef INT_IS_16BIT
1212
1213 # if defined MATCH_MAY_ALLOCATE
1214 /* 4400 was enough to cause a crash on Alpha OSF/1,
1215 whose default stack limit is 2mb. */
1216 long int re_max_failures = 4000;
1217 # else
1218 long int re_max_failures = 2000;
1219 # endif
1220
1221 union fail_stack_elt
1222 {
1223 unsigned char *pointer;
1224 long int integer;
1225 };
1226
1227 typedef union fail_stack_elt fail_stack_elt_t;
1228
1229 typedef struct
1230 {
1231 fail_stack_elt_t *stack;
1232 unsigned long int size;
1233 unsigned long int avail; /* Offset of next open position. */
1234 } fail_stack_type;
1235
1236 #else /* not INT_IS_16BIT */
1237
1238 # if defined MATCH_MAY_ALLOCATE
1239 /* 4400 was enough to cause a crash on Alpha OSF/1,
1240 whose default stack limit is 2mb. */
1241 int re_max_failures = 4000;
1242 # else
1243 int re_max_failures = 2000;
1244 # endif
1245
1246 union fail_stack_elt
1247 {
1248 unsigned char *pointer;
1249 int integer;
1250 };
1251
1252 typedef union fail_stack_elt fail_stack_elt_t;
1253
1254 typedef struct
1255 {
1256 fail_stack_elt_t *stack;
1257 unsigned size;
1258 unsigned avail; /* Offset of next open position. */
1259 } fail_stack_type;
1260
1261 #endif /* INT_IS_16BIT */
1262
1263 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1264 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1265 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1266
1267
1268 /* Define macros to initialize and free the failure stack.
1269 Do `return -2' if the alloc fails. */
1270
1271 #ifdef MATCH_MAY_ALLOCATE
1272 # define INIT_FAIL_STACK() \
1273 do { \
1274 fail_stack.stack = (fail_stack_elt_t *) \
1275 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1276 \
1277 if (fail_stack.stack == NULL) \
1278 return -2; \
1279 \
1280 fail_stack.size = INIT_FAILURE_ALLOC; \
1281 fail_stack.avail = 0; \
1282 } while (0)
1283
1284 # define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1285 #else
1286 # define INIT_FAIL_STACK() \
1287 do { \
1288 fail_stack.avail = 0; \
1289 } while (0)
1290
1291 # define RESET_FAIL_STACK()
1292 #endif
1293
1294
1295 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1296
1297 Return 1 if succeeds, and 0 if either ran out of memory
1298 allocating space for it or it was already too large.
1299
1300 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1301
1302 #define DOUBLE_FAIL_STACK(fail_stack) \
1303 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \
1304 ? 0 \
1305 : ((fail_stack).stack = (fail_stack_elt_t *) \
1306 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1307 (fail_stack).size * sizeof (fail_stack_elt_t), \
1308 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1309 \
1310 (fail_stack).stack == NULL \
1311 ? 0 \
1312 : ((fail_stack).size <<= 1, \
1313 1)))
1314
1315
1316 /* Push pointer POINTER on FAIL_STACK.
1317 Return 1 if was able to do so and 0 if ran out of memory allocating
1318 space to do so. */
1319 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1320 ((FAIL_STACK_FULL () \
1321 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1322 ? 0 \
1323 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1324 1))
1325
1326 /* Push a pointer value onto the failure stack.
1327 Assumes the variable `fail_stack'. Probably should only
1328 be called from within `PUSH_FAILURE_POINT'. */
1329 #define PUSH_FAILURE_POINTER(item) \
1330 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1331
1332 /* This pushes an integer-valued item onto the failure stack.
1333 Assumes the variable `fail_stack'. Probably should only
1334 be called from within `PUSH_FAILURE_POINT'. */
1335 #define PUSH_FAILURE_INT(item) \
1336 fail_stack.stack[fail_stack.avail++].integer = (item)
1337
1338 /* Push a fail_stack_elt_t value onto the failure stack.
1339 Assumes the variable `fail_stack'. Probably should only
1340 be called from within `PUSH_FAILURE_POINT'. */
1341 #define PUSH_FAILURE_ELT(item) \
1342 fail_stack.stack[fail_stack.avail++] = (item)
1343
1344 /* These three POP... operations complement the three PUSH... operations.
1345 All assume that `fail_stack' is nonempty. */
1346 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1347 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1348 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1349
1350 /* Used to omit pushing failure point id's when we're not debugging. */
1351 #ifdef DEBUG
1352 # define DEBUG_PUSH PUSH_FAILURE_INT
1353 # define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1354 #else
1355 # define DEBUG_PUSH(item)
1356 # define DEBUG_POP(item_addr)
1357 #endif
1358
1359
1360 /* Push the information about the state we will need
1361 if we ever fail back to it.
1362
1363 Requires variables fail_stack, regstart, regend, reg_info, and
1364 num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination'
1365 be declared.
1366
1367 Does `return FAILURE_CODE' if runs out of memory. */
1368
1369 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1370 do { \
1371 char *destination; \
1372 /* Must be int, so when we don't save any registers, the arithmetic \
1373 of 0 + -1 isn't done as unsigned. */ \
1374 /* Can't be int, since there is not a shred of a guarantee that int \
1375 is wide enough to hold a value of something to which pointer can \
1376 be assigned */ \
1377 active_reg_t this_reg; \
1378 \
1379 DEBUG_STATEMENT (failure_id++); \
1380 DEBUG_STATEMENT (nfailure_points_pushed++); \
1381 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1382 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1383 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1384 \
1385 DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \
1386 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1387 \
1388 /* Ensure we have enough space allocated for what we will push. */ \
1389 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1390 { \
1391 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1392 return failure_code; \
1393 \
1394 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1395 (fail_stack).size); \
1396 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1397 } \
1398 \
1399 /* Push the info, starting with the registers. */ \
1400 DEBUG_PRINT1 ("\n"); \
1401 \
1402 if (1) \
1403 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1404 this_reg++) \
1405 { \
1406 DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \
1407 DEBUG_STATEMENT (num_regs_pushed++); \
1408 \
1409 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1410 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1411 \
1412 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1413 PUSH_FAILURE_POINTER (regend[this_reg]); \
1414 \
1415 DEBUG_PRINT2 (" info: %p\n ", \
1416 reg_info[this_reg].word.pointer); \
1417 DEBUG_PRINT2 (" match_null=%d", \
1418 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1419 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1420 DEBUG_PRINT2 (" matched_something=%d", \
1421 MATCHED_SOMETHING (reg_info[this_reg])); \
1422 DEBUG_PRINT2 (" ever_matched=%d", \
1423 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1424 DEBUG_PRINT1 ("\n"); \
1425 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1426 } \
1427 \
1428 DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\
1429 PUSH_FAILURE_INT (lowest_active_reg); \
1430 \
1431 DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\
1432 PUSH_FAILURE_INT (highest_active_reg); \
1433 \
1434 DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \
1435 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1436 PUSH_FAILURE_POINTER (pattern_place); \
1437 \
1438 DEBUG_PRINT2 (" Pushing string %p: `", string_place); \
1439 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1440 size2); \
1441 DEBUG_PRINT1 ("'\n"); \
1442 PUSH_FAILURE_POINTER (string_place); \
1443 \
1444 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1445 DEBUG_PUSH (failure_id); \
1446 } while (0)
1447
1448 /* This is the number of items that are pushed and popped on the stack
1449 for each register. */
1450 #define NUM_REG_ITEMS 3
1451
1452 /* Individual items aside from the registers. */
1453 #ifdef DEBUG
1454 # define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1455 #else
1456 # define NUM_NONREG_ITEMS 4
1457 #endif
1458
1459 /* We push at most this many items on the stack. */
1460 /* We used to use (num_regs - 1), which is the number of registers
1461 this regexp will save; but that was changed to 5
1462 to avoid stack overflow for a regexp with lots of parens. */
1463 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1464
1465 /* We actually push this many items. */
1466 #define NUM_FAILURE_ITEMS \
1467 (((0 \
1468 ? 0 : highest_active_reg - lowest_active_reg + 1) \
1469 * NUM_REG_ITEMS) \
1470 + NUM_NONREG_ITEMS)
1471
1472 /* How many items can still be added to the stack without overflowing it. */
1473 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1474
1475
1476 /* Pops what PUSH_FAIL_STACK pushes.
1477
1478 We restore into the parameters, all of which should be lvalues:
1479 STR -- the saved data position.
1480 PAT -- the saved pattern position.
1481 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1482 REGSTART, REGEND -- arrays of string positions.
1483 REG_INFO -- array of information about each subexpression.
1484
1485 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1486 `pend', `string1', `size1', `string2', and `size2'. */
1487
1488 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1489 { \
1490 DEBUG_STATEMENT (unsigned failure_id;) \
1491 active_reg_t this_reg; \
1492 const unsigned char *string_temp; \
1493 \
1494 assert (!FAIL_STACK_EMPTY ()); \
1495 \
1496 /* Remove failure points and point to how many regs pushed. */ \
1497 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1498 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1499 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1500 \
1501 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1502 \
1503 DEBUG_POP (&failure_id); \
1504 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1505 \
1506 /* If the saved string location is NULL, it came from an \
1507 on_failure_keep_string_jump opcode, and we want to throw away the \
1508 saved NULL, thus retaining our current position in the string. */ \
1509 string_temp = POP_FAILURE_POINTER (); \
1510 if (string_temp != NULL) \
1511 str = (const char *) string_temp; \
1512 \
1513 DEBUG_PRINT2 (" Popping string %p: `", str); \
1514 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1515 DEBUG_PRINT1 ("'\n"); \
1516 \
1517 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1518 DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
1519 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1520 \
1521 /* Restore register info. */ \
1522 high_reg = (active_reg_t) POP_FAILURE_INT (); \
1523 DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \
1524 \
1525 low_reg = (active_reg_t) POP_FAILURE_INT (); \
1526 DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \
1527 \
1528 if (1) \
1529 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1530 { \
1531 DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \
1532 \
1533 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1534 DEBUG_PRINT2 (" info: %p\n", \
1535 reg_info[this_reg].word.pointer); \
1536 \
1537 regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1538 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1539 \
1540 regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1541 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1542 } \
1543 else \
1544 { \
1545 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1546 { \
1547 reg_info[this_reg].word.integer = 0; \
1548 regend[this_reg] = 0; \
1549 regstart[this_reg] = 0; \
1550 } \
1551 highest_active_reg = high_reg; \
1552 } \
1553 \
1554 set_regs_matched_done = 0; \
1555 DEBUG_STATEMENT (nfailure_points_popped++); \
1556 } /* POP_FAILURE_POINT */
1557
1558
1559 \f
1560 /* Structure for per-register (a.k.a. per-group) information.
1561 Other register information, such as the
1562 starting and ending positions (which are addresses), and the list of
1563 inner groups (which is a bits list) are maintained in separate
1564 variables.
1565
1566 We are making a (strictly speaking) nonportable assumption here: that
1567 the compiler will pack our bit fields into something that fits into
1568 the type of `word', i.e., is something that fits into one item on the
1569 failure stack. */
1570
1571
1572 /* Declarations and macros for re_match_2. */
1573
1574 typedef union
1575 {
1576 fail_stack_elt_t word;
1577 struct
1578 {
1579 /* This field is one if this group can match the empty string,
1580 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1581 #define MATCH_NULL_UNSET_VALUE 3
1582 unsigned match_null_string_p : 2;
1583 unsigned is_active : 1;
1584 unsigned matched_something : 1;
1585 unsigned ever_matched_something : 1;
1586 } bits;
1587 } register_info_type;
1588
1589 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1590 #define IS_ACTIVE(R) ((R).bits.is_active)
1591 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1592 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1593
1594
1595 /* Call this when have matched a real character; it sets `matched' flags
1596 for the subexpressions which we are currently inside. Also records
1597 that those subexprs have matched. */
1598 #define SET_REGS_MATCHED() \
1599 do \
1600 { \
1601 if (!set_regs_matched_done) \
1602 { \
1603 active_reg_t r; \
1604 set_regs_matched_done = 1; \
1605 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1606 { \
1607 MATCHED_SOMETHING (reg_info[r]) \
1608 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1609 = 1; \
1610 } \
1611 } \
1612 } \
1613 while (0)
1614
1615 /* Registers are set to a sentinel when they haven't yet matched. */
1616 static char reg_unset_dummy;
1617 #define REG_UNSET_VALUE (&reg_unset_dummy)
1618 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1619 \f
1620 /* Subroutine declarations and macros for regex_compile. */
1621
1622 static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size,
1623 reg_syntax_t syntax,
1624 struct re_pattern_buffer *bufp));
1625 static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1626 static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1627 int arg1, int arg2));
1628 static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1629 int arg, unsigned char *end));
1630 static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1631 int arg1, int arg2, unsigned char *end));
1632 static boolean at_begline_loc_p _RE_ARGS ((const char *pattern, const char *p,
1633 reg_syntax_t syntax));
1634 static boolean at_endline_loc_p _RE_ARGS ((const char *p, const char *pend,
1635 reg_syntax_t syntax));
1636 static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start,
1637 const char **p_ptr,
1638 const char *pend,
1639 char *translate,
1640 reg_syntax_t syntax,
1641 unsigned char *b));
1642
1643 /* Fetch the next character in the uncompiled pattern---translating it
1644 if necessary. Also cast from a signed character in the constant
1645 string passed to us by the user to an unsigned char that we can use
1646 as an array index (in, e.g., `translate'). */
1647 #ifndef PATFETCH
1648 # define PATFETCH(c) \
1649 do {if (p == pend) return REG_EEND; \
1650 c = (unsigned char) *p++; \
1651 if (translate) c = (unsigned char) translate[c]; \
1652 } while (0)
1653 #endif
1654
1655 /* Fetch the next character in the uncompiled pattern, with no
1656 translation. */
1657 #define PATFETCH_RAW(c) \
1658 do {if (p == pend) return REG_EEND; \
1659 c = (unsigned char) *p++; \
1660 } while (0)
1661
1662 /* Go backwards one character in the pattern. */
1663 #define PATUNFETCH p--
1664
1665
1666 /* If `translate' is non-null, return translate[D], else just D. We
1667 cast the subscript to translate because some data is declared as
1668 `char *', to avoid warnings when a string constant is passed. But
1669 when we use a character as a subscript we must make it unsigned. */
1670 #ifndef TRANSLATE
1671 # define TRANSLATE(d) \
1672 (translate ? (char) translate[(unsigned char) (d)] : (d))
1673 #endif
1674
1675
1676 /* Macros for outputting the compiled pattern into `buffer'. */
1677
1678 /* If the buffer isn't allocated when it comes in, use this. */
1679 #define INIT_BUF_SIZE 32
1680
1681 /* Make sure we have at least N more bytes of space in buffer. */
1682 #define GET_BUFFER_SPACE(n) \
1683 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
1684 EXTEND_BUFFER ()
1685
1686 /* Make sure we have one more byte of buffer space and then add C to it. */
1687 #define BUF_PUSH(c) \
1688 do { \
1689 GET_BUFFER_SPACE (1); \
1690 *b++ = (unsigned char) (c); \
1691 } while (0)
1692
1693
1694 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1695 #define BUF_PUSH_2(c1, c2) \
1696 do { \
1697 GET_BUFFER_SPACE (2); \
1698 *b++ = (unsigned char) (c1); \
1699 *b++ = (unsigned char) (c2); \
1700 } while (0)
1701
1702
1703 /* As with BUF_PUSH_2, except for three bytes. */
1704 #define BUF_PUSH_3(c1, c2, c3) \
1705 do { \
1706 GET_BUFFER_SPACE (3); \
1707 *b++ = (unsigned char) (c1); \
1708 *b++ = (unsigned char) (c2); \
1709 *b++ = (unsigned char) (c3); \
1710 } while (0)
1711
1712
1713 /* Store a jump with opcode OP at LOC to location TO. We store a
1714 relative address offset by the three bytes the jump itself occupies. */
1715 #define STORE_JUMP(op, loc, to) \
1716 store_op1 (op, loc, (int) ((to) - (loc) - 3))
1717
1718 /* Likewise, for a two-argument jump. */
1719 #define STORE_JUMP2(op, loc, to, arg) \
1720 store_op2 (op, loc, (int) ((to) - (loc) - 3), arg)
1721
1722 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1723 #define INSERT_JUMP(op, loc, to) \
1724 insert_op1 (op, loc, (int) ((to) - (loc) - 3), b)
1725
1726 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1727 #define INSERT_JUMP2(op, loc, to, arg) \
1728 insert_op2 (op, loc, (int) ((to) - (loc) - 3), arg, b)
1729
1730
1731 /* This is not an arbitrary limit: the arguments which represent offsets
1732 into the pattern are two bytes long. So if 2^16 bytes turns out to
1733 be too small, many things would have to change. */
1734 /* Any other compiler which, like MSC, has allocation limit below 2^16
1735 bytes will have to use approach similar to what was done below for
1736 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1737 reallocating to 0 bytes. Such thing is not going to work too well.
1738 You have been warned!! */
1739 #if defined _MSC_VER && !defined WIN32
1740 /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes.
1741 The REALLOC define eliminates a flurry of conversion warnings,
1742 but is not required. */
1743 # define MAX_BUF_SIZE 65500L
1744 # define REALLOC(p,s) realloc ((p), (size_t) (s))
1745 #else
1746 # define MAX_BUF_SIZE (1L << 16)
1747 # define REALLOC(p,s) realloc ((p), (s))
1748 #endif
1749
1750 /* Extend the buffer by twice its current size via realloc and
1751 reset the pointers that pointed into the old block to point to the
1752 correct places in the new one. If extending the buffer results in it
1753 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1754 #if __BOUNDED_POINTERS__
1755 # define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
1756 # define MOVE_BUFFER_POINTER(P) \
1757 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
1758 # define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1759 else \
1760 { \
1761 SET_HIGH_BOUND (b); \
1762 SET_HIGH_BOUND (begalt); \
1763 if (fixup_alt_jump) \
1764 SET_HIGH_BOUND (fixup_alt_jump); \
1765 if (laststart) \
1766 SET_HIGH_BOUND (laststart); \
1767 if (pending_exact) \
1768 SET_HIGH_BOUND (pending_exact); \
1769 }
1770 #else
1771 # define MOVE_BUFFER_POINTER(P) (P) += incr
1772 # define ELSE_EXTEND_BUFFER_HIGH_BOUND
1773 #endif
1774 #define EXTEND_BUFFER() \
1775 do { \
1776 unsigned char *old_buffer = bufp->buffer; \
1777 if (bufp->allocated == MAX_BUF_SIZE) \
1778 return REG_ESIZE; \
1779 bufp->allocated <<= 1; \
1780 if (bufp->allocated > MAX_BUF_SIZE) \
1781 bufp->allocated = MAX_BUF_SIZE; \
1782 bufp->buffer = (unsigned char *) REALLOC (bufp->buffer, bufp->allocated);\
1783 if (bufp->buffer == NULL) \
1784 return REG_ESPACE; \
1785 /* If the buffer moved, move all the pointers into it. */ \
1786 if (old_buffer != bufp->buffer) \
1787 { \
1788 int incr = bufp->buffer - old_buffer; \
1789 MOVE_BUFFER_POINTER (b); \
1790 MOVE_BUFFER_POINTER (begalt); \
1791 if (fixup_alt_jump) \
1792 MOVE_BUFFER_POINTER (fixup_alt_jump); \
1793 if (laststart) \
1794 MOVE_BUFFER_POINTER (laststart); \
1795 if (pending_exact) \
1796 MOVE_BUFFER_POINTER (pending_exact); \
1797 } \
1798 ELSE_EXTEND_BUFFER_HIGH_BOUND \
1799 } while (0)
1800
1801
1802 /* Since we have one byte reserved for the register number argument to
1803 {start,stop}_memory, the maximum number of groups we can report
1804 things about is what fits in that byte. */
1805 #define MAX_REGNUM 255
1806
1807 /* But patterns can have more than `MAX_REGNUM' registers. We just
1808 ignore the excess. */
1809 typedef unsigned regnum_t;
1810
1811
1812 /* Macros for the compile stack. */
1813
1814 /* Since offsets can go either forwards or backwards, this type needs to
1815 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1816 /* int may be not enough when sizeof(int) == 2. */
1817 typedef long pattern_offset_t;
1818
1819 typedef struct
1820 {
1821 pattern_offset_t begalt_offset;
1822 pattern_offset_t fixup_alt_jump;
1823 pattern_offset_t inner_group_offset;
1824 pattern_offset_t laststart_offset;
1825 regnum_t regnum;
1826 } compile_stack_elt_t;
1827
1828
1829 typedef struct
1830 {
1831 compile_stack_elt_t *stack;
1832 unsigned size;
1833 unsigned avail; /* Offset of next open position. */
1834 } compile_stack_type;
1835
1836
1837 #define INIT_COMPILE_STACK_SIZE 32
1838
1839 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1840 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1841
1842 /* The next available element. */
1843 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1844
1845
1846 /* Set the bit for character C in a list. */
1847 #define SET_LIST_BIT(c) \
1848 (b[((unsigned char) (c)) / BYTEWIDTH] \
1849 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1850
1851
1852 /* Get the next unsigned number in the uncompiled pattern. */
1853 #define GET_UNSIGNED_NUMBER(num) \
1854 { if (p != pend) \
1855 { \
1856 PATFETCH (c); \
1857 while ('0' <= c && c <= '9') \
1858 { \
1859 if (num < 0) \
1860 num = 0; \
1861 num = num * 10 + c - '0'; \
1862 if (p == pend) \
1863 break; \
1864 PATFETCH (c); \
1865 } \
1866 } \
1867 }
1868
1869 #if defined _LIBC || WIDE_CHAR_SUPPORT
1870 /* The GNU C library provides support for user-defined character classes
1871 and the functions from ISO C amendement 1. */
1872 # ifdef CHARCLASS_NAME_MAX
1873 # define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
1874 # else
1875 /* This shouldn't happen but some implementation might still have this
1876 problem. Use a reasonable default value. */
1877 # define CHAR_CLASS_MAX_LENGTH 256
1878 # endif
1879
1880 # ifdef _LIBC
1881 # define IS_CHAR_CLASS(string) __wctype (string)
1882 # else
1883 # define IS_CHAR_CLASS(string) wctype (string)
1884 # endif
1885 #else
1886 # define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1887
1888 # define IS_CHAR_CLASS(string) \
1889 (STREQ (string, "alpha") || STREQ (string, "upper") \
1890 || STREQ (string, "lower") || STREQ (string, "digit") \
1891 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1892 || STREQ (string, "space") || STREQ (string, "print") \
1893 || STREQ (string, "punct") || STREQ (string, "graph") \
1894 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1895 #endif
1896 \f
1897 #ifndef MATCH_MAY_ALLOCATE
1898
1899 /* If we cannot allocate large objects within re_match_2_internal,
1900 we make the fail stack and register vectors global.
1901 The fail stack, we grow to the maximum size when a regexp
1902 is compiled.
1903 The register vectors, we adjust in size each time we
1904 compile a regexp, according to the number of registers it needs. */
1905
1906 static fail_stack_type fail_stack;
1907
1908 /* Size with which the following vectors are currently allocated.
1909 That is so we can make them bigger as needed,
1910 but never make them smaller. */
1911 static int regs_allocated_size;
1912
1913 static const char ** regstart, ** regend;
1914 static const char ** old_regstart, ** old_regend;
1915 static const char **best_regstart, **best_regend;
1916 static register_info_type *reg_info;
1917 static const char **reg_dummy;
1918 static register_info_type *reg_info_dummy;
1919
1920 /* Make the register vectors big enough for NUM_REGS registers,
1921 but don't make them smaller. */
1922
1923 static
1924 regex_grow_registers (num_regs)
1925 int num_regs;
1926 {
1927 if (num_regs > regs_allocated_size)
1928 {
1929 RETALLOC_IF (regstart, num_regs, const char *);
1930 RETALLOC_IF (regend, num_regs, const char *);
1931 RETALLOC_IF (old_regstart, num_regs, const char *);
1932 RETALLOC_IF (old_regend, num_regs, const char *);
1933 RETALLOC_IF (best_regstart, num_regs, const char *);
1934 RETALLOC_IF (best_regend, num_regs, const char *);
1935 RETALLOC_IF (reg_info, num_regs, register_info_type);
1936 RETALLOC_IF (reg_dummy, num_regs, const char *);
1937 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1938
1939 regs_allocated_size = num_regs;
1940 }
1941 }
1942
1943 #endif /* not MATCH_MAY_ALLOCATE */
1944 \f
1945 static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
1946 compile_stack,
1947 regnum_t regnum));
1948
1949 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1950 Returns one of error codes defined in `regex.h', or zero for success.
1951
1952 Assumes the `allocated' (and perhaps `buffer') and `translate'
1953 fields are set in BUFP on entry.
1954
1955 If it succeeds, results are put in BUFP (if it returns an error, the
1956 contents of BUFP are undefined):
1957 `buffer' is the compiled pattern;
1958 `syntax' is set to SYNTAX;
1959 `used' is set to the length of the compiled pattern;
1960 `fastmap_accurate' is zero;
1961 `re_nsub' is the number of subexpressions in PATTERN;
1962 `not_bol' and `not_eol' are zero;
1963
1964 The `fastmap' and `newline_anchor' fields are neither
1965 examined nor set. */
1966
1967 /* Return, freeing storage we allocated. */
1968 #define FREE_STACK_RETURN(value) \
1969 return (free (compile_stack.stack), value)
1970
1971 static reg_errcode_t
1972 regex_compile (pattern, size, syntax, bufp)
1973 const char *pattern;
1974 size_t size;
1975 reg_syntax_t syntax;
1976 struct re_pattern_buffer *bufp;
1977 {
1978 /* We fetch characters from PATTERN here. Even though PATTERN is
1979 `char *' (i.e., signed), we declare these variables as unsigned, so
1980 they can be reliably used as array indices. */
1981 register unsigned char c, c1;
1982
1983 /* A random temporary spot in PATTERN. */
1984 const char *p1;
1985
1986 /* Points to the end of the buffer, where we should append. */
1987 register unsigned char *b;
1988
1989 /* Keeps track of unclosed groups. */
1990 compile_stack_type compile_stack;
1991
1992 /* Points to the current (ending) position in the pattern. */
1993 const char *p = pattern;
1994 const char *pend = pattern + size;
1995
1996 /* How to translate the characters in the pattern. */
1997 RE_TRANSLATE_TYPE translate = bufp->translate;
1998
1999 /* Address of the count-byte of the most recently inserted `exactn'
2000 command. This makes it possible to tell if a new exact-match
2001 character can be added to that command or if the character requires
2002 a new `exactn' command. */
2003 unsigned char *pending_exact = 0;
2004
2005 /* Address of start of the most recently finished expression.
2006 This tells, e.g., postfix * where to find the start of its
2007 operand. Reset at the beginning of groups and alternatives. */
2008 unsigned char *laststart = 0;
2009
2010 /* Address of beginning of regexp, or inside of last group. */
2011 unsigned char *begalt;
2012
2013 /* Place in the uncompiled pattern (i.e., the {) to
2014 which to go back if the interval is invalid. */
2015 const char *beg_interval;
2016
2017 /* Address of the place where a forward jump should go to the end of
2018 the containing expression. Each alternative of an `or' -- except the
2019 last -- ends with a forward jump of this sort. */
2020 unsigned char *fixup_alt_jump = 0;
2021
2022 /* Counts open-groups as they are encountered. Remembered for the
2023 matching close-group on the compile stack, so the same register
2024 number is put in the stop_memory as the start_memory. */
2025 regnum_t regnum = 0;
2026
2027 #ifdef DEBUG
2028 DEBUG_PRINT1 ("\nCompiling pattern: ");
2029 if (debug)
2030 {
2031 unsigned debug_count;
2032
2033 for (debug_count = 0; debug_count < size; debug_count++)
2034 putchar (pattern[debug_count]);
2035 putchar ('\n');
2036 }
2037 #endif /* DEBUG */
2038
2039 /* Initialize the compile stack. */
2040 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2041 if (compile_stack.stack == NULL)
2042 return REG_ESPACE;
2043
2044 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2045 compile_stack.avail = 0;
2046
2047 /* Initialize the pattern buffer. */
2048 bufp->syntax = syntax;
2049 bufp->fastmap_accurate = 0;
2050 bufp->not_bol = bufp->not_eol = 0;
2051
2052 /* Set `used' to zero, so that if we return an error, the pattern
2053 printer (for debugging) will think there's no pattern. We reset it
2054 at the end. */
2055 bufp->used = 0;
2056
2057 /* Always count groups, whether or not bufp->no_sub is set. */
2058 bufp->re_nsub = 0;
2059
2060 #if !defined emacs && !defined SYNTAX_TABLE
2061 /* Initialize the syntax table. */
2062 init_syntax_once ();
2063 #endif
2064
2065 if (bufp->allocated == 0)
2066 {
2067 if (bufp->buffer)
2068 { /* If zero allocated, but buffer is non-null, try to realloc
2069 enough space. This loses if buffer's address is bogus, but
2070 that is the user's responsibility. */
2071 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2072 }
2073 else
2074 { /* Caller did not allocate a buffer. Do it for them. */
2075 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2076 }
2077 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2078
2079 bufp->allocated = INIT_BUF_SIZE;
2080 }
2081
2082 begalt = b = bufp->buffer;
2083
2084 /* Loop through the uncompiled pattern until we're at the end. */
2085 while (p != pend)
2086 {
2087 PATFETCH (c);
2088
2089 switch (c)
2090 {
2091 case '^':
2092 {
2093 if ( /* If at start of pattern, it's an operator. */
2094 p == pattern + 1
2095 /* If context independent, it's an operator. */
2096 || syntax & RE_CONTEXT_INDEP_ANCHORS
2097 /* Otherwise, depends on what's come before. */
2098 || at_begline_loc_p (pattern, p, syntax))
2099 BUF_PUSH (begline);
2100 else
2101 goto normal_char;
2102 }
2103 break;
2104
2105
2106 case '$':
2107 {
2108 if ( /* If at end of pattern, it's an operator. */
2109 p == pend
2110 /* If context independent, it's an operator. */
2111 || syntax & RE_CONTEXT_INDEP_ANCHORS
2112 /* Otherwise, depends on what's next. */
2113 || at_endline_loc_p (p, pend, syntax))
2114 BUF_PUSH (endline);
2115 else
2116 goto normal_char;
2117 }
2118 break;
2119
2120
2121 case '+':
2122 case '?':
2123 if ((syntax & RE_BK_PLUS_QM)
2124 || (syntax & RE_LIMITED_OPS))
2125 goto normal_char;
2126 handle_plus:
2127 case '*':
2128 /* If there is no previous pattern... */
2129 if (!laststart)
2130 {
2131 if (syntax & RE_CONTEXT_INVALID_OPS)
2132 FREE_STACK_RETURN (REG_BADRPT);
2133 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2134 goto normal_char;
2135 }
2136
2137 {
2138 /* Are we optimizing this jump? */
2139 boolean keep_string_p = false;
2140
2141 /* 1 means zero (many) matches is allowed. */
2142 char zero_times_ok = 0, many_times_ok = 0;
2143
2144 /* If there is a sequence of repetition chars, collapse it
2145 down to just one (the right one). We can't combine
2146 interval operators with these because of, e.g., `a{2}*',
2147 which should only match an even number of `a's. */
2148
2149 for (;;)
2150 {
2151 zero_times_ok |= c != '+';
2152 many_times_ok |= c != '?';
2153
2154 if (p == pend)
2155 break;
2156
2157 PATFETCH (c);
2158
2159 if (c == '*'
2160 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2161 ;
2162
2163 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2164 {
2165 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2166
2167 PATFETCH (c1);
2168 if (!(c1 == '+' || c1 == '?'))
2169 {
2170 PATUNFETCH;
2171 PATUNFETCH;
2172 break;
2173 }
2174
2175 c = c1;
2176 }
2177 else
2178 {
2179 PATUNFETCH;
2180 break;
2181 }
2182
2183 /* If we get here, we found another repeat character. */
2184 }
2185
2186 /* Star, etc. applied to an empty pattern is equivalent
2187 to an empty pattern. */
2188 if (!laststart)
2189 break;
2190
2191 /* Now we know whether or not zero matches is allowed
2192 and also whether or not two or more matches is allowed. */
2193 if (many_times_ok)
2194 { /* More than one repetition is allowed, so put in at the
2195 end a backward relative jump from `b' to before the next
2196 jump we're going to put in below (which jumps from
2197 laststart to after this jump).
2198
2199 But if we are at the `*' in the exact sequence `.*\n',
2200 insert an unconditional jump backwards to the .,
2201 instead of the beginning of the loop. This way we only
2202 push a failure point once, instead of every time
2203 through the loop. */
2204 assert (p - 1 > pattern);
2205
2206 /* Allocate the space for the jump. */
2207 GET_BUFFER_SPACE (3);
2208
2209 /* We know we are not at the first character of the pattern,
2210 because laststart was nonzero. And we've already
2211 incremented `p', by the way, to be the character after
2212 the `*'. Do we have to do something analogous here
2213 for null bytes, because of RE_DOT_NOT_NULL? */
2214 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
2215 && zero_times_ok
2216 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
2217 && !(syntax & RE_DOT_NEWLINE))
2218 { /* We have .*\n. */
2219 STORE_JUMP (jump, b, laststart);
2220 keep_string_p = true;
2221 }
2222 else
2223 /* Anything else. */
2224 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
2225
2226 /* We've added more stuff to the buffer. */
2227 b += 3;
2228 }
2229
2230 /* On failure, jump from laststart to b + 3, which will be the
2231 end of the buffer after this jump is inserted. */
2232 GET_BUFFER_SPACE (3);
2233 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2234 : on_failure_jump,
2235 laststart, b + 3);
2236 pending_exact = 0;
2237 b += 3;
2238
2239 if (!zero_times_ok)
2240 {
2241 /* At least one repetition is required, so insert a
2242 `dummy_failure_jump' before the initial
2243 `on_failure_jump' instruction of the loop. This
2244 effects a skip over that instruction the first time
2245 we hit that loop. */
2246 GET_BUFFER_SPACE (3);
2247 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
2248 b += 3;
2249 }
2250 }
2251 break;
2252
2253
2254 case '.':
2255 laststart = b;
2256 BUF_PUSH (anychar);
2257 break;
2258
2259
2260 case '[':
2261 {
2262 boolean had_char_class = false;
2263 unsigned int range_start = 0xffffffff;
2264
2265 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2266
2267 /* Ensure that we have enough space to push a charset: the
2268 opcode, the length count, and the bitset; 34 bytes in all. */
2269 GET_BUFFER_SPACE (34);
2270
2271 laststart = b;
2272
2273 /* We test `*p == '^' twice, instead of using an if
2274 statement, so we only need one BUF_PUSH. */
2275 BUF_PUSH (*p == '^' ? charset_not : charset);
2276 if (*p == '^')
2277 p++;
2278
2279 /* Remember the first position in the bracket expression. */
2280 p1 = p;
2281
2282 /* Push the number of bytes in the bitmap. */
2283 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2284
2285 /* Clear the whole map. */
2286 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
2287
2288 /* charset_not matches newline according to a syntax bit. */
2289 if ((re_opcode_t) b[-2] == charset_not
2290 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2291 SET_LIST_BIT ('\n');
2292
2293 /* Read in characters and ranges, setting map bits. */
2294 for (;;)
2295 {
2296 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2297
2298 PATFETCH (c);
2299
2300 /* \ might escape characters inside [...] and [^...]. */
2301 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2302 {
2303 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2304
2305 PATFETCH (c1);
2306 SET_LIST_BIT (c1);
2307 range_start = c1;
2308 continue;
2309 }
2310
2311 /* Could be the end of the bracket expression. If it's
2312 not (i.e., when the bracket expression is `[]' so
2313 far), the ']' character bit gets set way below. */
2314 if (c == ']' && p != p1 + 1)
2315 break;
2316
2317 /* Look ahead to see if it's a range when the last thing
2318 was a character class. */
2319 if (had_char_class && c == '-' && *p != ']')
2320 FREE_STACK_RETURN (REG_ERANGE);
2321
2322 /* Look ahead to see if it's a range when the last thing
2323 was a character: if this is a hyphen not at the
2324 beginning or the end of a list, then it's the range
2325 operator. */
2326 if (c == '-'
2327 && !(p - 2 >= pattern && p[-2] == '[')
2328 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2329 && *p != ']')
2330 {
2331 reg_errcode_t ret
2332 = compile_range (range_start, &p, pend, translate,
2333 syntax, b);
2334 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2335 range_start = 0xffffffff;
2336 }
2337
2338 else if (p[0] == '-' && p[1] != ']')
2339 { /* This handles ranges made up of characters only. */
2340 reg_errcode_t ret;
2341
2342 /* Move past the `-'. */
2343 PATFETCH (c1);
2344
2345 ret = compile_range (c, &p, pend, translate, syntax, b);
2346 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2347 range_start = 0xffffffff;
2348 }
2349
2350 /* See if we're at the beginning of a possible character
2351 class. */
2352
2353 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2354 { /* Leave room for the null. */
2355 char str[CHAR_CLASS_MAX_LENGTH + 1];
2356
2357 PATFETCH (c);
2358 c1 = 0;
2359
2360 /* If pattern is `[[:'. */
2361 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2362
2363 for (;;)
2364 {
2365 PATFETCH (c);
2366 if ((c == ':' && *p == ']') || p == pend)
2367 break;
2368 if (c1 < CHAR_CLASS_MAX_LENGTH)
2369 str[c1++] = c;
2370 else
2371 /* This is in any case an invalid class name. */
2372 str[0] = '\0';
2373 }
2374 str[c1] = '\0';
2375
2376 /* If isn't a word bracketed by `[:' and `:]':
2377 undo the ending character, the letters, and leave
2378 the leading `:' and `[' (but set bits for them). */
2379 if (c == ':' && *p == ']')
2380 {
2381 #if defined _LIBC || WIDE_CHAR_SUPPORT
2382 boolean is_lower = STREQ (str, "lower");
2383 boolean is_upper = STREQ (str, "upper");
2384 wctype_t wt;
2385 int ch;
2386
2387 wt = IS_CHAR_CLASS (str);
2388 if (wt == 0)
2389 FREE_STACK_RETURN (REG_ECTYPE);
2390
2391 /* Throw away the ] at the end of the character
2392 class. */
2393 PATFETCH (c);
2394
2395 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2396
2397 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
2398 {
2399 # ifdef _LIBC
2400 if (__iswctype (__btowc (ch), wt))
2401 SET_LIST_BIT (ch);
2402 # else
2403 if (iswctype (btowc (ch), wt))
2404 SET_LIST_BIT (ch);
2405 # endif
2406
2407 if (translate && (is_upper || is_lower)
2408 && (ISUPPER (ch) || ISLOWER (ch)))
2409 SET_LIST_BIT (ch);
2410 }
2411
2412 had_char_class = true;
2413 #else
2414 int ch;
2415 boolean is_alnum = STREQ (str, "alnum");
2416 boolean is_alpha = STREQ (str, "alpha");
2417 boolean is_blank = STREQ (str, "blank");
2418 boolean is_cntrl = STREQ (str, "cntrl");
2419 boolean is_digit = STREQ (str, "digit");
2420 boolean is_graph = STREQ (str, "graph");
2421 boolean is_lower = STREQ (str, "lower");
2422 boolean is_print = STREQ (str, "print");
2423 boolean is_punct = STREQ (str, "punct");
2424 boolean is_space = STREQ (str, "space");
2425 boolean is_upper = STREQ (str, "upper");
2426 boolean is_xdigit = STREQ (str, "xdigit");
2427
2428 if (!IS_CHAR_CLASS (str))
2429 FREE_STACK_RETURN (REG_ECTYPE);
2430
2431 /* Throw away the ] at the end of the character
2432 class. */
2433 PATFETCH (c);
2434
2435 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2436
2437 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2438 {
2439 /* This was split into 3 if's to
2440 avoid an arbitrary limit in some compiler. */
2441 if ( (is_alnum && ISALNUM (ch))
2442 || (is_alpha && ISALPHA (ch))
2443 || (is_blank && ISBLANK (ch))
2444 || (is_cntrl && ISCNTRL (ch)))
2445 SET_LIST_BIT (ch);
2446 if ( (is_digit && ISDIGIT (ch))
2447 || (is_graph && ISGRAPH (ch))
2448 || (is_lower && ISLOWER (ch))
2449 || (is_print && ISPRINT (ch)))
2450 SET_LIST_BIT (ch);
2451 if ( (is_punct && ISPUNCT (ch))
2452 || (is_space && ISSPACE (ch))
2453 || (is_upper && ISUPPER (ch))
2454 || (is_xdigit && ISXDIGIT (ch)))
2455 SET_LIST_BIT (ch);
2456 if ( translate && (is_upper || is_lower)
2457 && (ISUPPER (ch) || ISLOWER (ch)))
2458 SET_LIST_BIT (ch);
2459 }
2460 had_char_class = true;
2461 #endif /* libc || wctype.h */
2462 }
2463 else
2464 {
2465 c1++;
2466 while (c1--)
2467 PATUNFETCH;
2468 SET_LIST_BIT ('[');
2469 SET_LIST_BIT (':');
2470 range_start = ':';
2471 had_char_class = false;
2472 }
2473 }
2474 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=')
2475 {
2476 unsigned char str[MB_LEN_MAX + 1];
2477 #ifdef _LIBC
2478 uint32_t nrules =
2479 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2480 #endif
2481
2482 PATFETCH (c);
2483 c1 = 0;
2484
2485 /* If pattern is `[[='. */
2486 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2487
2488 for (;;)
2489 {
2490 PATFETCH (c);
2491 if ((c == '=' && *p == ']') || p == pend)
2492 break;
2493 if (c1 < MB_LEN_MAX)
2494 str[c1++] = c;
2495 else
2496 /* This is in any case an invalid class name. */
2497 str[0] = '\0';
2498 }
2499 str[c1] = '\0';
2500
2501 if (c == '=' && *p == ']' && str[0] != '\0')
2502 {
2503 /* If we have no collation data we use the default
2504 collation in which each character is in a class
2505 by itself. It also means that ASCII is the
2506 character set and therefore we cannot have character
2507 with more than one byte in the multibyte
2508 representation. */
2509 #ifdef _LIBC
2510 if (nrules == 0)
2511 #endif
2512 {
2513 if (c1 != 1)
2514 FREE_STACK_RETURN (REG_ECOLLATE);
2515
2516 /* Throw away the ] at the end of the equivalence
2517 class. */
2518 PATFETCH (c);
2519
2520 /* Set the bit for the character. */
2521 SET_LIST_BIT (str[0]);
2522 }
2523 #ifdef _LIBC
2524 else
2525 {
2526 /* Try to match the byte sequence in `str' against
2527 those known to the collate implementation.
2528 First find out whether the bytes in `str' are
2529 actually from exactly one character. */
2530 const int32_t *table;
2531 const unsigned char *weights;
2532 const unsigned char *extra;
2533 const int32_t *indirect;
2534 int32_t idx;
2535 const unsigned char *cp = str;
2536 int ch;
2537
2538 /* This #include defines a local function! */
2539 # include <locale/weight.h>
2540
2541 table = (const int32_t *)
2542 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
2543 weights = (const unsigned char *)
2544 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
2545 extra = (const unsigned char *)
2546 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
2547 indirect = (const int32_t *)
2548 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
2549
2550 idx = findidx (&cp);
2551 if (idx == 0 || cp < str + c1)
2552 /* This is no valid character. */
2553 FREE_STACK_RETURN (REG_ECOLLATE);
2554
2555 /* Throw away the ] at the end of the equivalence
2556 class. */
2557 PATFETCH (c);
2558
2559 /* Now we have to go throught the whole table
2560 and find all characters which have the same
2561 first level weight.
2562
2563 XXX Note that this is not entirely correct.
2564 we would have to match multibyte sequences
2565 but this is not possible with the current
2566 implementation. */
2567 for (ch = 1; ch < 256; ++ch)
2568 /* XXX This test would have to be changed if we
2569 would allow matching multibyte sequences. */
2570 if (table[ch] > 0)
2571 {
2572 int32_t idx2 = table[ch];
2573 size_t len = weights[idx2];
2574
2575 /* Test whether the lenghts match. */
2576 if (weights[idx] == len)
2577 {
2578 /* They do. New compare the bytes of
2579 the weight. */
2580 size_t cnt = 0;
2581
2582 while (cnt < len
2583 && (weights[idx + 1 + cnt]
2584 == weights[idx2 + 1 + cnt]))
2585 ++len;
2586
2587 if (cnt == len)
2588 /* They match. Mark the character as
2589 acceptable. */
2590 SET_LIST_BIT (ch);
2591 }
2592 }
2593 }
2594 #endif
2595 had_char_class = true;
2596 }
2597 else
2598 {
2599 c1++;
2600 while (c1--)
2601 PATUNFETCH;
2602 SET_LIST_BIT ('[');
2603 SET_LIST_BIT ('=');
2604 range_start = '=';
2605 had_char_class = false;
2606 }
2607 }
2608 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.')
2609 {
2610 unsigned char str[128]; /* Should be large enough. */
2611 #ifdef _LIBC
2612 uint32_t nrules =
2613 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2614 #endif
2615
2616 PATFETCH (c);
2617 c1 = 0;
2618
2619 /* If pattern is `[[='. */
2620 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2621
2622 for (;;)
2623 {
2624 PATFETCH (c);
2625 if ((c == '.' && *p == ']') || p == pend)
2626 break;
2627 if (c1 < sizeof (str))
2628 str[c1++] = c;
2629 else
2630 /* This is in any case an invalid class name. */
2631 str[0] = '\0';
2632 }
2633 str[c1] = '\0';
2634
2635 if (c == '.' && *p == ']' && str[0] != '\0')
2636 {
2637 /* If we have no collation data we use the default
2638 collation in which each character is the name
2639 for its own class which contains only the one
2640 character. It also means that ASCII is the
2641 character set and therefore we cannot have character
2642 with more than one byte in the multibyte
2643 representation. */
2644 #ifdef _LIBC
2645 if (nrules == 0)
2646 #endif
2647 {
2648 if (c1 != 1)
2649 FREE_STACK_RETURN (REG_ECOLLATE);
2650
2651 /* Throw away the ] at the end of the equivalence
2652 class. */
2653 PATFETCH (c);
2654
2655 /* Set the bit for the character. */
2656 SET_LIST_BIT (str[0]);
2657 range_start = ((const unsigned char *) str)[0];
2658 }
2659 #ifdef _LIBC
2660 else
2661 {
2662 /* Try to match the byte sequence in `str' against
2663 those known to the collate implementation.
2664 First find out whether the bytes in `str' are
2665 actually from exactly one character. */
2666 int32_t table_size;
2667 const int32_t *symb_table;
2668 const unsigned char *extra;
2669 int32_t idx;
2670 int32_t elem;
2671 int32_t second;
2672 int32_t hash;
2673
2674 table_size =
2675 _NL_CURRENT_WORD (LC_COLLATE,
2676 _NL_COLLATE_SYMB_HASH_SIZEMB);
2677 symb_table = (const int32_t *)
2678 _NL_CURRENT (LC_COLLATE,
2679 _NL_COLLATE_SYMB_TABLEMB);
2680 extra = (const unsigned char *)
2681 _NL_CURRENT (LC_COLLATE,
2682 _NL_COLLATE_SYMB_EXTRAMB);
2683
2684 /* Locate the character in the hashing table. */
2685 hash = elem_hash (str, c1);
2686
2687 idx = 0;
2688 elem = hash % table_size;
2689 second = hash % (table_size - 2);
2690 while (symb_table[2 * elem] != 0)
2691 {
2692 /* First compare the hashing value. */
2693 if (symb_table[2 * elem] == hash
2694 && c1 == extra[symb_table[2 * elem + 1]]
2695 && memcmp (str,
2696 &extra[symb_table[2 * elem + 1]
2697 + 1],
2698 c1) == 0)
2699 {
2700 /* Yep, this is the entry. */
2701 idx = symb_table[2 * elem + 1];
2702 idx += 1 + extra[idx];
2703 break;
2704 }
2705
2706 /* Next entry. */
2707 elem += second;
2708 }
2709
2710 if (symb_table[2 * elem] == 0)
2711 /* This is no valid character. */
2712 FREE_STACK_RETURN (REG_ECOLLATE);
2713
2714 /* Throw away the ] at the end of the equivalence
2715 class. */
2716 PATFETCH (c);
2717
2718 /* Now add the multibyte character(s) we found
2719 to the accept list.
2720
2721 XXX Note that this is not entirely correct.
2722 we would have to match multibyte sequences
2723 but this is not possible with the current
2724 implementation. Also, we have to match
2725 collating symbols, which expand to more than
2726 one file, as a whole and not allow the
2727 individual bytes. */
2728 c1 = extra[idx++];
2729 if (c1 == 1)
2730 range_start = extra[idx];
2731 while (c1-- > 0)
2732 {
2733 SET_LIST_BIT (extra[idx]);
2734 ++idx;
2735 }
2736 }
2737 #endif
2738 had_char_class = false;
2739 }
2740 else
2741 {
2742 c1++;
2743 while (c1--)
2744 PATUNFETCH;
2745 SET_LIST_BIT ('[');
2746 SET_LIST_BIT ('.');
2747 range_start = '.';
2748 had_char_class = false;
2749 }
2750 }
2751 else
2752 {
2753 had_char_class = false;
2754 SET_LIST_BIT (c);
2755 range_start = c;
2756 }
2757 }
2758
2759 /* Discard any (non)matching list bytes that are all 0 at the
2760 end of the map. Decrease the map-length byte too. */
2761 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2762 b[-1]--;
2763 b += b[-1];
2764 }
2765 break;
2766
2767
2768 case '(':
2769 if (syntax & RE_NO_BK_PARENS)
2770 goto handle_open;
2771 else
2772 goto normal_char;
2773
2774
2775 case ')':
2776 if (syntax & RE_NO_BK_PARENS)
2777 goto handle_close;
2778 else
2779 goto normal_char;
2780
2781
2782 case '\n':
2783 if (syntax & RE_NEWLINE_ALT)
2784 goto handle_alt;
2785 else
2786 goto normal_char;
2787
2788
2789 case '|':
2790 if (syntax & RE_NO_BK_VBAR)
2791 goto handle_alt;
2792 else
2793 goto normal_char;
2794
2795
2796 case '{':
2797 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2798 goto handle_interval;
2799 else
2800 goto normal_char;
2801
2802
2803 case '\\':
2804 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2805
2806 /* Do not translate the character after the \, so that we can
2807 distinguish, e.g., \B from \b, even if we normally would
2808 translate, e.g., B to b. */
2809 PATFETCH_RAW (c);
2810
2811 switch (c)
2812 {
2813 case '(':
2814 if (syntax & RE_NO_BK_PARENS)
2815 goto normal_backslash;
2816
2817 handle_open:
2818 bufp->re_nsub++;
2819 regnum++;
2820
2821 if (COMPILE_STACK_FULL)
2822 {
2823 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2824 compile_stack_elt_t);
2825 if (compile_stack.stack == NULL) return REG_ESPACE;
2826
2827 compile_stack.size <<= 1;
2828 }
2829
2830 /* These are the values to restore when we hit end of this
2831 group. They are all relative offsets, so that if the
2832 whole pattern moves because of realloc, they will still
2833 be valid. */
2834 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2835 COMPILE_STACK_TOP.fixup_alt_jump
2836 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2837 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2838 COMPILE_STACK_TOP.regnum = regnum;
2839
2840 /* We will eventually replace the 0 with the number of
2841 groups inner to this one. But do not push a
2842 start_memory for groups beyond the last one we can
2843 represent in the compiled pattern. */
2844 if (regnum <= MAX_REGNUM)
2845 {
2846 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
2847 BUF_PUSH_3 (start_memory, regnum, 0);
2848 }
2849
2850 compile_stack.avail++;
2851
2852 fixup_alt_jump = 0;
2853 laststart = 0;
2854 begalt = b;
2855 /* If we've reached MAX_REGNUM groups, then this open
2856 won't actually generate any code, so we'll have to
2857 clear pending_exact explicitly. */
2858 pending_exact = 0;
2859 break;
2860
2861
2862 case ')':
2863 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2864
2865 if (COMPILE_STACK_EMPTY)
2866 {
2867 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2868 goto normal_backslash;
2869 else
2870 FREE_STACK_RETURN (REG_ERPAREN);
2871 }
2872
2873 handle_close:
2874 if (fixup_alt_jump)
2875 { /* Push a dummy failure point at the end of the
2876 alternative for a possible future
2877 `pop_failure_jump' to pop. See comments at
2878 `push_dummy_failure' in `re_match_2'. */
2879 BUF_PUSH (push_dummy_failure);
2880
2881 /* We allocated space for this jump when we assigned
2882 to `fixup_alt_jump', in the `handle_alt' case below. */
2883 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
2884 }
2885
2886 /* See similar code for backslashed left paren above. */
2887 if (COMPILE_STACK_EMPTY)
2888 {
2889 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2890 goto normal_char;
2891 else
2892 FREE_STACK_RETURN (REG_ERPAREN);
2893 }
2894
2895 /* Since we just checked for an empty stack above, this
2896 ``can't happen''. */
2897 assert (compile_stack.avail != 0);
2898 {
2899 /* We don't just want to restore into `regnum', because
2900 later groups should continue to be numbered higher,
2901 as in `(ab)c(de)' -- the second group is #2. */
2902 regnum_t this_group_regnum;
2903
2904 compile_stack.avail--;
2905 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2906 fixup_alt_jump
2907 = COMPILE_STACK_TOP.fixup_alt_jump
2908 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2909 : 0;
2910 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2911 this_group_regnum = COMPILE_STACK_TOP.regnum;
2912 /* If we've reached MAX_REGNUM groups, then this open
2913 won't actually generate any code, so we'll have to
2914 clear pending_exact explicitly. */
2915 pending_exact = 0;
2916
2917 /* We're at the end of the group, so now we know how many
2918 groups were inside this one. */
2919 if (this_group_regnum <= MAX_REGNUM)
2920 {
2921 unsigned char *inner_group_loc
2922 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2923
2924 *inner_group_loc = regnum - this_group_regnum;
2925 BUF_PUSH_3 (stop_memory, this_group_regnum,
2926 regnum - this_group_regnum);
2927 }
2928 }
2929 break;
2930
2931
2932 case '|': /* `\|'. */
2933 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2934 goto normal_backslash;
2935 handle_alt:
2936 if (syntax & RE_LIMITED_OPS)
2937 goto normal_char;
2938
2939 /* Insert before the previous alternative a jump which
2940 jumps to this alternative if the former fails. */
2941 GET_BUFFER_SPACE (3);
2942 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2943 pending_exact = 0;
2944 b += 3;
2945
2946 /* The alternative before this one has a jump after it
2947 which gets executed if it gets matched. Adjust that
2948 jump so it will jump to this alternative's analogous
2949 jump (put in below, which in turn will jump to the next
2950 (if any) alternative's such jump, etc.). The last such
2951 jump jumps to the correct final destination. A picture:
2952 _____ _____
2953 | | | |
2954 | v | v
2955 a | b | c
2956
2957 If we are at `b', then fixup_alt_jump right now points to a
2958 three-byte space after `a'. We'll put in the jump, set
2959 fixup_alt_jump to right after `b', and leave behind three
2960 bytes which we'll fill in when we get to after `c'. */
2961
2962 if (fixup_alt_jump)
2963 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2964
2965 /* Mark and leave space for a jump after this alternative,
2966 to be filled in later either by next alternative or
2967 when know we're at the end of a series of alternatives. */
2968 fixup_alt_jump = b;
2969 GET_BUFFER_SPACE (3);
2970 b += 3;
2971
2972 laststart = 0;
2973 begalt = b;
2974 break;
2975
2976
2977 case '{':
2978 /* If \{ is a literal. */
2979 if (!(syntax & RE_INTERVALS)
2980 /* If we're at `\{' and it's not the open-interval
2981 operator. */
2982 || (syntax & RE_NO_BK_BRACES))
2983 goto normal_backslash;
2984
2985 handle_interval:
2986 {
2987 /* If got here, then the syntax allows intervals. */
2988
2989 /* At least (most) this many matches must be made. */
2990 int lower_bound = -1, upper_bound = -1;
2991
2992 beg_interval = p - 1;
2993
2994 if (p == pend)
2995 {
2996 if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2997 goto unfetch_interval;
2998 else
2999 FREE_STACK_RETURN (REG_EBRACE);
3000 }
3001
3002 GET_UNSIGNED_NUMBER (lower_bound);
3003
3004 if (c == ',')
3005 {
3006 GET_UNSIGNED_NUMBER (upper_bound);
3007 if ((!(syntax & RE_NO_BK_BRACES) && c != '\\')
3008 || ((syntax & RE_NO_BK_BRACES) && c != '}'))
3009 FREE_STACK_RETURN (REG_BADBR);
3010
3011 if (upper_bound < 0)
3012 upper_bound = RE_DUP_MAX;
3013 }
3014 else
3015 /* Interval such as `{1}' => match exactly once. */
3016 upper_bound = lower_bound;
3017
3018 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
3019 || lower_bound > upper_bound)
3020 {
3021 if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
3022 goto unfetch_interval;
3023 else
3024 FREE_STACK_RETURN (REG_BADBR);
3025 }
3026
3027 if (!(syntax & RE_NO_BK_BRACES))
3028 {
3029 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
3030
3031 PATFETCH (c);
3032 }
3033
3034 if (c != '}')
3035 {
3036 if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
3037 goto unfetch_interval;
3038 else
3039 FREE_STACK_RETURN (REG_BADBR);
3040 }
3041
3042 /* We just parsed a valid interval. */
3043
3044 /* If it's invalid to have no preceding re. */
3045 if (!laststart)
3046 {
3047 if (syntax & RE_CONTEXT_INVALID_OPS)
3048 FREE_STACK_RETURN (REG_BADRPT);
3049 else if (syntax & RE_CONTEXT_INDEP_OPS)
3050 laststart = b;
3051 else
3052 goto unfetch_interval;
3053 }
3054
3055 /* If the upper bound is zero, don't want to succeed at
3056 all; jump from `laststart' to `b + 3', which will be
3057 the end of the buffer after we insert the jump. */
3058 if (upper_bound == 0)
3059 {
3060 GET_BUFFER_SPACE (3);
3061 INSERT_JUMP (jump, laststart, b + 3);
3062 b += 3;
3063 }
3064
3065 /* Otherwise, we have a nontrivial interval. When
3066 we're all done, the pattern will look like:
3067 set_number_at <jump count> <upper bound>
3068 set_number_at <succeed_n count> <lower bound>
3069 succeed_n <after jump addr> <succeed_n count>
3070 <body of loop>
3071 jump_n <succeed_n addr> <jump count>
3072 (The upper bound and `jump_n' are omitted if
3073 `upper_bound' is 1, though.) */
3074 else
3075 { /* If the upper bound is > 1, we need to insert
3076 more at the end of the loop. */
3077 unsigned nbytes = 10 + (upper_bound > 1) * 10;
3078
3079 GET_BUFFER_SPACE (nbytes);
3080
3081 /* Initialize lower bound of the `succeed_n', even
3082 though it will be set during matching by its
3083 attendant `set_number_at' (inserted next),
3084 because `re_compile_fastmap' needs to know.
3085 Jump to the `jump_n' we might insert below. */
3086 INSERT_JUMP2 (succeed_n, laststart,
3087 b + 5 + (upper_bound > 1) * 5,
3088 lower_bound);
3089 b += 5;
3090
3091 /* Code to initialize the lower bound. Insert
3092 before the `succeed_n'. The `5' is the last two
3093 bytes of this `set_number_at', plus 3 bytes of
3094 the following `succeed_n'. */
3095 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3096 b += 5;
3097
3098 if (upper_bound > 1)
3099 { /* More than one repetition is allowed, so
3100 append a backward jump to the `succeed_n'
3101 that starts this interval.
3102
3103 When we've reached this during matching,
3104 we'll have matched the interval once, so
3105 jump back only `upper_bound - 1' times. */
3106 STORE_JUMP2 (jump_n, b, laststart + 5,
3107 upper_bound - 1);
3108 b += 5;
3109
3110 /* The location we want to set is the second
3111 parameter of the `jump_n'; that is `b-2' as
3112 an absolute address. `laststart' will be
3113 the `set_number_at' we're about to insert;
3114 `laststart+3' the number to set, the source
3115 for the relative address. But we are
3116 inserting into the middle of the pattern --
3117 so everything is getting moved up by 5.
3118 Conclusion: (b - 2) - (laststart + 3) + 5,
3119 i.e., b - laststart.
3120
3121 We insert this at the beginning of the loop
3122 so that if we fail during matching, we'll
3123 reinitialize the bounds. */
3124 insert_op2 (set_number_at, laststart, b - laststart,
3125 upper_bound - 1, b);
3126 b += 5;
3127 }
3128 }
3129 pending_exact = 0;
3130 beg_interval = NULL;
3131 }
3132 break;
3133
3134 unfetch_interval:
3135 /* If an invalid interval, match the characters as literals. */
3136 assert (beg_interval);
3137 p = beg_interval;
3138 beg_interval = NULL;
3139
3140 /* normal_char and normal_backslash need `c'. */
3141 PATFETCH (c);
3142
3143 if (!(syntax & RE_NO_BK_BRACES))
3144 {
3145 if (p > pattern && p[-1] == '\\')
3146 goto normal_backslash;
3147 }
3148 goto normal_char;
3149
3150 #ifdef emacs
3151 /* There is no way to specify the before_dot and after_dot
3152 operators. rms says this is ok. --karl */
3153 case '=':
3154 BUF_PUSH (at_dot);
3155 break;
3156
3157 case 's':
3158 laststart = b;
3159 PATFETCH (c);
3160 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3161 break;
3162
3163 case 'S':
3164 laststart = b;
3165 PATFETCH (c);
3166 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3167 break;
3168 #endif /* emacs */
3169
3170
3171 case 'w':
3172 if (syntax & RE_NO_GNU_OPS)
3173 goto normal_char;
3174 laststart = b;
3175 BUF_PUSH (wordchar);
3176 break;
3177
3178
3179 case 'W':
3180 if (syntax & RE_NO_GNU_OPS)
3181 goto normal_char;
3182 laststart = b;
3183 BUF_PUSH (notwordchar);
3184 break;
3185
3186
3187 case '<':
3188 if (syntax & RE_NO_GNU_OPS)
3189 goto normal_char;
3190 BUF_PUSH (wordbeg);
3191 break;
3192
3193 case '>':
3194 if (syntax & RE_NO_GNU_OPS)
3195 goto normal_char;
3196 BUF_PUSH (wordend);
3197 break;
3198
3199 case 'b':
3200 if (syntax & RE_NO_GNU_OPS)
3201 goto normal_char;
3202 BUF_PUSH (wordbound);
3203 break;
3204
3205 case 'B':
3206 if (syntax & RE_NO_GNU_OPS)
3207 goto normal_char;
3208 BUF_PUSH (notwordbound);
3209 break;
3210
3211 case '`':
3212 if (syntax & RE_NO_GNU_OPS)
3213 goto normal_char;
3214 BUF_PUSH (begbuf);
3215 break;
3216
3217 case '\'':
3218 if (syntax & RE_NO_GNU_OPS)
3219 goto normal_char;
3220 BUF_PUSH (endbuf);
3221 break;
3222
3223 case '1': case '2': case '3': case '4': case '5':
3224 case '6': case '7': case '8': case '9':
3225 if (syntax & RE_NO_BK_REFS)
3226 goto normal_char;
3227
3228 c1 = c - '0';
3229
3230 if (c1 > regnum)
3231 FREE_STACK_RETURN (REG_ESUBREG);
3232
3233 /* Can't back reference to a subexpression if inside of it. */
3234 if (group_in_compile_stack (compile_stack, (regnum_t) c1))
3235 goto normal_char;
3236
3237 laststart = b;
3238 BUF_PUSH_2 (duplicate, c1);
3239 break;
3240
3241
3242 case '+':
3243 case '?':
3244 if (syntax & RE_BK_PLUS_QM)
3245 goto handle_plus;
3246 else
3247 goto normal_backslash;
3248
3249 default:
3250 normal_backslash:
3251 /* You might think it would be useful for \ to mean
3252 not to translate; but if we don't translate it
3253 it will never match anything. */
3254 c = TRANSLATE (c);
3255 goto normal_char;
3256 }
3257 break;
3258
3259
3260 default:
3261 /* Expects the character in `c'. */
3262 normal_char:
3263 /* If no exactn currently being built. */
3264 if (!pending_exact
3265
3266 /* If last exactn not at current position. */
3267 || pending_exact + *pending_exact + 1 != b
3268
3269 /* We have only one byte following the exactn for the count. */
3270 || *pending_exact == (1 << BYTEWIDTH) - 1
3271
3272 /* If followed by a repetition operator. */
3273 || *p == '*' || *p == '^'
3274 || ((syntax & RE_BK_PLUS_QM)
3275 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
3276 : (*p == '+' || *p == '?'))
3277 || ((syntax & RE_INTERVALS)
3278 && ((syntax & RE_NO_BK_BRACES)
3279 ? *p == '{'
3280 : (p[0] == '\\' && p[1] == '{'))))
3281 {
3282 /* Start building a new exactn. */
3283
3284 laststart = b;
3285
3286 BUF_PUSH_2 (exactn, 0);
3287 pending_exact = b - 1;
3288 }
3289
3290 BUF_PUSH (c);
3291 (*pending_exact)++;
3292 break;
3293 } /* switch (c) */
3294 } /* while p != pend */
3295
3296
3297 /* Through the pattern now. */
3298
3299 if (fixup_alt_jump)
3300 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
3301
3302 if (!COMPILE_STACK_EMPTY)
3303 FREE_STACK_RETURN (REG_EPAREN);
3304
3305 /* If we don't want backtracking, force success
3306 the first time we reach the end of the compiled pattern. */
3307 if (syntax & RE_NO_POSIX_BACKTRACKING)
3308 BUF_PUSH (succeed);
3309
3310 free (compile_stack.stack);
3311
3312 /* We have succeeded; set the length of the buffer. */
3313 bufp->used = b - bufp->buffer;
3314
3315 #ifdef DEBUG
3316 if (debug)
3317 {
3318 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3319 print_compiled_pattern (bufp);
3320 }
3321 #endif /* DEBUG */
3322
3323 #ifndef MATCH_MAY_ALLOCATE
3324 /* Initialize the failure stack to the largest possible stack. This
3325 isn't necessary unless we're trying to avoid calling alloca in
3326 the search and match routines. */
3327 {
3328 int num_regs = bufp->re_nsub + 1;
3329
3330 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
3331 is strictly greater than re_max_failures, the largest possible stack
3332 is 2 * re_max_failures failure points. */
3333 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
3334 {
3335 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
3336
3337 # ifdef emacs
3338 if (! fail_stack.stack)
3339 fail_stack.stack
3340 = (fail_stack_elt_t *) xmalloc (fail_stack.size
3341 * sizeof (fail_stack_elt_t));
3342 else
3343 fail_stack.stack
3344 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
3345 (fail_stack.size
3346 * sizeof (fail_stack_elt_t)));
3347 # else /* not emacs */
3348 if (! fail_stack.stack)
3349 fail_stack.stack
3350 = (fail_stack_elt_t *) malloc (fail_stack.size
3351 * sizeof (fail_stack_elt_t));
3352 else
3353 fail_stack.stack
3354 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3355 (fail_stack.size
3356 * sizeof (fail_stack_elt_t)));
3357 # endif /* not emacs */
3358 }
3359
3360 regex_grow_registers (num_regs);
3361 }
3362 #endif /* not MATCH_MAY_ALLOCATE */
3363
3364 return REG_NOERROR;
3365 } /* regex_compile */
3366 \f
3367 /* Subroutines for `regex_compile'. */
3368
3369 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3370
3371 static void
3372 store_op1 (op, loc, arg)
3373 re_opcode_t op;
3374 unsigned char *loc;
3375 int arg;
3376 {
3377 *loc = (unsigned char) op;
3378 STORE_NUMBER (loc + 1, arg);
3379 }
3380
3381
3382 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3383
3384 static void
3385 store_op2 (op, loc, arg1, arg2)
3386 re_opcode_t op;
3387 unsigned char *loc;
3388 int arg1, arg2;
3389 {
3390 *loc = (unsigned char) op;
3391 STORE_NUMBER (loc + 1, arg1);
3392 STORE_NUMBER (loc + 3, arg2);
3393 }
3394
3395
3396 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3397 for OP followed by two-byte integer parameter ARG. */
3398
3399 static void
3400 insert_op1 (op, loc, arg, end)
3401 re_opcode_t op;
3402 unsigned char *loc;
3403 int arg;
3404 unsigned char *end;
3405 {
3406 register unsigned char *pfrom = end;
3407 register unsigned char *pto = end + 3;
3408
3409 while (pfrom != loc)
3410 *--pto = *--pfrom;
3411
3412 store_op1 (op, loc, arg);
3413 }
3414
3415
3416 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3417
3418 static void
3419 insert_op2 (op, loc, arg1, arg2, end)
3420 re_opcode_t op;
3421 unsigned char *loc;
3422 int arg1, arg2;
3423 unsigned char *end;
3424 {
3425 register unsigned char *pfrom = end;
3426 register unsigned char *pto = end + 5;
3427
3428 while (pfrom != loc)
3429 *--pto = *--pfrom;
3430
3431 store_op2 (op, loc, arg1, arg2);
3432 }
3433
3434
3435 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3436 after an alternative or a begin-subexpression. We assume there is at
3437 least one character before the ^. */
3438
3439 static boolean
3440 at_begline_loc_p (pattern, p, syntax)
3441 const char *pattern, *p;
3442 reg_syntax_t syntax;
3443 {
3444 const char *prev = p - 2;
3445 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3446
3447 return
3448 /* After a subexpression? */
3449 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3450 /* After an alternative? */
3451 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3452 }
3453
3454
3455 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3456 at least one character after the $, i.e., `P < PEND'. */
3457
3458 static boolean
3459 at_endline_loc_p (p, pend, syntax)
3460 const char *p, *pend;
3461 reg_syntax_t syntax;
3462 {
3463 const char *next = p;
3464 boolean next_backslash = *next == '\\';
3465 const char *next_next = p + 1 < pend ? p + 1 : 0;
3466
3467 return
3468 /* Before a subexpression? */
3469 (syntax & RE_NO_BK_PARENS ? *next == ')'
3470 : next_backslash && next_next && *next_next == ')')
3471 /* Before an alternative? */
3472 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3473 : next_backslash && next_next && *next_next == '|');
3474 }
3475
3476
3477 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3478 false if it's not. */
3479
3480 static boolean
3481 group_in_compile_stack (compile_stack, regnum)
3482 compile_stack_type compile_stack;
3483 regnum_t regnum;
3484 {
3485 int this_element;
3486
3487 for (this_element = compile_stack.avail - 1;
3488 this_element >= 0;
3489 this_element--)
3490 if (compile_stack.stack[this_element].regnum == regnum)
3491 return true;
3492
3493 return false;
3494 }
3495
3496
3497 /* Read the ending character of a range (in a bracket expression) from the
3498 uncompiled pattern *P_PTR (which ends at PEND). We assume the
3499 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3500 Then we set the translation of all bits between the starting and
3501 ending characters (inclusive) in the compiled pattern B.
3502
3503 Return an error code.
3504
3505 We use these short variable names so we can use the same macros as
3506 `regex_compile' itself. */
3507
3508 static reg_errcode_t
3509 compile_range (range_start_char, p_ptr, pend, translate, syntax, b)
3510 unsigned int range_start_char;
3511 const char **p_ptr, *pend;
3512 RE_TRANSLATE_TYPE translate;
3513 reg_syntax_t syntax;
3514 unsigned char *b;
3515 {
3516 unsigned this_char;
3517 const char *p = *p_ptr;
3518 reg_errcode_t ret;
3519 #if _LIBC
3520 const unsigned char *collseq;
3521 unsigned int start_colseq;
3522 unsigned int end_colseq;
3523 #else
3524 unsigned end_char;
3525 #endif
3526
3527 if (p == pend)
3528 return REG_ERANGE;
3529
3530 /* Have to increment the pointer into the pattern string, so the
3531 caller isn't still at the ending character. */
3532 (*p_ptr)++;
3533
3534 /* Report an error if the range is empty and the syntax prohibits this. */
3535 ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3536
3537 #if _LIBC
3538 collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3539 _NL_COLLATE_COLLSEQMB);
3540
3541 start_colseq = collseq[(unsigned char) TRANSLATE (range_start_char)];
3542 end_colseq = collseq[(unsigned char) TRANSLATE (p[0])];
3543 for (this_char = 0; this_char <= (unsigned char) -1; ++this_char)
3544 {
3545 unsigned int this_colseq = collseq[(unsigned char) TRANSLATE (this_char)];
3546
3547 if (start_colseq <= this_colseq && this_colseq <= end_colseq)
3548 {
3549 SET_LIST_BIT (TRANSLATE (this_char));
3550 ret = REG_NOERROR;
3551 }
3552 }
3553 #else
3554 /* Here we see why `this_char' has to be larger than an `unsigned
3555 char' -- we would otherwise go into an infinite loop, since all
3556 characters <= 0xff. */
3557 range_start_char = TRANSLATE (range_start_char);
3558 end_char = TRANSLATE (p[0]);
3559 for (this_char = range_start_char; this_char <= end_char; ++this_char)
3560 {
3561 SET_LIST_BIT (TRANSLATE (this_char));
3562 ret = REG_NOERROR;
3563 }
3564 #endif
3565
3566 return ret;
3567 }
3568 \f
3569 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3570 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3571 characters can start a string that matches the pattern. This fastmap
3572 is used by re_search to skip quickly over impossible starting points.
3573
3574 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3575 area as BUFP->fastmap.
3576
3577 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3578 the pattern buffer.
3579
3580 Returns 0 if we succeed, -2 if an internal error. */
3581
3582 int
3583 re_compile_fastmap (bufp)
3584 struct re_pattern_buffer *bufp;
3585 {
3586 int j, k;
3587 #ifdef MATCH_MAY_ALLOCATE
3588 fail_stack_type fail_stack;
3589 #endif
3590 #ifndef REGEX_MALLOC
3591 char *destination;
3592 #endif
3593
3594 register char *fastmap = bufp->fastmap;
3595 unsigned char *pattern = bufp->buffer;
3596 unsigned char *p = pattern;
3597 register unsigned char *pend = pattern + bufp->used;
3598
3599 #ifdef REL_ALLOC
3600 /* This holds the pointer to the failure stack, when
3601 it is allocated relocatably. */
3602 fail_stack_elt_t *failure_stack_ptr;
3603 #endif
3604
3605 /* Assume that each path through the pattern can be null until
3606 proven otherwise. We set this false at the bottom of switch
3607 statement, to which we get only if a particular path doesn't
3608 match the empty string. */
3609 boolean path_can_be_null = true;
3610
3611 /* We aren't doing a `succeed_n' to begin with. */
3612 boolean succeed_n_p = false;
3613
3614 assert (fastmap != NULL && p != NULL);
3615
3616 INIT_FAIL_STACK ();
3617 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3618 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3619 bufp->can_be_null = 0;
3620
3621 while (1)
3622 {
3623 if (p == pend || *p == succeed)
3624 {
3625 /* We have reached the (effective) end of pattern. */
3626 if (!FAIL_STACK_EMPTY ())
3627 {
3628 bufp->can_be_null |= path_can_be_null;
3629
3630 /* Reset for next path. */
3631 path_can_be_null = true;
3632
3633 p = fail_stack.stack[--fail_stack.avail].pointer;
3634
3635 continue;
3636 }
3637 else
3638 break;
3639 }
3640
3641 /* We should never be about to go beyond the end of the pattern. */
3642 assert (p < pend);
3643
3644 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3645 {
3646
3647 /* I guess the idea here is to simply not bother with a fastmap
3648 if a backreference is used, since it's too hard to figure out
3649 the fastmap for the corresponding group. Setting
3650 `can_be_null' stops `re_search_2' from using the fastmap, so
3651 that is all we do. */
3652 case duplicate:
3653 bufp->can_be_null = 1;
3654 goto done;
3655
3656
3657 /* Following are the cases which match a character. These end
3658 with `break'. */
3659
3660 case exactn:
3661 fastmap[p[1]] = 1;
3662 break;
3663
3664
3665 case charset:
3666 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3667 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3668 fastmap[j] = 1;
3669 break;
3670
3671
3672 case charset_not:
3673 /* Chars beyond end of map must be allowed. */
3674 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3675 fastmap[j] = 1;
3676
3677 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3678 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3679 fastmap[j] = 1;
3680 break;
3681
3682
3683 case wordchar:
3684 for (j = 0; j < (1 << BYTEWIDTH); j++)
3685 if (SYNTAX (j) == Sword)
3686 fastmap[j] = 1;
3687 break;
3688
3689
3690 case notwordchar:
3691 for (j = 0; j < (1 << BYTEWIDTH); j++)
3692 if (SYNTAX (j) != Sword)
3693 fastmap[j] = 1;
3694 break;
3695
3696
3697 case anychar:
3698 {
3699 int fastmap_newline = fastmap['\n'];
3700
3701 /* `.' matches anything ... */
3702 for (j = 0; j < (1 << BYTEWIDTH); j++)
3703 fastmap[j] = 1;
3704
3705 /* ... except perhaps newline. */
3706 if (!(bufp->syntax & RE_DOT_NEWLINE))
3707 fastmap['\n'] = fastmap_newline;
3708
3709 /* Return if we have already set `can_be_null'; if we have,
3710 then the fastmap is irrelevant. Something's wrong here. */
3711 else if (bufp->can_be_null)
3712 goto done;
3713
3714 /* Otherwise, have to check alternative paths. */
3715 break;
3716 }
3717
3718 #ifdef emacs
3719 case syntaxspec:
3720 k = *p++;
3721 for (j = 0; j < (1 << BYTEWIDTH); j++)
3722 if (SYNTAX (j) == (enum syntaxcode) k)
3723 fastmap[j] = 1;
3724 break;
3725
3726
3727 case notsyntaxspec:
3728 k = *p++;
3729 for (j = 0; j < (1 << BYTEWIDTH); j++)
3730 if (SYNTAX (j) != (enum syntaxcode) k)
3731 fastmap[j] = 1;
3732 break;
3733
3734
3735 /* All cases after this match the empty string. These end with
3736 `continue'. */
3737
3738
3739 case before_dot:
3740 case at_dot:
3741 case after_dot:
3742 continue;
3743 #endif /* emacs */
3744
3745
3746 case no_op:
3747 case begline:
3748 case endline:
3749 case begbuf:
3750 case endbuf:
3751 case wordbound:
3752 case notwordbound:
3753 case wordbeg:
3754 case wordend:
3755 case push_dummy_failure:
3756 continue;
3757
3758
3759 case jump_n:
3760 case pop_failure_jump:
3761 case maybe_pop_jump:
3762 case jump:
3763 case jump_past_alt:
3764 case dummy_failure_jump:
3765 EXTRACT_NUMBER_AND_INCR (j, p);
3766 p += j;
3767 if (j > 0)
3768 continue;
3769
3770 /* Jump backward implies we just went through the body of a
3771 loop and matched nothing. Opcode jumped to should be
3772 `on_failure_jump' or `succeed_n'. Just treat it like an
3773 ordinary jump. For a * loop, it has pushed its failure
3774 point already; if so, discard that as redundant. */
3775 if ((re_opcode_t) *p != on_failure_jump
3776 && (re_opcode_t) *p != succeed_n)
3777 continue;
3778
3779 p++;
3780 EXTRACT_NUMBER_AND_INCR (j, p);
3781 p += j;
3782
3783 /* If what's on the stack is where we are now, pop it. */
3784 if (!FAIL_STACK_EMPTY ()
3785 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
3786 fail_stack.avail--;
3787
3788 continue;
3789
3790
3791 case on_failure_jump:
3792 case on_failure_keep_string_jump:
3793 handle_on_failure_jump:
3794 EXTRACT_NUMBER_AND_INCR (j, p);
3795
3796 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3797 end of the pattern. We don't want to push such a point,
3798 since when we restore it above, entering the switch will
3799 increment `p' past the end of the pattern. We don't need
3800 to push such a point since we obviously won't find any more
3801 fastmap entries beyond `pend'. Such a pattern can match
3802 the null string, though. */
3803 if (p + j < pend)
3804 {
3805 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3806 {
3807 RESET_FAIL_STACK ();
3808 return -2;
3809 }
3810 }
3811 else
3812 bufp->can_be_null = 1;
3813
3814 if (succeed_n_p)
3815 {
3816 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3817 succeed_n_p = false;
3818 }
3819
3820 continue;
3821
3822
3823 case succeed_n:
3824 /* Get to the number of times to succeed. */
3825 p += 2;
3826
3827 /* Increment p past the n for when k != 0. */
3828 EXTRACT_NUMBER_AND_INCR (k, p);
3829 if (k == 0)
3830 {
3831 p -= 4;
3832 succeed_n_p = true; /* Spaghetti code alert. */
3833 goto handle_on_failure_jump;
3834 }
3835 continue;
3836
3837
3838 case set_number_at:
3839 p += 4;
3840 continue;
3841
3842
3843 case start_memory:
3844 case stop_memory:
3845 p += 2;
3846 continue;
3847
3848
3849 default:
3850 abort (); /* We have listed all the cases. */
3851 } /* switch *p++ */
3852
3853 /* Getting here means we have found the possible starting
3854 characters for one path of the pattern -- and that the empty
3855 string does not match. We need not follow this path further.
3856 Instead, look at the next alternative (remembered on the
3857 stack), or quit if no more. The test at the top of the loop
3858 does these things. */
3859 path_can_be_null = false;
3860 p = pend;
3861 } /* while p */
3862
3863 /* Set `can_be_null' for the last path (also the first path, if the
3864 pattern is empty). */
3865 bufp->can_be_null |= path_can_be_null;
3866
3867 done:
3868 RESET_FAIL_STACK ();
3869 return 0;
3870 } /* re_compile_fastmap */
3871 #ifdef _LIBC
3872 weak_alias (__re_compile_fastmap, re_compile_fastmap)
3873 #endif
3874 \f
3875 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3876 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3877 this memory for recording register information. STARTS and ENDS
3878 must be allocated using the malloc library routine, and must each
3879 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3880
3881 If NUM_REGS == 0, then subsequent matches should allocate their own
3882 register data.
3883
3884 Unless this function is called, the first search or match using
3885 PATTERN_BUFFER will allocate its own register data, without
3886 freeing the old data. */
3887
3888 void
3889 re_set_registers (bufp, regs, num_regs, starts, ends)
3890 struct re_pattern_buffer *bufp;
3891 struct re_registers *regs;
3892 unsigned num_regs;
3893 regoff_t *starts, *ends;
3894 {
3895 if (num_regs)
3896 {
3897 bufp->regs_allocated = REGS_REALLOCATE;
3898 regs->num_regs = num_regs;
3899 regs->start = starts;
3900 regs->end = ends;
3901 }
3902 else
3903 {
3904 bufp->regs_allocated = REGS_UNALLOCATED;
3905 regs->num_regs = 0;
3906 regs->start = regs->end = (regoff_t *) 0;
3907 }
3908 }
3909 #ifdef _LIBC
3910 weak_alias (__re_set_registers, re_set_registers)
3911 #endif
3912 \f
3913 /* Searching routines. */
3914
3915 /* Like re_search_2, below, but only one string is specified, and
3916 doesn't let you say where to stop matching. */
3917
3918 int
3919 re_search (bufp, string, size, startpos, range, regs)
3920 struct re_pattern_buffer *bufp;
3921 const char *string;
3922 int size, startpos, range;
3923 struct re_registers *regs;
3924 {
3925 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3926 regs, size);
3927 }
3928 #ifdef _LIBC
3929 weak_alias (__re_search, re_search)
3930 #endif
3931
3932
3933 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3934 virtual concatenation of STRING1 and STRING2, starting first at index
3935 STARTPOS, then at STARTPOS + 1, and so on.
3936
3937 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3938
3939 RANGE is how far to scan while trying to match. RANGE = 0 means try
3940 only at STARTPOS; in general, the last start tried is STARTPOS +
3941 RANGE.
3942
3943 In REGS, return the indices of the virtual concatenation of STRING1
3944 and STRING2 that matched the entire BUFP->buffer and its contained
3945 subexpressions.
3946
3947 Do not consider matching one past the index STOP in the virtual
3948 concatenation of STRING1 and STRING2.
3949
3950 We return either the position in the strings at which the match was
3951 found, -1 if no match, or -2 if error (such as failure
3952 stack overflow). */
3953
3954 int
3955 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
3956 struct re_pattern_buffer *bufp;
3957 const char *string1, *string2;
3958 int size1, size2;
3959 int startpos;
3960 int range;
3961 struct re_registers *regs;
3962 int stop;
3963 {
3964 int val;
3965 register char *fastmap = bufp->fastmap;
3966 register RE_TRANSLATE_TYPE translate = bufp->translate;
3967 int total_size = size1 + size2;
3968 int endpos = startpos + range;
3969
3970 /* Check for out-of-range STARTPOS. */
3971 if (startpos < 0 || startpos > total_size)
3972 return -1;
3973
3974 /* Fix up RANGE if it might eventually take us outside
3975 the virtual concatenation of STRING1 and STRING2.
3976 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
3977 if (endpos < 0)
3978 range = 0 - startpos;
3979 else if (endpos > total_size)
3980 range = total_size - startpos;
3981
3982 /* If the search isn't to be a backwards one, don't waste time in a
3983 search for a pattern that must be anchored. */
3984 if (bufp->used > 0 && range > 0
3985 && ((re_opcode_t) bufp->buffer[0] == begbuf
3986 /* `begline' is like `begbuf' if it cannot match at newlines. */
3987 || ((re_opcode_t) bufp->buffer[0] == begline
3988 && !bufp->newline_anchor)))
3989 {
3990 if (startpos > 0)
3991 return -1;
3992 else
3993 range = 1;
3994 }
3995
3996 #ifdef emacs
3997 /* In a forward search for something that starts with \=.
3998 don't keep searching past point. */
3999 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4000 {
4001 range = PT - startpos;
4002 if (range <= 0)
4003 return -1;
4004 }
4005 #endif /* emacs */
4006
4007 /* Update the fastmap now if not correct already. */
4008 if (fastmap && !bufp->fastmap_accurate)
4009 if (re_compile_fastmap (bufp) == -2)
4010 return -2;
4011
4012 /* Loop through the string, looking for a place to start matching. */
4013 for (;;)
4014 {
4015 /* If a fastmap is supplied, skip quickly over characters that
4016 cannot be the start of a match. If the pattern can match the
4017 null string, however, we don't need to skip characters; we want
4018 the first null string. */
4019 if (fastmap && startpos < total_size && !bufp->can_be_null)
4020 {
4021 if (range > 0) /* Searching forwards. */
4022 {
4023 register const char *d;
4024 register int lim = 0;
4025 int irange = range;
4026
4027 if (startpos < size1 && startpos + range >= size1)
4028 lim = range - (size1 - startpos);
4029
4030 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
4031
4032 /* Written out as an if-else to avoid testing `translate'
4033 inside the loop. */
4034 if (translate)
4035 while (range > lim
4036 && !fastmap[(unsigned char)
4037 translate[(unsigned char) *d++]])
4038 range--;
4039 else
4040 while (range > lim && !fastmap[(unsigned char) *d++])
4041 range--;
4042
4043 startpos += irange - range;
4044 }
4045 else /* Searching backwards. */
4046 {
4047 register char c = (size1 == 0 || startpos >= size1
4048 ? string2[startpos - size1]
4049 : string1[startpos]);
4050
4051 if (!fastmap[(unsigned char) TRANSLATE (c)])
4052 goto advance;
4053 }
4054 }
4055
4056 /* If can't match the null string, and that's all we have left, fail. */
4057 if (range >= 0 && startpos == total_size && fastmap
4058 && !bufp->can_be_null)
4059 return -1;
4060
4061 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4062 startpos, regs, stop);
4063 #ifndef REGEX_MALLOC
4064 # ifdef C_ALLOCA
4065 alloca (0);
4066 # endif
4067 #endif
4068
4069 if (val >= 0)
4070 return startpos;
4071
4072 if (val == -2)
4073 return -2;
4074
4075 advance:
4076 if (!range)
4077 break;
4078 else if (range > 0)
4079 {
4080 range--;
4081 startpos++;
4082 }
4083 else
4084 {
4085 range++;
4086 startpos--;
4087 }
4088 }
4089 return -1;
4090 } /* re_search_2 */
4091 #ifdef _LIBC
4092 weak_alias (__re_search_2, re_search_2)
4093 #endif
4094 \f
4095 /* This converts PTR, a pointer into one of the search strings `string1'
4096 and `string2' into an offset from the beginning of that string. */
4097 #define POINTER_TO_OFFSET(ptr) \
4098 (FIRST_STRING_P (ptr) \
4099 ? ((regoff_t) ((ptr) - string1)) \
4100 : ((regoff_t) ((ptr) - string2 + size1)))
4101
4102 /* Macros for dealing with the split strings in re_match_2. */
4103
4104 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
4105
4106 /* Call before fetching a character with *d. This switches over to
4107 string2 if necessary. */
4108 #define PREFETCH() \
4109 while (d == dend) \
4110 { \
4111 /* End of string2 => fail. */ \
4112 if (dend == end_match_2) \
4113 goto fail; \
4114 /* End of string1 => advance to string2. */ \
4115 d = string2; \
4116 dend = end_match_2; \
4117 }
4118
4119
4120 /* Test if at very beginning or at very end of the virtual concatenation
4121 of `string1' and `string2'. If only one string, it's `string2'. */
4122 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4123 #define AT_STRINGS_END(d) ((d) == end2)
4124
4125
4126 /* Test if D points to a character which is word-constituent. We have
4127 two special cases to check for: if past the end of string1, look at
4128 the first character in string2; and if before the beginning of
4129 string2, look at the last character in string1. */
4130 #define WORDCHAR_P(d) \
4131 (SYNTAX ((d) == end1 ? *string2 \
4132 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
4133 == Sword)
4134
4135 /* Disabled due to a compiler bug -- see comment at case wordbound */
4136 #if 0
4137 /* Test if the character before D and the one at D differ with respect
4138 to being word-constituent. */
4139 #define AT_WORD_BOUNDARY(d) \
4140 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4141 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
4142 #endif
4143
4144 /* Free everything we malloc. */
4145 #ifdef MATCH_MAY_ALLOCATE
4146 # define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4147 # define FREE_VARIABLES() \
4148 do { \
4149 REGEX_FREE_STACK (fail_stack.stack); \
4150 FREE_VAR (regstart); \
4151 FREE_VAR (regend); \
4152 FREE_VAR (old_regstart); \
4153 FREE_VAR (old_regend); \
4154 FREE_VAR (best_regstart); \
4155 FREE_VAR (best_regend); \
4156 FREE_VAR (reg_info); \
4157 FREE_VAR (reg_dummy); \
4158 FREE_VAR (reg_info_dummy); \
4159 } while (0)
4160 #else
4161 # define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4162 #endif /* not MATCH_MAY_ALLOCATE */
4163
4164 /* These values must meet several constraints. They must not be valid
4165 register values; since we have a limit of 255 registers (because
4166 we use only one byte in the pattern for the register number), we can
4167 use numbers larger than 255. They must differ by 1, because of
4168 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4169 be larger than the value for the highest register, so we do not try
4170 to actually save any registers when none are active. */
4171 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4172 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4173 \f
4174 /* Matching routines. */
4175
4176 #ifndef emacs /* Emacs never uses this. */
4177 /* re_match is like re_match_2 except it takes only a single string. */
4178
4179 int
4180 re_match (bufp, string, size, pos, regs)
4181 struct re_pattern_buffer *bufp;
4182 const char *string;
4183 int size, pos;
4184 struct re_registers *regs;
4185 {
4186 int result = re_match_2_internal (bufp, NULL, 0, string, size,
4187 pos, regs, size);
4188 # ifndef REGEX_MALLOC
4189 # ifdef C_ALLOCA
4190 alloca (0);
4191 # endif
4192 # endif
4193 return result;
4194 }
4195 # ifdef _LIBC
4196 weak_alias (__re_match, re_match)
4197 # endif
4198 #endif /* not emacs */
4199
4200 static boolean group_match_null_string_p _RE_ARGS ((unsigned char **p,
4201 unsigned char *end,
4202 register_info_type *reg_info));
4203 static boolean alt_match_null_string_p _RE_ARGS ((unsigned char *p,
4204 unsigned char *end,
4205 register_info_type *reg_info));
4206 static boolean common_op_match_null_string_p _RE_ARGS ((unsigned char **p,
4207 unsigned char *end,
4208 register_info_type *reg_info));
4209 static int bcmp_translate _RE_ARGS ((const char *s1, const char *s2,
4210 int len, char *translate));
4211
4212 /* re_match_2 matches the compiled pattern in BUFP against the
4213 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4214 and SIZE2, respectively). We start matching at POS, and stop
4215 matching at STOP.
4216
4217 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4218 store offsets for the substring each group matched in REGS. See the
4219 documentation for exactly how many groups we fill.
4220
4221 We return -1 if no match, -2 if an internal error (such as the
4222 failure stack overflowing). Otherwise, we return the length of the
4223 matched substring. */
4224
4225 int
4226 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4227 struct re_pattern_buffer *bufp;
4228 const char *string1, *string2;
4229 int size1, size2;
4230 int pos;
4231 struct re_registers *regs;
4232 int stop;
4233 {
4234 int result = re_match_2_internal (bufp, string1, size1, string2, size2,
4235 pos, regs, stop);
4236 #ifndef REGEX_MALLOC
4237 # ifdef C_ALLOCA
4238 alloca (0);
4239 # endif
4240 #endif
4241 return result;
4242 }
4243 #ifdef _LIBC
4244 weak_alias (__re_match_2, re_match_2)
4245 #endif
4246
4247 /* This is a separate function so that we can force an alloca cleanup
4248 afterwards. */
4249 static int
4250 re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4251 struct re_pattern_buffer *bufp;
4252 const char *string1, *string2;
4253 int size1, size2;
4254 int pos;
4255 struct re_registers *regs;
4256 int stop;
4257 {
4258 /* General temporaries. */
4259 int mcnt;
4260 unsigned char *p1;
4261
4262 /* Just past the end of the corresponding string. */
4263 const char *end1, *end2;
4264
4265 /* Pointers into string1 and string2, just past the last characters in
4266 each to consider matching. */
4267 const char *end_match_1, *end_match_2;
4268
4269 /* Where we are in the data, and the end of the current string. */
4270 const char *d, *dend;
4271
4272 /* Where we are in the pattern, and the end of the pattern. */
4273 unsigned char *p = bufp->buffer;
4274 register unsigned char *pend = p + bufp->used;
4275
4276 /* Mark the opcode just after a start_memory, so we can test for an
4277 empty subpattern when we get to the stop_memory. */
4278 unsigned char *just_past_start_mem = 0;
4279
4280 /* We use this to map every character in the string. */
4281 RE_TRANSLATE_TYPE translate = bufp->translate;
4282
4283 /* Failure point stack. Each place that can handle a failure further
4284 down the line pushes a failure point on this stack. It consists of
4285 restart, regend, and reg_info for all registers corresponding to
4286 the subexpressions we're currently inside, plus the number of such
4287 registers, and, finally, two char *'s. The first char * is where
4288 to resume scanning the pattern; the second one is where to resume
4289 scanning the strings. If the latter is zero, the failure point is
4290 a ``dummy''; if a failure happens and the failure point is a dummy,
4291 it gets discarded and the next next one is tried. */
4292 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4293 fail_stack_type fail_stack;
4294 #endif
4295 #ifdef DEBUG
4296 static unsigned failure_id;
4297 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4298 #endif
4299
4300 #ifdef REL_ALLOC
4301 /* This holds the pointer to the failure stack, when
4302 it is allocated relocatably. */
4303 fail_stack_elt_t *failure_stack_ptr;
4304 #endif
4305
4306 /* We fill all the registers internally, independent of what we
4307 return, for use in backreferences. The number here includes
4308 an element for register zero. */
4309 size_t num_regs = bufp->re_nsub + 1;
4310
4311 /* The currently active registers. */
4312 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4313 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4314
4315 /* Information on the contents of registers. These are pointers into
4316 the input strings; they record just what was matched (on this
4317 attempt) by a subexpression part of the pattern, that is, the
4318 regnum-th regstart pointer points to where in the pattern we began
4319 matching and the regnum-th regend points to right after where we
4320 stopped matching the regnum-th subexpression. (The zeroth register
4321 keeps track of what the whole pattern matches.) */
4322 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4323 const char **regstart, **regend;
4324 #endif
4325
4326 /* If a group that's operated upon by a repetition operator fails to
4327 match anything, then the register for its start will need to be
4328 restored because it will have been set to wherever in the string we
4329 are when we last see its open-group operator. Similarly for a
4330 register's end. */
4331 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4332 const char **old_regstart, **old_regend;
4333 #endif
4334
4335 /* The is_active field of reg_info helps us keep track of which (possibly
4336 nested) subexpressions we are currently in. The matched_something
4337 field of reg_info[reg_num] helps us tell whether or not we have
4338 matched any of the pattern so far this time through the reg_num-th
4339 subexpression. These two fields get reset each time through any
4340 loop their register is in. */
4341 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4342 register_info_type *reg_info;
4343 #endif
4344
4345 /* The following record the register info as found in the above
4346 variables when we find a match better than any we've seen before.
4347 This happens as we backtrack through the failure points, which in
4348 turn happens only if we have not yet matched the entire string. */
4349 unsigned best_regs_set = false;
4350 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4351 const char **best_regstart, **best_regend;
4352 #endif
4353
4354 /* Logically, this is `best_regend[0]'. But we don't want to have to
4355 allocate space for that if we're not allocating space for anything
4356 else (see below). Also, we never need info about register 0 for
4357 any of the other register vectors, and it seems rather a kludge to
4358 treat `best_regend' differently than the rest. So we keep track of
4359 the end of the best match so far in a separate variable. We
4360 initialize this to NULL so that when we backtrack the first time
4361 and need to test it, it's not garbage. */
4362 const char *match_end = NULL;
4363
4364 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4365 int set_regs_matched_done = 0;
4366
4367 /* Used when we pop values we don't care about. */
4368 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4369 const char **reg_dummy;
4370 register_info_type *reg_info_dummy;
4371 #endif
4372
4373 #ifdef DEBUG
4374 /* Counts the total number of registers pushed. */
4375 unsigned num_regs_pushed = 0;
4376 #endif
4377
4378 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
4379
4380 INIT_FAIL_STACK ();
4381
4382 #ifdef MATCH_MAY_ALLOCATE
4383 /* Do not bother to initialize all the register variables if there are
4384 no groups in the pattern, as it takes a fair amount of time. If
4385 there are groups, we include space for register 0 (the whole
4386 pattern), even though we never use it, since it simplifies the
4387 array indexing. We should fix this. */
4388 if (bufp->re_nsub)
4389 {
4390 regstart = REGEX_TALLOC (num_regs, const char *);
4391 regend = REGEX_TALLOC (num_regs, const char *);
4392 old_regstart = REGEX_TALLOC (num_regs, const char *);
4393 old_regend = REGEX_TALLOC (num_regs, const char *);
4394 best_regstart = REGEX_TALLOC (num_regs, const char *);
4395 best_regend = REGEX_TALLOC (num_regs, const char *);
4396 reg_info = REGEX_TALLOC (num_regs, register_info_type);
4397 reg_dummy = REGEX_TALLOC (num_regs, const char *);
4398 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
4399
4400 if (!(regstart && regend && old_regstart && old_regend && reg_info
4401 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
4402 {
4403 FREE_VARIABLES ();
4404 return -2;
4405 }
4406 }
4407 else
4408 {
4409 /* We must initialize all our variables to NULL, so that
4410 `FREE_VARIABLES' doesn't try to free them. */
4411 regstart = regend = old_regstart = old_regend = best_regstart
4412 = best_regend = reg_dummy = NULL;
4413 reg_info = reg_info_dummy = (register_info_type *) NULL;
4414 }
4415 #endif /* MATCH_MAY_ALLOCATE */
4416
4417 /* The starting position is bogus. */
4418 if (pos < 0 || pos > size1 + size2)
4419 {
4420 FREE_VARIABLES ();
4421 return -1;
4422 }
4423
4424 /* Initialize subexpression text positions to -1 to mark ones that no
4425 start_memory/stop_memory has been seen for. Also initialize the
4426 register information struct. */
4427 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
4428 {
4429 regstart[mcnt] = regend[mcnt]
4430 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4431
4432 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
4433 IS_ACTIVE (reg_info[mcnt]) = 0;
4434 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4435 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4436 }
4437
4438 /* We move `string1' into `string2' if the latter's empty -- but not if
4439 `string1' is null. */
4440 if (size2 == 0 && string1 != NULL)
4441 {
4442 string2 = string1;
4443 size2 = size1;
4444 string1 = 0;
4445 size1 = 0;
4446 }
4447 end1 = string1 + size1;
4448 end2 = string2 + size2;
4449
4450 /* Compute where to stop matching, within the two strings. */
4451 if (stop <= size1)
4452 {
4453 end_match_1 = string1 + stop;
4454 end_match_2 = string2;
4455 }
4456 else
4457 {
4458 end_match_1 = end1;
4459 end_match_2 = string2 + stop - size1;
4460 }
4461
4462 /* `p' scans through the pattern as `d' scans through the data.
4463 `dend' is the end of the input string that `d' points within. `d'
4464 is advanced into the following input string whenever necessary, but
4465 this happens before fetching; therefore, at the beginning of the
4466 loop, `d' can be pointing at the end of a string, but it cannot
4467 equal `string2'. */
4468 if (size1 > 0 && pos <= size1)
4469 {
4470 d = string1 + pos;
4471 dend = end_match_1;
4472 }
4473 else
4474 {
4475 d = string2 + pos - size1;
4476 dend = end_match_2;
4477 }
4478
4479 DEBUG_PRINT1 ("The compiled pattern is:\n");
4480 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4481 DEBUG_PRINT1 ("The string to match is: `");
4482 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4483 DEBUG_PRINT1 ("'\n");
4484
4485 /* This loops over pattern commands. It exits by returning from the
4486 function if the match is complete, or it drops through if the match
4487 fails at this starting point in the input data. */
4488 for (;;)
4489 {
4490 #ifdef _LIBC
4491 DEBUG_PRINT2 ("\n%p: ", p);
4492 #else
4493 DEBUG_PRINT2 ("\n0x%x: ", p);
4494 #endif
4495
4496 if (p == pend)
4497 { /* End of pattern means we might have succeeded. */
4498 DEBUG_PRINT1 ("end of pattern ... ");
4499
4500 /* If we haven't matched the entire string, and we want the
4501 longest match, try backtracking. */
4502 if (d != end_match_2)
4503 {
4504 /* 1 if this match ends in the same string (string1 or string2)
4505 as the best previous match. */
4506 boolean same_str_p = (FIRST_STRING_P (match_end)
4507 == MATCHING_IN_FIRST_STRING);
4508 /* 1 if this match is the best seen so far. */
4509 boolean best_match_p;
4510
4511 /* AIX compiler got confused when this was combined
4512 with the previous declaration. */
4513 if (same_str_p)
4514 best_match_p = d > match_end;
4515 else
4516 best_match_p = !MATCHING_IN_FIRST_STRING;
4517
4518 DEBUG_PRINT1 ("backtracking.\n");
4519
4520 if (!FAIL_STACK_EMPTY ())
4521 { /* More failure points to try. */
4522
4523 /* If exceeds best match so far, save it. */
4524 if (!best_regs_set || best_match_p)
4525 {
4526 best_regs_set = true;
4527 match_end = d;
4528
4529 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4530
4531 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
4532 {
4533 best_regstart[mcnt] = regstart[mcnt];
4534 best_regend[mcnt] = regend[mcnt];
4535 }
4536 }
4537 goto fail;
4538 }
4539
4540 /* If no failure points, don't restore garbage. And if
4541 last match is real best match, don't restore second
4542 best one. */
4543 else if (best_regs_set && !best_match_p)
4544 {
4545 restore_best_regs:
4546 /* Restore best match. It may happen that `dend ==
4547 end_match_1' while the restored d is in string2.
4548 For example, the pattern `x.*y.*z' against the
4549 strings `x-' and `y-z-', if the two strings are
4550 not consecutive in memory. */
4551 DEBUG_PRINT1 ("Restoring best registers.\n");
4552
4553 d = match_end;
4554 dend = ((d >= string1 && d <= end1)
4555 ? end_match_1 : end_match_2);
4556
4557 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
4558 {
4559 regstart[mcnt] = best_regstart[mcnt];
4560 regend[mcnt] = best_regend[mcnt];
4561 }
4562 }
4563 } /* d != end_match_2 */
4564
4565 succeed_label:
4566 DEBUG_PRINT1 ("Accepting match.\n");
4567
4568 /* If caller wants register contents data back, do it. */
4569 if (regs && !bufp->no_sub)
4570 {
4571 /* Have the register data arrays been allocated? */
4572 if (bufp->regs_allocated == REGS_UNALLOCATED)
4573 { /* No. So allocate them with malloc. We need one
4574 extra element beyond `num_regs' for the `-1' marker
4575 GNU code uses. */
4576 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4577 regs->start = TALLOC (regs->num_regs, regoff_t);
4578 regs->end = TALLOC (regs->num_regs, regoff_t);
4579 if (regs->start == NULL || regs->end == NULL)
4580 {
4581 FREE_VARIABLES ();
4582 return -2;
4583 }
4584 bufp->regs_allocated = REGS_REALLOCATE;
4585 }
4586 else if (bufp->regs_allocated == REGS_REALLOCATE)
4587 { /* Yes. If we need more elements than were already
4588 allocated, reallocate them. If we need fewer, just
4589 leave it alone. */
4590 if (regs->num_regs < num_regs + 1)
4591 {
4592 regs->num_regs = num_regs + 1;
4593 RETALLOC (regs->start, regs->num_regs, regoff_t);
4594 RETALLOC (regs->end, regs->num_regs, regoff_t);
4595 if (regs->start == NULL || regs->end == NULL)
4596 {
4597 FREE_VARIABLES ();
4598 return -2;
4599 }
4600 }
4601 }
4602 else
4603 {
4604 /* These braces fend off a "empty body in an else-statement"
4605 warning under GCC when assert expands to nothing. */
4606 assert (bufp->regs_allocated == REGS_FIXED);
4607 }
4608
4609 /* Convert the pointer data in `regstart' and `regend' to
4610 indices. Register zero has to be set differently,
4611 since we haven't kept track of any info for it. */
4612 if (regs->num_regs > 0)
4613 {
4614 regs->start[0] = pos;
4615 regs->end[0] = (MATCHING_IN_FIRST_STRING
4616 ? ((regoff_t) (d - string1))
4617 : ((regoff_t) (d - string2 + size1)));
4618 }
4619
4620 /* Go through the first `min (num_regs, regs->num_regs)'
4621 registers, since that is all we initialized. */
4622 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
4623 mcnt++)
4624 {
4625 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4626 regs->start[mcnt] = regs->end[mcnt] = -1;
4627 else
4628 {
4629 regs->start[mcnt]
4630 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
4631 regs->end[mcnt]
4632 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
4633 }
4634 }
4635
4636 /* If the regs structure we return has more elements than
4637 were in the pattern, set the extra elements to -1. If
4638 we (re)allocated the registers, this is the case,
4639 because we always allocate enough to have at least one
4640 -1 at the end. */
4641 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
4642 regs->start[mcnt] = regs->end[mcnt] = -1;
4643 } /* regs && !bufp->no_sub */
4644
4645 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4646 nfailure_points_pushed, nfailure_points_popped,
4647 nfailure_points_pushed - nfailure_points_popped);
4648 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4649
4650 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
4651 ? string1
4652 : string2 - size1);
4653
4654 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4655
4656 FREE_VARIABLES ();
4657 return mcnt;
4658 }
4659
4660 /* Otherwise match next pattern command. */
4661 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4662 {
4663 /* Ignore these. Used to ignore the n of succeed_n's which
4664 currently have n == 0. */
4665 case no_op:
4666 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4667 break;
4668
4669 case succeed:
4670 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4671 goto succeed_label;
4672
4673 /* Match the next n pattern characters exactly. The following
4674 byte in the pattern defines n, and the n bytes after that
4675 are the characters to match. */
4676 case exactn:
4677 mcnt = *p++;
4678 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4679
4680 /* This is written out as an if-else so we don't waste time
4681 testing `translate' inside the loop. */
4682 if (translate)
4683 {
4684 do
4685 {
4686 PREFETCH ();
4687 if ((unsigned char) translate[(unsigned char) *d++]
4688 != (unsigned char) *p++)
4689 goto fail;
4690 }
4691 while (--mcnt);
4692 }
4693 else
4694 {
4695 do
4696 {
4697 PREFETCH ();
4698 if (*d++ != (char) *p++) goto fail;
4699 }
4700 while (--mcnt);
4701 }
4702 SET_REGS_MATCHED ();
4703 break;
4704
4705
4706 /* Match any character except possibly a newline or a null. */
4707 case anychar:
4708 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4709
4710 PREFETCH ();
4711
4712 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
4713 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
4714 goto fail;
4715
4716 SET_REGS_MATCHED ();
4717 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4718 d++;
4719 break;
4720
4721
4722 case charset:
4723 case charset_not:
4724 {
4725 register unsigned char c;
4726 boolean not = (re_opcode_t) *(p - 1) == charset_not;
4727
4728 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
4729
4730 PREFETCH ();
4731 c = TRANSLATE (*d); /* The character to match. */
4732
4733 /* Cast to `unsigned' instead of `unsigned char' in case the
4734 bit list is a full 32 bytes long. */
4735 if (c < (unsigned) (*p * BYTEWIDTH)
4736 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4737 not = !not;
4738
4739 p += 1 + *p;
4740
4741 if (!not) goto fail;
4742
4743 SET_REGS_MATCHED ();
4744 d++;
4745 break;
4746 }
4747
4748
4749 /* The beginning of a group is represented by start_memory.
4750 The arguments are the register number in the next byte, and the
4751 number of groups inner to this one in the next. The text
4752 matched within the group is recorded (in the internal
4753 registers data structure) under the register number. */
4754 case start_memory:
4755 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4756
4757 /* Find out if this group can match the empty string. */
4758 p1 = p; /* To send to group_match_null_string_p. */
4759
4760 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4761 REG_MATCH_NULL_STRING_P (reg_info[*p])
4762 = group_match_null_string_p (&p1, pend, reg_info);
4763
4764 /* Save the position in the string where we were the last time
4765 we were at this open-group operator in case the group is
4766 operated upon by a repetition operator, e.g., with `(a*)*b'
4767 against `ab'; then we want to ignore where we are now in
4768 the string in case this attempt to match fails. */
4769 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4770 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4771 : regstart[*p];
4772 DEBUG_PRINT2 (" old_regstart: %d\n",
4773 POINTER_TO_OFFSET (old_regstart[*p]));
4774
4775 regstart[*p] = d;
4776 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4777
4778 IS_ACTIVE (reg_info[*p]) = 1;
4779 MATCHED_SOMETHING (reg_info[*p]) = 0;
4780
4781 /* Clear this whenever we change the register activity status. */
4782 set_regs_matched_done = 0;
4783
4784 /* This is the new highest active register. */
4785 highest_active_reg = *p;
4786
4787 /* If nothing was active before, this is the new lowest active
4788 register. */
4789 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4790 lowest_active_reg = *p;
4791
4792 /* Move past the register number and inner group count. */
4793 p += 2;
4794 just_past_start_mem = p;
4795
4796 break;
4797
4798
4799 /* The stop_memory opcode represents the end of a group. Its
4800 arguments are the same as start_memory's: the register
4801 number, and the number of inner groups. */
4802 case stop_memory:
4803 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
4804
4805 /* We need to save the string position the last time we were at
4806 this close-group operator in case the group is operated
4807 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
4808 against `aba'; then we want to ignore where we are now in
4809 the string in case this attempt to match fails. */
4810 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4811 ? REG_UNSET (regend[*p]) ? d : regend[*p]
4812 : regend[*p];
4813 DEBUG_PRINT2 (" old_regend: %d\n",
4814 POINTER_TO_OFFSET (old_regend[*p]));
4815
4816 regend[*p] = d;
4817 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4818
4819 /* This register isn't active anymore. */
4820 IS_ACTIVE (reg_info[*p]) = 0;
4821
4822 /* Clear this whenever we change the register activity status. */
4823 set_regs_matched_done = 0;
4824
4825 /* If this was the only register active, nothing is active
4826 anymore. */
4827 if (lowest_active_reg == highest_active_reg)
4828 {
4829 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4830 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4831 }
4832 else
4833 { /* We must scan for the new highest active register, since
4834 it isn't necessarily one less than now: consider
4835 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
4836 new highest active register is 1. */
4837 unsigned char r = *p - 1;
4838 while (r > 0 && !IS_ACTIVE (reg_info[r]))
4839 r--;
4840
4841 /* If we end up at register zero, that means that we saved
4842 the registers as the result of an `on_failure_jump', not
4843 a `start_memory', and we jumped to past the innermost
4844 `stop_memory'. For example, in ((.)*) we save
4845 registers 1 and 2 as a result of the *, but when we pop
4846 back to the second ), we are at the stop_memory 1.
4847 Thus, nothing is active. */
4848 if (r == 0)
4849 {
4850 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4851 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4852 }
4853 else
4854 highest_active_reg = r;
4855 }
4856
4857 /* If just failed to match something this time around with a
4858 group that's operated on by a repetition operator, try to
4859 force exit from the ``loop'', and restore the register
4860 information for this group that we had before trying this
4861 last match. */
4862 if ((!MATCHED_SOMETHING (reg_info[*p])
4863 || just_past_start_mem == p - 1)
4864 && (p + 2) < pend)
4865 {
4866 boolean is_a_jump_n = false;
4867
4868 p1 = p + 2;
4869 mcnt = 0;
4870 switch ((re_opcode_t) *p1++)
4871 {
4872 case jump_n:
4873 is_a_jump_n = true;
4874 case pop_failure_jump:
4875 case maybe_pop_jump:
4876 case jump:
4877 case dummy_failure_jump:
4878 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4879 if (is_a_jump_n)
4880 p1 += 2;
4881 break;
4882
4883 default:
4884 /* do nothing */ ;
4885 }
4886 p1 += mcnt;
4887
4888 /* If the next operation is a jump backwards in the pattern
4889 to an on_failure_jump right before the start_memory
4890 corresponding to this stop_memory, exit from the loop
4891 by forcing a failure after pushing on the stack the
4892 on_failure_jump's jump in the pattern, and d. */
4893 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
4894 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
4895 {
4896 /* If this group ever matched anything, then restore
4897 what its registers were before trying this last
4898 failed match, e.g., with `(a*)*b' against `ab' for
4899 regstart[1], and, e.g., with `((a*)*(b*)*)*'
4900 against `aba' for regend[3].
4901
4902 Also restore the registers for inner groups for,
4903 e.g., `((a*)(b*))*' against `aba' (register 3 would
4904 otherwise get trashed). */
4905
4906 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
4907 {
4908 unsigned r;
4909
4910 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
4911
4912 /* Restore this and inner groups' (if any) registers. */
4913 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
4914 r++)
4915 {
4916 regstart[r] = old_regstart[r];
4917
4918 /* xx why this test? */
4919 if (old_regend[r] >= regstart[r])
4920 regend[r] = old_regend[r];
4921 }
4922 }
4923 p1++;
4924 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4925 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
4926
4927 goto fail;
4928 }
4929 }
4930
4931 /* Move past the register number and the inner group count. */
4932 p += 2;
4933 break;
4934
4935
4936 /* \<digit> has been turned into a `duplicate' command which is
4937 followed by the numeric value of <digit> as the register number. */
4938 case duplicate:
4939 {
4940 register const char *d2, *dend2;
4941 int regno = *p++; /* Get which register to match against. */
4942 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
4943
4944 /* Can't back reference a group which we've never matched. */
4945 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
4946 goto fail;
4947
4948 /* Where in input to try to start matching. */
4949 d2 = regstart[regno];
4950
4951 /* Where to stop matching; if both the place to start and
4952 the place to stop matching are in the same string, then
4953 set to the place to stop, otherwise, for now have to use
4954 the end of the first string. */
4955
4956 dend2 = ((FIRST_STRING_P (regstart[regno])
4957 == FIRST_STRING_P (regend[regno]))
4958 ? regend[regno] : end_match_1);
4959 for (;;)
4960 {
4961 /* If necessary, advance to next segment in register
4962 contents. */
4963 while (d2 == dend2)
4964 {
4965 if (dend2 == end_match_2) break;
4966 if (dend2 == regend[regno]) break;
4967
4968 /* End of string1 => advance to string2. */
4969 d2 = string2;
4970 dend2 = regend[regno];
4971 }
4972 /* At end of register contents => success */
4973 if (d2 == dend2) break;
4974
4975 /* If necessary, advance to next segment in data. */
4976 PREFETCH ();
4977
4978 /* How many characters left in this segment to match. */
4979 mcnt = dend - d;
4980
4981 /* Want how many consecutive characters we can match in
4982 one shot, so, if necessary, adjust the count. */
4983 if (mcnt > dend2 - d2)
4984 mcnt = dend2 - d2;
4985
4986 /* Compare that many; failure if mismatch, else move
4987 past them. */
4988 if (translate
4989 ? bcmp_translate (d, d2, mcnt, translate)
4990 : memcmp (d, d2, mcnt))
4991 goto fail;
4992 d += mcnt, d2 += mcnt;
4993
4994 /* Do this because we've match some characters. */
4995 SET_REGS_MATCHED ();
4996 }
4997 }
4998 break;
4999
5000
5001 /* begline matches the empty string at the beginning of the string
5002 (unless `not_bol' is set in `bufp'), and, if
5003 `newline_anchor' is set, after newlines. */
5004 case begline:
5005 DEBUG_PRINT1 ("EXECUTING begline.\n");
5006
5007 if (AT_STRINGS_BEG (d))
5008 {
5009 if (!bufp->not_bol) break;
5010 }
5011 else if (d[-1] == '\n' && bufp->newline_anchor)
5012 {
5013 break;
5014 }
5015 /* In all other cases, we fail. */
5016 goto fail;
5017
5018
5019 /* endline is the dual of begline. */
5020 case endline:
5021 DEBUG_PRINT1 ("EXECUTING endline.\n");
5022
5023 if (AT_STRINGS_END (d))
5024 {
5025 if (!bufp->not_eol) break;
5026 }
5027
5028 /* We have to ``prefetch'' the next character. */
5029 else if ((d == end1 ? *string2 : *d) == '\n'
5030 && bufp->newline_anchor)
5031 {
5032 break;
5033 }
5034 goto fail;
5035
5036
5037 /* Match at the very beginning of the data. */
5038 case begbuf:
5039 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5040 if (AT_STRINGS_BEG (d))
5041 break;
5042 goto fail;
5043
5044
5045 /* Match at the very end of the data. */
5046 case endbuf:
5047 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5048 if (AT_STRINGS_END (d))
5049 break;
5050 goto fail;
5051
5052
5053 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5054 pushes NULL as the value for the string on the stack. Then
5055 `pop_failure_point' will keep the current value for the
5056 string, instead of restoring it. To see why, consider
5057 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5058 then the . fails against the \n. But the next thing we want
5059 to do is match the \n against the \n; if we restored the
5060 string value, we would be back at the foo.
5061
5062 Because this is used only in specific cases, we don't need to
5063 check all the things that `on_failure_jump' does, to make
5064 sure the right things get saved on the stack. Hence we don't
5065 share its code. The only reason to push anything on the
5066 stack at all is that otherwise we would have to change
5067 `anychar's code to do something besides goto fail in this
5068 case; that seems worse than this. */
5069 case on_failure_keep_string_jump:
5070 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
5071
5072 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5073 #ifdef _LIBC
5074 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt);
5075 #else
5076 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
5077 #endif
5078
5079 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
5080 break;
5081
5082
5083 /* Uses of on_failure_jump:
5084
5085 Each alternative starts with an on_failure_jump that points
5086 to the beginning of the next alternative. Each alternative
5087 except the last ends with a jump that in effect jumps past
5088 the rest of the alternatives. (They really jump to the
5089 ending jump of the following alternative, because tensioning
5090 these jumps is a hassle.)
5091
5092 Repeats start with an on_failure_jump that points past both
5093 the repetition text and either the following jump or
5094 pop_failure_jump back to this on_failure_jump. */
5095 case on_failure_jump:
5096 on_failure:
5097 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
5098
5099 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5100 #ifdef _LIBC
5101 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt);
5102 #else
5103 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
5104 #endif
5105
5106 /* If this on_failure_jump comes right before a group (i.e.,
5107 the original * applied to a group), save the information
5108 for that group and all inner ones, so that if we fail back
5109 to this point, the group's information will be correct.
5110 For example, in \(a*\)*\1, we need the preceding group,
5111 and in \(zz\(a*\)b*\)\2, we need the inner group. */
5112
5113 /* We can't use `p' to check ahead because we push
5114 a failure point to `p + mcnt' after we do this. */
5115 p1 = p;
5116
5117 /* We need to skip no_op's before we look for the
5118 start_memory in case this on_failure_jump is happening as
5119 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
5120 against aba. */
5121 while (p1 < pend && (re_opcode_t) *p1 == no_op)
5122 p1++;
5123
5124 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
5125 {
5126 /* We have a new highest active register now. This will
5127 get reset at the start_memory we are about to get to,
5128 but we will have saved all the registers relevant to
5129 this repetition op, as described above. */
5130 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5131 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5132 lowest_active_reg = *(p1 + 1);
5133 }
5134
5135 DEBUG_PRINT1 (":\n");
5136 PUSH_FAILURE_POINT (p + mcnt, d, -2);
5137 break;
5138
5139
5140 /* A smart repeat ends with `maybe_pop_jump'.
5141 We change it to either `pop_failure_jump' or `jump'. */
5142 case maybe_pop_jump:
5143 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5144 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
5145 {
5146 register unsigned char *p2 = p;
5147
5148 /* Compare the beginning of the repeat with what in the
5149 pattern follows its end. If we can establish that there
5150 is nothing that they would both match, i.e., that we
5151 would have to backtrack because of (as in, e.g., `a*a')
5152 then we can change to pop_failure_jump, because we'll
5153 never have to backtrack.
5154
5155 This is not true in the case of alternatives: in
5156 `(a|ab)*' we do need to backtrack to the `ab' alternative
5157 (e.g., if the string was `ab'). But instead of trying to
5158 detect that here, the alternative has put on a dummy
5159 failure point which is what we will end up popping. */
5160
5161 /* Skip over open/close-group commands.
5162 If what follows this loop is a ...+ construct,
5163 look at what begins its body, since we will have to
5164 match at least one of that. */
5165 while (1)
5166 {
5167 if (p2 + 2 < pend
5168 && ((re_opcode_t) *p2 == stop_memory
5169 || (re_opcode_t) *p2 == start_memory))
5170 p2 += 3;
5171 else if (p2 + 6 < pend
5172 && (re_opcode_t) *p2 == dummy_failure_jump)
5173 p2 += 6;
5174 else
5175 break;
5176 }
5177
5178 p1 = p + mcnt;
5179 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5180 to the `maybe_finalize_jump' of this case. Examine what
5181 follows. */
5182
5183 /* If we're at the end of the pattern, we can change. */
5184 if (p2 == pend)
5185 {
5186 /* Consider what happens when matching ":\(.*\)"
5187 against ":/". I don't really understand this code
5188 yet. */
5189 p[-3] = (unsigned char) pop_failure_jump;
5190 DEBUG_PRINT1
5191 (" End of pattern: change to `pop_failure_jump'.\n");
5192 }
5193
5194 else if ((re_opcode_t) *p2 == exactn
5195 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
5196 {
5197 register unsigned char c
5198 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5199
5200 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
5201 {
5202 p[-3] = (unsigned char) pop_failure_jump;
5203 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5204 c, p1[5]);
5205 }
5206
5207 else if ((re_opcode_t) p1[3] == charset
5208 || (re_opcode_t) p1[3] == charset_not)
5209 {
5210 int not = (re_opcode_t) p1[3] == charset_not;
5211
5212 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
5213 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5214 not = !not;
5215
5216 /* `not' is equal to 1 if c would match, which means
5217 that we can't change to pop_failure_jump. */
5218 if (!not)
5219 {
5220 p[-3] = (unsigned char) pop_failure_jump;
5221 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5222 }
5223 }
5224 }
5225 else if ((re_opcode_t) *p2 == charset)
5226 {
5227 /* We win if the first character of the loop is not part
5228 of the charset. */
5229 if ((re_opcode_t) p1[3] == exactn
5230 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
5231 && (p2[2 + p1[5] / BYTEWIDTH]
5232 & (1 << (p1[5] % BYTEWIDTH)))))
5233 {
5234 p[-3] = (unsigned char) pop_failure_jump;
5235 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5236 }
5237
5238 else if ((re_opcode_t) p1[3] == charset_not)
5239 {
5240 int idx;
5241 /* We win if the charset_not inside the loop
5242 lists every character listed in the charset after. */
5243 for (idx = 0; idx < (int) p2[1]; idx++)
5244 if (! (p2[2 + idx] == 0
5245 || (idx < (int) p1[4]
5246 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
5247 break;
5248
5249 if (idx == p2[1])
5250 {
5251 p[-3] = (unsigned char) pop_failure_jump;
5252 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5253 }
5254 }
5255 else if ((re_opcode_t) p1[3] == charset)
5256 {
5257 int idx;
5258 /* We win if the charset inside the loop
5259 has no overlap with the one after the loop. */
5260 for (idx = 0;
5261 idx < (int) p2[1] && idx < (int) p1[4];
5262 idx++)
5263 if ((p2[2 + idx] & p1[5 + idx]) != 0)
5264 break;
5265
5266 if (idx == p2[1] || idx == p1[4])
5267 {
5268 p[-3] = (unsigned char) pop_failure_jump;
5269 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5270 }
5271 }
5272 }
5273 }
5274 p -= 2; /* Point at relative address again. */
5275 if ((re_opcode_t) p[-1] != pop_failure_jump)
5276 {
5277 p[-1] = (unsigned char) jump;
5278 DEBUG_PRINT1 (" Match => jump.\n");
5279 goto unconditional_jump;
5280 }
5281 /* Note fall through. */
5282
5283
5284 /* The end of a simple repeat has a pop_failure_jump back to
5285 its matching on_failure_jump, where the latter will push a
5286 failure point. The pop_failure_jump takes off failure
5287 points put on by this pop_failure_jump's matching
5288 on_failure_jump; we got through the pattern to here from the
5289 matching on_failure_jump, so didn't fail. */
5290 case pop_failure_jump:
5291 {
5292 /* We need to pass separate storage for the lowest and
5293 highest registers, even though we don't care about the
5294 actual values. Otherwise, we will restore only one
5295 register from the stack, since lowest will == highest in
5296 `pop_failure_point'. */
5297 active_reg_t dummy_low_reg, dummy_high_reg;
5298 unsigned char *pdummy;
5299 const char *sdummy;
5300
5301 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
5302 POP_FAILURE_POINT (sdummy, pdummy,
5303 dummy_low_reg, dummy_high_reg,
5304 reg_dummy, reg_dummy, reg_info_dummy);
5305 }
5306 /* Note fall through. */
5307
5308 unconditional_jump:
5309 #ifdef _LIBC
5310 DEBUG_PRINT2 ("\n%p: ", p);
5311 #else
5312 DEBUG_PRINT2 ("\n0x%x: ", p);
5313 #endif
5314 /* Note fall through. */
5315
5316 /* Unconditionally jump (without popping any failure points). */
5317 case jump:
5318 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
5319 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5320 p += mcnt; /* Do the jump. */
5321 #ifdef _LIBC
5322 DEBUG_PRINT2 ("(to %p).\n", p);
5323 #else
5324 DEBUG_PRINT2 ("(to 0x%x).\n", p);
5325 #endif
5326 break;
5327
5328
5329 /* We need this opcode so we can detect where alternatives end
5330 in `group_match_null_string_p' et al. */
5331 case jump_past_alt:
5332 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
5333 goto unconditional_jump;
5334
5335
5336 /* Normally, the on_failure_jump pushes a failure point, which
5337 then gets popped at pop_failure_jump. We will end up at
5338 pop_failure_jump, also, and with a pattern of, say, `a+', we
5339 are skipping over the on_failure_jump, so we have to push
5340 something meaningless for pop_failure_jump to pop. */
5341 case dummy_failure_jump:
5342 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
5343 /* It doesn't matter what we push for the string here. What
5344 the code at `fail' tests is the value for the pattern. */
5345 PUSH_FAILURE_POINT (NULL, NULL, -2);
5346 goto unconditional_jump;
5347
5348
5349 /* At the end of an alternative, we need to push a dummy failure
5350 point in case we are followed by a `pop_failure_jump', because
5351 we don't want the failure point for the alternative to be
5352 popped. For example, matching `(a|ab)*' against `aab'
5353 requires that we match the `ab' alternative. */
5354 case push_dummy_failure:
5355 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
5356 /* See comments just above at `dummy_failure_jump' about the
5357 two zeroes. */
5358 PUSH_FAILURE_POINT (NULL, NULL, -2);
5359 break;
5360
5361 /* Have to succeed matching what follows at least n times.
5362 After that, handle like `on_failure_jump'. */
5363 case succeed_n:
5364 EXTRACT_NUMBER (mcnt, p + 2);
5365 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5366
5367 assert (mcnt >= 0);
5368 /* Originally, this is how many times we HAVE to succeed. */
5369 if (mcnt > 0)
5370 {
5371 mcnt--;
5372 p += 2;
5373 STORE_NUMBER_AND_INCR (p, mcnt);
5374 #ifdef _LIBC
5375 DEBUG_PRINT3 (" Setting %p to %d.\n", p - 2, mcnt);
5376 #else
5377 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - 2, mcnt);
5378 #endif
5379 }
5380 else if (mcnt == 0)
5381 {
5382 #ifdef _LIBC
5383 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p+2);
5384 #else
5385 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
5386 #endif
5387 p[2] = (unsigned char) no_op;
5388 p[3] = (unsigned char) no_op;
5389 goto on_failure;
5390 }
5391 break;
5392
5393 case jump_n:
5394 EXTRACT_NUMBER (mcnt, p + 2);
5395 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5396
5397 /* Originally, this is how many times we CAN jump. */
5398 if (mcnt)
5399 {
5400 mcnt--;
5401 STORE_NUMBER (p + 2, mcnt);
5402 #ifdef _LIBC
5403 DEBUG_PRINT3 (" Setting %p to %d.\n", p + 2, mcnt);
5404 #else
5405 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + 2, mcnt);
5406 #endif
5407 goto unconditional_jump;
5408 }
5409 /* If don't have to jump any more, skip over the rest of command. */
5410 else
5411 p += 4;
5412 break;
5413
5414 case set_number_at:
5415 {
5416 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5417
5418 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5419 p1 = p + mcnt;
5420 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5421 #ifdef _LIBC
5422 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
5423 #else
5424 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
5425 #endif
5426 STORE_NUMBER (p1, mcnt);
5427 break;
5428 }
5429
5430 #if 0
5431 /* The DEC Alpha C compiler 3.x generates incorrect code for the
5432 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
5433 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
5434 macro and introducing temporary variables works around the bug. */
5435
5436 case wordbound:
5437 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5438 if (AT_WORD_BOUNDARY (d))
5439 break;
5440 goto fail;
5441
5442 case notwordbound:
5443 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5444 if (AT_WORD_BOUNDARY (d))
5445 goto fail;
5446 break;
5447 #else
5448 case wordbound:
5449 {
5450 boolean prevchar, thischar;
5451
5452 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5453 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5454 break;
5455
5456 prevchar = WORDCHAR_P (d - 1);
5457 thischar = WORDCHAR_P (d);
5458 if (prevchar != thischar)
5459 break;
5460 goto fail;
5461 }
5462
5463 case notwordbound:
5464 {
5465 boolean prevchar, thischar;
5466
5467 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5468 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5469 goto fail;
5470
5471 prevchar = WORDCHAR_P (d - 1);
5472 thischar = WORDCHAR_P (d);
5473 if (prevchar != thischar)
5474 goto fail;
5475 break;
5476 }
5477 #endif
5478
5479 case wordbeg:
5480 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5481 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
5482 break;
5483 goto fail;
5484
5485 case wordend:
5486 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5487 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
5488 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
5489 break;
5490 goto fail;
5491
5492 #ifdef emacs
5493 case before_dot:
5494 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5495 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
5496 goto fail;
5497 break;
5498
5499 case at_dot:
5500 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5501 if (PTR_CHAR_POS ((unsigned char *) d) != point)
5502 goto fail;
5503 break;
5504
5505 case after_dot:
5506 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5507 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
5508 goto fail;
5509 break;
5510
5511 case syntaxspec:
5512 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
5513 mcnt = *p++;
5514 goto matchsyntax;
5515
5516 case wordchar:
5517 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
5518 mcnt = (int) Sword;
5519 matchsyntax:
5520 PREFETCH ();
5521 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
5522 d++;
5523 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
5524 goto fail;
5525 SET_REGS_MATCHED ();
5526 break;
5527
5528 case notsyntaxspec:
5529 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
5530 mcnt = *p++;
5531 goto matchnotsyntax;
5532
5533 case notwordchar:
5534 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
5535 mcnt = (int) Sword;
5536 matchnotsyntax:
5537 PREFETCH ();
5538 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
5539 d++;
5540 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
5541 goto fail;
5542 SET_REGS_MATCHED ();
5543 break;
5544
5545 #else /* not emacs */
5546 case wordchar:
5547 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
5548 PREFETCH ();
5549 if (!WORDCHAR_P (d))
5550 goto fail;
5551 SET_REGS_MATCHED ();
5552 d++;
5553 break;
5554
5555 case notwordchar:
5556 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
5557 PREFETCH ();
5558 if (WORDCHAR_P (d))
5559 goto fail;
5560 SET_REGS_MATCHED ();
5561 d++;
5562 break;
5563 #endif /* not emacs */
5564
5565 default:
5566 abort ();
5567 }
5568 continue; /* Successfully executed one pattern command; keep going. */
5569
5570
5571 /* We goto here if a matching operation fails. */
5572 fail:
5573 if (!FAIL_STACK_EMPTY ())
5574 { /* A restart point is known. Restore to that state. */
5575 DEBUG_PRINT1 ("\nFAIL:\n");
5576 POP_FAILURE_POINT (d, p,
5577 lowest_active_reg, highest_active_reg,
5578 regstart, regend, reg_info);
5579
5580 /* If this failure point is a dummy, try the next one. */
5581 if (!p)
5582 goto fail;
5583
5584 /* If we failed to the end of the pattern, don't examine *p. */
5585 assert (p <= pend);
5586 if (p < pend)
5587 {
5588 boolean is_a_jump_n = false;
5589
5590 /* If failed to a backwards jump that's part of a repetition
5591 loop, need to pop this failure point and use the next one. */
5592 switch ((re_opcode_t) *p)
5593 {
5594 case jump_n:
5595 is_a_jump_n = true;
5596 case maybe_pop_jump:
5597 case pop_failure_jump:
5598 case jump:
5599 p1 = p + 1;
5600 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5601 p1 += mcnt;
5602
5603 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
5604 || (!is_a_jump_n
5605 && (re_opcode_t) *p1 == on_failure_jump))
5606 goto fail;
5607 break;
5608 default:
5609 /* do nothing */ ;
5610 }
5611 }
5612
5613 if (d >= string1 && d <= end1)
5614 dend = end_match_1;
5615 }
5616 else
5617 break; /* Matching at this starting point really fails. */
5618 } /* for (;;) */
5619
5620 if (best_regs_set)
5621 goto restore_best_regs;
5622
5623 FREE_VARIABLES ();
5624
5625 return -1; /* Failure to match. */
5626 } /* re_match_2 */
5627 \f
5628 /* Subroutine definitions for re_match_2. */
5629
5630
5631 /* We are passed P pointing to a register number after a start_memory.
5632
5633 Return true if the pattern up to the corresponding stop_memory can
5634 match the empty string, and false otherwise.
5635
5636 If we find the matching stop_memory, sets P to point to one past its number.
5637 Otherwise, sets P to an undefined byte less than or equal to END.
5638
5639 We don't handle duplicates properly (yet). */
5640
5641 static boolean
5642 group_match_null_string_p (p, end, reg_info)
5643 unsigned char **p, *end;
5644 register_info_type *reg_info;
5645 {
5646 int mcnt;
5647 /* Point to after the args to the start_memory. */
5648 unsigned char *p1 = *p + 2;
5649
5650 while (p1 < end)
5651 {
5652 /* Skip over opcodes that can match nothing, and return true or
5653 false, as appropriate, when we get to one that can't, or to the
5654 matching stop_memory. */
5655
5656 switch ((re_opcode_t) *p1)
5657 {
5658 /* Could be either a loop or a series of alternatives. */
5659 case on_failure_jump:
5660 p1++;
5661 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5662
5663 /* If the next operation is not a jump backwards in the
5664 pattern. */
5665
5666 if (mcnt >= 0)
5667 {
5668 /* Go through the on_failure_jumps of the alternatives,
5669 seeing if any of the alternatives cannot match nothing.
5670 The last alternative starts with only a jump,
5671 whereas the rest start with on_failure_jump and end
5672 with a jump, e.g., here is the pattern for `a|b|c':
5673
5674 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
5675 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
5676 /exactn/1/c
5677
5678 So, we have to first go through the first (n-1)
5679 alternatives and then deal with the last one separately. */
5680
5681
5682 /* Deal with the first (n-1) alternatives, which start
5683 with an on_failure_jump (see above) that jumps to right
5684 past a jump_past_alt. */
5685
5686 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
5687 {
5688 /* `mcnt' holds how many bytes long the alternative
5689 is, including the ending `jump_past_alt' and
5690 its number. */
5691
5692 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
5693 reg_info))
5694 return false;
5695
5696 /* Move to right after this alternative, including the
5697 jump_past_alt. */
5698 p1 += mcnt;
5699
5700 /* Break if it's the beginning of an n-th alternative
5701 that doesn't begin with an on_failure_jump. */
5702 if ((re_opcode_t) *p1 != on_failure_jump)
5703 break;
5704
5705 /* Still have to check that it's not an n-th
5706 alternative that starts with an on_failure_jump. */
5707 p1++;
5708 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5709 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
5710 {
5711 /* Get to the beginning of the n-th alternative. */
5712 p1 -= 3;
5713 break;
5714 }
5715 }
5716
5717 /* Deal with the last alternative: go back and get number
5718 of the `jump_past_alt' just before it. `mcnt' contains
5719 the length of the alternative. */
5720 EXTRACT_NUMBER (mcnt, p1 - 2);
5721
5722 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
5723 return false;
5724
5725 p1 += mcnt; /* Get past the n-th alternative. */
5726 } /* if mcnt > 0 */
5727 break;
5728
5729
5730 case stop_memory:
5731 assert (p1[1] == **p);
5732 *p = p1 + 2;
5733 return true;
5734
5735
5736 default:
5737 if (!common_op_match_null_string_p (&p1, end, reg_info))
5738 return false;
5739 }
5740 } /* while p1 < end */
5741
5742 return false;
5743 } /* group_match_null_string_p */
5744
5745
5746 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
5747 It expects P to be the first byte of a single alternative and END one
5748 byte past the last. The alternative can contain groups. */
5749
5750 static boolean
5751 alt_match_null_string_p (p, end, reg_info)
5752 unsigned char *p, *end;
5753 register_info_type *reg_info;
5754 {
5755 int mcnt;
5756 unsigned char *p1 = p;
5757
5758 while (p1 < end)
5759 {
5760 /* Skip over opcodes that can match nothing, and break when we get
5761 to one that can't. */
5762
5763 switch ((re_opcode_t) *p1)
5764 {
5765 /* It's a loop. */
5766 case on_failure_jump:
5767 p1++;
5768 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5769 p1 += mcnt;
5770 break;
5771
5772 default:
5773 if (!common_op_match_null_string_p (&p1, end, reg_info))
5774 return false;
5775 }
5776 } /* while p1 < end */
5777
5778 return true;
5779 } /* alt_match_null_string_p */
5780
5781
5782 /* Deals with the ops common to group_match_null_string_p and
5783 alt_match_null_string_p.
5784
5785 Sets P to one after the op and its arguments, if any. */
5786
5787 static boolean
5788 common_op_match_null_string_p (p, end, reg_info)
5789 unsigned char **p, *end;
5790 register_info_type *reg_info;
5791 {
5792 int mcnt;
5793 boolean ret;
5794 int reg_no;
5795 unsigned char *p1 = *p;
5796
5797 switch ((re_opcode_t) *p1++)
5798 {
5799 case no_op:
5800 case begline:
5801 case endline:
5802 case begbuf:
5803 case endbuf:
5804 case wordbeg:
5805 case wordend:
5806 case wordbound:
5807 case notwordbound:
5808 #ifdef emacs
5809 case before_dot:
5810 case at_dot:
5811 case after_dot:
5812 #endif
5813 break;
5814
5815 case start_memory:
5816 reg_no = *p1;
5817 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
5818 ret = group_match_null_string_p (&p1, end, reg_info);
5819
5820 /* Have to set this here in case we're checking a group which
5821 contains a group and a back reference to it. */
5822
5823 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
5824 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
5825
5826 if (!ret)
5827 return false;
5828 break;
5829
5830 /* If this is an optimized succeed_n for zero times, make the jump. */
5831 case jump:
5832 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5833 if (mcnt >= 0)
5834 p1 += mcnt;
5835 else
5836 return false;
5837 break;
5838
5839 case succeed_n:
5840 /* Get to the number of times to succeed. */
5841 p1 += 2;
5842 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5843
5844 if (mcnt == 0)
5845 {
5846 p1 -= 4;
5847 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5848 p1 += mcnt;
5849 }
5850 else
5851 return false;
5852 break;
5853
5854 case duplicate:
5855 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
5856 return false;
5857 break;
5858
5859 case set_number_at:
5860 p1 += 4;
5861
5862 default:
5863 /* All other opcodes mean we cannot match the empty string. */
5864 return false;
5865 }
5866
5867 *p = p1;
5868 return true;
5869 } /* common_op_match_null_string_p */
5870
5871
5872 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5873 bytes; nonzero otherwise. */
5874
5875 static int
5876 bcmp_translate (s1, s2, len, translate)
5877 const char *s1, *s2;
5878 register int len;
5879 RE_TRANSLATE_TYPE translate;
5880 {
5881 register const unsigned char *p1 = (const unsigned char *) s1;
5882 register const unsigned char *p2 = (const unsigned char *) s2;
5883 while (len)
5884 {
5885 if (translate[*p1++] != translate[*p2++]) return 1;
5886 len--;
5887 }
5888 return 0;
5889 }
5890 \f
5891 /* Entry points for GNU code. */
5892
5893 /* re_compile_pattern is the GNU regular expression compiler: it
5894 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5895 Returns 0 if the pattern was valid, otherwise an error string.
5896
5897 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5898 are set in BUFP on entry.
5899
5900 We call regex_compile to do the actual compilation. */
5901
5902 const char *
5903 re_compile_pattern (pattern, length, bufp)
5904 const char *pattern;
5905 size_t length;
5906 struct re_pattern_buffer *bufp;
5907 {
5908 reg_errcode_t ret;
5909
5910 /* GNU code is written to assume at least RE_NREGS registers will be set
5911 (and at least one extra will be -1). */
5912 bufp->regs_allocated = REGS_UNALLOCATED;
5913
5914 /* And GNU code determines whether or not to get register information
5915 by passing null for the REGS argument to re_match, etc., not by
5916 setting no_sub. */
5917 bufp->no_sub = 0;
5918
5919 /* Match anchors at newline. */
5920 bufp->newline_anchor = 1;
5921
5922 ret = regex_compile (pattern, length, re_syntax_options, bufp);
5923
5924 if (!ret)
5925 return NULL;
5926 return gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
5927 }
5928 #ifdef _LIBC
5929 weak_alias (__re_compile_pattern, re_compile_pattern)
5930 #endif
5931 \f
5932 /* Entry points compatible with 4.2 BSD regex library. We don't define
5933 them unless specifically requested. */
5934
5935 #if defined _REGEX_RE_COMP || defined _LIBC
5936
5937 /* BSD has one and only one pattern buffer. */
5938 static struct re_pattern_buffer re_comp_buf;
5939
5940 char *
5941 #ifdef _LIBC
5942 /* Make these definitions weak in libc, so POSIX programs can redefine
5943 these names if they don't use our functions, and still use
5944 regcomp/regexec below without link errors. */
5945 weak_function
5946 #endif
5947 re_comp (s)
5948 const char *s;
5949 {
5950 reg_errcode_t ret;
5951
5952 if (!s)
5953 {
5954 if (!re_comp_buf.buffer)
5955 return gettext ("No previous regular expression");
5956 return 0;
5957 }
5958
5959 if (!re_comp_buf.buffer)
5960 {
5961 re_comp_buf.buffer = (unsigned char *) malloc (200);
5962 if (re_comp_buf.buffer == NULL)
5963 return (char *) gettext (re_error_msgid
5964 + re_error_msgid_idx[(int) REG_ESPACE]);
5965 re_comp_buf.allocated = 200;
5966
5967 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
5968 if (re_comp_buf.fastmap == NULL)
5969 return (char *) gettext (re_error_msgid
5970 + re_error_msgid_idx[(int) REG_ESPACE]);
5971 }
5972
5973 /* Since `re_exec' always passes NULL for the `regs' argument, we
5974 don't need to initialize the pattern buffer fields which affect it. */
5975
5976 /* Match anchors at newlines. */
5977 re_comp_buf.newline_anchor = 1;
5978
5979 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5980
5981 if (!ret)
5982 return NULL;
5983
5984 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5985 return (char *) gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
5986 }
5987
5988
5989 int
5990 #ifdef _LIBC
5991 weak_function
5992 #endif
5993 re_exec (s)
5994 const char *s;
5995 {
5996 const int len = strlen (s);
5997 return
5998 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
5999 }
6000
6001 #endif /* _REGEX_RE_COMP */
6002 \f
6003 /* POSIX.2 functions. Don't define these for Emacs. */
6004
6005 #ifndef emacs
6006
6007 /* regcomp takes a regular expression as a string and compiles it.
6008
6009 PREG is a regex_t *. We do not expect any fields to be initialized,
6010 since POSIX says we shouldn't. Thus, we set
6011
6012 `buffer' to the compiled pattern;
6013 `used' to the length of the compiled pattern;
6014 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6015 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6016 RE_SYNTAX_POSIX_BASIC;
6017 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6018 `fastmap' to an allocated space for the fastmap;
6019 `fastmap_accurate' to zero;
6020 `re_nsub' to the number of subexpressions in PATTERN.
6021
6022 PATTERN is the address of the pattern string.
6023
6024 CFLAGS is a series of bits which affect compilation.
6025
6026 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6027 use POSIX basic syntax.
6028
6029 If REG_NEWLINE is set, then . and [^...] don't match newline.
6030 Also, regexec will try a match beginning after every newline.
6031
6032 If REG_ICASE is set, then we considers upper- and lowercase
6033 versions of letters to be equivalent when matching.
6034
6035 If REG_NOSUB is set, then when PREG is passed to regexec, that
6036 routine will report only success or failure, and nothing about the
6037 registers.
6038
6039 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6040 the return codes and their meanings.) */
6041
6042 int
6043 regcomp (preg, pattern, cflags)
6044 regex_t *preg;
6045 const char *pattern;
6046 int cflags;
6047 {
6048 reg_errcode_t ret;
6049 reg_syntax_t syntax
6050 = (cflags & REG_EXTENDED) ?
6051 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6052
6053 /* regex_compile will allocate the space for the compiled pattern. */
6054 preg->buffer = 0;
6055 preg->allocated = 0;
6056 preg->used = 0;
6057
6058 /* Try to allocate space for the fastmap. */
6059 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
6060
6061 if (cflags & REG_ICASE)
6062 {
6063 unsigned i;
6064
6065 preg->translate
6066 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6067 * sizeof (*(RE_TRANSLATE_TYPE)0));
6068 if (preg->translate == NULL)
6069 return (int) REG_ESPACE;
6070
6071 /* Map uppercase characters to corresponding lowercase ones. */
6072 for (i = 0; i < CHAR_SET_SIZE; i++)
6073 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
6074 }
6075 else
6076 preg->translate = NULL;
6077
6078 /* If REG_NEWLINE is set, newlines are treated differently. */
6079 if (cflags & REG_NEWLINE)
6080 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6081 syntax &= ~RE_DOT_NEWLINE;
6082 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6083 /* It also changes the matching behavior. */
6084 preg->newline_anchor = 1;
6085 }
6086 else
6087 preg->newline_anchor = 0;
6088
6089 preg->no_sub = !!(cflags & REG_NOSUB);
6090
6091 /* POSIX says a null character in the pattern terminates it, so we
6092 can use strlen here in compiling the pattern. */
6093 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
6094
6095 /* POSIX doesn't distinguish between an unmatched open-group and an
6096 unmatched close-group: both are REG_EPAREN. */
6097 if (ret == REG_ERPAREN) ret = REG_EPAREN;
6098
6099 if (ret == REG_NOERROR && preg->fastmap)
6100 {
6101 /* Compute the fastmap now, since regexec cannot modify the pattern
6102 buffer. */
6103 if (re_compile_fastmap (preg) == -2)
6104 {
6105 /* Some error occurred while computing the fastmap, just forget
6106 about it. */
6107 free (preg->fastmap);
6108 preg->fastmap = NULL;
6109 }
6110 }
6111
6112 return (int) ret;
6113 }
6114 #ifdef _LIBC
6115 weak_alias (__regcomp, regcomp)
6116 #endif
6117
6118
6119 /* regexec searches for a given pattern, specified by PREG, in the
6120 string STRING.
6121
6122 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6123 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6124 least NMATCH elements, and we set them to the offsets of the
6125 corresponding matched substrings.
6126
6127 EFLAGS specifies `execution flags' which affect matching: if
6128 REG_NOTBOL is set, then ^ does not match at the beginning of the
6129 string; if REG_NOTEOL is set, then $ does not match at the end.
6130
6131 We return 0 if we find a match and REG_NOMATCH if not. */
6132
6133 int
6134 regexec (preg, string, nmatch, pmatch, eflags)
6135 const regex_t *preg;
6136 const char *string;
6137 size_t nmatch;
6138 regmatch_t pmatch[];
6139 int eflags;
6140 {
6141 int ret;
6142 struct re_registers regs;
6143 regex_t private_preg;
6144 int len = strlen (string);
6145 boolean want_reg_info = !preg->no_sub && nmatch > 0;
6146
6147 private_preg = *preg;
6148
6149 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6150 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6151
6152 /* The user has told us exactly how many registers to return
6153 information about, via `nmatch'. We have to pass that on to the
6154 matching routines. */
6155 private_preg.regs_allocated = REGS_FIXED;
6156
6157 if (want_reg_info)
6158 {
6159 regs.num_regs = nmatch;
6160 regs.start = TALLOC (nmatch * 2, regoff_t);
6161 if (regs.start == NULL)
6162 return (int) REG_NOMATCH;
6163 regs.end = regs.start + nmatch;
6164 }
6165
6166 /* Perform the searching operation. */
6167 ret = re_search (&private_preg, string, len,
6168 /* start: */ 0, /* range: */ len,
6169 want_reg_info ? &regs : (struct re_registers *) 0);
6170
6171 /* Copy the register information to the POSIX structure. */
6172 if (want_reg_info)
6173 {
6174 if (ret >= 0)
6175 {
6176 unsigned r;
6177
6178 for (r = 0; r < nmatch; r++)
6179 {
6180 pmatch[r].rm_so = regs.start[r];
6181 pmatch[r].rm_eo = regs.end[r];
6182 }
6183 }
6184
6185 /* If we needed the temporary register info, free the space now. */
6186 free (regs.start);
6187 }
6188
6189 /* We want zero return to mean success, unlike `re_search'. */
6190 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6191 }
6192 #ifdef _LIBC
6193 weak_alias (__regexec, regexec)
6194 #endif
6195
6196
6197 /* Returns a message corresponding to an error code, ERRCODE, returned
6198 from either regcomp or regexec. We don't use PREG here. */
6199
6200 size_t
6201 regerror (errcode, preg, errbuf, errbuf_size)
6202 int errcode;
6203 const regex_t *preg;
6204 char *errbuf;
6205 size_t errbuf_size;
6206 {
6207 const char *msg;
6208 size_t msg_size;
6209
6210 if (errcode < 0
6211 || errcode >= (int) (sizeof (re_error_msgid_idx)
6212 / sizeof (re_error_msgid_idx[0])))
6213 /* Only error codes returned by the rest of the code should be passed
6214 to this routine. If we are given anything else, or if other regex
6215 code generates an invalid error code, then the program has a bug.
6216 Dump core so we can fix it. */
6217 abort ();
6218
6219 msg = gettext (re_error_msgid + re_error_msgid_idx[errcode]);
6220
6221 msg_size = strlen (msg) + 1; /* Includes the null. */
6222
6223 if (errbuf_size != 0)
6224 {
6225 if (msg_size > errbuf_size)
6226 {
6227 #if defined HAVE_MEMPCPY || defined _LIBC
6228 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
6229 #else
6230 memcpy (errbuf, msg, errbuf_size - 1);
6231 errbuf[errbuf_size - 1] = 0;
6232 #endif
6233 }
6234 else
6235 memcpy (errbuf, msg, msg_size);
6236 }
6237
6238 return msg_size;
6239 }
6240 #ifdef _LIBC
6241 weak_alias (__regerror, regerror)
6242 #endif
6243
6244
6245 /* Free dynamically allocated space used by PREG. */
6246
6247 void
6248 regfree (preg)
6249 regex_t *preg;
6250 {
6251 if (preg->buffer != NULL)
6252 free (preg->buffer);
6253 preg->buffer = NULL;
6254
6255 preg->allocated = 0;
6256 preg->used = 0;
6257
6258 if (preg->fastmap != NULL)
6259 free (preg->fastmap);
6260 preg->fastmap = NULL;
6261 preg->fastmap_accurate = 0;
6262
6263 if (preg->translate != NULL)
6264 free (preg->translate);
6265 preg->translate = NULL;
6266 }
6267 #ifdef _LIBC
6268 weak_alias (__regfree, regfree)
6269 #endif
6270
6271 #endif /* not emacs */