]>
Commit | Line | Data |
---|---|---|
dd714b8a MT |
1 | Submitted by: Alexander E. Patrakov |
2 | Date: 2005-08-13 | |
3 | Initial Package Version: 2.5.1a | |
4 | Upstream Status: Partially accepted, partially rejected, but required for LSB >= 2.0 certification | |
5 | Origin: RedHat | |
6 | Description: Various fixes from RedHat. Individual patches: | |
7 | ||
8 | grep-2.5.1-fgrep.patch | |
9 | grep-2.5.1-bracket.patch | |
10 | grep-2.5-i18n.patch | |
11 | grep-2.5.1-oi.patch | |
12 | grep-2.5.1-manpage.patch | |
13 | grep-2.5.1-color.patch | |
14 | grep-2.5.1-icolor.patch | |
15 | grep-2.5.1-egf-speedup.patch | |
16 | grep-2.5.1-dfa-optional.patch | |
17 | grep-2.5.1-tests.patch | |
18 | grep-2.5.1-w.patch | |
19 | ||
20 | Testcases: | |
21 | ||
22 | -fgrep: ???, but required for other patches | |
23 | -bracket: echo "[" | LANG=en_US.UTF-8 grep "[[:space:]]" | |
24 | -i18n: many fixes for multibyte locale support, required for LSB. | |
25 | -oi: echo xxYYzz | LANG=C grep -i -o yy | |
26 | -manpage: typo | |
27 | -color: restore the background color correctly | |
28 | -icolor: ??? echo 'spam foo SPAM FOO' | grep -i --color spam | |
29 | (but that's also fixed by -oi. Is this patch just a cleanup?) | |
30 | -egf-speedup: without this, grep is as slow as a snail in UTF-8 locales. | |
31 | -dfa-optional: disables dfa in multibyte locales by default. | |
32 | -w: (echo 'foo';echo 'fo') > /tmp/testfile && grep -F -w fo /tmp/testfile | |
33 | ||
34 | diff -urN grep-2.5.1a.orig/doc/grep.1 grep-2.5.1a/doc/grep.1 | |
35 | --- grep-2.5.1a.orig/doc/grep.1 2004-11-12 16:26:37.000000000 +0500 | |
36 | +++ grep-2.5.1a/doc/grep.1 2005-10-23 09:49:43.000000000 +0600 | |
37 | @@ -191,6 +191,7 @@ | |
38 | .I PATTERN | |
39 | as a list of fixed strings, separated by newlines, | |
40 | any of which is to be matched. | |
41 | +.TP | |
42 | .BR \-P ", " \-\^\-perl-regexp | |
43 | Interpret | |
44 | .I PATTERN | |
45 | @@ -302,7 +303,7 @@ | |
46 | This is especially useful for tools like zgrep, e.g. | |
47 | .B "gzip -cd foo.gz |grep --label=foo something" | |
48 | .TP | |
49 | -.BR \-\^\-line-buffering | |
50 | +.BR \-\^\-line-buffered | |
51 | Use line buffering, it can be a performance penality. | |
52 | .TP | |
53 | .BR \-q ", " \-\^\-quiet ", " \-\^\-silent | |
54 | diff -urN grep-2.5.1a.orig/lib/posix/regex.h grep-2.5.1a/lib/posix/regex.h | |
55 | --- grep-2.5.1a.orig/lib/posix/regex.h 2001-04-02 23:56:50.000000000 +0600 | |
56 | +++ grep-2.5.1a/lib/posix/regex.h 2005-10-23 09:49:31.000000000 +0600 | |
57 | @@ -109,6 +109,10 @@ | |
58 | If not set, \{, \}, {, and } are literals. */ | |
59 | #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) | |
60 | ||
61 | +/* If this bit is set, then ignore case when matching. | |
62 | + If not set, then case is significant. */ | |
63 | +#define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1) | |
64 | + | |
65 | /* If this bit is set, +, ? and | aren't recognized as operators. | |
66 | If not set, they are. */ | |
67 | #define RE_LIMITED_OPS (RE_INTERVALS << 1) | |
68 | diff -urN grep-2.5.1a.orig/src/dfa.c grep-2.5.1a/src/dfa.c | |
69 | --- grep-2.5.1a.orig/src/dfa.c 2001-09-26 22:57:55.000000000 +0600 | |
70 | +++ grep-2.5.1a/src/dfa.c 2005-10-23 09:49:17.000000000 +0600 | |
71 | @@ -414,7 +414,7 @@ | |
72 | ||
73 | /* This function fetch a wide character, and update cur_mb_len, | |
74 | used only if the current locale is a multibyte environment. */ | |
75 | -static wchar_t | |
76 | +static wint_t | |
77 | fetch_wc (char const *eoferr) | |
78 | { | |
79 | wchar_t wc; | |
80 | @@ -423,7 +423,7 @@ | |
81 | if (eoferr != 0) | |
82 | dfaerror (eoferr); | |
83 | else | |
84 | - return -1; | |
85 | + return WEOF; | |
86 | } | |
87 | ||
88 | cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); | |
89 | @@ -459,7 +459,7 @@ | |
90 | static void | |
91 | parse_bracket_exp_mb () | |
92 | { | |
93 | - wchar_t wc, wc1, wc2; | |
94 | + wint_t wc, wc1, wc2; | |
95 | ||
96 | /* Work area to build a mb_char_classes. */ | |
97 | struct mb_char_classes *work_mbc; | |
98 | @@ -496,7 +496,7 @@ | |
99 | work_mbc->invert = 0; | |
100 | do | |
101 | { | |
102 | - wc1 = -1; /* mark wc1 is not initialized". */ | |
103 | + wc1 = WEOF; /* mark wc1 is not initialized". */ | |
104 | ||
105 | /* Note that if we're looking at some other [:...:] construct, | |
106 | we just treat it as a bunch of ordinary characters. We can do | |
107 | @@ -586,7 +586,7 @@ | |
108 | work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; | |
109 | } | |
110 | } | |
111 | - wc = -1; | |
112 | + wc1 = wc = WEOF; | |
113 | } | |
114 | else | |
115 | /* We treat '[' as a normal character here. */ | |
116 | @@ -600,7 +600,7 @@ | |
117 | wc = fetch_wc(("Unbalanced [")); | |
118 | } | |
119 | ||
120 | - if (wc1 == -1) | |
121 | + if (wc1 == WEOF) | |
122 | wc1 = fetch_wc(_("Unbalanced [")); | |
123 | ||
124 | if (wc1 == L'-') | |
125 | @@ -630,17 +630,17 @@ | |
126 | } | |
127 | REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, | |
128 | range_sts_al, work_mbc->nranges + 1); | |
129 | - work_mbc->range_sts[work_mbc->nranges] = wc; | |
130 | + work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc; | |
131 | REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, | |
132 | range_ends_al, work_mbc->nranges + 1); | |
133 | - work_mbc->range_ends[work_mbc->nranges++] = wc2; | |
134 | + work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2; | |
135 | } | |
136 | - else if (wc != -1) | |
137 | + else if (wc != WEOF) | |
138 | /* build normal characters. */ | |
139 | { | |
140 | REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, | |
141 | work_mbc->nchars + 1); | |
142 | - work_mbc->chars[work_mbc->nchars++] = wc; | |
143 | + work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc; | |
144 | } | |
145 | } | |
146 | while ((wc = wc1) != L']'); | |
147 | @@ -2552,6 +2552,8 @@ | |
148 | } | |
149 | ||
150 | /* match with a character? */ | |
151 | + if (case_fold) | |
152 | + wc = towlower (wc); | |
153 | for (i = 0; i<work_mbc->nchars; i++) | |
154 | { | |
155 | if (wc == work_mbc->chars[i]) | |
156 | diff -urN grep-2.5.1a.orig/src/grep.c grep-2.5.1a/src/grep.c | |
157 | --- grep-2.5.1a.orig/src/grep.c 2004-11-12 16:25:35.000000000 +0500 | |
158 | +++ grep-2.5.1a/src/grep.c 2005-10-23 09:50:06.000000000 +0600 | |
159 | @@ -30,6 +30,12 @@ | |
160 | # include <sys/time.h> | |
161 | # include <sys/resource.h> | |
162 | #endif | |
163 | +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC | |
164 | +/* We can handle multibyte string. */ | |
165 | +# define MBS_SUPPORT | |
166 | +# include <wchar.h> | |
167 | +# include <wctype.h> | |
168 | +#endif | |
169 | #include <stdio.h> | |
170 | #include "system.h" | |
171 | #include "getopt.h" | |
172 | @@ -558,33 +564,6 @@ | |
173 | { | |
174 | size_t match_size; | |
175 | size_t match_offset; | |
176 | - if(match_icase) | |
177 | - { | |
178 | - /* Yuck, this is tricky */ | |
179 | - char *buf = (char*) xmalloc (lim - beg); | |
180 | - char *ibeg = buf; | |
181 | - char *ilim = ibeg + (lim - beg); | |
182 | - int i; | |
183 | - for (i = 0; i < lim - beg; i++) | |
184 | - ibeg[i] = tolower (beg[i]); | |
185 | - while ((match_offset = (*execute) (ibeg, ilim-ibeg, &match_size, 1)) | |
186 | - != (size_t) -1) | |
187 | - { | |
188 | - char const *b = beg + match_offset; | |
189 | - if (b == lim) | |
190 | - break; | |
191 | - fwrite (beg, sizeof (char), match_offset, stdout); | |
192 | - printf ("\33[%sm", grep_color); | |
193 | - fwrite (b, sizeof (char), match_size, stdout); | |
194 | - fputs ("\33[00m", stdout); | |
195 | - beg = b + match_size; | |
196 | - ibeg = ibeg + match_offset + match_size; | |
197 | - } | |
198 | - fwrite (beg, 1, lim - beg, stdout); | |
199 | - free (buf); | |
200 | - lastout = lim; | |
201 | - return; | |
202 | - } | |
203 | while (lim-beg && (match_offset = (*execute) (beg, lim - beg, &match_size, 1)) | |
204 | != (size_t) -1) | |
205 | { | |
206 | @@ -601,6 +580,7 @@ | |
207 | fputs ("\33[00m", stdout); | |
208 | beg = b + match_size; | |
209 | } | |
210 | + fputs ("\33[K", stdout); | |
211 | } | |
212 | fwrite (beg, 1, lim - beg, stdout); | |
213 | if (ferror (stdout)) | |
214 | @@ -1697,6 +1677,37 @@ | |
215 | if (!install_matcher (matcher) && !install_matcher ("default")) | |
216 | abort (); | |
217 | ||
218 | +#ifdef MBS_SUPPORT | |
219 | + if (MB_CUR_MAX != 1 && match_icase) | |
220 | + { | |
221 | + wchar_t wc; | |
222 | + mbstate_t cur_state, prev_state; | |
223 | + int i, len = strlen(keys); | |
224 | + | |
225 | + memset(&cur_state, 0, sizeof(mbstate_t)); | |
226 | + for (i = 0; i <= len ;) | |
227 | + { | |
228 | + size_t mbclen; | |
229 | + mbclen = mbrtowc(&wc, keys + i, len - i, &cur_state); | |
230 | + if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) | |
231 | + { | |
232 | + /* An invalid sequence, or a truncated multibyte character. | |
233 | + We treat it as a singlebyte character. */ | |
234 | + mbclen = 1; | |
235 | + } | |
236 | + else | |
237 | + { | |
238 | + if (iswupper((wint_t)wc)) | |
239 | + { | |
240 | + wc = towlower((wint_t)wc); | |
241 | + wcrtomb(keys + i, wc, &cur_state); | |
242 | + } | |
243 | + } | |
244 | + i += mbclen; | |
245 | + } | |
246 | + } | |
247 | +#endif /* MBS_SUPPORT */ | |
248 | + | |
249 | (*compile)(keys, keycc); | |
250 | ||
251 | if ((argc - optind > 1 && !no_filenames) || with_filenames) | |
252 | diff -urN grep-2.5.1a.orig/src/search.c grep-2.5.1a/src/search.c | |
253 | --- grep-2.5.1a.orig/src/search.c 2001-04-19 09:42:14.000000000 +0600 | |
254 | +++ grep-2.5.1a/src/search.c 2005-10-23 09:51:25.000000000 +0600 | |
255 | @@ -18,9 +18,13 @@ | |
256 | ||
257 | /* Written August 1992 by Mike Haertel. */ | |
258 | ||
259 | +#ifndef _GNU_SOURCE | |
260 | +# define _GNU_SOURCE 1 | |
261 | +#endif | |
262 | #ifdef HAVE_CONFIG_H | |
263 | # include <config.h> | |
264 | #endif | |
265 | +#include <assert.h> | |
266 | #include <sys/types.h> | |
267 | #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC | |
268 | /* We can handle multibyte string. */ | |
269 | @@ -31,7 +35,7 @@ | |
270 | ||
271 | #include "system.h" | |
272 | #include "grep.h" | |
273 | -#include "regex.h" | |
274 | +#include <regex.h> | |
275 | #include "dfa.h" | |
276 | #include "kwset.h" | |
277 | #include "error.h" | |
278 | @@ -39,6 +43,9 @@ | |
279 | #ifdef HAVE_LIBPCRE | |
280 | # include <pcre.h> | |
281 | #endif | |
282 | +#ifdef HAVE_LANGINFO_CODESET | |
283 | +# include <langinfo.h> | |
284 | +#endif | |
285 | ||
286 | #define NCHAR (UCHAR_MAX + 1) | |
287 | ||
288 | @@ -70,9 +77,10 @@ | |
289 | call the regexp matcher at all. */ | |
290 | static int kwset_exact_matches; | |
291 | ||
292 | -#if defined(MBS_SUPPORT) | |
293 | -static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); | |
294 | -#endif | |
295 | +/* UTF-8 encoding allows some optimizations that we can't otherwise | |
296 | + assume in a multibyte encoding. */ | |
297 | +static int using_utf8; | |
298 | + | |
299 | static void kwsinit PARAMS ((void)); | |
300 | static void kwsmusts PARAMS ((void)); | |
301 | static void Gcompile PARAMS ((char const *, size_t)); | |
302 | @@ -84,6 +92,15 @@ | |
303 | static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); | |
304 | ||
305 | void | |
306 | +check_utf8 (void) | |
307 | +{ | |
308 | +#ifdef HAVE_LANGINFO_CODESET | |
309 | + if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0) | |
310 | + using_utf8 = 1; | |
311 | +#endif | |
312 | +} | |
313 | + | |
314 | +void | |
315 | dfaerror (char const *mesg) | |
316 | { | |
317 | error (2, 0, mesg); | |
318 | @@ -141,38 +158,6 @@ | |
319 | } | |
320 | } | |
321 | ||
322 | -#ifdef MBS_SUPPORT | |
323 | -/* This function allocate the array which correspond to "buf". | |
324 | - Then this check multibyte string and mark on the positions which | |
325 | - are not singlebyte character nor the first byte of a multibyte | |
326 | - character. Caller must free the array. */ | |
327 | -static char* | |
328 | -check_multibyte_string(char const *buf, size_t size) | |
329 | -{ | |
330 | - char *mb_properties = malloc(size); | |
331 | - mbstate_t cur_state; | |
332 | - int i; | |
333 | - memset(&cur_state, 0, sizeof(mbstate_t)); | |
334 | - memset(mb_properties, 0, sizeof(char)*size); | |
335 | - for (i = 0; i < size ;) | |
336 | - { | |
337 | - size_t mbclen; | |
338 | - mbclen = mbrlen(buf + i, size - i, &cur_state); | |
339 | - | |
340 | - if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) | |
341 | - { | |
342 | - /* An invalid sequence, or a truncated multibyte character. | |
343 | - We treat it as a singlebyte character. */ | |
344 | - mbclen = 1; | |
345 | - } | |
346 | - mb_properties[i] = mbclen; | |
347 | - i += mbclen; | |
348 | - } | |
349 | - | |
350 | - return mb_properties; | |
351 | -} | |
352 | -#endif | |
353 | - | |
354 | static void | |
355 | Gcompile (char const *pattern, size_t size) | |
356 | { | |
357 | @@ -181,7 +166,8 @@ | |
358 | size_t total = size; | |
359 | char const *motif = pattern; | |
360 | ||
361 | - re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); | |
362 | + check_utf8 (); | |
363 | + re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0)); | |
364 | dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); | |
365 | ||
366 | /* For GNU regex compiler we have to pass the patterns separately to detect | |
367 | @@ -233,7 +219,7 @@ | |
368 | static char const line_end[] = "\\)$"; | |
369 | static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\("; | |
370 | static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)"; | |
371 | - char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); | |
372 | + char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); | |
373 | size_t i; | |
374 | strcpy (n, match_lines ? line_beg : word_beg); | |
375 | i = strlen (n); | |
376 | @@ -257,14 +243,15 @@ | |
377 | size_t total = size; | |
378 | char const *motif = pattern; | |
379 | ||
380 | + check_utf8 (); | |
381 | if (strcmp (matcher, "awk") == 0) | |
382 | { | |
383 | - re_set_syntax (RE_SYNTAX_AWK); | |
384 | + re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0)); | |
385 | dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte); | |
386 | } | |
387 | else | |
388 | { | |
389 | - re_set_syntax (RE_SYNTAX_POSIX_EGREP); | |
390 | + re_set_syntax (RE_SYNTAX_POSIX_EGREP | (match_icase ? RE_ICASE : 0)); | |
391 | dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte); | |
392 | } | |
393 | ||
394 | @@ -316,7 +303,7 @@ | |
395 | static char const line_end[] = ")$"; | |
396 | static char const word_beg[] = "(^|[^[:alnum:]_])("; | |
397 | static char const word_end[] = ")([^[:alnum:]_]|$)"; | |
398 | - char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); | |
399 | + char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); | |
400 | size_t i; | |
401 | strcpy (n, match_lines ? line_beg : word_beg); | |
402 | i = strlen(n); | |
403 | @@ -339,15 +326,35 @@ | |
404 | char eol = eolbyte; | |
405 | int backref, start, len; | |
406 | struct kwsmatch kwsm; | |
407 | - size_t i; | |
408 | + size_t i, ret_val; | |
409 | + static int use_dfa; | |
410 | + static int use_dfa_checked = 0; | |
411 | #ifdef MBS_SUPPORT | |
412 | - char *mb_properties = NULL; | |
413 | + const char *last_char = NULL; | |
414 | + int mb_cur_max = MB_CUR_MAX; | |
415 | + mbstate_t mbs; | |
416 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
417 | #endif /* MBS_SUPPORT */ | |
418 | ||
419 | + if (!use_dfa_checked) | |
420 | + { | |
421 | + char *grep_use_dfa = getenv ("GREP_USE_DFA"); | |
422 | + if (!grep_use_dfa) | |
423 | + { | |
424 | #ifdef MBS_SUPPORT | |
425 | - if (MB_CUR_MAX > 1 && kwset) | |
426 | - mb_properties = check_multibyte_string(buf, size); | |
427 | + /* Turn off DFA when processing multibyte input. */ | |
428 | + use_dfa = (MB_CUR_MAX == 1); | |
429 | +#else | |
430 | + use_dfa = 1; | |
431 | #endif /* MBS_SUPPORT */ | |
432 | + } | |
433 | + else | |
434 | + { | |
435 | + use_dfa = atoi (grep_use_dfa); | |
436 | + } | |
437 | + | |
438 | + use_dfa_checked = 1; | |
439 | + } | |
440 | ||
441 | buflim = buf + size; | |
442 | ||
443 | @@ -358,47 +365,124 @@ | |
444 | if (kwset) | |
445 | { | |
446 | /* Find a possible match using the KWset matcher. */ | |
447 | - size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); | |
448 | +#ifdef MBS_SUPPORT | |
449 | + size_t bytes_left = 0; | |
450 | +#endif /* MBS_SUPPORT */ | |
451 | + size_t offset; | |
452 | +#ifdef MBS_SUPPORT | |
453 | + /* kwsexec doesn't work with match_icase and multibyte input. */ | |
454 | + if (match_icase && mb_cur_max > 1) | |
455 | + /* Avoid kwset */ | |
456 | + offset = 0; | |
457 | + else | |
458 | +#endif /* MBS_SUPPORT */ | |
459 | + offset = kwsexec (kwset, beg, buflim - beg, &kwsm); | |
460 | if (offset == (size_t) -1) | |
461 | - { | |
462 | + goto failure; | |
463 | #ifdef MBS_SUPPORT | |
464 | - if (MB_CUR_MAX > 1) | |
465 | - free(mb_properties); | |
466 | -#endif | |
467 | - return (size_t)-1; | |
468 | + if (mb_cur_max > 1 && !using_utf8) | |
469 | + { | |
470 | + bytes_left = offset; | |
471 | + while (bytes_left) | |
472 | + { | |
473 | + size_t mlen = mbrlen (beg, bytes_left, &mbs); | |
474 | + | |
475 | + last_char = beg; | |
476 | + if (mlen == (size_t) -1 || mlen == 0) | |
477 | + { | |
478 | + /* Incomplete character: treat as single-byte. */ | |
479 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
480 | + beg++; | |
481 | + bytes_left--; | |
482 | + continue; | |
483 | + } | |
484 | + | |
485 | + if (mlen == (size_t) -2) | |
486 | + /* Offset points inside multibyte character: | |
487 | + * no good. */ | |
488 | + break; | |
489 | + | |
490 | + beg += mlen; | |
491 | + bytes_left -= mlen; | |
492 | + } | |
493 | } | |
494 | + else | |
495 | +#endif /* MBS_SUPPORT */ | |
496 | beg += offset; | |
497 | /* Narrow down to the line containing the candidate, and | |
498 | run it through DFA. */ | |
499 | end = memchr(beg, eol, buflim - beg); | |
500 | end++; | |
501 | #ifdef MBS_SUPPORT | |
502 | - if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) | |
503 | + if (mb_cur_max > 1 && bytes_left) | |
504 | continue; | |
505 | -#endif | |
506 | +#endif /* MBS_SUPPORT */ | |
507 | while (beg > buf && beg[-1] != eol) | |
508 | --beg; | |
509 | - if (kwsm.index < kwset_exact_matches) | |
510 | - goto success; | |
511 | - if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) | |
512 | + if ( | |
513 | +#ifdef MBS_SUPPORT | |
514 | + !(match_icase && mb_cur_max > 1) && | |
515 | +#endif /* MBS_SUPPORT */ | |
516 | + (kwsm.index < kwset_exact_matches)) | |
517 | + goto success_in_beg_and_end; | |
518 | + if (use_dfa && | |
519 | + dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) | |
520 | continue; | |
521 | } | |
522 | else | |
523 | { | |
524 | /* No good fixed strings; start with DFA. */ | |
525 | - size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); | |
526 | +#ifdef MBS_SUPPORT | |
527 | + size_t bytes_left = 0; | |
528 | +#endif /* MBS_SUPPORT */ | |
529 | + size_t offset = 0; | |
530 | + if (use_dfa) | |
531 | + offset = dfaexec (&dfa, beg, buflim - beg, &backref); | |
532 | if (offset == (size_t) -1) | |
533 | break; | |
534 | /* Narrow down to the line we've found. */ | |
535 | +#ifdef MBS_SUPPORT | |
536 | + if (mb_cur_max > 1 && !using_utf8) | |
537 | + { | |
538 | + bytes_left = offset; | |
539 | + while (bytes_left) | |
540 | + { | |
541 | + size_t mlen = mbrlen (beg, bytes_left, &mbs); | |
542 | + | |
543 | + last_char = beg; | |
544 | + if (mlen == (size_t) -1 || mlen == 0) | |
545 | + { | |
546 | + /* Incomplete character: treat as single-byte. */ | |
547 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
548 | + beg++; | |
549 | + bytes_left--; | |
550 | + continue; | |
551 | + } | |
552 | + | |
553 | + if (mlen == (size_t) -2) | |
554 | + /* Offset points inside multibyte character: | |
555 | + * no good. */ | |
556 | + break; | |
557 | + | |
558 | + beg += mlen; | |
559 | + bytes_left -= mlen; | |
560 | + } | |
561 | + } | |
562 | + else | |
563 | +#endif /* MBS_SUPPORT */ | |
564 | beg += offset; | |
565 | end = memchr (beg, eol, buflim - beg); | |
566 | end++; | |
567 | +#ifdef MBS_SUPPORT | |
568 | + if (mb_cur_max > 1 && bytes_left) | |
569 | + continue; | |
570 | +#endif /* MBS_SUPPORT */ | |
571 | while (beg > buf && beg[-1] != eol) | |
572 | --beg; | |
573 | } | |
574 | /* Successful, no backreferences encountered! */ | |
575 | - if (!backref) | |
576 | - goto success; | |
577 | + if (use_dfa && !backref) | |
578 | + goto success_in_beg_and_end; | |
579 | } | |
580 | else | |
581 | end = beg + size; | |
582 | @@ -413,14 +497,11 @@ | |
583 | end - beg - 1, &(patterns[i].regs)))) | |
584 | { | |
585 | len = patterns[i].regs.end[0] - start; | |
586 | - if (exact) | |
587 | - { | |
588 | - *match_size = len; | |
589 | - return start; | |
590 | - } | |
591 | + if (exact && !match_words) | |
592 | + goto success_in_start_and_len; | |
593 | if ((!match_lines && !match_words) | |
594 | || (match_lines && len == end - beg - 1)) | |
595 | - goto success; | |
596 | + goto success_in_beg_and_end; | |
597 | /* If -w, check if the match aligns with word boundaries. | |
598 | We do this iteratively because: | |
599 | (a) the line may contain more than one occurence of the | |
600 | @@ -431,10 +512,84 @@ | |
601 | if (match_words) | |
602 | while (start >= 0) | |
603 | { | |
604 | - if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1])) | |
605 | - && (len == end - beg - 1 | |
606 | - || !WCHAR ((unsigned char) beg[start + len]))) | |
607 | - goto success; | |
608 | + int lword_match = 0; | |
609 | + if (start == 0) | |
610 | + lword_match = 1; | |
611 | + else | |
612 | + { | |
613 | + assert (start > 0); | |
614 | +#ifdef MBS_SUPPORT | |
615 | + if (mb_cur_max > 1) | |
616 | + { | |
617 | + const char *s; | |
618 | + int mr; | |
619 | + wchar_t pwc; | |
620 | + | |
621 | + if (using_utf8) | |
622 | + { | |
623 | + s = beg + start - 1; | |
624 | + while (s > buf | |
625 | + && (unsigned char) *s >= 0x80 | |
626 | + && (unsigned char) *s <= 0xbf) | |
627 | + --s; | |
628 | + } | |
629 | + else | |
630 | + s = last_char; | |
631 | + mr = mbtowc (&pwc, s, beg + start - s); | |
632 | + if (mr <= 0) | |
633 | + { | |
634 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
635 | + lword_match = 1; | |
636 | + } | |
637 | + else if (!(iswalnum (pwc) || pwc == L'_') | |
638 | + && mr == (int) (beg + start - s)) | |
639 | + lword_match = 1; | |
640 | + } | |
641 | + else | |
642 | +#endif /* MBS_SUPPORT */ | |
643 | + if (!WCHAR ((unsigned char) beg[start - 1])) | |
644 | + lword_match = 1; | |
645 | + } | |
646 | + | |
647 | + if (lword_match) | |
648 | + { | |
649 | + int rword_match = 0; | |
650 | + if (start + len == end - beg - 1) | |
651 | + rword_match = 1; | |
652 | + else | |
653 | + { | |
654 | +#ifdef MBS_SUPPORT | |
655 | + if (mb_cur_max > 1) | |
656 | + { | |
657 | + wchar_t nwc; | |
658 | + int mr; | |
659 | + | |
660 | + mr = mbtowc (&nwc, beg + start + len, | |
661 | + end - beg - start - len - 1); | |
662 | + if (mr <= 0) | |
663 | + { | |
664 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
665 | + rword_match = 1; | |
666 | + } | |
667 | + else if (!iswalnum (nwc) && nwc != L'_') | |
668 | + rword_match = 1; | |
669 | + } | |
670 | + else | |
671 | +#endif /* MBS_SUPPORT */ | |
672 | + if (!WCHAR ((unsigned char) beg[start + len])) | |
673 | + rword_match = 1; | |
674 | + } | |
675 | + | |
676 | + if (rword_match) | |
677 | + { | |
678 | + if (!exact) | |
679 | + /* Returns the whole line. */ | |
680 | + goto success_in_beg_and_end; | |
681 | + else | |
682 | + /* Returns just this word match. */ | |
683 | + goto success_in_start_and_len; | |
684 | + } | |
685 | + } | |
686 | if (len > 0) | |
687 | { | |
688 | /* Try a shorter length anchored at the same place. */ | |
689 | @@ -461,26 +616,154 @@ | |
690 | } | |
691 | } /* for Regex patterns. */ | |
692 | } /* for (beg = end ..) */ | |
693 | -#ifdef MBS_SUPPORT | |
694 | - if (MB_CUR_MAX > 1 && mb_properties) | |
695 | - free (mb_properties); | |
696 | -#endif /* MBS_SUPPORT */ | |
697 | + | |
698 | + failure: | |
699 | return (size_t) -1; | |
700 | ||
701 | - success: | |
702 | -#ifdef MBS_SUPPORT | |
703 | - if (MB_CUR_MAX > 1 && mb_properties) | |
704 | - free (mb_properties); | |
705 | -#endif /* MBS_SUPPORT */ | |
706 | - *match_size = end - beg; | |
707 | - return beg - buf; | |
708 | + success_in_beg_and_end: | |
709 | + len = end - beg; | |
710 | + start = beg - buf; | |
711 | + /* FALLTHROUGH */ | |
712 | + | |
713 | + success_in_start_and_len: | |
714 | + *match_size = len; | |
715 | + return start; | |
716 | } | |
717 | ||
718 | +#ifdef MBS_SUPPORT | |
719 | +static int f_i_multibyte; /* whether we're using the new -Fi MB method */ | |
720 | +static struct | |
721 | +{ | |
722 | + wchar_t **patterns; | |
723 | + size_t count, maxlen; | |
724 | + unsigned char *match; | |
725 | +} Fimb; | |
726 | +#endif | |
727 | + | |
728 | static void | |
729 | Fcompile (char const *pattern, size_t size) | |
730 | { | |
731 | + int mb_cur_max = MB_CUR_MAX; | |
732 | char const *beg, *lim, *err; | |
733 | ||
734 | + check_utf8 (); | |
735 | +#ifdef MBS_SUPPORT | |
736 | + /* Support -F -i for UTF-8 input. */ | |
737 | + if (match_icase && mb_cur_max > 1) | |
738 | + { | |
739 | + mbstate_t mbs; | |
740 | + wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t)); | |
741 | + const char *patternend = pattern; | |
742 | + size_t wcsize; | |
743 | + kwset_t fimb_kwset = NULL; | |
744 | + char *starts = NULL; | |
745 | + wchar_t *wcbeg, *wclim; | |
746 | + size_t allocated = 0; | |
747 | + | |
748 | + memset (&mbs, '\0', sizeof (mbs)); | |
749 | +# ifdef __GNU_LIBRARY__ | |
750 | + wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs); | |
751 | + if (patternend != pattern + size) | |
752 | + wcsize = (size_t) -1; | |
753 | +# else | |
754 | + { | |
755 | + char *patterncopy = xmalloc (size + 1); | |
756 | + | |
757 | + memcpy (patterncopy, pattern, size); | |
758 | + patterncopy[size] = '\0'; | |
759 | + patternend = patterncopy; | |
760 | + wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs); | |
761 | + if (patternend != patterncopy + size) | |
762 | + wcsize = (size_t) -1; | |
763 | + free (patterncopy); | |
764 | + } | |
765 | +# endif | |
766 | + if (wcsize + 2 <= 2) | |
767 | + { | |
768 | +fimb_fail: | |
769 | + free (wcpattern); | |
770 | + free (starts); | |
771 | + if (fimb_kwset) | |
772 | + kwsfree (fimb_kwset); | |
773 | + free (Fimb.patterns); | |
774 | + Fimb.patterns = NULL; | |
775 | + } | |
776 | + else | |
777 | + { | |
778 | + if (!(fimb_kwset = kwsalloc (NULL))) | |
779 | + error (2, 0, _("memory exhausted")); | |
780 | + | |
781 | + starts = xmalloc (mb_cur_max * 3); | |
782 | + wcbeg = wcpattern; | |
783 | + do | |
784 | + { | |
785 | + int i; | |
786 | + size_t wclen; | |
787 | + | |
788 | + if (Fimb.count >= allocated) | |
789 | + { | |
790 | + if (allocated == 0) | |
791 | + allocated = 128; | |
792 | + else | |
793 | + allocated *= 2; | |
794 | + Fimb.patterns = xrealloc (Fimb.patterns, | |
795 | + sizeof (wchar_t *) * allocated); | |
796 | + } | |
797 | + Fimb.patterns[Fimb.count++] = wcbeg; | |
798 | + for (wclim = wcbeg; | |
799 | + wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim) | |
800 | + *wclim = towlower (*wclim); | |
801 | + *wclim = L'\0'; | |
802 | + wclen = wclim - wcbeg; | |
803 | + if (wclen > Fimb.maxlen) | |
804 | + Fimb.maxlen = wclen; | |
805 | + if (wclen > 3) | |
806 | + wclen = 3; | |
807 | + if (wclen == 0) | |
808 | + { | |
809 | + if ((err = kwsincr (fimb_kwset, "", 0)) != 0) | |
810 | + error (2, 0, err); | |
811 | + } | |
812 | + else | |
813 | + for (i = 0; i < (1 << wclen); i++) | |
814 | + { | |
815 | + char *p = starts; | |
816 | + int j, k; | |
817 | + | |
818 | + for (j = 0; j < wclen; ++j) | |
819 | + { | |
820 | + wchar_t wc = wcbeg[j]; | |
821 | + if (i & (1 << j)) | |
822 | + { | |
823 | + wc = towupper (wc); | |
824 | + if (wc == wcbeg[j]) | |
825 | + continue; | |
826 | + } | |
827 | + k = wctomb (p, wc); | |
828 | + if (k <= 0) | |
829 | + goto fimb_fail; | |
830 | + p += k; | |
831 | + } | |
832 | + if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0) | |
833 | + error (2, 0, err); | |
834 | + } | |
835 | + if (wclim < wcpattern + wcsize) | |
836 | + ++wclim; | |
837 | + wcbeg = wclim; | |
838 | + } | |
839 | + while (wcbeg < wcpattern + wcsize); | |
840 | + f_i_multibyte = 1; | |
841 | + kwset = fimb_kwset; | |
842 | + free (starts); | |
843 | + Fimb.match = xmalloc (Fimb.count); | |
844 | + if ((err = kwsprep (kwset)) != 0) | |
845 | + error (2, 0, err); | |
846 | + return; | |
847 | + } | |
848 | + } | |
849 | +#endif /* MBS_SUPPORT */ | |
850 | + | |
851 | + | |
852 | kwsinit (); | |
853 | beg = pattern; | |
854 | do | |
855 | @@ -499,6 +782,76 @@ | |
856 | error (2, 0, err); | |
857 | } | |
858 | ||
859 | +#ifdef MBS_SUPPORT | |
860 | +static int | |
861 | +Fimbexec (const char *buf, size_t size, size_t *plen, int exact) | |
862 | +{ | |
863 | + size_t len, letter, i; | |
864 | + int ret = -1; | |
865 | + mbstate_t mbs; | |
866 | + wchar_t wc; | |
867 | + int patterns_left; | |
868 | + | |
869 | + assert (match_icase && f_i_multibyte == 1); | |
870 | + assert (MB_CUR_MAX > 1); | |
871 | + | |
872 | + memset (&mbs, '\0', sizeof (mbs)); | |
873 | + memset (Fimb.match, '\1', Fimb.count); | |
874 | + letter = len = 0; | |
875 | + patterns_left = 1; | |
876 | + while (patterns_left && len <= size) | |
877 | + { | |
878 | + size_t c; | |
879 | + | |
880 | + patterns_left = 0; | |
881 | + if (len < size) | |
882 | + { | |
883 | + c = mbrtowc (&wc, buf + len, size - len, &mbs); | |
884 | + if (c + 2 <= 2) | |
885 | + return ret; | |
886 | + | |
887 | + wc = towlower (wc); | |
888 | + } | |
889 | + else | |
890 | + { | |
891 | + c = 1; | |
892 | + wc = L'\0'; | |
893 | + } | |
894 | + | |
895 | + for (i = 0; i < Fimb.count; i++) | |
896 | + { | |
897 | + if (Fimb.match[i]) | |
898 | + { | |
899 | + if (Fimb.patterns[i][letter] == L'\0') | |
900 | + { | |
901 | + /* Found a match. */ | |
902 | + *plen = len; | |
903 | + if (!exact && !match_words) | |
904 | + return 0; | |
905 | + else | |
906 | + { | |
907 | + /* For -w or exact look for longest match. */ | |
908 | + ret = 0; | |
909 | + Fimb.match[i] = '\0'; | |
910 | + continue; | |
911 | + } | |
912 | + } | |
913 | + | |
914 | + if (Fimb.patterns[i][letter] == wc) | |
915 | + patterns_left = 1; | |
916 | + else | |
917 | + Fimb.match[i] = '\0'; | |
918 | + } | |
919 | + } | |
920 | + | |
921 | + len += c; | |
922 | + letter++; | |
923 | + } | |
924 | + | |
925 | + return ret; | |
926 | +} | |
927 | +#endif /* MBS_SUPPORT */ | |
928 | + | |
929 | static size_t | |
930 | Fexecute (char const *buf, size_t size, size_t *match_size, int exact) | |
931 | { | |
932 | @@ -506,88 +859,268 @@ | |
933 | register size_t len; | |
934 | char eol = eolbyte; | |
935 | struct kwsmatch kwsmatch; | |
936 | + size_t ret_val; | |
937 | #ifdef MBS_SUPPORT | |
938 | - char *mb_properties; | |
939 | - if (MB_CUR_MAX > 1) | |
940 | - mb_properties = check_multibyte_string (buf, size); | |
941 | + int mb_cur_max = MB_CUR_MAX; | |
942 | + mbstate_t mbs; | |
943 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
944 | + const char *last_char = NULL; | |
945 | #endif /* MBS_SUPPORT */ | |
946 | ||
947 | for (beg = buf; beg <= buf + size; ++beg) | |
948 | { | |
949 | - size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); | |
950 | + size_t offset; | |
951 | + offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); | |
952 | + | |
953 | if (offset == (size_t) -1) | |
954 | - { | |
955 | + goto failure; | |
956 | #ifdef MBS_SUPPORT | |
957 | - if (MB_CUR_MAX > 1) | |
958 | - free(mb_properties); | |
959 | -#endif /* MBS_SUPPORT */ | |
960 | - return offset; | |
961 | + if (mb_cur_max > 1 && !using_utf8) | |
962 | + { | |
963 | + size_t bytes_left = offset; | |
964 | + while (bytes_left) | |
965 | + { | |
966 | + size_t mlen = mbrlen (beg, bytes_left, &mbs); | |
967 | + | |
968 | + last_char = beg; | |
969 | + if (mlen == (size_t) -1 || mlen == 0) | |
970 | + { | |
971 | + /* Incomplete character: treat as single-byte. */ | |
972 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
973 | + beg++; | |
974 | + bytes_left--; | |
975 | + continue; | |
976 | + } | |
977 | + | |
978 | + if (mlen == (size_t) -2) | |
979 | + /* Offset points inside multibyte character: no good. */ | |
980 | + break; | |
981 | + | |
982 | + beg += mlen; | |
983 | + bytes_left -= mlen; | |
984 | + } | |
985 | + | |
986 | + if (bytes_left) | |
987 | + continue; | |
988 | } | |
989 | -#ifdef MBS_SUPPORT | |
990 | - if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) | |
991 | - continue; /* It is a part of multibyte character. */ | |
992 | + else | |
993 | #endif /* MBS_SUPPORT */ | |
994 | beg += offset; | |
995 | - len = kwsmatch.size[0]; | |
996 | - if (exact) | |
997 | - { | |
998 | - *match_size = len; | |
999 | #ifdef MBS_SUPPORT | |
1000 | - if (MB_CUR_MAX > 1) | |
1001 | - free (mb_properties); | |
1002 | + /* For f_i_multibyte, the string at beg now matches first 3 chars of | |
1003 | + one of the search strings (less if there are shorter search strings). | |
1004 | + See if this is a real match. */ | |
1005 | + if (f_i_multibyte | |
1006 | + && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], exact)) | |
1007 | + goto next_char; | |
1008 | #endif /* MBS_SUPPORT */ | |
1009 | - return beg - buf; | |
1010 | - } | |
1011 | + len = kwsmatch.size[0]; | |
1012 | + if (exact && !match_words) | |
1013 | + goto success_in_beg_and_len; | |
1014 | if (match_lines) | |
1015 | { | |
1016 | if (beg > buf && beg[-1] != eol) | |
1017 | - continue; | |
1018 | + goto next_char; | |
1019 | if (beg + len < buf + size && beg[len] != eol) | |
1020 | - continue; | |
1021 | + goto next_char; | |
1022 | goto success; | |
1023 | } | |
1024 | else if (match_words) | |
1025 | - for (try = beg; len; ) | |
1026 | - { | |
1027 | - if (try > buf && WCHAR((unsigned char) try[-1])) | |
1028 | - break; | |
1029 | - if (try + len < buf + size && WCHAR((unsigned char) try[len])) | |
1030 | - { | |
1031 | - offset = kwsexec (kwset, beg, --len, &kwsmatch); | |
1032 | - if (offset == (size_t) -1) | |
1033 | - { | |
1034 | + { | |
1035 | + while (len) | |
1036 | + { | |
1037 | + int word_match = 0; | |
1038 | + if (beg > buf) | |
1039 | + { | |
1040 | #ifdef MBS_SUPPORT | |
1041 | - if (MB_CUR_MAX > 1) | |
1042 | - free (mb_properties); | |
1043 | + if (mb_cur_max > 1) | |
1044 | + { | |
1045 | + const char *s; | |
1046 | + int mr; | |
1047 | + wchar_t pwc; | |
1048 | + | |
1049 | + if (using_utf8) | |
1050 | + { | |
1051 | + s = beg - 1; | |
1052 | + while (s > buf | |
1053 | + && (unsigned char) *s >= 0x80 | |
1054 | + && (unsigned char) *s <= 0xbf) | |
1055 | + --s; | |
1056 | + } | |
1057 | + else | |
1058 | + s = last_char; | |
1059 | + mr = mbtowc (&pwc, s, beg - s); | |
1060 | + if (mr <= 0) | |
1061 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
1062 | + else if ((iswalnum (pwc) || pwc == L'_') | |
1063 | + && mr == (int) (beg - s)) | |
1064 | + goto next_char; | |
1065 | + } | |
1066 | + else | |
1067 | #endif /* MBS_SUPPORT */ | |
1068 | - return offset; | |
1069 | - } | |
1070 | - try = beg + offset; | |
1071 | - len = kwsmatch.size[0]; | |
1072 | - } | |
1073 | - else | |
1074 | - goto success; | |
1075 | - } | |
1076 | + if (WCHAR ((unsigned char) beg[-1])) | |
1077 | + goto next_char; | |
1078 | + } | |
1079 | +#ifdef MBS_SUPPORT | |
1080 | + if (mb_cur_max > 1) | |
1081 | + { | |
1082 | + wchar_t nwc; | |
1083 | + int mr; | |
1084 | + | |
1085 | + mr = mbtowc (&nwc, beg + len, buf + size - beg - len); | |
1086 | + if (mr <= 0) | |
1087 | + { | |
1088 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
1089 | + word_match = 1; | |
1090 | + } | |
1091 | + else if (!iswalnum (nwc) && nwc != L'_') | |
1092 | + word_match = 1; | |
1093 | + } | |
1094 | + else | |
1095 | +#endif /* MBS_SUPPORT */ | |
1096 | + if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len])) | |
1097 | + word_match = 1; | |
1098 | + if (word_match) | |
1099 | + { | |
1100 | + if (!exact) | |
1101 | + /* Returns the whole line now we know there's a word match. */ | |
1102 | + goto success; | |
1103 | + else | |
1104 | + /* Returns just this word match. */ | |
1105 | + goto success_in_beg_and_len; | |
1106 | + } | |
1107 | + if (len > 0) | |
1108 | + { | |
1109 | + /* Try a shorter length anchored at the same place. */ | |
1110 | + --len; | |
1111 | + offset = kwsexec (kwset, beg, len, &kwsmatch); | |
1112 | + | |
1113 | + if (offset == -1) | |
1114 | + goto next_char; /* Try a different anchor. */ | |
1115 | +#ifdef MBS_SUPPORT | |
1116 | + if (mb_cur_max > 1 && !using_utf8) | |
1117 | + { | |
1118 | + size_t bytes_left = offset; | |
1119 | + while (bytes_left) | |
1120 | + { | |
1121 | + size_t mlen = mbrlen (beg, bytes_left, &mbs); | |
1122 | + | |
1123 | + last_char = beg; | |
1124 | + if (mlen == (size_t) -1 || mlen == 0) | |
1125 | + { | |
1126 | + /* Incomplete character: treat as single-byte. */ | |
1127 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
1128 | + beg++; | |
1129 | + bytes_left--; | |
1130 | + continue; | |
1131 | + } | |
1132 | + | |
1133 | + if (mlen == (size_t) -2) | |
1134 | + { | |
1135 | + /* Offset points inside multibyte character: | |
1136 | + * no good. */ | |
1137 | + break; | |
1138 | + } | |
1139 | + | |
1140 | + beg += mlen; | |
1141 | + bytes_left -= mlen; | |
1142 | + } | |
1143 | + | |
1144 | + if (bytes_left) | |
1145 | + { | |
1146 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
1147 | + goto next_char; /* Try a different anchor. */ | |
1148 | + } | |
1149 | + } | |
1150 | + else | |
1151 | +#endif /* MBS_SUPPORT */ | |
1152 | + beg += offset; | |
1153 | +#ifdef MBS_SUPPORT | |
1154 | + /* The string at beg now matches first 3 chars of one of | |
1155 | + the search strings (less if there are shorter search | |
1156 | + strings). See if this is a real match. */ | |
1157 | + if (f_i_multibyte | |
1158 | + && Fimbexec (beg, len - offset, &kwsmatch.size[0], | |
1159 | + exact)) | |
1160 | + goto next_char; | |
1161 | +#endif /* MBS_SUPPORT */ | |
1162 | + len = kwsmatch.size[0]; | |
1163 | + } | |
1164 | + } | |
1165 | + } | |
1166 | else | |
1167 | goto success; | |
1168 | - } | |
1169 | - | |
1170 | +next_char:; | |
1171 | #ifdef MBS_SUPPORT | |
1172 | - if (MB_CUR_MAX > 1) | |
1173 | - free (mb_properties); | |
1174 | + /* Advance to next character. For MB_CUR_MAX == 1 case this is handled | |
1175 | + by ++beg above. */ | |
1176 | + if (mb_cur_max > 1) | |
1177 | + { | |
1178 | + if (using_utf8) | |
1179 | + { | |
1180 | + unsigned char c = *beg; | |
1181 | + if (c >= 0xc2) | |
1182 | + { | |
1183 | + if (c < 0xe0) | |
1184 | + ++beg; | |
1185 | + else if (c < 0xf0) | |
1186 | + beg += 2; | |
1187 | + else if (c < 0xf8) | |
1188 | + beg += 3; | |
1189 | + else if (c < 0xfc) | |
1190 | + beg += 4; | |
1191 | + else if (c < 0xfe) | |
1192 | + beg += 5; | |
1193 | + } | |
1194 | + } | |
1195 | + else | |
1196 | + { | |
1197 | + size_t l = mbrlen (beg, buf + size - beg, &mbs); | |
1198 | + | |
1199 | + last_char = beg; | |
1200 | + if (l + 2 >= 2) | |
1201 | + beg += l - 1; | |
1202 | + else | |
1203 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
1204 | + } | |
1205 | + } | |
1206 | #endif /* MBS_SUPPORT */ | |
1207 | + } | |
1208 | + | |
1209 | + failure: | |
1210 | return -1; | |
1211 | ||
1212 | success: | |
1213 | +#ifdef MBS_SUPPORT | |
1214 | + if (mb_cur_max > 1 && !using_utf8) | |
1215 | + { | |
1216 | + end = beg + len; | |
1217 | + while (end < buf + size) | |
1218 | + { | |
1219 | + size_t mlen = mbrlen (end, buf + size - end, &mbs); | |
1220 | + if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0) | |
1221 | + { | |
1222 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
1223 | + mlen = 1; | |
1224 | + } | |
1225 | + if (mlen == 1 && *end == eol) | |
1226 | + break; | |
1227 | + | |
1228 | + end += mlen; | |
1229 | + } | |
1230 | + } | |
1231 | + else | |
1232 | +#endif /* MBS_SUPPORT */ | |
1233 | end = memchr (beg + len, eol, (buf + size) - (beg + len)); | |
1234 | + | |
1235 | end++; | |
1236 | while (buf < beg && beg[-1] != eol) | |
1237 | --beg; | |
1238 | - *match_size = end - beg; | |
1239 | -#ifdef MBS_SUPPORT | |
1240 | - if (MB_CUR_MAX > 1) | |
1241 | - free (mb_properties); | |
1242 | -#endif /* MBS_SUPPORT */ | |
1243 | + len = end - beg; | |
1244 | + /* FALLTHROUGH */ | |
1245 | + | |
1246 | + success_in_beg_and_len: | |
1247 | + *match_size = len; | |
1248 | return beg - buf; | |
1249 | } | |
1250 | ||
1251 | diff -urN grep-2.5.1a.orig/src/search.c.orig grep-2.5.1a/src/search.c.orig | |
1252 | --- grep-2.5.1a.orig/src/search.c.orig 1970-01-01 05:00:00.000000000 +0500 | |
1253 | +++ grep-2.5.1a/src/search.c.orig 2005-10-23 09:48:39.000000000 +0600 | |
1254 | @@ -0,0 +1,714 @@ | |
1255 | +/* search.c - searching subroutines using dfa, kwset and regex for grep. | |
1256 | + Copyright 1992, 1998, 2000 Free Software Foundation, Inc. | |
1257 | + | |
1258 | + This program is free software; you can redistribute it and/or modify | |
1259 | + it under the terms of the GNU General Public License as published by | |
1260 | + the Free Software Foundation; either version 2, or (at your option) | |
1261 | + any later version. | |
1262 | + | |
1263 | + This program is distributed in the hope that it will be useful, | |
1264 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
1265 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
1266 | + GNU General Public License for more details. | |
1267 | + | |
1268 | + You should have received a copy of the GNU General Public License | |
1269 | + along with this program; if not, write to the Free Software | |
1270 | + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA | |
1271 | + 02111-1307, USA. */ | |
1272 | + | |
1273 | +/* Written August 1992 by Mike Haertel. */ | |
1274 | + | |
1275 | +#ifdef HAVE_CONFIG_H | |
1276 | +# include <config.h> | |
1277 | +#endif | |
1278 | +#include <sys/types.h> | |
1279 | +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC | |
1280 | +/* We can handle multibyte string. */ | |
1281 | +# define MBS_SUPPORT | |
1282 | +# include <wchar.h> | |
1283 | +# include <wctype.h> | |
1284 | +#endif | |
1285 | + | |
1286 | +#include "system.h" | |
1287 | +#include "grep.h" | |
1288 | +#include "regex.h" | |
1289 | +#include "dfa.h" | |
1290 | +#include "kwset.h" | |
1291 | +#include "error.h" | |
1292 | +#include "xalloc.h" | |
1293 | +#ifdef HAVE_LIBPCRE | |
1294 | +# include <pcre.h> | |
1295 | +#endif | |
1296 | + | |
1297 | +#define NCHAR (UCHAR_MAX + 1) | |
1298 | + | |
1299 | +/* For -w, we also consider _ to be word constituent. */ | |
1300 | +#define WCHAR(C) (ISALNUM(C) || (C) == '_') | |
1301 | + | |
1302 | +/* DFA compiled regexp. */ | |
1303 | +static struct dfa dfa; | |
1304 | + | |
1305 | +/* The Regex compiled patterns. */ | |
1306 | +static struct patterns | |
1307 | +{ | |
1308 | + /* Regex compiled regexp. */ | |
1309 | + struct re_pattern_buffer regexbuf; | |
1310 | + struct re_registers regs; /* This is here on account of a BRAIN-DEAD | |
1311 | + Q@#%!# library interface in regex.c. */ | |
1312 | +} patterns0; | |
1313 | + | |
1314 | +struct patterns *patterns; | |
1315 | +size_t pcount; | |
1316 | + | |
1317 | +/* KWset compiled pattern. For Ecompile and Gcompile, we compile | |
1318 | + a list of strings, at least one of which is known to occur in | |
1319 | + any string matching the regexp. */ | |
1320 | +static kwset_t kwset; | |
1321 | + | |
1322 | +/* Number of compiled fixed strings known to exactly match the regexp. | |
1323 | + If kwsexec returns < kwset_exact_matches, then we don't need to | |
1324 | + call the regexp matcher at all. */ | |
1325 | +static int kwset_exact_matches; | |
1326 | + | |
1327 | +#if defined(MBS_SUPPORT) | |
1328 | +static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); | |
1329 | +#endif | |
1330 | +static void kwsinit PARAMS ((void)); | |
1331 | +static void kwsmusts PARAMS ((void)); | |
1332 | +static void Gcompile PARAMS ((char const *, size_t)); | |
1333 | +static void Ecompile PARAMS ((char const *, size_t)); | |
1334 | +static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int )); | |
1335 | +static void Fcompile PARAMS ((char const *, size_t)); | |
1336 | +static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int)); | |
1337 | +static void Pcompile PARAMS ((char const *, size_t )); | |
1338 | +static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); | |
1339 | + | |
1340 | +void | |
1341 | +dfaerror (char const *mesg) | |
1342 | +{ | |
1343 | + error (2, 0, mesg); | |
1344 | +} | |
1345 | + | |
1346 | +static void | |
1347 | +kwsinit (void) | |
1348 | +{ | |
1349 | + static char trans[NCHAR]; | |
1350 | + int i; | |
1351 | + | |
1352 | + if (match_icase) | |
1353 | + for (i = 0; i < NCHAR; ++i) | |
1354 | + trans[i] = TOLOWER (i); | |
1355 | + | |
1356 | + if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0))) | |
1357 | + error (2, 0, _("memory exhausted")); | |
1358 | +} | |
1359 | + | |
1360 | +/* If the DFA turns out to have some set of fixed strings one of | |
1361 | + which must occur in the match, then we build a kwset matcher | |
1362 | + to find those strings, and thus quickly filter out impossible | |
1363 | + matches. */ | |
1364 | +static void | |
1365 | +kwsmusts (void) | |
1366 | +{ | |
1367 | + struct dfamust const *dm; | |
1368 | + char const *err; | |
1369 | + | |
1370 | + if (dfa.musts) | |
1371 | + { | |
1372 | + kwsinit (); | |
1373 | + /* First, we compile in the substrings known to be exact | |
1374 | + matches. The kwset matcher will return the index | |
1375 | + of the matching string that it chooses. */ | |
1376 | + for (dm = dfa.musts; dm; dm = dm->next) | |
1377 | + { | |
1378 | + if (!dm->exact) | |
1379 | + continue; | |
1380 | + ++kwset_exact_matches; | |
1381 | + if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0) | |
1382 | + error (2, 0, err); | |
1383 | + } | |
1384 | + /* Now, we compile the substrings that will require | |
1385 | + the use of the regexp matcher. */ | |
1386 | + for (dm = dfa.musts; dm; dm = dm->next) | |
1387 | + { | |
1388 | + if (dm->exact) | |
1389 | + continue; | |
1390 | + if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0) | |
1391 | + error (2, 0, err); | |
1392 | + } | |
1393 | + if ((err = kwsprep (kwset)) != 0) | |
1394 | + error (2, 0, err); | |
1395 | + } | |
1396 | +} | |
1397 | + | |
1398 | +#ifdef MBS_SUPPORT | |
1399 | +/* This function allocate the array which correspond to "buf". | |
1400 | + Then this check multibyte string and mark on the positions which | |
1401 | + are not singlebyte character nor the first byte of a multibyte | |
1402 | + character. Caller must free the array. */ | |
1403 | +static char* | |
1404 | +check_multibyte_string(char const *buf, size_t size) | |
1405 | +{ | |
1406 | + char *mb_properties = malloc(size); | |
1407 | + mbstate_t cur_state; | |
1408 | + int i; | |
1409 | + memset(&cur_state, 0, sizeof(mbstate_t)); | |
1410 | + memset(mb_properties, 0, sizeof(char)*size); | |
1411 | + for (i = 0; i < size ;) | |
1412 | + { | |
1413 | + size_t mbclen; | |
1414 | + mbclen = mbrlen(buf + i, size - i, &cur_state); | |
1415 | + | |
1416 | + if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) | |
1417 | + { | |
1418 | + /* An invalid sequence, or a truncated multibyte character. | |
1419 | + We treat it as a singlebyte character. */ | |
1420 | + mbclen = 1; | |
1421 | + } | |
1422 | + mb_properties[i] = mbclen; | |
1423 | + i += mbclen; | |
1424 | + } | |
1425 | + | |
1426 | + return mb_properties; | |
1427 | +} | |
1428 | +#endif | |
1429 | + | |
1430 | +static void | |
1431 | +Gcompile (char const *pattern, size_t size) | |
1432 | +{ | |
1433 | + const char *err; | |
1434 | + char const *sep; | |
1435 | + size_t total = size; | |
1436 | + char const *motif = pattern; | |
1437 | + | |
1438 | + re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); | |
1439 | + dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); | |
1440 | + | |
1441 | + /* For GNU regex compiler we have to pass the patterns separately to detect | |
1442 | + errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" | |
1443 | + GNU regex should have raise a syntax error. The same for backref, where | |
1444 | + the backref should have been local to each pattern. */ | |
1445 | + do | |
1446 | + { | |
1447 | + size_t len; | |
1448 | + sep = memchr (motif, '\n', total); | |
1449 | + if (sep) | |
1450 | + { | |
1451 | + len = sep - motif; | |
1452 | + sep++; | |
1453 | + total -= (len + 1); | |
1454 | + } | |
1455 | + else | |
1456 | + { | |
1457 | + len = total; | |
1458 | + total = 0; | |
1459 | + } | |
1460 | + | |
1461 | + patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns)); | |
1462 | + if (patterns == NULL) | |
1463 | + error (2, errno, _("memory exhausted")); | |
1464 | + | |
1465 | + patterns[pcount] = patterns0; | |
1466 | + | |
1467 | + if ((err = re_compile_pattern (motif, len, | |
1468 | + &(patterns[pcount].regexbuf))) != 0) | |
1469 | + error (2, 0, err); | |
1470 | + pcount++; | |
1471 | + | |
1472 | + motif = sep; | |
1473 | + } while (sep && total != 0); | |
1474 | + | |
1475 | + /* In the match_words and match_lines cases, we use a different pattern | |
1476 | + for the DFA matcher that will quickly throw out cases that won't work. | |
1477 | + Then if DFA succeeds we do some hairy stuff using the regex matcher | |
1478 | + to decide whether the match should really count. */ | |
1479 | + if (match_words || match_lines) | |
1480 | + { | |
1481 | + /* In the whole-word case, we use the pattern: | |
1482 | + \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\). | |
1483 | + In the whole-line case, we use the pattern: | |
1484 | + ^\(userpattern\)$. */ | |
1485 | + | |
1486 | + static char const line_beg[] = "^\\("; | |
1487 | + static char const line_end[] = "\\)$"; | |
1488 | + static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\("; | |
1489 | + static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)"; | |
1490 | + char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); | |
1491 | + size_t i; | |
1492 | + strcpy (n, match_lines ? line_beg : word_beg); | |
1493 | + i = strlen (n); | |
1494 | + memcpy (n + i, pattern, size); | |
1495 | + i += size; | |
1496 | + strcpy (n + i, match_lines ? line_end : word_end); | |
1497 | + i += strlen (n + i); | |
1498 | + pattern = n; | |
1499 | + size = i; | |
1500 | + } | |
1501 | + | |
1502 | + dfacomp (pattern, size, &dfa, 1); | |
1503 | + kwsmusts (); | |
1504 | +} | |
1505 | + | |
1506 | +static void | |
1507 | +Ecompile (char const *pattern, size_t size) | |
1508 | +{ | |
1509 | + const char *err; | |
1510 | + const char *sep; | |
1511 | + size_t total = size; | |
1512 | + char const *motif = pattern; | |
1513 | + | |
1514 | + if (strcmp (matcher, "awk") == 0) | |
1515 | + { | |
1516 | + re_set_syntax (RE_SYNTAX_AWK); | |
1517 | + dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte); | |
1518 | + } | |
1519 | + else | |
1520 | + { | |
1521 | + re_set_syntax (RE_SYNTAX_POSIX_EGREP); | |
1522 | + dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte); | |
1523 | + } | |
1524 | + | |
1525 | + /* For GNU regex compiler we have to pass the patterns separately to detect | |
1526 | + errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" | |
1527 | + GNU regex should have raise a syntax error. The same for backref, where | |
1528 | + the backref should have been local to each pattern. */ | |
1529 | + do | |
1530 | + { | |
1531 | + size_t len; | |
1532 | + sep = memchr (motif, '\n', total); | |
1533 | + if (sep) | |
1534 | + { | |
1535 | + len = sep - motif; | |
1536 | + sep++; | |
1537 | + total -= (len + 1); | |
1538 | + } | |
1539 | + else | |
1540 | + { | |
1541 | + len = total; | |
1542 | + total = 0; | |
1543 | + } | |
1544 | + | |
1545 | + patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns)); | |
1546 | + if (patterns == NULL) | |
1547 | + error (2, errno, _("memory exhausted")); | |
1548 | + patterns[pcount] = patterns0; | |
1549 | + | |
1550 | + if ((err = re_compile_pattern (motif, len, | |
1551 | + &(patterns[pcount].regexbuf))) != 0) | |
1552 | + error (2, 0, err); | |
1553 | + pcount++; | |
1554 | + | |
1555 | + motif = sep; | |
1556 | + } while (sep && total != 0); | |
1557 | + | |
1558 | + /* In the match_words and match_lines cases, we use a different pattern | |
1559 | + for the DFA matcher that will quickly throw out cases that won't work. | |
1560 | + Then if DFA succeeds we do some hairy stuff using the regex matcher | |
1561 | + to decide whether the match should really count. */ | |
1562 | + if (match_words || match_lines) | |
1563 | + { | |
1564 | + /* In the whole-word case, we use the pattern: | |
1565 | + (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$). | |
1566 | + In the whole-line case, we use the pattern: | |
1567 | + ^(userpattern)$. */ | |
1568 | + | |
1569 | + static char const line_beg[] = "^("; | |
1570 | + static char const line_end[] = ")$"; | |
1571 | + static char const word_beg[] = "(^|[^[:alnum:]_])("; | |
1572 | + static char const word_end[] = ")([^[:alnum:]_]|$)"; | |
1573 | + char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); | |
1574 | + size_t i; | |
1575 | + strcpy (n, match_lines ? line_beg : word_beg); | |
1576 | + i = strlen(n); | |
1577 | + memcpy (n + i, pattern, size); | |
1578 | + i += size; | |
1579 | + strcpy (n + i, match_lines ? line_end : word_end); | |
1580 | + i += strlen (n + i); | |
1581 | + pattern = n; | |
1582 | + size = i; | |
1583 | + } | |
1584 | + | |
1585 | + dfacomp (pattern, size, &dfa, 1); | |
1586 | + kwsmusts (); | |
1587 | +} | |
1588 | + | |
1589 | +static size_t | |
1590 | +EGexecute (char const *buf, size_t size, size_t *match_size, int exact) | |
1591 | +{ | |
1592 | + register char const *buflim, *beg, *end; | |
1593 | + char eol = eolbyte; | |
1594 | + int backref, start, len; | |
1595 | + struct kwsmatch kwsm; | |
1596 | + size_t i; | |
1597 | +#ifdef MBS_SUPPORT | |
1598 | + char *mb_properties = NULL; | |
1599 | +#endif /* MBS_SUPPORT */ | |
1600 | + | |
1601 | +#ifdef MBS_SUPPORT | |
1602 | + if (MB_CUR_MAX > 1 && kwset) | |
1603 | + mb_properties = check_multibyte_string(buf, size); | |
1604 | +#endif /* MBS_SUPPORT */ | |
1605 | + | |
1606 | + buflim = buf + size; | |
1607 | + | |
1608 | + for (beg = end = buf; end < buflim; beg = end) | |
1609 | + { | |
1610 | + if (!exact) | |
1611 | + { | |
1612 | + if (kwset) | |
1613 | + { | |
1614 | + /* Find a possible match using the KWset matcher. */ | |
1615 | + size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); | |
1616 | + if (offset == (size_t) -1) | |
1617 | + goto failure; | |
1618 | + beg += offset; | |
1619 | + /* Narrow down to the line containing the candidate, and | |
1620 | + run it through DFA. */ | |
1621 | + end = memchr(beg, eol, buflim - beg); | |
1622 | + end++; | |
1623 | +#ifdef MBS_SUPPORT | |
1624 | + if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) | |
1625 | + continue; | |
1626 | +#endif | |
1627 | + while (beg > buf && beg[-1] != eol) | |
1628 | + --beg; | |
1629 | + if (kwsm.index < kwset_exact_matches) | |
1630 | + goto success_in_beg_and_end; | |
1631 | + if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) | |
1632 | + continue; | |
1633 | + } | |
1634 | + else | |
1635 | + { | |
1636 | + /* No good fixed strings; start with DFA. */ | |
1637 | + size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); | |
1638 | + if (offset == (size_t) -1) | |
1639 | + break; | |
1640 | + /* Narrow down to the line we've found. */ | |
1641 | + beg += offset; | |
1642 | + end = memchr (beg, eol, buflim - beg); | |
1643 | + end++; | |
1644 | + while (beg > buf && beg[-1] != eol) | |
1645 | + --beg; | |
1646 | + } | |
1647 | + /* Successful, no backreferences encountered! */ | |
1648 | + if (!backref) | |
1649 | + goto success_in_beg_and_end; | |
1650 | + } | |
1651 | + else | |
1652 | + end = beg + size; | |
1653 | + | |
1654 | + /* If we've made it to this point, this means DFA has seen | |
1655 | + a probable match, and we need to run it through Regex. */ | |
1656 | + for (i = 0; i < pcount; i++) | |
1657 | + { | |
1658 | + patterns[i].regexbuf.not_eol = 0; | |
1659 | + if (0 <= (start = re_search (&(patterns[i].regexbuf), beg, | |
1660 | + end - beg - 1, 0, | |
1661 | + end - beg - 1, &(patterns[i].regs)))) | |
1662 | + { | |
1663 | + len = patterns[i].regs.end[0] - start; | |
1664 | + if (exact && !match_words) | |
1665 | + goto success_in_start_and_len; | |
1666 | + if ((!match_lines && !match_words) | |
1667 | + || (match_lines && len == end - beg - 1)) | |
1668 | + goto success_in_beg_and_end; | |
1669 | + /* If -w, check if the match aligns with word boundaries. | |
1670 | + We do this iteratively because: | |
1671 | + (a) the line may contain more than one occurence of the | |
1672 | + pattern, and | |
1673 | + (b) Several alternatives in the pattern might be valid at a | |
1674 | + given point, and we may need to consider a shorter one to | |
1675 | + find a word boundary. */ | |
1676 | + if (match_words) | |
1677 | + while (start >= 0) | |
1678 | + { | |
1679 | + if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1])) | |
1680 | + && (len == end - beg - 1 | |
1681 | + || !WCHAR ((unsigned char) beg[start + len]))) | |
1682 | + goto success_in_beg_and_end; | |
1683 | + if (len > 0) | |
1684 | + { | |
1685 | + /* Try a shorter length anchored at the same place. */ | |
1686 | + --len; | |
1687 | + patterns[i].regexbuf.not_eol = 1; | |
1688 | + len = re_match (&(patterns[i].regexbuf), beg, | |
1689 | + start + len, start, | |
1690 | + &(patterns[i].regs)); | |
1691 | + } | |
1692 | + if (len <= 0) | |
1693 | + { | |
1694 | + /* Try looking further on. */ | |
1695 | + if (start == end - beg - 1) | |
1696 | + break; | |
1697 | + ++start; | |
1698 | + patterns[i].regexbuf.not_eol = 0; | |
1699 | + start = re_search (&(patterns[i].regexbuf), beg, | |
1700 | + end - beg - 1, | |
1701 | + start, end - beg - 1 - start, | |
1702 | + &(patterns[i].regs)); | |
1703 | + len = patterns[i].regs.end[0] - start; | |
1704 | + } | |
1705 | + } | |
1706 | + } | |
1707 | + } /* for Regex patterns. */ | |
1708 | + } /* for (beg = end ..) */ | |
1709 | + | |
1710 | + failure: | |
1711 | +#ifdef MBS_SUPPORT | |
1712 | + if (MB_CUR_MAX > 1 && mb_properties) | |
1713 | + free (mb_properties); | |
1714 | +#endif /* MBS_SUPPORT */ | |
1715 | + return (size_t) -1; | |
1716 | + | |
1717 | + success_in_beg_and_end: | |
1718 | + len = end - beg; | |
1719 | + start = beg - buf; | |
1720 | + /* FALLTHROUGH */ | |
1721 | + | |
1722 | + success_in_start_and_len: | |
1723 | +#ifdef MBS_SUPPORT | |
1724 | + if (MB_CUR_MAX > 1 && mb_properties) | |
1725 | + free (mb_properties); | |
1726 | +#endif /* MBS_SUPPORT */ | |
1727 | + *match_size = len; | |
1728 | + return start; | |
1729 | +} | |
1730 | + | |
1731 | +static void | |
1732 | +Fcompile (char const *pattern, size_t size) | |
1733 | +{ | |
1734 | + char const *beg, *lim, *err; | |
1735 | + | |
1736 | + kwsinit (); | |
1737 | + beg = pattern; | |
1738 | + do | |
1739 | + { | |
1740 | + for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim) | |
1741 | + ; | |
1742 | + if ((err = kwsincr (kwset, beg, lim - beg)) != 0) | |
1743 | + error (2, 0, err); | |
1744 | + if (lim < pattern + size) | |
1745 | + ++lim; | |
1746 | + beg = lim; | |
1747 | + } | |
1748 | + while (beg < pattern + size); | |
1749 | + | |
1750 | + if ((err = kwsprep (kwset)) != 0) | |
1751 | + error (2, 0, err); | |
1752 | +} | |
1753 | + | |
1754 | +static size_t | |
1755 | +Fexecute (char const *buf, size_t size, size_t *match_size, int exact) | |
1756 | +{ | |
1757 | + register char const *beg, *try, *end; | |
1758 | + register size_t len; | |
1759 | + char eol = eolbyte; | |
1760 | + struct kwsmatch kwsmatch; | |
1761 | +#ifdef MBS_SUPPORT | |
1762 | + char *mb_properties; | |
1763 | + if (MB_CUR_MAX > 1) | |
1764 | + mb_properties = check_multibyte_string (buf, size); | |
1765 | +#endif /* MBS_SUPPORT */ | |
1766 | + | |
1767 | + for (beg = buf; beg <= buf + size; ++beg) | |
1768 | + { | |
1769 | + size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); | |
1770 | + if (offset == (size_t) -1) | |
1771 | + goto failure; | |
1772 | +#ifdef MBS_SUPPORT | |
1773 | + if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) | |
1774 | + continue; /* It is a part of multibyte character. */ | |
1775 | +#endif /* MBS_SUPPORT */ | |
1776 | + beg += offset; | |
1777 | + len = kwsmatch.size[0]; | |
1778 | + if (exact && !match_words) | |
1779 | + goto success_in_beg_and_len; | |
1780 | + if (match_lines) | |
1781 | + { | |
1782 | + if (beg > buf && beg[-1] != eol) | |
1783 | + continue; | |
1784 | + if (beg + len < buf + size && beg[len] != eol) | |
1785 | + continue; | |
1786 | + goto success; | |
1787 | + } | |
1788 | + else if (match_words) | |
1789 | + for (try = beg; len; ) | |
1790 | + { | |
1791 | + if (try > buf && WCHAR((unsigned char) try[-1])) | |
1792 | + break; | |
1793 | + if (try + len < buf + size && WCHAR((unsigned char) try[len])) | |
1794 | + { | |
1795 | + offset = kwsexec (kwset, beg, --len, &kwsmatch); | |
1796 | + if (offset == (size_t) -1) | |
1797 | + { | |
1798 | +#ifdef MBS_SUPPORT | |
1799 | + if (MB_CUR_MAX > 1) | |
1800 | + free (mb_properties); | |
1801 | +#endif /* MBS_SUPPORT */ | |
1802 | + return offset; | |
1803 | + } | |
1804 | + try = beg + offset; | |
1805 | + len = kwsmatch.size[0]; | |
1806 | + } | |
1807 | + else | |
1808 | + goto success; | |
1809 | + } | |
1810 | + else | |
1811 | + goto success; | |
1812 | + } | |
1813 | + | |
1814 | + failure: | |
1815 | +#ifdef MBS_SUPPORT | |
1816 | + if (MB_CUR_MAX > 1) | |
1817 | + free (mb_properties); | |
1818 | +#endif /* MBS_SUPPORT */ | |
1819 | + return -1; | |
1820 | + | |
1821 | + success: | |
1822 | + end = memchr (beg + len, eol, (buf + size) - (beg + len)); | |
1823 | + end++; | |
1824 | + while (buf < beg && beg[-1] != eol) | |
1825 | + --beg; | |
1826 | + len = end - beg; | |
1827 | + /* FALLTHROUGH */ | |
1828 | + | |
1829 | + success_in_beg_and_len: | |
1830 | + *match_size = len; | |
1831 | +#ifdef MBS_SUPPORT | |
1832 | + if (MB_CUR_MAX > 1) | |
1833 | + free (mb_properties); | |
1834 | +#endif /* MBS_SUPPORT */ | |
1835 | + return beg - buf; | |
1836 | +} | |
1837 | + | |
1838 | +#if HAVE_LIBPCRE | |
1839 | +/* Compiled internal form of a Perl regular expression. */ | |
1840 | +static pcre *cre; | |
1841 | + | |
1842 | +/* Additional information about the pattern. */ | |
1843 | +static pcre_extra *extra; | |
1844 | +#endif | |
1845 | + | |
1846 | +static void | |
1847 | +Pcompile (char const *pattern, size_t size) | |
1848 | +{ | |
1849 | +#if !HAVE_LIBPCRE | |
1850 | + error (2, 0, _("The -P option is not supported")); | |
1851 | +#else | |
1852 | + int e; | |
1853 | + char const *ep; | |
1854 | + char *re = xmalloc (4 * size + 7); | |
1855 | + int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0); | |
1856 | + char const *patlim = pattern + size; | |
1857 | + char *n = re; | |
1858 | + char const *p; | |
1859 | + char const *pnul; | |
1860 | + | |
1861 | + /* FIXME: Remove this restriction. */ | |
1862 | + if (eolbyte != '\n') | |
1863 | + error (2, 0, _("The -P and -z options cannot be combined")); | |
1864 | + | |
1865 | + *n = '\0'; | |
1866 | + if (match_lines) | |
1867 | + strcpy (n, "^("); | |
1868 | + if (match_words) | |
1869 | + strcpy (n, "\\b("); | |
1870 | + n += strlen (n); | |
1871 | + | |
1872 | + /* The PCRE interface doesn't allow NUL bytes in the pattern, so | |
1873 | + replace each NUL byte in the pattern with the four characters | |
1874 | + "\000", removing a preceding backslash if there are an odd | |
1875 | + number of backslashes before the NUL. | |
1876 | + | |
1877 | + FIXME: This method does not work with some multibyte character | |
1878 | + encodings, notably Shift-JIS, where a multibyte character can end | |
1879 | + in a backslash byte. */ | |
1880 | + for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1) | |
1881 | + { | |
1882 | + memcpy (n, p, pnul - p); | |
1883 | + n += pnul - p; | |
1884 | + for (p = pnul; pattern < p && p[-1] == '\\'; p--) | |
1885 | + continue; | |
1886 | + n -= (pnul - p) & 1; | |
1887 | + strcpy (n, "\\000"); | |
1888 | + n += 4; | |
1889 | + } | |
1890 | + | |
1891 | + memcpy (n, p, patlim - p); | |
1892 | + n += patlim - p; | |
1893 | + *n = '\0'; | |
1894 | + if (match_words) | |
1895 | + strcpy (n, ")\\b"); | |
1896 | + if (match_lines) | |
1897 | + strcpy (n, ")$"); | |
1898 | + | |
1899 | + cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ()); | |
1900 | + if (!cre) | |
1901 | + error (2, 0, ep); | |
1902 | + | |
1903 | + extra = pcre_study (cre, 0, &ep); | |
1904 | + if (ep) | |
1905 | + error (2, 0, ep); | |
1906 | + | |
1907 | + free (re); | |
1908 | +#endif | |
1909 | +} | |
1910 | + | |
1911 | +static size_t | |
1912 | +Pexecute (char const *buf, size_t size, size_t *match_size, int exact) | |
1913 | +{ | |
1914 | +#if !HAVE_LIBPCRE | |
1915 | + abort (); | |
1916 | + return -1; | |
1917 | +#else | |
1918 | + /* This array must have at least two elements; everything after that | |
1919 | + is just for performance improvement in pcre_exec. */ | |
1920 | + int sub[300]; | |
1921 | + | |
1922 | + int e = pcre_exec (cre, extra, buf, size, 0, 0, | |
1923 | + sub, sizeof sub / sizeof *sub); | |
1924 | + | |
1925 | + if (e <= 0) | |
1926 | + { | |
1927 | + switch (e) | |
1928 | + { | |
1929 | + case PCRE_ERROR_NOMATCH: | |
1930 | + return -1; | |
1931 | + | |
1932 | + case PCRE_ERROR_NOMEMORY: | |
1933 | + error (2, 0, _("Memory exhausted")); | |
1934 | + | |
1935 | + default: | |
1936 | + abort (); | |
1937 | + } | |
1938 | + } | |
1939 | + else | |
1940 | + { | |
1941 | + /* Narrow down to the line we've found. */ | |
1942 | + char const *beg = buf + sub[0]; | |
1943 | + char const *end = buf + sub[1]; | |
1944 | + char const *buflim = buf + size; | |
1945 | + char eol = eolbyte; | |
1946 | + if (!exact) | |
1947 | + { | |
1948 | + end = memchr (end, eol, buflim - end); | |
1949 | + end++; | |
1950 | + while (buf < beg && beg[-1] != eol) | |
1951 | + --beg; | |
1952 | + } | |
1953 | + | |
1954 | + *match_size = end - beg; | |
1955 | + return beg - buf; | |
1956 | + } | |
1957 | +#endif | |
1958 | +} | |
1959 | + | |
1960 | +struct matcher const matchers[] = { | |
1961 | + { "default", Gcompile, EGexecute }, | |
1962 | + { "grep", Gcompile, EGexecute }, | |
1963 | + { "egrep", Ecompile, EGexecute }, | |
1964 | + { "awk", Ecompile, EGexecute }, | |
1965 | + { "fgrep", Fcompile, Fexecute }, | |
1966 | + { "perl", Pcompile, Pexecute }, | |
1967 | + { "", 0, 0 }, | |
1968 | +}; | |
1969 | diff -urN grep-2.5.1a.orig/tests/fmbtest.sh grep-2.5.1a/tests/fmbtest.sh | |
1970 | --- grep-2.5.1a.orig/tests/fmbtest.sh 1970-01-01 05:00:00.000000000 +0500 | |
1971 | +++ grep-2.5.1a/tests/fmbtest.sh 2005-10-23 09:51:12.000000000 +0600 | |
1972 | @@ -0,0 +1,111 @@ | |
1973 | +#!/bin/sh | |
1974 | + | |
1975 | +: ${srcdir=.} | |
1976 | + | |
1977 | +# If cs_CZ.UTF-8 locale doesn't work, skip this test silently | |
1978 | +LC_ALL=cs_CZ.UTF-8 locale -k LC_CTYPE 2>/dev/null | ${GREP} -q charmap.*UTF-8 \ | |
1979 | + || exit 77 | |
1980 | + | |
1981 | +failures=0 | |
1982 | + | |
1983 | +cat > csinput <<EOF | |
1984 | +01 Žluťoučká číše | |
1985 | +ČíŠE 02 | |
1986 | +03 Z číší Čiší cosi | |
1987 | +04 Čí | |
1988 | +Še 05 | |
1989 | +06 ČČČČČČČíšČÍŠčíš | |
1990 | +07 ČČČ ČČČČíšČÍŠčíšEEEE | |
1991 | +čAs 08 | |
1992 | +09Čapka | |
1993 | +10ČaSy se měnÍ | |
1994 | +ČÍšE11 | |
1995 | +Čas12 | |
1996 | +𝇕ČÍšE𝇓13 | |
1997 | +ŽČÍšE𝇓14 | |
1998 | +𝇕ČÍšEŽ15 | |
1999 | +ŽČÍšEŽ16 | |
2000 | +ČÍšE𝇓17 | |
2001 | +ČÍšEŽ18 | |
2002 | +19𝇕ČÍše | |
2003 | +20ŽČÍše | |
2004 | +EOF | |
2005 | +cat > cspatfile <<EOF | |
2006 | +ČÍšE | |
2007 | +Čas | |
2008 | +EOF | |
2009 | + | |
2010 | +for mode in F G E; do | |
2011 | + | |
2012 | +test1="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode} -f cspatfile csinput \ | |
2013 | + | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)" | |
2014 | +if test "$test1" != "11 12 13 14 15 16 17 18"; then | |
2015 | + echo "Test #1 ${mode} failed: $test1" | |
2016 | + failures=1 | |
2017 | +fi | |
2018 | + | |
2019 | +test2="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode}i -f cspatfile csinput \ | |
2020 | + | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)" | |
2021 | +if test "$test2" != "01 02 07 08 10 11 12 13 14 15 16 17 18 19 20"; then | |
2022 | + echo "Test #2 ${mode} failed: $test2" | |
2023 | + failures=1 | |
2024 | +fi | |
2025 | + | |
2026 | +test3="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode}i -e 'ČÍšE' -e 'Čas' csinput \ | |
2027 | + | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)" | |
2028 | +if test "$test3" != "01 02 07 08 10 11 12 13 14 15 16 17 18 19 20"; then | |
2029 | + echo "Test #3 ${mode} failed: $test3" | |
2030 | + failures=1 | |
2031 | +fi | |
2032 | + | |
2033 | +test4="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode}iw -f cspatfile csinput \ | |
2034 | + | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)" | |
2035 | +if test "$test4" != "01 02 08 13 17 19"; then | |
2036 | + echo "Test #4 ${mode} failed: $test4" | |
2037 | + failures=1 | |
2038 | +fi | |
2039 | + | |
2040 | +done | |
2041 | + | |
2042 | +# Test that -F --color=always prefers longer matches. | |
2043 | +test5="`echo 'Cosi tu ČišÍ...' \ | |
2044 | + | LC_ALL=cs_CZ.UTF-8 ${GREP} --color=always -Fi -e 'čiš' -e 'čiší'`" | |
2045 | +if echo "$test5" | LC_ALL=C ${GREP} -q 'Cosi tu .*\[.*mČišÍ.*\[.*m\(.\[K\)\?\.\.\.'; then | |
2046 | + : | |
2047 | +else | |
2048 | + echo "Test #5 F failed: $test5" | |
2049 | + failures=1 | |
2050 | +fi | |
2051 | + | |
2052 | +for mode in G E; do | |
2053 | + | |
2054 | +# Test that -{G,E} --color=always prefers earlier pattern matches. | |
2055 | +test6="`echo 'Cosi tu ČišÍ...' \ | |
2056 | + | LC_ALL=cs_CZ.UTF-8 ${GREP} --color=always -${mode}i -e 'čiš' -e 'čiší'`" | |
2057 | +if echo "$test6" | LC_ALL=C ${GREP} -q 'Cosi tu .*\[.*mČiš.*\[.*m\(.\[K\)\?Í\.\.\.'; then | |
2058 | + : | |
2059 | +else | |
2060 | + echo "Test #6 ${mode} failed: $test6" | |
2061 | + failures=1 | |
2062 | +fi | |
2063 | + | |
2064 | +# Test that -{G,E} --color=always prefers earlier pattern matches. | |
2065 | +test7="`echo 'Cosi tu ČišÍ...' \ | |
2066 | + | LC_ALL=cs_CZ.UTF-8 ${GREP} --color=always -${mode}i -e 'čiší' -e 'čiš'`" | |
2067 | +if echo "$test7" | LC_ALL=C ${GREP} -q 'Cosi tu .*\[.*mČišÍ.*\[.*m\(.\[K\)\?\.\.\.'; then | |
2068 | + : | |
2069 | +else | |
2070 | + echo "Test #7 ${mode} failed: $test7" | |
2071 | + failures=1 | |
2072 | +fi | |
2073 | + | |
2074 | +test8="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode}i -e 'Č.šE' -e 'Č[a-f]s' csinput \ | |
2075 | + | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)" | |
2076 | +if test "$test8" != "01 02 07 08 10 11 12 13 14 15 16 17 18 19 20"; then | |
2077 | + echo "Test #8 ${mode} failed: $test8" | |
2078 | + failures=1 | |
2079 | +fi | |
2080 | + | |
2081 | +done | |
2082 | + | |
2083 | +exit $failures | |
2084 | diff -urN grep-2.5.1a.orig/tests/Makefile.am grep-2.5.1a/tests/Makefile.am | |
2085 | --- grep-2.5.1a.orig/tests/Makefile.am 2001-03-07 09:11:27.000000000 +0500 | |
2086 | +++ grep-2.5.1a/tests/Makefile.am 2005-10-23 09:51:12.000000000 +0600 | |
2087 | @@ -3,7 +3,8 @@ | |
2088 | AWK=@AWK@ | |
2089 | ||
2090 | TESTS = warning.sh khadafy.sh spencer1.sh bre.sh ere.sh \ | |
2091 | - status.sh empty.sh options.sh backref.sh file.sh | |
2092 | + status.sh empty.sh options.sh backref.sh file.sh \ | |
2093 | + fmbtest.sh | |
2094 | EXTRA_DIST = $(TESTS) \ | |
2095 | khadafy.lines khadafy.regexp \ | |
2096 | spencer1.awk spencer1.tests \ | |
2097 | diff -urN grep-2.5.1a.orig/tests/Makefile.in grep-2.5.1a/tests/Makefile.in | |
2098 | --- grep-2.5.1a.orig/tests/Makefile.in 2002-03-26 21:09:36.000000000 +0500 | |
2099 | +++ grep-2.5.1a/tests/Makefile.in 2005-10-23 09:51:13.000000000 +0600 | |
2100 | @@ -97,7 +97,8 @@ | |
2101 | AWK = @AWK@ | |
2102 | ||
2103 | TESTS = warning.sh khadafy.sh spencer1.sh bre.sh ere.sh \ | |
2104 | - status.sh empty.sh options.sh backref.sh file.sh | |
2105 | + status.sh empty.sh options.sh backref.sh file.sh \ | |
2106 | + fmbtest.sh | |
2107 | ||
2108 | EXTRA_DIST = $(TESTS) \ | |
2109 | khadafy.lines khadafy.regexp \ |