]>
Commit | Line | Data |
---|---|---|
14744156 | 1 | /* Test for UTF-8 regular expression optimizations. |
04277e02 | 2 | Copyright (C) 2003-2019 Free Software Foundation, Inc. |
14744156 UD |
3 | This file is part of the GNU C Library. |
4 | Contributed by Jakub Jelinek <jakub@redhat.com>, 2003. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ | |
14744156 UD |
19 | |
20 | #include <sys/types.h> | |
21 | #include <mcheck.h> | |
22 | #include <regex.h> | |
23 | #include <stdio.h> | |
24 | #include <stdlib.h> | |
25 | #include <string.h> | |
26 | #include <locale.h> | |
27 | ||
28 | #define RE_NO_INTERNAL_PROTOTYPES 1 | |
29 | #include "regex_internal.h" | |
30 | ||
ad7f28c2 UD |
31 | #define BRE RE_SYNTAX_POSIX_BASIC |
32 | #define ERE RE_SYNTAX_POSIX_EXTENDED | |
33 | ||
14744156 UD |
34 | static struct |
35 | { | |
36 | int syntax; | |
37 | const char *pattern; | |
38 | const char *string; | |
39 | int res, optimize; | |
40 | } tests[] = { | |
41 | /* \xc3\x84 LATIN CAPITAL LETTER A WITH DIAERESIS | |
42 | \xc3\x96 LATIN CAPITAL LETTER O WITH DIAERESIS | |
43 | \xc3\xa4 LATIN SMALL LETTER A WITH DIAERESIS | |
44 | \xc3\xb6 LATIN SMALL LETTER O WITH DIAERESIS | |
45 | \xe2\x80\x94 EM DASH */ | |
46 | /* Should be optimized. */ | |
ad7f28c2 UD |
47 | {BRE, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, 1}, |
48 | {BRE, "b\xc3\xa4z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1}, | |
49 | {BRE, "b\xc3\xa4*z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1}, | |
50 | {BRE, "b\xc3\xa4*z", "b\xc3\xa4rfoobz", 7, 1}, | |
51 | {BRE, "b\xc3\xa4\\+z", "b\xc3\xa4rfoob\xc3\xa4\xc3\xa4z", 7, 1}, | |
52 | {BRE, "b\xc3\xa4\\?z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1}, | |
53 | {BRE, "b\xc3\xa4\\{1,2\\}z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1}, | |
54 | {BRE, "^x\\|xy*z$", "\xc3\xb6xyyz", 2, 1}, | |
55 | {BRE, "^x\\\\y\\{6\\}z\\+", "x\\yyyyyyzz\xc3\xb6", 0, 1}, | |
56 | {BRE, "^x\\\\y\\{2,36\\}z\\+", "x\\yzz\xc3\xb6", -1, 1}, | |
57 | {BRE, "^x\\\\y\\{,3\\}z\\+", "x\\yyyzz\xc3\xb6", 0, 1}, | |
58 | {BRE, "^x\\|x\xc3\xa4*z$", "\xc3\xb6x\xc3\xa4\xc3\xa4z", 2, 1}, | |
59 | {BRE, "^x\\\\\xc3\x84\\{6\\}z\\+", | |
5f93cd52 | 60 | "x\\\xc3\x84\xc3\x84\xc3\x84\xc3\x84\xc3\x84\xc3\x84zz\xc3\xb6", 0, 1}, |
ad7f28c2 UD |
61 | {BRE, "^x\\\\\xc3\x84\\{2,36\\}z\\+", "x\\\xc3\x84zz\xc3\xb6", -1, 1}, |
62 | {BRE, "^x\\\\\xc3\x84\\{,3\\}z\\+", | |
5f93cd52 | 63 | "x\\\xc3\x84\xc3\x84\xc3\x84zz\xc3\xb6", 0, 1}, |
ad7f28c2 UD |
64 | {BRE, "x[C]y", "axCy", 1, 1}, |
65 | {BRE, "x[ABC]y", "axCy", 1, 1}, | |
66 | {BRE, "\\`x\\|z\\'", "x\xe2\x80\x94", 0, 1}, | |
67 | {BRE, "\\(xy\\)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, 1}, | |
68 | {BRE, "xy\\?z", "\xc3\x84xz\xc3\xb6", 2, 1}, | |
69 | {BRE, "\\`\xc3\x84\\|z\\'", "\xc3\x84\xe2\x80\x94", 0, 1}, | |
70 | {BRE, "\\(x\xc3\x84\\)z\\1\x61\\1", | |
5f93cd52 | 71 | "\xe2\x80\x94x\xc3\x84zx\xc3\x84\x61x\xc3\x84\xc3\x96", 3, 1}, |
ad7f28c2 UD |
72 | {BRE, "x\xc3\x96\\?z", "\xc3\x84xz\xc3\xb6", 2, 1}, |
73 | {BRE, "x.y", "ax\xe2\x80\x94yz", 1, 1}, | |
74 | {BRE, "x.*z", "\xc3\x84xz", 2, 1}, | |
75 | {BRE, "x.*z", "\xc3\x84x\xe2\x80\x94z", 2, 1}, | |
76 | {BRE, "x.*z", "\xc3\x84x\xe2\x80\x94y\xf1\x90\x80\x90z", 2, 1}, | |
77 | {BRE, "x.*z", "\xc3\x84x\xe2\x80\x94\xc3\x94\xf1\x90\x80\x90z", 2, 1}, | |
78 | {BRE, "x.\\?z", "axz", 1, 1}, | |
79 | {BRE, "x.\\?z", "axyz", 1, 1}, | |
80 | {BRE, "x.\\?z", "ax\xc3\x84z", 1, 1}, | |
81 | {BRE, "x.\\?z", "ax\xe2\x80\x94z", 1, 1}, | |
82 | {BRE, "x.\\?z", "ax\xf0\x9d\x80\x80z", 1, 1}, | |
83 | {BRE, "x.\\?z", "ax\xf9\x81\x82\x83\x84z", 1, 1}, | |
84 | {BRE, "x.\\?z", "ax\xfd\xbf\xbf\xbf\xbf\xbfz", 1, 1}, | |
85 | {BRE, ".", "y", 0, 1}, | |
86 | {BRE, ".", "\xc3\x84", 0, 1}, | |
87 | {BRE, ".", "\xe2\x80\x94", 0, 1}, | |
88 | {BRE, ".", "\xf0\x9d\x80\x80", 0, 1}, | |
89 | {BRE, ".", "\xf9\x81\x82\x83\x84", 0, 1}, | |
90 | {BRE, ".", "\xfd\xbf\xbf\xbf\xbf\xbf", 0, 1}, | |
91 | {BRE, "x.\\?z", "axyyz", -1, 1}, | |
92 | {BRE, "x.\\?z", "ax\xc3\x84\xc3\x96z", -1, 1}, | |
93 | {BRE, "x.\\?z", "ax\xe2\x80\x94\xc3\xa4z", -1, 1}, | |
94 | {BRE, "x.\\?z", "ax\xf0\x9d\x80\x80yz", -1, 1}, | |
89635190 | 95 | {BRE, "x.\\?z", "ax\xf9\x81\x82\x83\x84\xf0\x9d\x80\x81z", -1, 1}, |
ad7f28c2 UD |
96 | {BRE, "x.\\?z", "ax\xfd\xbf\xbf\xbf\xbf\xbf\xc3\x96z", -1, 1}, |
97 | {BRE, "x.\\+z", "\xe2\x80\x94xz", -1, 1}, | |
98 | {BRE, "x.\\+z", "\xe2\x80\x94xyz", 3, 1}, | |
99 | {BRE, "x.\\+z", "\xe2\x80\x94x\xc3\x84y\xe2\x80\x94z", 3, 1}, | |
100 | {BRE, "x.\\+z", "\xe2\x80\x94x\xe2\x80\x94z", 3, 1}, | |
101 | {BRE, "x.\\+z", "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z", 3, 1}, | |
102 | {BRE, "x.\\+z", "\xe2\x80\x94x.~\xe2\x80\x94\xf9\x81\x82\x83\x84z", 3, 1}, | |
103 | {BRE, "x.\\+z", "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz", 3, 1}, | |
104 | {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94xz", -1, 1}, | |
105 | {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xc3\x96y\xc3\xa4z", -1, 1}, | |
106 | {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94xyz", 3, 1}, | |
107 | {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xc3\x84\xe2\x80\x94z", 3, 1}, | |
108 | {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xe2\x80\x94z", 3, 1}, | |
109 | {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z", 3, 1}, | |
110 | {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x~\xe2\x80\x94z", 3, 1}, | |
111 | {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz", 3, 1}, | |
112 | {BRE, "x\\(.w\\|\xc3\x86\\)\\?z", "axz", 1, 1}, | |
113 | {BRE, "x\\(.w\\|\xc3\x86\\)\\?z", "ax\xfd\xbf\xbf\xbf\xbf\xbfwz", 1, 1}, | |
114 | {BRE, "x\\(.w\\|\xc3\x86\\)\\?z", "ax\xc3\x86z", 1, 1}, | |
115 | {BRE, "x\\(.w\\|\xc3\x86\\)\\?z", "ax\xe2\x80\x96wz", 1, 1}, | |
116 | {ERE, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, 1}, | |
117 | {ERE, "^x|xy*z$", "\xc3\xb6xyyz", 2, 1}, | |
118 | {ERE, "^x\\\\y{6}z+", "x\\yyyyyyzz\xc3\xb6", 0, 1}, | |
119 | {ERE, "^x\\\\y{2,36}z+", "x\\yzz\xc3\xb6", -1, 1}, | |
120 | {ERE, "^x\\\\y{,3}z+", "x\\yyyzz\xc3\xb6", 0, 1}, | |
121 | {ERE, "x[C]y", "axCy", 1, 1}, | |
122 | {ERE, "x[ABC]y", "axCy", 1, 1}, | |
123 | {ERE, "\\`x|z\\'", "x\xe2\x80\x94", 0, 1}, | |
124 | {ERE, "(xy)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, 1}, | |
125 | {ERE, "xy?z", "\xc3\x84xz\xc3\xb6", 2, 1}, | |
126 | {ERE, "x.y", "ax\xe2\x80\x94yz", 1, 1}, | |
127 | {ERE, "x.*z", "\xc3\x84xz", 2, 1}, | |
128 | {ERE, "x.*z", "\xc3\x84x\xe2\x80\x94z", 2, 1}, | |
129 | {ERE, "x.*z", "\xc3\x84x\xe2\x80\x94y\xf1\x90\x80\x90z", 2, 1}, | |
130 | {ERE, "x.*z", "\xc3\x84x\xe2\x80\x94\xc3\x94\xf1\x90\x80\x90z", 2, 1}, | |
131 | {ERE, "x.?z", "axz", 1, 1}, | |
132 | {ERE, "x.?z", "axyz", 1, 1}, | |
133 | {ERE, "x.?z", "ax\xc3\x84z", 1, 1}, | |
134 | {ERE, "x.?z", "ax\xe2\x80\x94z", 1, 1}, | |
135 | {ERE, "x.?z", "ax\xf0\x9d\x80\x80z", 1, 1}, | |
136 | {ERE, "x.?z", "ax\xf9\x81\x82\x83\x84z", 1, 1}, | |
137 | {ERE, "x.?z", "ax\xfd\xbf\xbf\xbf\xbf\xbfz", 1, 1}, | |
138 | {ERE, "x.?z", "axyyz", -1, 1}, | |
139 | {ERE, "x.?z", "ax\xc3\x84\xc3\x96z", -1, 1}, | |
140 | {ERE, "x.?z", "ax\xe2\x80\x94\xc3\xa4z", -1, 1}, | |
141 | {ERE, "x.?z", "ax\xf0\x9d\x80\x80yz", -1, 1}, | |
89635190 | 142 | {ERE, "x.?z", "ax\xf9\x81\x82\x83\x84\xf0\x9d\x80\x81z", -1, 1}, |
ad7f28c2 UD |
143 | {ERE, "x.?z", "ax\xfd\xbf\xbf\xbf\xbf\xbf\xc3\x96z", -1, 1}, |
144 | {ERE, "x.+z", "\xe2\x80\x94xz", -1, 1}, | |
145 | {ERE, "x.+z", "\xe2\x80\x94xyz", 3, 1}, | |
146 | {ERE, "x.+z", "\xe2\x80\x94x\xc3\x84y\xe2\x80\x94z", 3, 1}, | |
147 | {ERE, "x.+z", "\xe2\x80\x94x\xe2\x80\x94z", 3, 1}, | |
148 | {ERE, "x.+z", "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z", 3, 1}, | |
149 | {ERE, "x.+z", "\xe2\x80\x94x.~\xe2\x80\x94\xf9\x81\x82\x83\x84z", 3, 1}, | |
150 | {ERE, "x.+z", "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz", 3, 1}, | |
151 | {ERE, "x.{1,2}z", "\xe2\x80\x94xz", -1, 1}, | |
152 | {ERE, "x.{1,2}z", "\xe2\x80\x94x\xc3\x96y\xc3\xa4z", -1, 1}, | |
153 | {ERE, "x.{1,2}z", "\xe2\x80\x94xyz", 3, 1}, | |
154 | {ERE, "x.{1,2}z", "\xe2\x80\x94x\xc3\x84\xe2\x80\x94z", 3, 1}, | |
155 | {ERE, "x.{1,2}z", "\xe2\x80\x94x\xe2\x80\x94z", 3, 1}, | |
156 | {ERE, "x.{1,2}z", "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z", 3, 1}, | |
157 | {ERE, "x.{1,2}z", "\xe2\x80\x94x~\xe2\x80\x94z", 3, 1}, | |
158 | {ERE, "x.{1,2}z", "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz", 3, 1}, | |
159 | {ERE, "x(.w|\xc3\x86)?z", "axz", 1, 1}, | |
160 | {ERE, "x(.w|\xc3\x86)?z", "ax\xfd\xbf\xbf\xbf\xbf\xbfwz", 1, 1}, | |
161 | {ERE, "x(.w|\xc3\x86)?z", "ax\xc3\x86z", 1, 1}, | |
162 | {ERE, "x(.w|\xc3\x86)?z", "ax\xe2\x80\x96wz", 1, 1}, | |
14744156 | 163 | /* Should not be optimized. */ |
ad7f28c2 UD |
164 | {BRE, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, 0}, |
165 | {BRE, "x[A-Z,]y", "axCy", 1, 0}, | |
166 | {BRE, "x[^y]z", "ax\xe2\x80\x94z", 1, 0}, | |
167 | {BRE, "x[[:alnum:]]z", "ax\xc3\x96z", 1, 0}, | |
168 | {BRE, "x[[=A=]]z", "axAz", 1, 0}, | |
169 | {BRE, "x[[=\xc3\x84=]]z", "ax\xc3\x84z", 1, 0}, | |
170 | {BRE, "\\<g", "\xe2\x80\x94g", 3, 0}, | |
171 | {BRE, "\\bg\\b", "\xe2\x80\x94g", 3, 0}, | |
172 | {BRE, "\\Bg\\B", "\xc3\xa4g\xc3\xa4", 2, 0}, | |
173 | {BRE, "a\\wz", "a\xc3\x84z", 0, 0}, | |
174 | {BRE, "x\\Wz", "\xc3\x96x\xe2\x80\x94z", 2, 0}, | |
175 | {ERE, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, 0}, | |
176 | {ERE, "x[A-Z,]y", "axCy", 1, 0}, | |
177 | {ERE, "x[^y]z", "ax\xe2\x80\x94z", 1, 0}, | |
178 | {ERE, "x[[:alnum:]]z", "ax\xc3\x96z", 1, 0}, | |
179 | {ERE, "x[[=A=]]z", "axAz", 1, 0}, | |
180 | {ERE, "x[[=\xc3\x84=]]z", "ax\xc3\x84z", 1, 0}, | |
181 | {ERE, "\\<g", "\xe2\x80\x94g", 3, 0}, | |
182 | {ERE, "\\bg\\b", "\xe2\x80\x94g", 3, 0}, | |
183 | {ERE, "\\Bg\\B", "\xc3\xa4g\xc3\xa4", 2, 0}, | |
184 | {ERE, "a\\wz", "a\xc3\x84z", 0, 0}, | |
185 | {ERE, "x\\Wz", "\xc3\x96x\xe2\x80\x94z", 2, 0}, | |
14744156 UD |
186 | }; |
187 | ||
188 | int | |
189 | main (void) | |
190 | { | |
191 | struct re_pattern_buffer regbuf; | |
192 | const char *err; | |
193 | size_t i; | |
194 | int ret = 0; | |
195 | ||
196 | mtrace (); | |
197 | ||
198 | setlocale (LC_ALL, "de_DE.UTF-8"); | |
199 | for (i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i) | |
200 | { | |
201 | int res, optimized; | |
a8067e8f | 202 | |
14744156 UD |
203 | re_set_syntax (tests[i].syntax); |
204 | memset (®buf, '\0', sizeof (regbuf)); | |
205 | err = re_compile_pattern (tests[i].pattern, strlen (tests[i].pattern), | |
206 | ®buf); | |
207 | if (err != NULL) | |
208 | { | |
209 | printf ("re_compile_pattern failed: %s\n", err); | |
210 | ret = 1; | |
211 | continue; | |
212 | } | |
213 | ||
214 | /* Check if re_search will be done as multi-byte or single-byte. */ | |
215 | optimized = ((re_dfa_t *) regbuf.buffer)->mb_cur_max == 1; | |
216 | if (optimized != tests[i].optimize) | |
217 | { | |
218 | printf ("pattern %zd %soptimized while it should%s be\n", | |
219 | i, optimized ? "" : "not ", tests[i].optimize ? "" : " not"); | |
220 | ret = 1; | |
221 | } | |
222 | ||
ad7f28c2 UD |
223 | int str_len = strlen (tests[i].string); |
224 | res = re_search (®buf, tests[i].string, str_len, 0, str_len, NULL); | |
14744156 UD |
225 | if (res != tests[i].res) |
226 | { | |
227 | printf ("re_search %zd failed: %d\n", i, res); | |
228 | ret = 1; | |
229 | regfree (®buf); | |
230 | continue; | |
231 | } | |
ad7f28c2 UD |
232 | |
233 | res = re_search (®buf, tests[i].string, str_len, str_len, -str_len, | |
234 | NULL); | |
235 | if (res != tests[i].res) | |
236 | { | |
237 | printf ("backward re_search %zd failed: %d\n", i, res); | |
238 | ret = 1; | |
239 | regfree (®buf); | |
240 | continue; | |
241 | } | |
14744156 | 242 | regfree (®buf); |
a8067e8f UD |
243 | |
244 | re_set_syntax (tests[i].syntax | RE_ICASE); | |
245 | memset (®buf, '\0', sizeof (regbuf)); | |
246 | err = re_compile_pattern (tests[i].pattern, strlen (tests[i].pattern), | |
247 | ®buf); | |
248 | if (err != NULL) | |
249 | { | |
250 | printf ("re_compile_pattern failed: %s\n", err); | |
251 | ret = 1; | |
252 | continue; | |
253 | } | |
254 | ||
255 | /* Check if re_search will be done as multi-byte or single-byte. */ | |
256 | optimized = ((re_dfa_t *) regbuf.buffer)->mb_cur_max == 1; | |
257 | if (optimized) | |
258 | { | |
259 | printf ("pattern %zd optimized while it should not be when case insensitive\n", | |
260 | i); | |
261 | ret = 1; | |
262 | } | |
263 | ||
ad7f28c2 | 264 | res = re_search (®buf, tests[i].string, str_len, 0, str_len, NULL); |
a8067e8f UD |
265 | if (res != tests[i].res) |
266 | { | |
ad7f28c2 | 267 | printf ("ICASE re_search %zd failed: %d\n", i, res); |
a8067e8f UD |
268 | ret = 1; |
269 | regfree (®buf); | |
270 | continue; | |
271 | } | |
ad7f28c2 | 272 | |
ad7f28c2 UD |
273 | res = re_search (®buf, tests[i].string, str_len, str_len, -str_len, |
274 | NULL); | |
275 | if (res != tests[i].res) | |
276 | { | |
277 | printf ("ICASE backward re_search %zd failed: %d\n", i, res); | |
278 | ret = 1; | |
279 | regfree (®buf); | |
280 | continue; | |
97fd3a30 | 281 | } |
a8067e8f | 282 | regfree (®buf); |
14744156 UD |
283 | } |
284 | ||
285 | return ret; | |
286 | } |