]> git.ipfire.org Git - thirdparty/glibc.git/blame - string/str-two-way.h
Detect EOL on-the-fly in strstr, strcasestr and memmem.
[thirdparty/glibc.git] / string / str-two-way.h
CommitLineData
0caca71a 1/* Byte-wise substring search, using the Two-Way algorithm.
be75d758 2 Copyright (C) 2008-2012 Free Software Foundation, Inc.
0caca71a
UD
3 This file is part of the GNU C Library.
4 Written by Eric Blake <ebb9@byu.net>, 2008.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
0caca71a
UD
19
20/* Before including this file, you need to include <string.h> (and
21 <config.h> before that, if not part of libc), and define:
22 RESULT_TYPE A macro that expands to the return type.
23 AVAILABLE(h, h_l, j, n_l)
24 A macro that returns nonzero if there are
25 at least N_L bytes left starting at H[J].
26 H is 'unsigned char *', H_L, J, and N_L
27 are 'size_t'; H_L is an lvalue. For
28 NUL-terminated searches, H_L can be
29 modified each iteration to avoid having
30 to compute the end of H up front.
31
32 For case-insensitivity, you may optionally define:
33 CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L
34 characters of P1 and P2 are equal.
35 CANON_ELEMENT(c) A macro that canonicalizes an element right after
36 it has been fetched from one of the two strings.
37 The argument is an 'unsigned char'; the result
38 must be an 'unsigned char' as well.
39
40 This file undefines the macros documented above, and defines
41 LONG_NEEDLE_THRESHOLD.
42*/
43
44#include <limits.h>
45#include <stdint.h>
be75d758 46#include <sys/param.h> /* Defines MAX. */
0caca71a
UD
47
48/* We use the Two-Way string matching algorithm, which guarantees
49 linear complexity with constant space. Additionally, for long
50 needles, we also use a bad character shift table similar to the
51 Boyer-Moore algorithm to achieve improved (potentially sub-linear)
52 performance.
53
54 See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
55 and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
56*/
57
58/* Point at which computing a bad-byte shift table is likely to be
59 worthwhile. Small needles should not compute a table, since it
60 adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
61 speedup no greater than a factor of NEEDLE_LEN. The larger the
62 needle, the better the potential performance gain. On the other
63 hand, on non-POSIX systems with CHAR_BIT larger than eight, the
64 memory required for the table is prohibitive. */
65#if CHAR_BIT < 10
66# define LONG_NEEDLE_THRESHOLD 32U
67#else
68# define LONG_NEEDLE_THRESHOLD SIZE_MAX
69#endif
70
0caca71a
UD
71#ifndef CANON_ELEMENT
72# define CANON_ELEMENT(c) c
73#endif
74#ifndef CMP_FUNC
75# define CMP_FUNC memcmp
76#endif
77
400726de
MK
78#ifndef AVAILABLE1
79# define AVAILABLE1(h, h_l, j, n_l) AVAILABLE (h, h_l, j, n_l)
80#endif
81#ifndef AVAILABLE2
82# define AVAILABLE2(h, h_l, j, n_l) (1)
83#endif
84#ifndef RET0_IF_0
85# define RET0_IF_0(a) /* nothing */
86#endif
87
0caca71a
UD
88/* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
89 Return the index of the first byte in the right half, and set
90 *PERIOD to the global period of the right half.
91
92 The global period of a string is the smallest index (possibly its
93 length) at which all remaining bytes in the string are repetitions
94 of the prefix (the last repetition may be a subset of the prefix).
95
96 When NEEDLE is factored into two halves, a local period is the
97 length of the smallest word that shares a suffix with the left half
98 and shares a prefix with the right half. All factorizations of a
99 non-empty NEEDLE have a local period of at least 1 and no greater
100 than NEEDLE_LEN.
101
102 A critical factorization has the property that the local period
103 equals the global period. All strings have at least one critical
104 factorization with the left half smaller than the global period.
105
106 Given an ordered alphabet, a critical factorization can be computed
107 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
108 larger of two ordered maximal suffixes. The ordered maximal
109 suffixes are determined by lexicographic comparison of
110 periodicity. */
111static size_t
112critical_factorization (const unsigned char *needle, size_t needle_len,
113 size_t *period)
114{
115 /* Index of last byte of left half, or SIZE_MAX. */
116 size_t max_suffix, max_suffix_rev;
117 size_t j; /* Index into NEEDLE for current candidate suffix. */
118 size_t k; /* Offset into current period. */
119 size_t p; /* Intermediate period. */
120 unsigned char a, b; /* Current comparison bytes. */
121
122 /* Invariants:
123 0 <= j < NEEDLE_LEN - 1
124 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
125 min(max_suffix, max_suffix_rev) < global period of NEEDLE
126 1 <= p <= global period of NEEDLE
127 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
128 1 <= k <= p
129 */
130
131 /* Perform lexicographic search. */
132 max_suffix = SIZE_MAX;
133 j = 0;
134 k = p = 1;
135 while (j + k < needle_len)
136 {
137 a = CANON_ELEMENT (needle[j + k]);
138 b = CANON_ELEMENT (needle[max_suffix + k]);
139 if (a < b)
140 {
141 /* Suffix is smaller, period is entire prefix so far. */
142 j += k;
143 k = 1;
144 p = j - max_suffix;
145 }
146 else if (a == b)
147 {
148 /* Advance through repetition of the current period. */
149 if (k != p)
150 ++k;
151 else
152 {
153 j += p;
154 k = 1;
155 }
156 }
157 else /* b < a */
158 {
159 /* Suffix is larger, start over from current location. */
160 max_suffix = j++;
161 k = p = 1;
162 }
163 }
164 *period = p;
165
166 /* Perform reverse lexicographic search. */
167 max_suffix_rev = SIZE_MAX;
168 j = 0;
169 k = p = 1;
170 while (j + k < needle_len)
171 {
172 a = CANON_ELEMENT (needle[j + k]);
173 b = CANON_ELEMENT (needle[max_suffix_rev + k]);
174 if (b < a)
175 {
176 /* Suffix is smaller, period is entire prefix so far. */
177 j += k;
178 k = 1;
179 p = j - max_suffix_rev;
180 }
181 else if (a == b)
182 {
183 /* Advance through repetition of the current period. */
184 if (k != p)
185 ++k;
186 else
187 {
188 j += p;
189 k = 1;
190 }
191 }
192 else /* a < b */
193 {
194 /* Suffix is larger, start over from current location. */
195 max_suffix_rev = j++;
196 k = p = 1;
197 }
198 }
199
200 /* Choose the longer suffix. Return the first byte of the right
201 half, rather than the last byte of the left half. */
202 if (max_suffix_rev + 1 < max_suffix + 1)
203 return max_suffix + 1;
204 *period = p;
205 return max_suffix_rev + 1;
206}
207
208/* Return the first location of non-empty NEEDLE within HAYSTACK, or
209 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
210 method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
211 Performance is guaranteed to be linear, with an initialization cost
212 of 2 * NEEDLE_LEN comparisons.
213
214 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
215 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
216 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
217 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */
218static RETURN_TYPE
219two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
220 const unsigned char *needle, size_t needle_len)
221{
222 size_t i; /* Index into current byte of NEEDLE. */
223 size_t j; /* Index into current window of HAYSTACK. */
224 size_t period; /* The period of the right half of needle. */
225 size_t suffix; /* The index of the right half of needle. */
226
227 /* Factor the needle into two halves, such that the left half is
228 smaller than the global period, and the right half is
229 periodic (with a period as large as NEEDLE_LEN - suffix). */
230 suffix = critical_factorization (needle, needle_len, &period);
231
232 /* Perform the search. Each iteration compares the right half
233 first. */
234 if (CMP_FUNC (needle, needle + period, suffix) == 0)
235 {
236 /* Entire needle is periodic; a mismatch can only advance by the
237 period, so use memory to avoid rescanning known occurrences
238 of the period. */
239 size_t memory = 0;
240 j = 0;
241 while (AVAILABLE (haystack, haystack_len, j, needle_len))
242 {
243 /* Scan for matches in right half. */
244 i = MAX (suffix, memory);
245 while (i < needle_len && (CANON_ELEMENT (needle[i])
246 == CANON_ELEMENT (haystack[i + j])))
247 ++i;
248 if (needle_len <= i)
249 {
250 /* Scan for matches in left half. */
251 i = suffix - 1;
252 while (memory < i + 1 && (CANON_ELEMENT (needle[i])
253 == CANON_ELEMENT (haystack[i + j])))
254 --i;
255 if (i + 1 < memory + 1)
256 return (RETURN_TYPE) (haystack + j);
257 /* No match, so remember how many repetitions of period
258 on the right half were scanned. */
259 j += period;
260 memory = needle_len - period;
261 }
262 else
263 {
264 j += i - suffix + 1;
265 memory = 0;
266 }
267 }
268 }
269 else
270 {
20a71f2c
MK
271 /* The comparison always starts from needle[suffix], so cache it
272 and use an optimized first-character loop. */
273 unsigned char needle_suffix = CANON_ELEMENT (needle[suffix]);
274
0caca71a
UD
275 /* The two halves of needle are distinct; no extra memory is
276 required, and any mismatch results in a maximal shift. */
277 period = MAX (suffix, needle_len - suffix) + 1;
278 j = 0;
400726de 279 while (AVAILABLE1 (haystack, haystack_len, j, needle_len))
0caca71a 280 {
400726de
MK
281 unsigned char haystack_char;
282
20a71f2c
MK
283 /* TODO: The first-character loop can be sped up by adapting
284 longword-at-a-time implementation of memchr/strchr. */
285 if (needle_suffix
400726de 286 != (haystack_char = CANON_ELEMENT (haystack[suffix + j])))
20a71f2c 287 {
400726de 288 RET0_IF_0 (haystack_char);
20a71f2c
MK
289 ++j;
290 continue;
291 }
292
0caca71a 293 /* Scan for matches in right half. */
20a71f2c 294 i = suffix + 1;
400726de
MK
295 while (i < needle_len)
296 {
297 if (CANON_ELEMENT (needle[i])
298 != (haystack_char = CANON_ELEMENT (haystack[i + j])))
299 {
300 RET0_IF_0 (haystack_char);
301 break;
302 }
303 ++i;
304 }
0caca71a
UD
305 if (needle_len <= i)
306 {
307 /* Scan for matches in left half. */
308 i = suffix - 1;
400726de
MK
309 while (i != SIZE_MAX)
310 {
311 if (CANON_ELEMENT (needle[i])
312 != (haystack_char = CANON_ELEMENT (haystack[i + j])))
313 {
314 RET0_IF_0 (haystack_char);
315 break;
316 }
317 --i;
318 }
0caca71a
UD
319 if (i == SIZE_MAX)
320 return (RETURN_TYPE) (haystack + j);
321 j += period;
322 }
323 else
324 j += i - suffix + 1;
400726de
MK
325
326 if (!AVAILABLE2 (haystack, haystack_len, j, needle_len))
327 break;
0caca71a
UD
328 }
329 }
400726de 330 ret0: __attribute__ ((unused))
0caca71a
UD
331 return NULL;
332}
333
334/* Return the first location of non-empty NEEDLE within HAYSTACK, or
335 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
336 method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
337 Performance is guaranteed to be linear, with an initialization cost
338 of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
339
340 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
341 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
342 and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
343 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
344 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
345 sublinear performance is not possible. */
346static RETURN_TYPE
347two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
348 const unsigned char *needle, size_t needle_len)
349{
350 size_t i; /* Index into current byte of NEEDLE. */
351 size_t j; /* Index into current window of HAYSTACK. */
352 size_t period; /* The period of the right half of needle. */
353 size_t suffix; /* The index of the right half of needle. */
354 size_t shift_table[1U << CHAR_BIT]; /* See below. */
355
356 /* Factor the needle into two halves, such that the left half is
357 smaller than the global period, and the right half is
358 periodic (with a period as large as NEEDLE_LEN - suffix). */
359 suffix = critical_factorization (needle, needle_len, &period);
360
361 /* Populate shift_table. For each possible byte value c,
362 shift_table[c] is the distance from the last occurrence of c to
363 the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
364 shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */
365 for (i = 0; i < 1U << CHAR_BIT; i++)
366 shift_table[i] = needle_len;
367 for (i = 0; i < needle_len; i++)
368 shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
369
370 /* Perform the search. Each iteration compares the right half
371 first. */
372 if (CMP_FUNC (needle, needle + period, suffix) == 0)
373 {
374 /* Entire needle is periodic; a mismatch can only advance by the
375 period, so use memory to avoid rescanning known occurrences
376 of the period. */
377 size_t memory = 0;
378 size_t shift;
379 j = 0;
380 while (AVAILABLE (haystack, haystack_len, j, needle_len))
381 {
382 /* Check the last byte first; if it does not match, then
383 shift to the next possible match location. */
384 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
385 if (0 < shift)
386 {
387 if (memory && shift < period)
388 {
389 /* Since needle is periodic, but the last period has
390 a byte out of place, there can be no match until
391 after the mismatch. */
392 shift = needle_len - period;
0caca71a 393 }
5fb308bc 394 memory = 0;
0caca71a
UD
395 j += shift;
396 continue;
397 }
398 /* Scan for matches in right half. The last byte has
399 already been matched, by virtue of the shift table. */
400 i = MAX (suffix, memory);
401 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
402 == CANON_ELEMENT (haystack[i + j])))
403 ++i;
404 if (needle_len - 1 <= i)
405 {
406 /* Scan for matches in left half. */
407 i = suffix - 1;
408 while (memory < i + 1 && (CANON_ELEMENT (needle[i])
409 == CANON_ELEMENT (haystack[i + j])))
410 --i;
411 if (i + 1 < memory + 1)
412 return (RETURN_TYPE) (haystack + j);
413 /* No match, so remember how many repetitions of period
414 on the right half were scanned. */
415 j += period;
416 memory = needle_len - period;
417 }
418 else
419 {
420 j += i - suffix + 1;
421 memory = 0;
422 }
423 }
424 }
425 else
426 {
427 /* The two halves of needle are distinct; no extra memory is
428 required, and any mismatch results in a maximal shift. */
429 size_t shift;
430 period = MAX (suffix, needle_len - suffix) + 1;
431 j = 0;
432 while (AVAILABLE (haystack, haystack_len, j, needle_len))
433 {
434 /* Check the last byte first; if it does not match, then
435 shift to the next possible match location. */
436 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
437 if (0 < shift)
438 {
439 j += shift;
440 continue;
441 }
442 /* Scan for matches in right half. The last byte has
443 already been matched, by virtue of the shift table. */
444 i = suffix;
445 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
446 == CANON_ELEMENT (haystack[i + j])))
447 ++i;
448 if (needle_len - 1 <= i)
449 {
450 /* Scan for matches in left half. */
451 i = suffix - 1;
452 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
453 == CANON_ELEMENT (haystack[i + j])))
454 --i;
455 if (i == SIZE_MAX)
456 return (RETURN_TYPE) (haystack + j);
457 j += period;
458 }
459 else
460 j += i - suffix + 1;
461 }
462 }
463 return NULL;
464}
465
466#undef AVAILABLE
400726de
MK
467#undef AVAILABLE1
468#undef AVAILABLE2
0caca71a
UD
469#undef CANON_ELEMENT
470#undef CMP_FUNC
400726de 471#undef RET0_IF_0
0caca71a 472#undef RETURN_TYPE