]> git.ipfire.org Git - thirdparty/git.git/blame - kwset.c
Add string search routines from GNU grep
[thirdparty/git.git] / kwset.c
CommitLineData
05f3dbba
FK
1/* kwset.c - search for any of a set of keywords.
2 Copyright 1989, 1998, 2000, 2005 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
18
19/* Written August 1989 by Mike Haertel.
20 The author may be reached (Email) at the address mike@ai.mit.edu,
21 or (US mail) as Mike Haertel c/o Free Software Foundation. */
22
23/* The algorithm implemented by these routines bears a startling resemblence
24 to one discovered by Beate Commentz-Walter, although it is not identical.
25 See "A String Matching Algorithm Fast on the Average," Technical Report,
26 IBM-Germany, Scientific Center Heidelberg, Tiergartenstrasse 15, D-6900
27 Heidelberg, Germany. See also Aho, A.V., and M. Corasick, "Efficient
28 String Matching: An Aid to Bibliographic Search," CACM June 1975,
29 Vol. 18, No. 6, which describes the failure function used below. */
30
31#ifdef HAVE_CONFIG_H
32# include <config.h>
33#endif
34#include <sys/types.h>
35#include "system.h"
36#include "kwset.h"
37#include "obstack.h"
38
39#ifdef GREP
40extern char *xmalloc();
41# undef malloc
42# define malloc xmalloc
43#endif
44
45#define NCHAR (UCHAR_MAX + 1)
46#define obstack_chunk_alloc malloc
47#define obstack_chunk_free free
48
49#define U(c) ((unsigned char) (c))
50
51/* Balanced tree of edges and labels leaving a given trie node. */
52struct tree
53{
54 struct tree *llink; /* Left link; MUST be first field. */
55 struct tree *rlink; /* Right link (to larger labels). */
56 struct trie *trie; /* Trie node pointed to by this edge. */
57 unsigned char label; /* Label on this edge. */
58 char balance; /* Difference in depths of subtrees. */
59};
60
61/* Node of a trie representing a set of reversed keywords. */
62struct trie
63{
64 unsigned int accepting; /* Word index of accepted word, or zero. */
65 struct tree *links; /* Tree of edges leaving this node. */
66 struct trie *parent; /* Parent of this node. */
67 struct trie *next; /* List of all trie nodes in level order. */
68 struct trie *fail; /* Aho-Corasick failure function. */
69 int depth; /* Depth of this node from the root. */
70 int shift; /* Shift function for search failures. */
71 int maxshift; /* Max shift of self and descendents. */
72};
73
74/* Structure returned opaquely to the caller, containing everything. */
75struct kwset
76{
77 struct obstack obstack; /* Obstack for node allocation. */
78 int words; /* Number of words in the trie. */
79 struct trie *trie; /* The trie itself. */
80 int mind; /* Minimum depth of an accepting node. */
81 int maxd; /* Maximum depth of any node. */
82 unsigned char delta[NCHAR]; /* Delta table for rapid search. */
83 struct trie *next[NCHAR]; /* Table of children of the root. */
84 char *target; /* Target string if there's only one. */
85 int mind2; /* Used in Boyer-Moore search for one string. */
86 char const *trans; /* Character translation table. */
87};
88
89/* Allocate and initialize a keyword set object, returning an opaque
90 pointer to it. Return NULL if memory is not available. */
91kwset_t
92kwsalloc (char const *trans)
93{
94 struct kwset *kwset;
95
96 kwset = (struct kwset *) malloc(sizeof (struct kwset));
97 if (!kwset)
98 return NULL;
99
100 obstack_init(&kwset->obstack);
101 kwset->words = 0;
102 kwset->trie
103 = (struct trie *) obstack_alloc(&kwset->obstack, sizeof (struct trie));
104 if (!kwset->trie)
105 {
106 kwsfree((kwset_t) kwset);
107 return NULL;
108 }
109 kwset->trie->accepting = 0;
110 kwset->trie->links = NULL;
111 kwset->trie->parent = NULL;
112 kwset->trie->next = NULL;
113 kwset->trie->fail = NULL;
114 kwset->trie->depth = 0;
115 kwset->trie->shift = 0;
116 kwset->mind = INT_MAX;
117 kwset->maxd = -1;
118 kwset->target = NULL;
119 kwset->trans = trans;
120
121 return (kwset_t) kwset;
122}
123
124/* This upper bound is valid for CHAR_BIT >= 4 and
125 exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }. */
126#define DEPTH_SIZE (CHAR_BIT + CHAR_BIT/2)
127
128/* Add the given string to the contents of the keyword set. Return NULL
129 for success, an error message otherwise. */
130const char *
131kwsincr (kwset_t kws, char const *text, size_t len)
132{
133 struct kwset *kwset;
134 register struct trie *trie;
135 register unsigned char label;
136 register struct tree *link;
137 register int depth;
138 struct tree *links[DEPTH_SIZE];
139 enum { L, R } dirs[DEPTH_SIZE];
140 struct tree *t, *r, *l, *rl, *lr;
141
142 kwset = (struct kwset *) kws;
143 trie = kwset->trie;
144 text += len;
145
146 /* Descend the trie (built of reversed keywords) character-by-character,
147 installing new nodes when necessary. */
148 while (len--)
149 {
150 label = kwset->trans ? kwset->trans[U(*--text)] : *--text;
151
152 /* Descend the tree of outgoing links for this trie node,
153 looking for the current character and keeping track
154 of the path followed. */
155 link = trie->links;
156 links[0] = (struct tree *) &trie->links;
157 dirs[0] = L;
158 depth = 1;
159
160 while (link && label != link->label)
161 {
162 links[depth] = link;
163 if (label < link->label)
164 dirs[depth++] = L, link = link->llink;
165 else
166 dirs[depth++] = R, link = link->rlink;
167 }
168
169 /* The current character doesn't have an outgoing link at
170 this trie node, so build a new trie node and install
171 a link in the current trie node's tree. */
172 if (!link)
173 {
174 link = (struct tree *) obstack_alloc(&kwset->obstack,
175 sizeof (struct tree));
176 if (!link)
177 return _("memory exhausted");
178 link->llink = NULL;
179 link->rlink = NULL;
180 link->trie = (struct trie *) obstack_alloc(&kwset->obstack,
181 sizeof (struct trie));
182 if (!link->trie)
183 {
184 obstack_free(&kwset->obstack, link);
185 return _("memory exhausted");
186 }
187 link->trie->accepting = 0;
188 link->trie->links = NULL;
189 link->trie->parent = trie;
190 link->trie->next = NULL;
191 link->trie->fail = NULL;
192 link->trie->depth = trie->depth + 1;
193 link->trie->shift = 0;
194 link->label = label;
195 link->balance = 0;
196
197 /* Install the new tree node in its parent. */
198 if (dirs[--depth] == L)
199 links[depth]->llink = link;
200 else
201 links[depth]->rlink = link;
202
203 /* Back up the tree fixing the balance flags. */
204 while (depth && !links[depth]->balance)
205 {
206 if (dirs[depth] == L)
207 --links[depth]->balance;
208 else
209 ++links[depth]->balance;
210 --depth;
211 }
212
213 /* Rebalance the tree by pointer rotations if necessary. */
214 if (depth && ((dirs[depth] == L && --links[depth]->balance)
215 || (dirs[depth] == R && ++links[depth]->balance)))
216 {
217 switch (links[depth]->balance)
218 {
219 case (char) -2:
220 switch (dirs[depth + 1])
221 {
222 case L:
223 r = links[depth], t = r->llink, rl = t->rlink;
224 t->rlink = r, r->llink = rl;
225 t->balance = r->balance = 0;
226 break;
227 case R:
228 r = links[depth], l = r->llink, t = l->rlink;
229 rl = t->rlink, lr = t->llink;
230 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
231 l->balance = t->balance != 1 ? 0 : -1;
232 r->balance = t->balance != (char) -1 ? 0 : 1;
233 t->balance = 0;
234 break;
235 default:
236 abort ();
237 }
238 break;
239 case 2:
240 switch (dirs[depth + 1])
241 {
242 case R:
243 l = links[depth], t = l->rlink, lr = t->llink;
244 t->llink = l, l->rlink = lr;
245 t->balance = l->balance = 0;
246 break;
247 case L:
248 l = links[depth], r = l->rlink, t = r->llink;
249 lr = t->llink, rl = t->rlink;
250 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
251 l->balance = t->balance != 1 ? 0 : -1;
252 r->balance = t->balance != (char) -1 ? 0 : 1;
253 t->balance = 0;
254 break;
255 default:
256 abort ();
257 }
258 break;
259 default:
260 abort ();
261 }
262
263 if (dirs[depth - 1] == L)
264 links[depth - 1]->llink = t;
265 else
266 links[depth - 1]->rlink = t;
267 }
268 }
269
270 trie = link->trie;
271 }
272
273 /* Mark the node we finally reached as accepting, encoding the
274 index number of this word in the keyword set so far. */
275 if (!trie->accepting)
276 trie->accepting = 1 + 2 * kwset->words;
277 ++kwset->words;
278
279 /* Keep track of the longest and shortest string of the keyword set. */
280 if (trie->depth < kwset->mind)
281 kwset->mind = trie->depth;
282 if (trie->depth > kwset->maxd)
283 kwset->maxd = trie->depth;
284
285 return NULL;
286}
287
288/* Enqueue the trie nodes referenced from the given tree in the
289 given queue. */
290static void
291enqueue (struct tree *tree, struct trie **last)
292{
293 if (!tree)
294 return;
295 enqueue(tree->llink, last);
296 enqueue(tree->rlink, last);
297 (*last) = (*last)->next = tree->trie;
298}
299
300/* Compute the Aho-Corasick failure function for the trie nodes referenced
301 from the given tree, given the failure function for their parent as
302 well as a last resort failure node. */
303static void
304treefails (register struct tree const *tree, struct trie const *fail,
305 struct trie *recourse)
306{
307 register struct tree *link;
308
309 if (!tree)
310 return;
311
312 treefails(tree->llink, fail, recourse);
313 treefails(tree->rlink, fail, recourse);
314
315 /* Find, in the chain of fails going back to the root, the first
316 node that has a descendent on the current label. */
317 while (fail)
318 {
319 link = fail->links;
320 while (link && tree->label != link->label)
321 if (tree->label < link->label)
322 link = link->llink;
323 else
324 link = link->rlink;
325 if (link)
326 {
327 tree->trie->fail = link->trie;
328 return;
329 }
330 fail = fail->fail;
331 }
332
333 tree->trie->fail = recourse;
334}
335
336/* Set delta entries for the links of the given tree such that
337 the preexisting delta value is larger than the current depth. */
338static void
339treedelta (register struct tree const *tree,
340 register unsigned int depth,
341 unsigned char delta[])
342{
343 if (!tree)
344 return;
345 treedelta(tree->llink, depth, delta);
346 treedelta(tree->rlink, depth, delta);
347 if (depth < delta[tree->label])
348 delta[tree->label] = depth;
349}
350
351/* Return true if A has every label in B. */
352static int
353hasevery (register struct tree const *a, register struct tree const *b)
354{
355 if (!b)
356 return 1;
357 if (!hasevery(a, b->llink))
358 return 0;
359 if (!hasevery(a, b->rlink))
360 return 0;
361 while (a && b->label != a->label)
362 if (b->label < a->label)
363 a = a->llink;
364 else
365 a = a->rlink;
366 return !!a;
367}
368
369/* Compute a vector, indexed by character code, of the trie nodes
370 referenced from the given tree. */
371static void
372treenext (struct tree const *tree, struct trie *next[])
373{
374 if (!tree)
375 return;
376 treenext(tree->llink, next);
377 treenext(tree->rlink, next);
378 next[tree->label] = tree->trie;
379}
380
381/* Compute the shift for each trie node, as well as the delta
382 table and next cache for the given keyword set. */
383const char *
384kwsprep (kwset_t kws)
385{
386 register struct kwset *kwset;
387 register int i;
388 register struct trie *curr;
389 register char const *trans;
390 unsigned char delta[NCHAR];
391
392 kwset = (struct kwset *) kws;
393
394 /* Initial values for the delta table; will be changed later. The
395 delta entry for a given character is the smallest depth of any
396 node at which an outgoing edge is labeled by that character. */
397 memset(delta, kwset->mind < UCHAR_MAX ? kwset->mind : UCHAR_MAX, NCHAR);
398
399 /* Check if we can use the simple boyer-moore algorithm, instead
400 of the hairy commentz-walter algorithm. */
401 if (kwset->words == 1 && kwset->trans == NULL)
402 {
403 char c;
404
405 /* Looking for just one string. Extract it from the trie. */
406 kwset->target = obstack_alloc(&kwset->obstack, kwset->mind);
407 if (!kwset->target)
408 return _("memory exhausted");
409 for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i)
410 {
411 kwset->target[i] = curr->links->label;
412 curr = curr->links->trie;
413 }
414 /* Build the Boyer Moore delta. Boy that's easy compared to CW. */
415 for (i = 0; i < kwset->mind; ++i)
416 delta[U(kwset->target[i])] = kwset->mind - (i + 1);
417 /* Find the minimal delta2 shift that we might make after
418 a backwards match has failed. */
419 c = kwset->target[kwset->mind - 1];
420 for (i = kwset->mind - 2; i >= 0; --i)
421 if (kwset->target[i] == c)
422 break;
423 kwset->mind2 = kwset->mind - (i + 1);
424 }
425 else
426 {
427 register struct trie *fail;
428 struct trie *last, *next[NCHAR];
429
430 /* Traverse the nodes of the trie in level order, simultaneously
431 computing the delta table, failure function, and shift function. */
432 for (curr = last = kwset->trie; curr; curr = curr->next)
433 {
434 /* Enqueue the immediate descendents in the level order queue. */
435 enqueue(curr->links, &last);
436
437 curr->shift = kwset->mind;
438 curr->maxshift = kwset->mind;
439
440 /* Update the delta table for the descendents of this node. */
441 treedelta(curr->links, curr->depth, delta);
442
443 /* Compute the failure function for the decendents of this node. */
444 treefails(curr->links, curr->fail, kwset->trie);
445
446 /* Update the shifts at each node in the current node's chain
447 of fails back to the root. */
448 for (fail = curr->fail; fail; fail = fail->fail)
449 {
450 /* If the current node has some outgoing edge that the fail
451 doesn't, then the shift at the fail should be no larger
452 than the difference of their depths. */
453 if (!hasevery(fail->links, curr->links))
454 if (curr->depth - fail->depth < fail->shift)
455 fail->shift = curr->depth - fail->depth;
456
457 /* If the current node is accepting then the shift at the
458 fail and its descendents should be no larger than the
459 difference of their depths. */
460 if (curr->accepting && fail->maxshift > curr->depth - fail->depth)
461 fail->maxshift = curr->depth - fail->depth;
462 }
463 }
464
465 /* Traverse the trie in level order again, fixing up all nodes whose
466 shift exceeds their inherited maxshift. */
467 for (curr = kwset->trie->next; curr; curr = curr->next)
468 {
469 if (curr->maxshift > curr->parent->maxshift)
470 curr->maxshift = curr->parent->maxshift;
471 if (curr->shift > curr->maxshift)
472 curr->shift = curr->maxshift;
473 }
474
475 /* Create a vector, indexed by character code, of the outgoing links
476 from the root node. */
477 for (i = 0; i < NCHAR; ++i)
478 next[i] = NULL;
479 treenext(kwset->trie->links, next);
480
481 if ((trans = kwset->trans) != NULL)
482 for (i = 0; i < NCHAR; ++i)
483 kwset->next[i] = next[U(trans[i])];
484 else
485 memcpy(kwset->next, next, NCHAR * sizeof(struct trie *));
486 }
487
488 /* Fix things up for any translation table. */
489 if ((trans = kwset->trans) != NULL)
490 for (i = 0; i < NCHAR; ++i)
491 kwset->delta[i] = delta[U(trans[i])];
492 else
493 memcpy(kwset->delta, delta, NCHAR);
494
495 return NULL;
496}
497
498/* Fast boyer-moore search. */
499static size_t
500bmexec (kwset_t kws, char const *text, size_t size)
501{
502 struct kwset const *kwset;
503 register unsigned char const *d1;
504 register char const *ep, *sp, *tp;
505 register int d, gc, i, len, md2;
506
507 kwset = (struct kwset const *) kws;
508 len = kwset->mind;
509
510 if (len == 0)
511 return 0;
512 if (len > size)
513 return -1;
514 if (len == 1)
515 {
516 tp = memchr (text, kwset->target[0], size);
517 return tp ? tp - text : -1;
518 }
519
520 d1 = kwset->delta;
521 sp = kwset->target + len;
522 gc = U(sp[-2]);
523 md2 = kwset->mind2;
524 tp = text + len;
525
526 /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
527 if (size > 12 * len)
528 /* 11 is not a bug, the initial offset happens only once. */
529 for (ep = text + size - 11 * len;;)
530 {
531 while (tp <= ep)
532 {
533 d = d1[U(tp[-1])], tp += d;
534 d = d1[U(tp[-1])], tp += d;
535 if (d == 0)
536 goto found;
537 d = d1[U(tp[-1])], tp += d;
538 d = d1[U(tp[-1])], tp += d;
539 d = d1[U(tp[-1])], tp += d;
540 if (d == 0)
541 goto found;
542 d = d1[U(tp[-1])], tp += d;
543 d = d1[U(tp[-1])], tp += d;
544 d = d1[U(tp[-1])], tp += d;
545 if (d == 0)
546 goto found;
547 d = d1[U(tp[-1])], tp += d;
548 d = d1[U(tp[-1])], tp += d;
549 }
550 break;
551 found:
552 if (U(tp[-2]) == gc)
553 {
554 for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i)
555 ;
556 if (i > len)
557 return tp - len - text;
558 }
559 tp += md2;
560 }
561
562 /* Now we have only a few characters left to search. We
563 carefully avoid ever producing an out-of-bounds pointer. */
564 ep = text + size;
565 d = d1[U(tp[-1])];
566 while (d <= ep - tp)
567 {
568 d = d1[U((tp += d)[-1])];
569 if (d != 0)
570 continue;
571 if (U(tp[-2]) == gc)
572 {
573 for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i)
574 ;
575 if (i > len)
576 return tp - len - text;
577 }
578 d = md2;
579 }
580
581 return -1;
582}
583
584/* Hairy multiple string search. */
585static size_t
586cwexec (kwset_t kws, char const *text, size_t len, struct kwsmatch *kwsmatch)
587{
588 struct kwset const *kwset;
589 struct trie * const *next;
590 struct trie const *trie;
591 struct trie const *accept;
592 char const *beg, *lim, *mch, *lmch;
593 register unsigned char c;
594 register unsigned char const *delta;
595 register int d;
596 register char const *end, *qlim;
597 register struct tree const *tree;
598 register char const *trans;
599
600#ifdef lint
601 accept = NULL;
602#endif
603
604 /* Initialize register copies and look for easy ways out. */
605 kwset = (struct kwset *) kws;
606 if (len < kwset->mind)
607 return -1;
608 next = kwset->next;
609 delta = kwset->delta;
610 trans = kwset->trans;
611 lim = text + len;
612 end = text;
613 if ((d = kwset->mind) != 0)
614 mch = NULL;
615 else
616 {
617 mch = text, accept = kwset->trie;
618 goto match;
619 }
620
621 if (len >= 4 * kwset->mind)
622 qlim = lim - 4 * kwset->mind;
623 else
624 qlim = NULL;
625
626 while (lim - end >= d)
627 {
628 if (qlim && end <= qlim)
629 {
630 end += d - 1;
631 while ((d = delta[c = *end]) && end < qlim)
632 {
633 end += d;
634 end += delta[U(*end)];
635 end += delta[U(*end)];
636 }
637 ++end;
638 }
639 else
640 d = delta[c = (end += d)[-1]];
641 if (d)
642 continue;
643 beg = end - 1;
644 trie = next[c];
645 if (trie->accepting)
646 {
647 mch = beg;
648 accept = trie;
649 }
650 d = trie->shift;
651 while (beg > text)
652 {
653 c = trans ? trans[U(*--beg)] : *--beg;
654 tree = trie->links;
655 while (tree && c != tree->label)
656 if (c < tree->label)
657 tree = tree->llink;
658 else
659 tree = tree->rlink;
660 if (tree)
661 {
662 trie = tree->trie;
663 if (trie->accepting)
664 {
665 mch = beg;
666 accept = trie;
667 }
668 }
669 else
670 break;
671 d = trie->shift;
672 }
673 if (mch)
674 goto match;
675 }
676 return -1;
677
678 match:
679 /* Given a known match, find the longest possible match anchored
680 at or before its starting point. This is nearly a verbatim
681 copy of the preceding main search loops. */
682 if (lim - mch > kwset->maxd)
683 lim = mch + kwset->maxd;
684 lmch = 0;
685 d = 1;
686 while (lim - end >= d)
687 {
688 if ((d = delta[c = (end += d)[-1]]) != 0)
689 continue;
690 beg = end - 1;
691 if (!(trie = next[c]))
692 {
693 d = 1;
694 continue;
695 }
696 if (trie->accepting && beg <= mch)
697 {
698 lmch = beg;
699 accept = trie;
700 }
701 d = trie->shift;
702 while (beg > text)
703 {
704 c = trans ? trans[U(*--beg)] : *--beg;
705 tree = trie->links;
706 while (tree && c != tree->label)
707 if (c < tree->label)
708 tree = tree->llink;
709 else
710 tree = tree->rlink;
711 if (tree)
712 {
713 trie = tree->trie;
714 if (trie->accepting && beg <= mch)
715 {
716 lmch = beg;
717 accept = trie;
718 }
719 }
720 else
721 break;
722 d = trie->shift;
723 }
724 if (lmch)
725 {
726 mch = lmch;
727 goto match;
728 }
729 if (!d)
730 d = 1;
731 }
732
733 if (kwsmatch)
734 {
735 kwsmatch->index = accept->accepting / 2;
736 kwsmatch->offset[0] = mch - text;
737 kwsmatch->size[0] = accept->depth;
738 }
739 return mch - text;
740}
741
742/* Search through the given text for a match of any member of the
743 given keyword set. Return a pointer to the first character of
744 the matching substring, or NULL if no match is found. If FOUNDLEN
745 is non-NULL store in the referenced location the length of the
746 matching substring. Similarly, if FOUNDIDX is non-NULL, store
747 in the referenced location the index number of the particular
748 keyword matched. */
749size_t
750kwsexec (kwset_t kws, char const *text, size_t size,
751 struct kwsmatch *kwsmatch)
752{
753 struct kwset const *kwset = (struct kwset *) kws;
754 if (kwset->words == 1 && kwset->trans == NULL)
755 {
756 size_t ret = bmexec (kws, text, size);
757 if (kwsmatch != NULL && ret != (size_t) -1)
758 {
759 kwsmatch->index = 0;
760 kwsmatch->offset[0] = ret;
761 kwsmatch->size[0] = kwset->mind;
762 }
763 return ret;
764 }
765 else
766 return cwexec(kws, text, size, kwsmatch);
767}
768
769/* Free the components of the given keyword set. */
770void
771kwsfree (kwset_t kws)
772{
773 struct kwset *kwset;
774
775 kwset = (struct kwset *) kws;
776 obstack_free(&kwset->obstack, NULL);
777 free(kws);
778}