]> git.ipfire.org Git - thirdparty/git.git/blob - grep.c
grep/pcre2: use compile-time PCREv2 version test
[thirdparty/git.git] / grep.c
1 #include "cache.h"
2 #include "config.h"
3 #include "grep.h"
4 #include "object-store.h"
5 #include "userdiff.h"
6 #include "xdiff-interface.h"
7 #include "diff.h"
8 #include "diffcore.h"
9 #include "commit.h"
10 #include "quote.h"
11 #include "help.h"
12
13 static int grep_source_load(struct grep_source *gs);
14 static int grep_source_is_binary(struct grep_source *gs,
15 struct index_state *istate);
16
17 static void std_output(struct grep_opt *opt, const void *buf, size_t size)
18 {
19 fwrite(buf, size, 1, stdout);
20 }
21
22 static struct grep_opt grep_defaults = {
23 .relative = 1,
24 .pathname = 1,
25 .max_depth = -1,
26 .pattern_type_option = GREP_PATTERN_TYPE_UNSPECIFIED,
27 .colors = {
28 [GREP_COLOR_CONTEXT] = "",
29 [GREP_COLOR_FILENAME] = "",
30 [GREP_COLOR_FUNCTION] = "",
31 [GREP_COLOR_LINENO] = "",
32 [GREP_COLOR_COLUMNNO] = "",
33 [GREP_COLOR_MATCH_CONTEXT] = GIT_COLOR_BOLD_RED,
34 [GREP_COLOR_MATCH_SELECTED] = GIT_COLOR_BOLD_RED,
35 [GREP_COLOR_SELECTED] = "",
36 [GREP_COLOR_SEP] = GIT_COLOR_CYAN,
37 },
38 .only_matching = 0,
39 .color = -1,
40 .output = std_output,
41 };
42
43 #ifdef USE_LIBPCRE2
44 static pcre2_general_context *pcre2_global_context;
45 #define GREP_PCRE2_DEBUG_MALLOC 0
46
47 static void *pcre2_malloc(PCRE2_SIZE size, MAYBE_UNUSED void *memory_data)
48 {
49 void *pointer = malloc(size);
50 #if GREP_PCRE2_DEBUG_MALLOC
51 static int count = 1;
52 fprintf(stderr, "PCRE2:%p -> #%02d: alloc(%lu)\n", pointer, count++, size);
53 #endif
54 return pointer;
55 }
56
57 static void pcre2_free(void *pointer, MAYBE_UNUSED void *memory_data)
58 {
59 #if GREP_PCRE2_DEBUG_MALLOC
60 static int count = 1;
61 if (pointer)
62 fprintf(stderr, "PCRE2:%p -> #%02d: free()\n", pointer, count++);
63 #endif
64 free(pointer);
65 }
66 #endif
67
68 static const char *color_grep_slots[] = {
69 [GREP_COLOR_CONTEXT] = "context",
70 [GREP_COLOR_FILENAME] = "filename",
71 [GREP_COLOR_FUNCTION] = "function",
72 [GREP_COLOR_LINENO] = "lineNumber",
73 [GREP_COLOR_COLUMNNO] = "column",
74 [GREP_COLOR_MATCH_CONTEXT] = "matchContext",
75 [GREP_COLOR_MATCH_SELECTED] = "matchSelected",
76 [GREP_COLOR_SELECTED] = "selected",
77 [GREP_COLOR_SEP] = "separator",
78 };
79
80 static int parse_pattern_type_arg(const char *opt, const char *arg)
81 {
82 if (!strcmp(arg, "default"))
83 return GREP_PATTERN_TYPE_UNSPECIFIED;
84 else if (!strcmp(arg, "basic"))
85 return GREP_PATTERN_TYPE_BRE;
86 else if (!strcmp(arg, "extended"))
87 return GREP_PATTERN_TYPE_ERE;
88 else if (!strcmp(arg, "fixed"))
89 return GREP_PATTERN_TYPE_FIXED;
90 else if (!strcmp(arg, "perl"))
91 return GREP_PATTERN_TYPE_PCRE;
92 die("bad %s argument: %s", opt, arg);
93 }
94
95 define_list_config_array_extra(color_grep_slots, {"match"});
96
97 /*
98 * Read the configuration file once and store it in
99 * the grep_defaults template.
100 */
101 int grep_config(const char *var, const char *value, void *cb)
102 {
103 struct grep_opt *opt = &grep_defaults;
104 const char *slot;
105
106 if (userdiff_config(var, value) < 0)
107 return -1;
108
109 /*
110 * The instance of grep_opt that we set up here is copied by
111 * grep_init() to be used by each individual invocation.
112 * When populating a new field of this structure here, be
113 * sure to think about ownership -- e.g., you might need to
114 * override the shallow copy in grep_init() with a deep copy.
115 */
116
117 if (!strcmp(var, "grep.extendedregexp")) {
118 opt->extended_regexp_option = git_config_bool(var, value);
119 return 0;
120 }
121
122 if (!strcmp(var, "grep.patterntype")) {
123 opt->pattern_type_option = parse_pattern_type_arg(var, value);
124 return 0;
125 }
126
127 if (!strcmp(var, "grep.linenumber")) {
128 opt->linenum = git_config_bool(var, value);
129 return 0;
130 }
131 if (!strcmp(var, "grep.column")) {
132 opt->columnnum = git_config_bool(var, value);
133 return 0;
134 }
135
136 if (!strcmp(var, "grep.fullname")) {
137 opt->relative = !git_config_bool(var, value);
138 return 0;
139 }
140
141 if (!strcmp(var, "color.grep"))
142 opt->color = git_config_colorbool(var, value);
143 if (!strcmp(var, "color.grep.match")) {
144 if (grep_config("color.grep.matchcontext", value, cb) < 0)
145 return -1;
146 if (grep_config("color.grep.matchselected", value, cb) < 0)
147 return -1;
148 } else if (skip_prefix(var, "color.grep.", &slot)) {
149 int i = LOOKUP_CONFIG(color_grep_slots, slot);
150 char *color;
151
152 if (i < 0)
153 return -1;
154 color = opt->colors[i];
155 if (!value)
156 return config_error_nonbool(var);
157 return color_parse(value, color);
158 }
159 return 0;
160 }
161
162 /*
163 * Initialize one instance of grep_opt and copy the
164 * default values from the template we read the configuration
165 * information in an earlier call to git_config(grep_config).
166 *
167 * If using PCRE, make sure that the library is configured
168 * to use the same allocator as Git (e.g. nedmalloc on Windows).
169 *
170 * Any allocated memory needs to be released in grep_destroy().
171 */
172 void grep_init(struct grep_opt *opt, struct repository *repo, const char *prefix)
173 {
174 #if defined(USE_LIBPCRE2)
175 if (!pcre2_global_context)
176 pcre2_global_context = pcre2_general_context_create(
177 pcre2_malloc, pcre2_free, NULL);
178 #endif
179
180 *opt = grep_defaults;
181
182 opt->repo = repo;
183 opt->prefix = prefix;
184 opt->prefix_length = (prefix && *prefix) ? strlen(prefix) : 0;
185 opt->pattern_tail = &opt->pattern_list;
186 opt->header_tail = &opt->header_list;
187 }
188
189 void grep_destroy(void)
190 {
191 #ifdef USE_LIBPCRE2
192 pcre2_general_context_free(pcre2_global_context);
193 #endif
194 }
195
196 static void grep_set_pattern_type_option(enum grep_pattern_type pattern_type, struct grep_opt *opt)
197 {
198 /*
199 * When committing to the pattern type by setting the relevant
200 * fields in grep_opt it's generally not necessary to zero out
201 * the fields we're not choosing, since they won't have been
202 * set by anything. The extended_regexp_option field is the
203 * only exception to this.
204 *
205 * This is because in the process of parsing grep.patternType
206 * & grep.extendedRegexp we set opt->pattern_type_option and
207 * opt->extended_regexp_option, respectively. We then
208 * internally use opt->extended_regexp_option to see if we're
209 * compiling an ERE. It must be unset if that's not actually
210 * the case.
211 */
212 if (pattern_type != GREP_PATTERN_TYPE_ERE &&
213 opt->extended_regexp_option)
214 opt->extended_regexp_option = 0;
215
216 switch (pattern_type) {
217 case GREP_PATTERN_TYPE_UNSPECIFIED:
218 /* fall through */
219
220 case GREP_PATTERN_TYPE_BRE:
221 break;
222
223 case GREP_PATTERN_TYPE_ERE:
224 opt->extended_regexp_option = 1;
225 break;
226
227 case GREP_PATTERN_TYPE_FIXED:
228 opt->fixed = 1;
229 break;
230
231 case GREP_PATTERN_TYPE_PCRE:
232 opt->pcre2 = 1;
233 break;
234 }
235 }
236
237 void grep_commit_pattern_type(enum grep_pattern_type pattern_type, struct grep_opt *opt)
238 {
239 if (pattern_type != GREP_PATTERN_TYPE_UNSPECIFIED)
240 grep_set_pattern_type_option(pattern_type, opt);
241 else if (opt->pattern_type_option != GREP_PATTERN_TYPE_UNSPECIFIED)
242 grep_set_pattern_type_option(opt->pattern_type_option, opt);
243 else if (opt->extended_regexp_option)
244 /*
245 * This branch *must* happen after setting from the
246 * opt->pattern_type_option above, we don't want
247 * grep.extendedRegexp to override grep.patternType!
248 */
249 grep_set_pattern_type_option(GREP_PATTERN_TYPE_ERE, opt);
250 }
251
252 static struct grep_pat *create_grep_pat(const char *pat, size_t patlen,
253 const char *origin, int no,
254 enum grep_pat_token t,
255 enum grep_header_field field)
256 {
257 struct grep_pat *p = xcalloc(1, sizeof(*p));
258 p->pattern = xmemdupz(pat, patlen);
259 p->patternlen = patlen;
260 p->origin = origin;
261 p->no = no;
262 p->token = t;
263 p->field = field;
264 return p;
265 }
266
267 static void do_append_grep_pat(struct grep_pat ***tail, struct grep_pat *p)
268 {
269 **tail = p;
270 *tail = &p->next;
271 p->next = NULL;
272
273 switch (p->token) {
274 case GREP_PATTERN: /* atom */
275 case GREP_PATTERN_HEAD:
276 case GREP_PATTERN_BODY:
277 for (;;) {
278 struct grep_pat *new_pat;
279 size_t len = 0;
280 char *cp = p->pattern + p->patternlen, *nl = NULL;
281 while (++len <= p->patternlen) {
282 if (*(--cp) == '\n') {
283 nl = cp;
284 break;
285 }
286 }
287 if (!nl)
288 break;
289 new_pat = create_grep_pat(nl + 1, len - 1, p->origin,
290 p->no, p->token, p->field);
291 new_pat->next = p->next;
292 if (!p->next)
293 *tail = &new_pat->next;
294 p->next = new_pat;
295 *nl = '\0';
296 p->patternlen -= len;
297 }
298 break;
299 default:
300 break;
301 }
302 }
303
304 void append_header_grep_pattern(struct grep_opt *opt,
305 enum grep_header_field field, const char *pat)
306 {
307 struct grep_pat *p = create_grep_pat(pat, strlen(pat), "header", 0,
308 GREP_PATTERN_HEAD, field);
309 if (field == GREP_HEADER_REFLOG)
310 opt->use_reflog_filter = 1;
311 do_append_grep_pat(&opt->header_tail, p);
312 }
313
314 void append_grep_pattern(struct grep_opt *opt, const char *pat,
315 const char *origin, int no, enum grep_pat_token t)
316 {
317 append_grep_pat(opt, pat, strlen(pat), origin, no, t);
318 }
319
320 void append_grep_pat(struct grep_opt *opt, const char *pat, size_t patlen,
321 const char *origin, int no, enum grep_pat_token t)
322 {
323 struct grep_pat *p = create_grep_pat(pat, patlen, origin, no, t, 0);
324 do_append_grep_pat(&opt->pattern_tail, p);
325 }
326
327 struct grep_opt *grep_opt_dup(const struct grep_opt *opt)
328 {
329 struct grep_pat *pat;
330 struct grep_opt *ret = xmalloc(sizeof(struct grep_opt));
331 *ret = *opt;
332
333 ret->pattern_list = NULL;
334 ret->pattern_tail = &ret->pattern_list;
335
336 for(pat = opt->pattern_list; pat != NULL; pat = pat->next)
337 {
338 if(pat->token == GREP_PATTERN_HEAD)
339 append_header_grep_pattern(ret, pat->field,
340 pat->pattern);
341 else
342 append_grep_pat(ret, pat->pattern, pat->patternlen,
343 pat->origin, pat->no, pat->token);
344 }
345
346 return ret;
347 }
348
349 static NORETURN void compile_regexp_failed(const struct grep_pat *p,
350 const char *error)
351 {
352 char where[1024];
353
354 if (p->no)
355 xsnprintf(where, sizeof(where), "In '%s' at %d, ", p->origin, p->no);
356 else if (p->origin)
357 xsnprintf(where, sizeof(where), "%s, ", p->origin);
358 else
359 where[0] = 0;
360
361 die("%s'%s': %s", where, p->pattern, error);
362 }
363
364 static int is_fixed(const char *s, size_t len)
365 {
366 size_t i;
367
368 for (i = 0; i < len; i++) {
369 if (is_regex_special(s[i]))
370 return 0;
371 }
372
373 return 1;
374 }
375
376 #ifdef USE_LIBPCRE2
377 static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt)
378 {
379 int error;
380 PCRE2_UCHAR errbuf[256];
381 PCRE2_SIZE erroffset;
382 int options = PCRE2_MULTILINE;
383 int jitret;
384 int patinforet;
385 size_t jitsizearg;
386
387 /* pcre2_global_context is initialized in grep_init */
388 if (opt->ignore_case) {
389 if (!opt->ignore_locale && has_non_ascii(p->pattern)) {
390 if (!pcre2_global_context)
391 BUG("pcre2_global_context uninitialized");
392 p->pcre2_tables = pcre2_maketables(pcre2_global_context);
393 p->pcre2_compile_context = pcre2_compile_context_create(NULL);
394 pcre2_set_character_tables(p->pcre2_compile_context,
395 p->pcre2_tables);
396 }
397 options |= PCRE2_CASELESS;
398 }
399 if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
400 !(!opt->ignore_case && (p->fixed || p->is_fixed)))
401 options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
402
403 #ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER
404 /* Work around https://bugs.exim.org/show_bug.cgi?id=2642 fixed in 10.36 */
405 if (PCRE2_MATCH_INVALID_UTF && options & (PCRE2_UTF | PCRE2_CASELESS))
406 options |= PCRE2_NO_START_OPTIMIZE;
407 #endif
408
409 p->pcre2_pattern = pcre2_compile((PCRE2_SPTR)p->pattern,
410 p->patternlen, options, &error, &erroffset,
411 p->pcre2_compile_context);
412
413 if (p->pcre2_pattern) {
414 p->pcre2_match_data = pcre2_match_data_create_from_pattern(p->pcre2_pattern, NULL);
415 if (!p->pcre2_match_data)
416 die("Couldn't allocate PCRE2 match data");
417 } else {
418 pcre2_get_error_message(error, errbuf, sizeof(errbuf));
419 compile_regexp_failed(p, (const char *)&errbuf);
420 }
421
422 pcre2_config(PCRE2_CONFIG_JIT, &p->pcre2_jit_on);
423 if (p->pcre2_jit_on) {
424 jitret = pcre2_jit_compile(p->pcre2_pattern, PCRE2_JIT_COMPLETE);
425 if (jitret)
426 die("Couldn't JIT the PCRE2 pattern '%s', got '%d'\n", p->pattern, jitret);
427
428 /*
429 * The pcre2_config(PCRE2_CONFIG_JIT, ...) call just
430 * tells us whether the library itself supports JIT,
431 * but to see whether we're going to be actually using
432 * JIT we need to extract PCRE2_INFO_JITSIZE from the
433 * pattern *after* we do pcre2_jit_compile() above.
434 *
435 * This is because if the pattern contains the
436 * (*NO_JIT) verb (see pcre2syntax(3))
437 * pcre2_jit_compile() will exit early with 0. If we
438 * then proceed to call pcre2_jit_match() further down
439 * the line instead of pcre2_match() we'll either
440 * segfault (pre PCRE 10.31) or run into a fatal error
441 * (post PCRE2 10.31)
442 */
443 patinforet = pcre2_pattern_info(p->pcre2_pattern, PCRE2_INFO_JITSIZE, &jitsizearg);
444 if (patinforet)
445 BUG("pcre2_pattern_info() failed: %d", patinforet);
446 if (jitsizearg == 0) {
447 p->pcre2_jit_on = 0;
448 return;
449 }
450 }
451 }
452
453 static int pcre2match(struct grep_pat *p, const char *line, const char *eol,
454 regmatch_t *match, int eflags)
455 {
456 int ret, flags = 0;
457 PCRE2_SIZE *ovector;
458 PCRE2_UCHAR errbuf[256];
459
460 if (eflags & REG_NOTBOL)
461 flags |= PCRE2_NOTBOL;
462
463 if (p->pcre2_jit_on)
464 ret = pcre2_jit_match(p->pcre2_pattern, (unsigned char *)line,
465 eol - line, 0, flags, p->pcre2_match_data,
466 NULL);
467 else
468 ret = pcre2_match(p->pcre2_pattern, (unsigned char *)line,
469 eol - line, 0, flags, p->pcre2_match_data,
470 NULL);
471
472 if (ret < 0 && ret != PCRE2_ERROR_NOMATCH) {
473 pcre2_get_error_message(ret, errbuf, sizeof(errbuf));
474 die("%s failed with error code %d: %s",
475 (p->pcre2_jit_on ? "pcre2_jit_match" : "pcre2_match"), ret,
476 errbuf);
477 }
478 if (ret > 0) {
479 ovector = pcre2_get_ovector_pointer(p->pcre2_match_data);
480 ret = 0;
481 match->rm_so = (int)ovector[0];
482 match->rm_eo = (int)ovector[1];
483 }
484
485 return ret;
486 }
487
488 static void free_pcre2_pattern(struct grep_pat *p)
489 {
490 pcre2_compile_context_free(p->pcre2_compile_context);
491 pcre2_code_free(p->pcre2_pattern);
492 pcre2_match_data_free(p->pcre2_match_data);
493 free((void *)p->pcre2_tables);
494 }
495 #else /* !USE_LIBPCRE2 */
496 static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt)
497 {
498 die("cannot use Perl-compatible regexes when not compiled with USE_LIBPCRE");
499 }
500
501 static int pcre2match(struct grep_pat *p, const char *line, const char *eol,
502 regmatch_t *match, int eflags)
503 {
504 return 1;
505 }
506
507 static void free_pcre2_pattern(struct grep_pat *p)
508 {
509 }
510
511 static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt)
512 {
513 struct strbuf sb = STRBUF_INIT;
514 int err;
515 int regflags = 0;
516
517 basic_regex_quote_buf(&sb, p->pattern);
518 if (opt->ignore_case)
519 regflags |= REG_ICASE;
520 err = regcomp(&p->regexp, sb.buf, regflags);
521 strbuf_release(&sb);
522 if (err) {
523 char errbuf[1024];
524 regerror(err, &p->regexp, errbuf, sizeof(errbuf));
525 compile_regexp_failed(p, errbuf);
526 }
527 }
528 #endif /* !USE_LIBPCRE2 */
529
530 static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
531 {
532 int err;
533 int regflags = REG_NEWLINE;
534
535 p->word_regexp = opt->word_regexp;
536 p->ignore_case = opt->ignore_case;
537 p->fixed = opt->fixed;
538
539 if (memchr(p->pattern, 0, p->patternlen) && !opt->pcre2)
540 die(_("given pattern contains NULL byte (via -f <file>). This is only supported with -P under PCRE v2"));
541
542 p->is_fixed = is_fixed(p->pattern, p->patternlen);
543 #ifdef USE_LIBPCRE2
544 if (!p->fixed && !p->is_fixed) {
545 const char *no_jit = "(*NO_JIT)";
546 const int no_jit_len = strlen(no_jit);
547 if (starts_with(p->pattern, no_jit) &&
548 is_fixed(p->pattern + no_jit_len,
549 p->patternlen - no_jit_len))
550 p->is_fixed = 1;
551 }
552 #endif
553 if (p->fixed || p->is_fixed) {
554 #ifdef USE_LIBPCRE2
555 if (p->is_fixed) {
556 compile_pcre2_pattern(p, opt);
557 } else {
558 /*
559 * E.g. t7811-grep-open.sh relies on the
560 * pattern being restored.
561 */
562 char *old_pattern = p->pattern;
563 size_t old_patternlen = p->patternlen;
564 struct strbuf sb = STRBUF_INIT;
565
566 /*
567 * There is the PCRE2_LITERAL flag, but it's
568 * only in PCRE v2 10.30 and later. Needing to
569 * ifdef our way around that and dealing with
570 * it + PCRE2_MULTILINE being an error is more
571 * complex than just quoting this ourselves.
572 */
573 strbuf_add(&sb, "\\Q", 2);
574 strbuf_add(&sb, p->pattern, p->patternlen);
575 strbuf_add(&sb, "\\E", 2);
576
577 p->pattern = sb.buf;
578 p->patternlen = sb.len;
579 compile_pcre2_pattern(p, opt);
580 p->pattern = old_pattern;
581 p->patternlen = old_patternlen;
582 strbuf_release(&sb);
583 }
584 #else /* !USE_LIBPCRE2 */
585 compile_fixed_regexp(p, opt);
586 #endif /* !USE_LIBPCRE2 */
587 return;
588 }
589
590 if (opt->pcre2) {
591 compile_pcre2_pattern(p, opt);
592 return;
593 }
594
595 if (p->ignore_case)
596 regflags |= REG_ICASE;
597 if (opt->extended_regexp_option)
598 regflags |= REG_EXTENDED;
599 err = regcomp(&p->regexp, p->pattern, regflags);
600 if (err) {
601 char errbuf[1024];
602 regerror(err, &p->regexp, errbuf, 1024);
603 compile_regexp_failed(p, errbuf);
604 }
605 }
606
607 static struct grep_expr *compile_pattern_or(struct grep_pat **);
608 static struct grep_expr *compile_pattern_atom(struct grep_pat **list)
609 {
610 struct grep_pat *p;
611 struct grep_expr *x;
612
613 p = *list;
614 if (!p)
615 return NULL;
616 switch (p->token) {
617 case GREP_PATTERN: /* atom */
618 case GREP_PATTERN_HEAD:
619 case GREP_PATTERN_BODY:
620 x = xcalloc(1, sizeof (struct grep_expr));
621 x->node = GREP_NODE_ATOM;
622 x->u.atom = p;
623 *list = p->next;
624 return x;
625 case GREP_OPEN_PAREN:
626 *list = p->next;
627 x = compile_pattern_or(list);
628 if (!*list || (*list)->token != GREP_CLOSE_PAREN)
629 die("unmatched parenthesis");
630 *list = (*list)->next;
631 return x;
632 default:
633 return NULL;
634 }
635 }
636
637 static struct grep_expr *compile_pattern_not(struct grep_pat **list)
638 {
639 struct grep_pat *p;
640 struct grep_expr *x;
641
642 p = *list;
643 if (!p)
644 return NULL;
645 switch (p->token) {
646 case GREP_NOT:
647 if (!p->next)
648 die("--not not followed by pattern expression");
649 *list = p->next;
650 x = xcalloc(1, sizeof (struct grep_expr));
651 x->node = GREP_NODE_NOT;
652 x->u.unary = compile_pattern_not(list);
653 if (!x->u.unary)
654 die("--not followed by non pattern expression");
655 return x;
656 default:
657 return compile_pattern_atom(list);
658 }
659 }
660
661 static struct grep_expr *compile_pattern_and(struct grep_pat **list)
662 {
663 struct grep_pat *p;
664 struct grep_expr *x, *y, *z;
665
666 x = compile_pattern_not(list);
667 p = *list;
668 if (p && p->token == GREP_AND) {
669 if (!p->next)
670 die("--and not followed by pattern expression");
671 *list = p->next;
672 y = compile_pattern_and(list);
673 if (!y)
674 die("--and not followed by pattern expression");
675 z = xcalloc(1, sizeof (struct grep_expr));
676 z->node = GREP_NODE_AND;
677 z->u.binary.left = x;
678 z->u.binary.right = y;
679 return z;
680 }
681 return x;
682 }
683
684 static struct grep_expr *compile_pattern_or(struct grep_pat **list)
685 {
686 struct grep_pat *p;
687 struct grep_expr *x, *y, *z;
688
689 x = compile_pattern_and(list);
690 p = *list;
691 if (x && p && p->token != GREP_CLOSE_PAREN) {
692 y = compile_pattern_or(list);
693 if (!y)
694 die("not a pattern expression %s", p->pattern);
695 z = xcalloc(1, sizeof (struct grep_expr));
696 z->node = GREP_NODE_OR;
697 z->u.binary.left = x;
698 z->u.binary.right = y;
699 return z;
700 }
701 return x;
702 }
703
704 static struct grep_expr *compile_pattern_expr(struct grep_pat **list)
705 {
706 return compile_pattern_or(list);
707 }
708
709 static struct grep_expr *grep_true_expr(void)
710 {
711 struct grep_expr *z = xcalloc(1, sizeof(*z));
712 z->node = GREP_NODE_TRUE;
713 return z;
714 }
715
716 static struct grep_expr *grep_or_expr(struct grep_expr *left, struct grep_expr *right)
717 {
718 struct grep_expr *z = xcalloc(1, sizeof(*z));
719 z->node = GREP_NODE_OR;
720 z->u.binary.left = left;
721 z->u.binary.right = right;
722 return z;
723 }
724
725 static struct grep_expr *prep_header_patterns(struct grep_opt *opt)
726 {
727 struct grep_pat *p;
728 struct grep_expr *header_expr;
729 struct grep_expr *(header_group[GREP_HEADER_FIELD_MAX]);
730 enum grep_header_field fld;
731
732 if (!opt->header_list)
733 return NULL;
734
735 for (p = opt->header_list; p; p = p->next) {
736 if (p->token != GREP_PATTERN_HEAD)
737 BUG("a non-header pattern in grep header list.");
738 if (p->field < GREP_HEADER_FIELD_MIN ||
739 GREP_HEADER_FIELD_MAX <= p->field)
740 BUG("unknown header field %d", p->field);
741 compile_regexp(p, opt);
742 }
743
744 for (fld = 0; fld < GREP_HEADER_FIELD_MAX; fld++)
745 header_group[fld] = NULL;
746
747 for (p = opt->header_list; p; p = p->next) {
748 struct grep_expr *h;
749 struct grep_pat *pp = p;
750
751 h = compile_pattern_atom(&pp);
752 if (!h || pp != p->next)
753 BUG("malformed header expr");
754 if (!header_group[p->field]) {
755 header_group[p->field] = h;
756 continue;
757 }
758 header_group[p->field] = grep_or_expr(h, header_group[p->field]);
759 }
760
761 header_expr = NULL;
762
763 for (fld = 0; fld < GREP_HEADER_FIELD_MAX; fld++) {
764 if (!header_group[fld])
765 continue;
766 if (!header_expr)
767 header_expr = grep_true_expr();
768 header_expr = grep_or_expr(header_group[fld], header_expr);
769 }
770 return header_expr;
771 }
772
773 static struct grep_expr *grep_splice_or(struct grep_expr *x, struct grep_expr *y)
774 {
775 struct grep_expr *z = x;
776
777 while (x) {
778 assert(x->node == GREP_NODE_OR);
779 if (x->u.binary.right &&
780 x->u.binary.right->node == GREP_NODE_TRUE) {
781 x->u.binary.right = y;
782 break;
783 }
784 x = x->u.binary.right;
785 }
786 return z;
787 }
788
789 void compile_grep_patterns(struct grep_opt *opt)
790 {
791 struct grep_pat *p;
792 struct grep_expr *header_expr = prep_header_patterns(opt);
793
794 for (p = opt->pattern_list; p; p = p->next) {
795 switch (p->token) {
796 case GREP_PATTERN: /* atom */
797 case GREP_PATTERN_HEAD:
798 case GREP_PATTERN_BODY:
799 compile_regexp(p, opt);
800 break;
801 default:
802 opt->extended = 1;
803 break;
804 }
805 }
806
807 if (opt->all_match || header_expr)
808 opt->extended = 1;
809 else if (!opt->extended)
810 return;
811
812 p = opt->pattern_list;
813 if (p)
814 opt->pattern_expression = compile_pattern_expr(&p);
815 if (p)
816 die("incomplete pattern expression: %s", p->pattern);
817
818 if (!header_expr)
819 return;
820
821 if (!opt->pattern_expression)
822 opt->pattern_expression = header_expr;
823 else if (opt->all_match)
824 opt->pattern_expression = grep_splice_or(header_expr,
825 opt->pattern_expression);
826 else
827 opt->pattern_expression = grep_or_expr(opt->pattern_expression,
828 header_expr);
829 opt->all_match = 1;
830 }
831
832 static void free_pattern_expr(struct grep_expr *x)
833 {
834 switch (x->node) {
835 case GREP_NODE_TRUE:
836 case GREP_NODE_ATOM:
837 break;
838 case GREP_NODE_NOT:
839 free_pattern_expr(x->u.unary);
840 break;
841 case GREP_NODE_AND:
842 case GREP_NODE_OR:
843 free_pattern_expr(x->u.binary.left);
844 free_pattern_expr(x->u.binary.right);
845 break;
846 }
847 free(x);
848 }
849
850 void free_grep_patterns(struct grep_opt *opt)
851 {
852 struct grep_pat *p, *n;
853
854 for (p = opt->pattern_list; p; p = n) {
855 n = p->next;
856 switch (p->token) {
857 case GREP_PATTERN: /* atom */
858 case GREP_PATTERN_HEAD:
859 case GREP_PATTERN_BODY:
860 if (p->pcre2_pattern)
861 free_pcre2_pattern(p);
862 else
863 regfree(&p->regexp);
864 free(p->pattern);
865 break;
866 default:
867 break;
868 }
869 free(p);
870 }
871
872 if (!opt->extended)
873 return;
874 free_pattern_expr(opt->pattern_expression);
875 }
876
877 static char *end_of_line(char *cp, unsigned long *left)
878 {
879 unsigned long l = *left;
880 while (l && *cp != '\n') {
881 l--;
882 cp++;
883 }
884 *left = l;
885 return cp;
886 }
887
888 static int word_char(char ch)
889 {
890 return isalnum(ch) || ch == '_';
891 }
892
893 static void output_color(struct grep_opt *opt, const void *data, size_t size,
894 const char *color)
895 {
896 if (want_color(opt->color) && color && color[0]) {
897 opt->output(opt, color, strlen(color));
898 opt->output(opt, data, size);
899 opt->output(opt, GIT_COLOR_RESET, strlen(GIT_COLOR_RESET));
900 } else
901 opt->output(opt, data, size);
902 }
903
904 static void output_sep(struct grep_opt *opt, char sign)
905 {
906 if (opt->null_following_name)
907 opt->output(opt, "\0", 1);
908 else
909 output_color(opt, &sign, 1, opt->colors[GREP_COLOR_SEP]);
910 }
911
912 static void show_name(struct grep_opt *opt, const char *name)
913 {
914 output_color(opt, name, strlen(name), opt->colors[GREP_COLOR_FILENAME]);
915 opt->output(opt, opt->null_following_name ? "\0" : "\n", 1);
916 }
917
918 static int patmatch(struct grep_pat *p, char *line, char *eol,
919 regmatch_t *match, int eflags)
920 {
921 int hit;
922
923 if (p->pcre2_pattern)
924 hit = !pcre2match(p, line, eol, match, eflags);
925 else
926 hit = !regexec_buf(&p->regexp, line, eol - line, 1, match,
927 eflags);
928
929 return hit;
930 }
931
932 static int strip_timestamp(char *bol, char **eol_p)
933 {
934 char *eol = *eol_p;
935 int ch;
936
937 while (bol < --eol) {
938 if (*eol != '>')
939 continue;
940 *eol_p = ++eol;
941 ch = *eol;
942 *eol = '\0';
943 return ch;
944 }
945 return 0;
946 }
947
948 static struct {
949 const char *field;
950 size_t len;
951 } header_field[] = {
952 { "author ", 7 },
953 { "committer ", 10 },
954 { "reflog ", 7 },
955 };
956
957 static int match_one_pattern(struct grep_pat *p, char *bol, char *eol,
958 enum grep_context ctx,
959 regmatch_t *pmatch, int eflags)
960 {
961 int hit = 0;
962 int saved_ch = 0;
963 const char *start = bol;
964
965 if ((p->token != GREP_PATTERN) &&
966 ((p->token == GREP_PATTERN_HEAD) != (ctx == GREP_CONTEXT_HEAD)))
967 return 0;
968
969 if (p->token == GREP_PATTERN_HEAD) {
970 const char *field;
971 size_t len;
972 assert(p->field < ARRAY_SIZE(header_field));
973 field = header_field[p->field].field;
974 len = header_field[p->field].len;
975 if (strncmp(bol, field, len))
976 return 0;
977 bol += len;
978 switch (p->field) {
979 case GREP_HEADER_AUTHOR:
980 case GREP_HEADER_COMMITTER:
981 saved_ch = strip_timestamp(bol, &eol);
982 break;
983 default:
984 break;
985 }
986 }
987
988 again:
989 hit = patmatch(p, bol, eol, pmatch, eflags);
990
991 if (hit && p->word_regexp) {
992 if ((pmatch[0].rm_so < 0) ||
993 (eol - bol) < pmatch[0].rm_so ||
994 (pmatch[0].rm_eo < 0) ||
995 (eol - bol) < pmatch[0].rm_eo)
996 die("regexp returned nonsense");
997
998 /* Match beginning must be either beginning of the
999 * line, or at word boundary (i.e. the last char must
1000 * not be a word char). Similarly, match end must be
1001 * either end of the line, or at word boundary
1002 * (i.e. the next char must not be a word char).
1003 */
1004 if ( ((pmatch[0].rm_so == 0) ||
1005 !word_char(bol[pmatch[0].rm_so-1])) &&
1006 ((pmatch[0].rm_eo == (eol-bol)) ||
1007 !word_char(bol[pmatch[0].rm_eo])) )
1008 ;
1009 else
1010 hit = 0;
1011
1012 /* Words consist of at least one character. */
1013 if (pmatch->rm_so == pmatch->rm_eo)
1014 hit = 0;
1015
1016 if (!hit && pmatch[0].rm_so + bol + 1 < eol) {
1017 /* There could be more than one match on the
1018 * line, and the first match might not be
1019 * strict word match. But later ones could be!
1020 * Forward to the next possible start, i.e. the
1021 * next position following a non-word char.
1022 */
1023 bol = pmatch[0].rm_so + bol + 1;
1024 while (word_char(bol[-1]) && bol < eol)
1025 bol++;
1026 eflags |= REG_NOTBOL;
1027 if (bol < eol)
1028 goto again;
1029 }
1030 }
1031 if (p->token == GREP_PATTERN_HEAD && saved_ch)
1032 *eol = saved_ch;
1033 if (hit) {
1034 pmatch[0].rm_so += bol - start;
1035 pmatch[0].rm_eo += bol - start;
1036 }
1037 return hit;
1038 }
1039
1040 static int match_expr_eval(struct grep_opt *opt, struct grep_expr *x, char *bol,
1041 char *eol, enum grep_context ctx, ssize_t *col,
1042 ssize_t *icol, int collect_hits)
1043 {
1044 int h = 0;
1045
1046 if (!x)
1047 die("Not a valid grep expression");
1048 switch (x->node) {
1049 case GREP_NODE_TRUE:
1050 h = 1;
1051 break;
1052 case GREP_NODE_ATOM:
1053 {
1054 regmatch_t tmp;
1055 h = match_one_pattern(x->u.atom, bol, eol, ctx,
1056 &tmp, 0);
1057 if (h && (*col < 0 || tmp.rm_so < *col))
1058 *col = tmp.rm_so;
1059 }
1060 break;
1061 case GREP_NODE_NOT:
1062 /*
1063 * Upon visiting a GREP_NODE_NOT, col and icol become swapped.
1064 */
1065 h = !match_expr_eval(opt, x->u.unary, bol, eol, ctx, icol, col,
1066 0);
1067 break;
1068 case GREP_NODE_AND:
1069 h = match_expr_eval(opt, x->u.binary.left, bol, eol, ctx, col,
1070 icol, 0);
1071 if (h || opt->columnnum) {
1072 /*
1073 * Don't short-circuit AND when given --column, since a
1074 * NOT earlier in the tree may turn this into an OR. In
1075 * this case, see the below comment.
1076 */
1077 h &= match_expr_eval(opt, x->u.binary.right, bol, eol,
1078 ctx, col, icol, 0);
1079 }
1080 break;
1081 case GREP_NODE_OR:
1082 if (!(collect_hits || opt->columnnum)) {
1083 /*
1084 * Don't short-circuit OR when given --column (or
1085 * collecting hits) to ensure we don't skip a later
1086 * child that would produce an earlier match.
1087 */
1088 return (match_expr_eval(opt, x->u.binary.left, bol, eol,
1089 ctx, col, icol, 0) ||
1090 match_expr_eval(opt, x->u.binary.right, bol,
1091 eol, ctx, col, icol, 0));
1092 }
1093 h = match_expr_eval(opt, x->u.binary.left, bol, eol, ctx, col,
1094 icol, 0);
1095 if (collect_hits)
1096 x->u.binary.left->hit |= h;
1097 h |= match_expr_eval(opt, x->u.binary.right, bol, eol, ctx, col,
1098 icol, collect_hits);
1099 break;
1100 default:
1101 die("Unexpected node type (internal error) %d", x->node);
1102 }
1103 if (collect_hits)
1104 x->hit |= h;
1105 return h;
1106 }
1107
1108 static int match_expr(struct grep_opt *opt, char *bol, char *eol,
1109 enum grep_context ctx, ssize_t *col,
1110 ssize_t *icol, int collect_hits)
1111 {
1112 struct grep_expr *x = opt->pattern_expression;
1113 return match_expr_eval(opt, x, bol, eol, ctx, col, icol, collect_hits);
1114 }
1115
1116 static int match_line(struct grep_opt *opt, char *bol, char *eol,
1117 ssize_t *col, ssize_t *icol,
1118 enum grep_context ctx, int collect_hits)
1119 {
1120 struct grep_pat *p;
1121 int hit = 0;
1122
1123 if (opt->extended)
1124 return match_expr(opt, bol, eol, ctx, col, icol,
1125 collect_hits);
1126
1127 /* we do not call with collect_hits without being extended */
1128 for (p = opt->pattern_list; p; p = p->next) {
1129 regmatch_t tmp;
1130 if (match_one_pattern(p, bol, eol, ctx, &tmp, 0)) {
1131 hit |= 1;
1132 if (!opt->columnnum) {
1133 /*
1134 * Without --column, any single match on a line
1135 * is enough to know that it needs to be
1136 * printed. With --column, scan _all_ patterns
1137 * to find the earliest.
1138 */
1139 break;
1140 }
1141 if (*col < 0 || tmp.rm_so < *col)
1142 *col = tmp.rm_so;
1143 }
1144 }
1145 return hit;
1146 }
1147
1148 static int match_next_pattern(struct grep_pat *p, char *bol, char *eol,
1149 enum grep_context ctx,
1150 regmatch_t *pmatch, int eflags)
1151 {
1152 regmatch_t match;
1153
1154 if (!match_one_pattern(p, bol, eol, ctx, &match, eflags))
1155 return 0;
1156 if (match.rm_so < 0 || match.rm_eo < 0)
1157 return 0;
1158 if (pmatch->rm_so >= 0 && pmatch->rm_eo >= 0) {
1159 if (match.rm_so > pmatch->rm_so)
1160 return 1;
1161 if (match.rm_so == pmatch->rm_so && match.rm_eo < pmatch->rm_eo)
1162 return 1;
1163 }
1164 pmatch->rm_so = match.rm_so;
1165 pmatch->rm_eo = match.rm_eo;
1166 return 1;
1167 }
1168
1169 static int next_match(struct grep_opt *opt, char *bol, char *eol,
1170 enum grep_context ctx, regmatch_t *pmatch, int eflags)
1171 {
1172 struct grep_pat *p;
1173 int hit = 0;
1174
1175 pmatch->rm_so = pmatch->rm_eo = -1;
1176 if (bol < eol) {
1177 for (p = opt->pattern_list; p; p = p->next) {
1178 switch (p->token) {
1179 case GREP_PATTERN: /* atom */
1180 case GREP_PATTERN_HEAD:
1181 case GREP_PATTERN_BODY:
1182 hit |= match_next_pattern(p, bol, eol, ctx,
1183 pmatch, eflags);
1184 break;
1185 default:
1186 break;
1187 }
1188 }
1189 }
1190 return hit;
1191 }
1192
1193 static void show_line_header(struct grep_opt *opt, const char *name,
1194 unsigned lno, ssize_t cno, char sign)
1195 {
1196 if (opt->heading && opt->last_shown == 0) {
1197 output_color(opt, name, strlen(name), opt->colors[GREP_COLOR_FILENAME]);
1198 opt->output(opt, "\n", 1);
1199 }
1200 opt->last_shown = lno;
1201
1202 if (!opt->heading && opt->pathname) {
1203 output_color(opt, name, strlen(name), opt->colors[GREP_COLOR_FILENAME]);
1204 output_sep(opt, sign);
1205 }
1206 if (opt->linenum) {
1207 char buf[32];
1208 xsnprintf(buf, sizeof(buf), "%d", lno);
1209 output_color(opt, buf, strlen(buf), opt->colors[GREP_COLOR_LINENO]);
1210 output_sep(opt, sign);
1211 }
1212 /*
1213 * Treat 'cno' as the 1-indexed offset from the start of a non-context
1214 * line to its first match. Otherwise, 'cno' is 0 indicating that we are
1215 * being called with a context line.
1216 */
1217 if (opt->columnnum && cno) {
1218 char buf[32];
1219 xsnprintf(buf, sizeof(buf), "%"PRIuMAX, (uintmax_t)cno);
1220 output_color(opt, buf, strlen(buf), opt->colors[GREP_COLOR_COLUMNNO]);
1221 output_sep(opt, sign);
1222 }
1223 }
1224
1225 static void show_line(struct grep_opt *opt, char *bol, char *eol,
1226 const char *name, unsigned lno, ssize_t cno, char sign)
1227 {
1228 int rest = eol - bol;
1229 const char *match_color = NULL;
1230 const char *line_color = NULL;
1231
1232 if (opt->file_break && opt->last_shown == 0) {
1233 if (opt->show_hunk_mark)
1234 opt->output(opt, "\n", 1);
1235 } else if (opt->pre_context || opt->post_context || opt->funcbody) {
1236 if (opt->last_shown == 0) {
1237 if (opt->show_hunk_mark) {
1238 output_color(opt, "--", 2, opt->colors[GREP_COLOR_SEP]);
1239 opt->output(opt, "\n", 1);
1240 }
1241 } else if (lno > opt->last_shown + 1) {
1242 output_color(opt, "--", 2, opt->colors[GREP_COLOR_SEP]);
1243 opt->output(opt, "\n", 1);
1244 }
1245 }
1246 if (!opt->only_matching) {
1247 /*
1248 * In case the line we're being called with contains more than
1249 * one match, leave printing each header to the loop below.
1250 */
1251 show_line_header(opt, name, lno, cno, sign);
1252 }
1253 if (opt->color || opt->only_matching) {
1254 regmatch_t match;
1255 enum grep_context ctx = GREP_CONTEXT_BODY;
1256 int ch = *eol;
1257 int eflags = 0;
1258
1259 if (opt->color) {
1260 if (sign == ':')
1261 match_color = opt->colors[GREP_COLOR_MATCH_SELECTED];
1262 else
1263 match_color = opt->colors[GREP_COLOR_MATCH_CONTEXT];
1264 if (sign == ':')
1265 line_color = opt->colors[GREP_COLOR_SELECTED];
1266 else if (sign == '-')
1267 line_color = opt->colors[GREP_COLOR_CONTEXT];
1268 else if (sign == '=')
1269 line_color = opt->colors[GREP_COLOR_FUNCTION];
1270 }
1271 *eol = '\0';
1272 while (next_match(opt, bol, eol, ctx, &match, eflags)) {
1273 if (match.rm_so == match.rm_eo)
1274 break;
1275
1276 if (opt->only_matching)
1277 show_line_header(opt, name, lno, cno, sign);
1278 else
1279 output_color(opt, bol, match.rm_so, line_color);
1280 output_color(opt, bol + match.rm_so,
1281 match.rm_eo - match.rm_so, match_color);
1282 if (opt->only_matching)
1283 opt->output(opt, "\n", 1);
1284 bol += match.rm_eo;
1285 cno += match.rm_eo;
1286 rest -= match.rm_eo;
1287 eflags = REG_NOTBOL;
1288 }
1289 *eol = ch;
1290 }
1291 if (!opt->only_matching) {
1292 output_color(opt, bol, rest, line_color);
1293 opt->output(opt, "\n", 1);
1294 }
1295 }
1296
1297 int grep_use_locks;
1298
1299 /*
1300 * This lock protects access to the gitattributes machinery, which is
1301 * not thread-safe.
1302 */
1303 pthread_mutex_t grep_attr_mutex;
1304
1305 static inline void grep_attr_lock(void)
1306 {
1307 if (grep_use_locks)
1308 pthread_mutex_lock(&grep_attr_mutex);
1309 }
1310
1311 static inline void grep_attr_unlock(void)
1312 {
1313 if (grep_use_locks)
1314 pthread_mutex_unlock(&grep_attr_mutex);
1315 }
1316
1317 static int match_funcname(struct grep_opt *opt, struct grep_source *gs, char *bol, char *eol)
1318 {
1319 xdemitconf_t *xecfg = opt->priv;
1320 if (xecfg && !xecfg->find_func) {
1321 grep_source_load_driver(gs, opt->repo->index);
1322 if (gs->driver->funcname.pattern) {
1323 const struct userdiff_funcname *pe = &gs->driver->funcname;
1324 xdiff_set_find_func(xecfg, pe->pattern, pe->cflags);
1325 } else {
1326 xecfg = opt->priv = NULL;
1327 }
1328 }
1329
1330 if (xecfg) {
1331 char buf[1];
1332 return xecfg->find_func(bol, eol - bol, buf, 1,
1333 xecfg->find_func_priv) >= 0;
1334 }
1335
1336 if (bol == eol)
1337 return 0;
1338 if (isalpha(*bol) || *bol == '_' || *bol == '$')
1339 return 1;
1340 return 0;
1341 }
1342
1343 static void show_funcname_line(struct grep_opt *opt, struct grep_source *gs,
1344 char *bol, unsigned lno)
1345 {
1346 while (bol > gs->buf) {
1347 char *eol = --bol;
1348
1349 while (bol > gs->buf && bol[-1] != '\n')
1350 bol--;
1351 lno--;
1352
1353 if (lno <= opt->last_shown)
1354 break;
1355
1356 if (match_funcname(opt, gs, bol, eol)) {
1357 show_line(opt, bol, eol, gs->name, lno, 0, '=');
1358 break;
1359 }
1360 }
1361 }
1362
1363 static int is_empty_line(const char *bol, const char *eol);
1364
1365 static void show_pre_context(struct grep_opt *opt, struct grep_source *gs,
1366 char *bol, char *end, unsigned lno)
1367 {
1368 unsigned cur = lno, from = 1, funcname_lno = 0, orig_from;
1369 int funcname_needed = !!opt->funcname, comment_needed = 0;
1370
1371 if (opt->pre_context < lno)
1372 from = lno - opt->pre_context;
1373 if (from <= opt->last_shown)
1374 from = opt->last_shown + 1;
1375 orig_from = from;
1376 if (opt->funcbody) {
1377 if (match_funcname(opt, gs, bol, end))
1378 comment_needed = 1;
1379 else
1380 funcname_needed = 1;
1381 from = opt->last_shown + 1;
1382 }
1383
1384 /* Rewind. */
1385 while (bol > gs->buf && cur > from) {
1386 char *next_bol = bol;
1387 char *eol = --bol;
1388
1389 while (bol > gs->buf && bol[-1] != '\n')
1390 bol--;
1391 cur--;
1392 if (comment_needed && (is_empty_line(bol, eol) ||
1393 match_funcname(opt, gs, bol, eol))) {
1394 comment_needed = 0;
1395 from = orig_from;
1396 if (cur < from) {
1397 cur++;
1398 bol = next_bol;
1399 break;
1400 }
1401 }
1402 if (funcname_needed && match_funcname(opt, gs, bol, eol)) {
1403 funcname_lno = cur;
1404 funcname_needed = 0;
1405 if (opt->funcbody)
1406 comment_needed = 1;
1407 else
1408 from = orig_from;
1409 }
1410 }
1411
1412 /* We need to look even further back to find a function signature. */
1413 if (opt->funcname && funcname_needed)
1414 show_funcname_line(opt, gs, bol, cur);
1415
1416 /* Back forward. */
1417 while (cur < lno) {
1418 char *eol = bol, sign = (cur == funcname_lno) ? '=' : '-';
1419
1420 while (*eol != '\n')
1421 eol++;
1422 show_line(opt, bol, eol, gs->name, cur, 0, sign);
1423 bol = eol + 1;
1424 cur++;
1425 }
1426 }
1427
1428 static int should_lookahead(struct grep_opt *opt)
1429 {
1430 struct grep_pat *p;
1431
1432 if (opt->extended)
1433 return 0; /* punt for too complex stuff */
1434 if (opt->invert)
1435 return 0;
1436 for (p = opt->pattern_list; p; p = p->next) {
1437 if (p->token != GREP_PATTERN)
1438 return 0; /* punt for "header only" and stuff */
1439 }
1440 return 1;
1441 }
1442
1443 static int look_ahead(struct grep_opt *opt,
1444 unsigned long *left_p,
1445 unsigned *lno_p,
1446 char **bol_p)
1447 {
1448 unsigned lno = *lno_p;
1449 char *bol = *bol_p;
1450 struct grep_pat *p;
1451 char *sp, *last_bol;
1452 regoff_t earliest = -1;
1453
1454 for (p = opt->pattern_list; p; p = p->next) {
1455 int hit;
1456 regmatch_t m;
1457
1458 hit = patmatch(p, bol, bol + *left_p, &m, 0);
1459 if (!hit || m.rm_so < 0 || m.rm_eo < 0)
1460 continue;
1461 if (earliest < 0 || m.rm_so < earliest)
1462 earliest = m.rm_so;
1463 }
1464
1465 if (earliest < 0) {
1466 *bol_p = bol + *left_p;
1467 *left_p = 0;
1468 return 1;
1469 }
1470 for (sp = bol + earliest; bol < sp && sp[-1] != '\n'; sp--)
1471 ; /* find the beginning of the line */
1472 last_bol = sp;
1473
1474 for (sp = bol; sp < last_bol; sp++) {
1475 if (*sp == '\n')
1476 lno++;
1477 }
1478 *left_p -= last_bol - bol;
1479 *bol_p = last_bol;
1480 *lno_p = lno;
1481 return 0;
1482 }
1483
1484 static int fill_textconv_grep(struct repository *r,
1485 struct userdiff_driver *driver,
1486 struct grep_source *gs)
1487 {
1488 struct diff_filespec *df;
1489 char *buf;
1490 size_t size;
1491
1492 if (!driver || !driver->textconv)
1493 return grep_source_load(gs);
1494
1495 /*
1496 * The textconv interface is intimately tied to diff_filespecs, so we
1497 * have to pretend to be one. If we could unify the grep_source
1498 * and diff_filespec structs, this mess could just go away.
1499 */
1500 df = alloc_filespec(gs->path);
1501 switch (gs->type) {
1502 case GREP_SOURCE_OID:
1503 fill_filespec(df, gs->identifier, 1, 0100644);
1504 break;
1505 case GREP_SOURCE_FILE:
1506 fill_filespec(df, &null_oid, 0, 0100644);
1507 break;
1508 default:
1509 BUG("attempt to textconv something without a path?");
1510 }
1511
1512 /*
1513 * fill_textconv is not remotely thread-safe; it modifies the global
1514 * diff tempfile structure, writes to the_repo's odb and might
1515 * internally call thread-unsafe functions such as the
1516 * prepare_packed_git() lazy-initializator. Because of the last two, we
1517 * must ensure mutual exclusion between this call and the object reading
1518 * API, thus we use obj_read_lock() here.
1519 *
1520 * TODO: allowing text conversion to run in parallel with object
1521 * reading operations might increase performance in the multithreaded
1522 * non-worktreee git-grep with --textconv.
1523 */
1524 obj_read_lock();
1525 size = fill_textconv(r, driver, df, &buf);
1526 obj_read_unlock();
1527 free_filespec(df);
1528
1529 /*
1530 * The normal fill_textconv usage by the diff machinery would just keep
1531 * the textconv'd buf separate from the diff_filespec. But much of the
1532 * grep code passes around a grep_source and assumes that its "buf"
1533 * pointer is the beginning of the thing we are searching. So let's
1534 * install our textconv'd version into the grep_source, taking care not
1535 * to leak any existing buffer.
1536 */
1537 grep_source_clear_data(gs);
1538 gs->buf = buf;
1539 gs->size = size;
1540
1541 return 0;
1542 }
1543
1544 static int is_empty_line(const char *bol, const char *eol)
1545 {
1546 while (bol < eol && isspace(*bol))
1547 bol++;
1548 return bol == eol;
1549 }
1550
1551 static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int collect_hits)
1552 {
1553 char *bol;
1554 char *peek_bol = NULL;
1555 unsigned long left;
1556 unsigned lno = 1;
1557 unsigned last_hit = 0;
1558 int binary_match_only = 0;
1559 unsigned count = 0;
1560 int try_lookahead = 0;
1561 int show_function = 0;
1562 struct userdiff_driver *textconv = NULL;
1563 enum grep_context ctx = GREP_CONTEXT_HEAD;
1564 xdemitconf_t xecfg;
1565
1566 if (!opt->status_only && gs->name == NULL)
1567 BUG("grep call which could print a name requires "
1568 "grep_source.name be non-NULL");
1569
1570 if (!opt->output)
1571 opt->output = std_output;
1572
1573 if (opt->pre_context || opt->post_context || opt->file_break ||
1574 opt->funcbody) {
1575 /* Show hunk marks, except for the first file. */
1576 if (opt->last_shown)
1577 opt->show_hunk_mark = 1;
1578 /*
1579 * If we're using threads then we can't easily identify
1580 * the first file. Always put hunk marks in that case
1581 * and skip the very first one later in work_done().
1582 */
1583 if (opt->output != std_output)
1584 opt->show_hunk_mark = 1;
1585 }
1586 opt->last_shown = 0;
1587
1588 if (opt->allow_textconv) {
1589 grep_source_load_driver(gs, opt->repo->index);
1590 /*
1591 * We might set up the shared textconv cache data here, which
1592 * is not thread-safe. Also, get_oid_with_context() and
1593 * parse_object() might be internally called. As they are not
1594 * currently thread-safe and might be racy with object reading,
1595 * obj_read_lock() must be called.
1596 */
1597 grep_attr_lock();
1598 obj_read_lock();
1599 textconv = userdiff_get_textconv(opt->repo, gs->driver);
1600 obj_read_unlock();
1601 grep_attr_unlock();
1602 }
1603
1604 /*
1605 * We know the result of a textconv is text, so we only have to care
1606 * about binary handling if we are not using it.
1607 */
1608 if (!textconv) {
1609 switch (opt->binary) {
1610 case GREP_BINARY_DEFAULT:
1611 if (grep_source_is_binary(gs, opt->repo->index))
1612 binary_match_only = 1;
1613 break;
1614 case GREP_BINARY_NOMATCH:
1615 if (grep_source_is_binary(gs, opt->repo->index))
1616 return 0; /* Assume unmatch */
1617 break;
1618 case GREP_BINARY_TEXT:
1619 break;
1620 default:
1621 BUG("unknown binary handling mode");
1622 }
1623 }
1624
1625 memset(&xecfg, 0, sizeof(xecfg));
1626 opt->priv = &xecfg;
1627
1628 try_lookahead = should_lookahead(opt);
1629
1630 if (fill_textconv_grep(opt->repo, textconv, gs) < 0)
1631 return 0;
1632
1633 bol = gs->buf;
1634 left = gs->size;
1635 while (left) {
1636 char *eol, ch;
1637 int hit;
1638 ssize_t cno;
1639 ssize_t col = -1, icol = -1;
1640
1641 /*
1642 * look_ahead() skips quickly to the line that possibly
1643 * has the next hit; don't call it if we need to do
1644 * something more than just skipping the current line
1645 * in response to an unmatch for the current line. E.g.
1646 * inside a post-context window, we will show the current
1647 * line as a context around the previous hit when it
1648 * doesn't hit.
1649 */
1650 if (try_lookahead
1651 && !(last_hit
1652 && (show_function ||
1653 lno <= last_hit + opt->post_context))
1654 && look_ahead(opt, &left, &lno, &bol))
1655 break;
1656 eol = end_of_line(bol, &left);
1657 ch = *eol;
1658 *eol = 0;
1659
1660 if ((ctx == GREP_CONTEXT_HEAD) && (eol == bol))
1661 ctx = GREP_CONTEXT_BODY;
1662
1663 hit = match_line(opt, bol, eol, &col, &icol, ctx, collect_hits);
1664 *eol = ch;
1665
1666 if (collect_hits)
1667 goto next_line;
1668
1669 /* "grep -v -e foo -e bla" should list lines
1670 * that do not have either, so inversion should
1671 * be done outside.
1672 */
1673 if (opt->invert)
1674 hit = !hit;
1675 if (opt->unmatch_name_only) {
1676 if (hit)
1677 return 0;
1678 goto next_line;
1679 }
1680 if (hit) {
1681 count++;
1682 if (opt->status_only)
1683 return 1;
1684 if (opt->name_only) {
1685 show_name(opt, gs->name);
1686 return 1;
1687 }
1688 if (opt->count)
1689 goto next_line;
1690 if (binary_match_only) {
1691 opt->output(opt, "Binary file ", 12);
1692 output_color(opt, gs->name, strlen(gs->name),
1693 opt->colors[GREP_COLOR_FILENAME]);
1694 opt->output(opt, " matches\n", 9);
1695 return 1;
1696 }
1697 /* Hit at this line. If we haven't shown the
1698 * pre-context lines, we would need to show them.
1699 */
1700 if (opt->pre_context || opt->funcbody)
1701 show_pre_context(opt, gs, bol, eol, lno);
1702 else if (opt->funcname)
1703 show_funcname_line(opt, gs, bol, lno);
1704 cno = opt->invert ? icol : col;
1705 if (cno < 0) {
1706 /*
1707 * A negative cno indicates that there was no
1708 * match on the line. We are thus inverted and
1709 * being asked to show all lines that _don't_
1710 * match a given expression. Therefore, set cno
1711 * to 0 to suggest the whole line matches.
1712 */
1713 cno = 0;
1714 }
1715 show_line(opt, bol, eol, gs->name, lno, cno + 1, ':');
1716 last_hit = lno;
1717 if (opt->funcbody)
1718 show_function = 1;
1719 goto next_line;
1720 }
1721 if (show_function && (!peek_bol || peek_bol < bol)) {
1722 unsigned long peek_left = left;
1723 char *peek_eol = eol;
1724
1725 /*
1726 * Trailing empty lines are not interesting.
1727 * Peek past them to see if they belong to the
1728 * body of the current function.
1729 */
1730 peek_bol = bol;
1731 while (is_empty_line(peek_bol, peek_eol)) {
1732 peek_bol = peek_eol + 1;
1733 peek_eol = end_of_line(peek_bol, &peek_left);
1734 }
1735
1736 if (match_funcname(opt, gs, peek_bol, peek_eol))
1737 show_function = 0;
1738 }
1739 if (show_function ||
1740 (last_hit && lno <= last_hit + opt->post_context)) {
1741 /* If the last hit is within the post context,
1742 * we need to show this line.
1743 */
1744 show_line(opt, bol, eol, gs->name, lno, col + 1, '-');
1745 }
1746
1747 next_line:
1748 bol = eol + 1;
1749 if (!left)
1750 break;
1751 left--;
1752 lno++;
1753 }
1754
1755 if (collect_hits)
1756 return 0;
1757
1758 if (opt->status_only)
1759 return opt->unmatch_name_only;
1760 if (opt->unmatch_name_only) {
1761 /* We did not see any hit, so we want to show this */
1762 show_name(opt, gs->name);
1763 return 1;
1764 }
1765
1766 xdiff_clear_find_func(&xecfg);
1767 opt->priv = NULL;
1768
1769 /* NEEDSWORK:
1770 * The real "grep -c foo *.c" gives many "bar.c:0" lines,
1771 * which feels mostly useless but sometimes useful. Maybe
1772 * make it another option? For now suppress them.
1773 */
1774 if (opt->count && count) {
1775 char buf[32];
1776 if (opt->pathname) {
1777 output_color(opt, gs->name, strlen(gs->name),
1778 opt->colors[GREP_COLOR_FILENAME]);
1779 output_sep(opt, ':');
1780 }
1781 xsnprintf(buf, sizeof(buf), "%u\n", count);
1782 opt->output(opt, buf, strlen(buf));
1783 return 1;
1784 }
1785 return !!last_hit;
1786 }
1787
1788 static void clr_hit_marker(struct grep_expr *x)
1789 {
1790 /* All-hit markers are meaningful only at the very top level
1791 * OR node.
1792 */
1793 while (1) {
1794 x->hit = 0;
1795 if (x->node != GREP_NODE_OR)
1796 return;
1797 x->u.binary.left->hit = 0;
1798 x = x->u.binary.right;
1799 }
1800 }
1801
1802 static int chk_hit_marker(struct grep_expr *x)
1803 {
1804 /* Top level nodes have hit markers. See if they all are hits */
1805 while (1) {
1806 if (x->node != GREP_NODE_OR)
1807 return x->hit;
1808 if (!x->u.binary.left->hit)
1809 return 0;
1810 x = x->u.binary.right;
1811 }
1812 }
1813
1814 int grep_source(struct grep_opt *opt, struct grep_source *gs)
1815 {
1816 /*
1817 * we do not have to do the two-pass grep when we do not check
1818 * buffer-wide "all-match".
1819 */
1820 if (!opt->all_match)
1821 return grep_source_1(opt, gs, 0);
1822
1823 /* Otherwise the toplevel "or" terms hit a bit differently.
1824 * We first clear hit markers from them.
1825 */
1826 clr_hit_marker(opt->pattern_expression);
1827 grep_source_1(opt, gs, 1);
1828
1829 if (!chk_hit_marker(opt->pattern_expression))
1830 return 0;
1831
1832 return grep_source_1(opt, gs, 0);
1833 }
1834
1835 int grep_buffer(struct grep_opt *opt, char *buf, unsigned long size)
1836 {
1837 struct grep_source gs;
1838 int r;
1839
1840 grep_source_init(&gs, GREP_SOURCE_BUF, NULL, NULL, NULL);
1841 gs.buf = buf;
1842 gs.size = size;
1843
1844 r = grep_source(opt, &gs);
1845
1846 grep_source_clear(&gs);
1847 return r;
1848 }
1849
1850 void grep_source_init(struct grep_source *gs, enum grep_source_type type,
1851 const char *name, const char *path,
1852 const void *identifier)
1853 {
1854 gs->type = type;
1855 gs->name = xstrdup_or_null(name);
1856 gs->path = xstrdup_or_null(path);
1857 gs->buf = NULL;
1858 gs->size = 0;
1859 gs->driver = NULL;
1860
1861 switch (type) {
1862 case GREP_SOURCE_FILE:
1863 gs->identifier = xstrdup(identifier);
1864 break;
1865 case GREP_SOURCE_OID:
1866 gs->identifier = oiddup(identifier);
1867 break;
1868 case GREP_SOURCE_BUF:
1869 gs->identifier = NULL;
1870 break;
1871 }
1872 }
1873
1874 void grep_source_clear(struct grep_source *gs)
1875 {
1876 FREE_AND_NULL(gs->name);
1877 FREE_AND_NULL(gs->path);
1878 FREE_AND_NULL(gs->identifier);
1879 grep_source_clear_data(gs);
1880 }
1881
1882 void grep_source_clear_data(struct grep_source *gs)
1883 {
1884 switch (gs->type) {
1885 case GREP_SOURCE_FILE:
1886 case GREP_SOURCE_OID:
1887 FREE_AND_NULL(gs->buf);
1888 gs->size = 0;
1889 break;
1890 case GREP_SOURCE_BUF:
1891 /* leave user-provided buf intact */
1892 break;
1893 }
1894 }
1895
1896 static int grep_source_load_oid(struct grep_source *gs)
1897 {
1898 enum object_type type;
1899
1900 gs->buf = read_object_file(gs->identifier, &type, &gs->size);
1901 if (!gs->buf)
1902 return error(_("'%s': unable to read %s"),
1903 gs->name,
1904 oid_to_hex(gs->identifier));
1905 return 0;
1906 }
1907
1908 static int grep_source_load_file(struct grep_source *gs)
1909 {
1910 const char *filename = gs->identifier;
1911 struct stat st;
1912 char *data;
1913 size_t size;
1914 int i;
1915
1916 if (lstat(filename, &st) < 0) {
1917 err_ret:
1918 if (errno != ENOENT)
1919 error_errno(_("failed to stat '%s'"), filename);
1920 return -1;
1921 }
1922 if (!S_ISREG(st.st_mode))
1923 return -1;
1924 size = xsize_t(st.st_size);
1925 i = open(filename, O_RDONLY);
1926 if (i < 0)
1927 goto err_ret;
1928 data = xmallocz(size);
1929 if (st.st_size != read_in_full(i, data, size)) {
1930 error_errno(_("'%s': short read"), filename);
1931 close(i);
1932 free(data);
1933 return -1;
1934 }
1935 close(i);
1936
1937 gs->buf = data;
1938 gs->size = size;
1939 return 0;
1940 }
1941
1942 static int grep_source_load(struct grep_source *gs)
1943 {
1944 if (gs->buf)
1945 return 0;
1946
1947 switch (gs->type) {
1948 case GREP_SOURCE_FILE:
1949 return grep_source_load_file(gs);
1950 case GREP_SOURCE_OID:
1951 return grep_source_load_oid(gs);
1952 case GREP_SOURCE_BUF:
1953 return gs->buf ? 0 : -1;
1954 }
1955 BUG("invalid grep_source type to load");
1956 }
1957
1958 void grep_source_load_driver(struct grep_source *gs,
1959 struct index_state *istate)
1960 {
1961 if (gs->driver)
1962 return;
1963
1964 grep_attr_lock();
1965 if (gs->path)
1966 gs->driver = userdiff_find_by_path(istate, gs->path);
1967 if (!gs->driver)
1968 gs->driver = userdiff_find_by_name("default");
1969 grep_attr_unlock();
1970 }
1971
1972 static int grep_source_is_binary(struct grep_source *gs,
1973 struct index_state *istate)
1974 {
1975 grep_source_load_driver(gs, istate);
1976 if (gs->driver->binary != -1)
1977 return gs->driver->binary;
1978
1979 if (!grep_source_load(gs))
1980 return buffer_is_binary(gs->buf, gs->size);
1981
1982 return 0;
1983 }