]> git.ipfire.org Git - thirdparty/git.git/commitdiff
grep: don't use PCRE2?_UTF8 with "log --encoding=<non-utf8>"
authorÆvar Arnfjörð Bjarmason <avarab@gmail.com>
Thu, 27 Jun 2019 23:39:05 +0000 (01:39 +0200)
committerJunio C Hamano <gitster@pobox.com>
Fri, 28 Jun 2019 16:11:09 +0000 (09:11 -0700)
Fix a bug introduced in 18547aacf5 ("grep/pcre: support utf-8",
2016-06-25) that was missed due to a blindspot in our tests, as
discussed in the previous commit. I then blindly copied the same bug
in 94da9193a6 ("grep: add support for PCRE v2", 2017-06-01) when
adding the PCRE v2 code.

We should not tell PCRE that we're processing UTF-8 just because we're
dealing with non-ASCII. In the case of e.g. "log --encoding=<...>"
under is_utf8_locale() the haystack might be in ISO-8859-1, and the
needle might be in a non-UTF-8 encoding.

Maybe we should be more strict here and die earlier? Should we also be
converting the needle to the encoding in question, and failing if it's
not a string that's valid in that encoding? Maybe.

But for now matching this as non-UTF8 at least has some hope of
producing sensible results, since we know that our default heuristic
of assuming the text to be matched is in the user locale encoding
isn't true when we've explicitly encoded it to be in a different
encoding.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
grep.c
grep.h
revision.c
t/t4210-log-i18n.sh

diff --git a/grep.c b/grep.c
index f7c3a5803e8ea0bf2c609e9eeec6764217641cfc..1de4ab49c0c255c68252a6d586327be5a40fdbaf 100644 (file)
--- a/grep.c
+++ b/grep.c
@@ -388,11 +388,11 @@ static void compile_pcre1_regexp(struct grep_pat *p, const struct grep_opt *opt)
        int options = PCRE_MULTILINE;
 
        if (opt->ignore_case) {
-               if (has_non_ascii(p->pattern))
+               if (!opt->ignore_locale && has_non_ascii(p->pattern))
                        p->pcre1_tables = pcre_maketables();
                options |= PCRE_CASELESS;
        }
-       if (is_utf8_locale() && has_non_ascii(p->pattern))
+       if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern))
                options |= PCRE_UTF8;
 
        p->pcre1_regexp = pcre_compile(p->pattern, options, &error, &erroffset,
@@ -498,14 +498,14 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
        p->pcre2_compile_context = NULL;
 
        if (opt->ignore_case) {
-               if (has_non_ascii(p->pattern)) {
+               if (!opt->ignore_locale && has_non_ascii(p->pattern)) {
                        character_tables = pcre2_maketables(NULL);
                        p->pcre2_compile_context = pcre2_compile_context_create(NULL);
                        pcre2_set_character_tables(p->pcre2_compile_context, character_tables);
                }
                options |= PCRE2_CASELESS;
        }
-       if (is_utf8_locale() && has_non_ascii(p->pattern))
+       if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern))
                options |= PCRE2_UTF;
 
        p->pcre2_pattern = pcre2_compile((PCRE2_SPTR)p->pattern,
diff --git a/grep.h b/grep.h
index 1875880f37990f981865b0fab332cc6c6c4e5971..4bb8a79d93187bc2a624a17014e79734620047ea 100644 (file)
--- a/grep.h
+++ b/grep.h
@@ -173,6 +173,7 @@ struct grep_opt {
        int funcbody;
        int extended_regexp_option;
        int pattern_type_option;
+       int ignore_locale;
        char colors[NR_GREP_COLORS][COLOR_MAXLEN];
        unsigned pre_context;
        unsigned post_context;
index 621feb9df716400f32d016e1d36fc368b6a884fb..a842fb158af309aca4826c4bbb98cacb0b269afa 100644 (file)
@@ -28,6 +28,7 @@
 #include "commit-graph.h"
 #include "prio-queue.h"
 #include "hashmap.h"
+#include "utf8.h"
 
 volatile show_early_output_fn_t show_early_output;
 
@@ -2655,6 +2656,8 @@ int setup_revisions(int argc, const char **argv, struct rev_info *revs, struct s
 
        grep_commit_pattern_type(GREP_PATTERN_TYPE_UNSPECIFIED,
                                 &revs->grep_filter);
+       if (!is_encoding_utf8(get_log_output_encoding()))
+               revs->grep_filter.ignore_locale = 1;
        compile_grep_patterns(&revs->grep_filter);
 
        if (revs->reverse && revs->reflog_info)
index 86d22c1d4cf6a1a573c5a791c9f2dafde75e98d6..515bcb7ce1268550eb367e0055434d5ef673b2d3 100755 (executable)
@@ -59,10 +59,8 @@ test_expect_success 'log --grep does not find non-reencoded values (latin1)' '
 for engine in fixed basic extended perl
 do
        prereq=
-       result=success
        if test $engine = "perl"
        then
-               result=failure
                prereq="PCRE"
        else
                prereq=""
@@ -72,7 +70,7 @@ do
        then
            force_regex=.*
        fi
-       test_expect_$result GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not find non-reencoded values (latin1 + locale)" "
+       test_expect_success GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not find non-reencoded values (latin1 + locale)" "
                cat >expect <<-\EOF &&
                latin1
                utf8
@@ -86,7 +84,7 @@ do
                test_must_be_empty actual
        "
 
-       test_expect_$result GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not die on invalid UTF-8 value (latin1 + locale + invalid needle)" "
+       test_expect_success GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not die on invalid UTF-8 value (latin1 + locale + invalid needle)" "
                LC_ALL=\"$is_IS_locale\" git -c grep.patternType=$engine log --encoding=ISO-8859-1 --format=%s --grep=\"$force_regex$invalid_e\" >actual &&
                test_must_be_empty actual
        "