]> git.ipfire.org Git - thirdparty/git.git/commitdiff
grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
authorHamza Mahfooz <someguy@effective-light.com>
Fri, 15 Oct 2021 16:13:56 +0000 (12:13 -0400)
committerJunio C Hamano <gitster@pobox.com>
Fri, 15 Oct 2021 19:45:39 +0000 (12:45 -0700)
If we attempt to grep non-ascii log message text with an ascii pattern, we
run into the following issue:

    $ git log --color --author='.var.*Bjar' -1 origin/master | grep ^Author
    grep: (standard input): binary file matches

So, to fix this teach the grep code to use PCRE2_UTF, as long as the log
output is encoded in UTF-8.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
grep.c
t/t7812-grep-icase-non-ascii.sh

diff --git a/grep.c b/grep.c
index fe847a0111a209279656c5a14318d2fa196df2ed..f6e113e9f0fd52bc40c08613919eade24020d8da 100644 (file)
--- a/grep.c
+++ b/grep.c
@@ -382,8 +382,10 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
                }
                options |= PCRE2_CASELESS;
        }
-       if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
-           !(!opt->ignore_case && (p->fixed || p->is_fixed)))
+       if ((!opt->ignore_locale && !has_non_ascii(p->pattern)) ||
+           (!opt->ignore_locale && is_utf8_locale() &&
+            has_non_ascii(p->pattern) && !(!opt->ignore_case &&
+                                           (p->fixed || p->is_fixed))))
                options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
 
 #ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER
index e5d1e4ea6862694b0392415807d37ef4d3efc71b..22487d90fdc68e30b24144ac0899d60df0ea3fb0 100755 (executable)
@@ -53,6 +53,54 @@ test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' '
        test_cmp expected actual
 '
 
+test_expect_success GETTEXT_LOCALE,PCRE 'log --author with an ascii pattern on UTF-8 data' '
+       cat >expected <<-\EOF &&
+       Author: <BOLD;RED>À Ú Thor<RESET> <author@example.com>
+       EOF
+       test_write_lines "forth" >file4 &&
+       git add file4 &&
+       git commit --author="À Ú Thor <author@example.com>" -m sécond &&
+       git log -1 --color=always --perl-regexp --author=".*Thor" >log &&
+       grep Author log >actual.raw &&
+       test_decode_color <actual.raw >actual &&
+       test_cmp expected actual
+'
+
+test_expect_success GETTEXT_LOCALE,PCRE 'log --committer with an ascii pattern on ISO-8859-1 data' '
+       cat >expected <<-\EOF &&
+       Commit:     Ç<BOLD;RED> O Mîtter <committer@example.com><RESET>
+       EOF
+       test_write_lines "fifth" >file5 &&
+       git add file5 &&
+       GIT_COMMITTER_NAME="Ç O Mîtter" &&
+       GIT_COMMITTER_EMAIL="committer@example.com" &&
+       git -c i18n.commitEncoding=latin1 commit -m thïrd &&
+       git -c i18n.logOutputEncoding=latin1 log -1 --pretty=fuller --color=always --perl-regexp --committer=" O.*" >log &&
+       grep Commit: log >actual.raw &&
+       test_decode_color <actual.raw >actual &&
+       test_cmp expected actual
+'
+
+test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on UTF-8 data' '
+       cat >expected <<-\EOF &&
+           sé<BOLD;RED>con<RESET>d
+       EOF
+       git log -1 --color=always --perl-regexp --grep="con" >log &&
+       grep con log >actual.raw &&
+       test_decode_color <actual.raw >actual &&
+       test_cmp expected actual
+'
+
+test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on ISO-8859-1 data' '
+       cat >expected <<-\EOF &&
+           <BOLD;RED>thïrd<RESET>
+       EOF
+       git -c i18n.logOutputEncoding=latin1 log -1 --color=always --perl-regexp --grep="th.*rd" >log &&
+       grep "th.*rd" log >actual.raw &&
+       test_decode_color <actual.raw >actual &&
+       test_cmp expected actual
+'
+
 test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: setup invalid UTF-8 data' '
        printf "\\200\\n" >invalid-0x80 &&
        echo "ævar" >expected &&