userdiff: support regexec(3) with multi-byte support

author René Scharfe <l.s.r@web.de>

Thu, 6 Apr 2023 20:19:11 +0000 (22:19 +0200)

committer Junio C Hamano <gitster@pobox.com>

Fri, 7 Apr 2023 14:38:09 +0000 (07:38 -0700)
author René Scharfe <l.s.r@web.de>
Thu, 6 Apr 2023 20:19:11 +0000 (22:19 +0200)
committer Junio C Hamano <gitster@pobox.com>
Fri, 7 Apr 2023 14:38:09 +0000 (07:38 -0700)
diff --git a/t/t4034-diff-words.sh b/t/t4034-diff-words.sh

index 15764ee9ac8ab7984e57473a121be595c2e9b0cc..74586f3813c6f44955a5407e0d3382febd9e4e58 100755 (executable)
--- a/t/t4034-diff-words.sh
+++ b/t/t4034-diff-words.sh
@@ -69,6 +69,10 @@ test_language_driver () {
                 echo "* diff='"$lang"'" >.gitattributes &&
                 word_diff --color-words
         '
+       test_expect_success "diff driver '$lang' in Islandic" '
+               LANG=is_IS.UTF-8 LANGUAGE=is LC_ALL="$is_IS_locale" \
+               word_diff --color-words
+       '
  }
  
  test_expect_success setup '
diff --git a/userdiff.c b/userdiff.c

index e25356a06124dae49a486884ccdd2872cb429a29..8bb7b7a4bacee5a66a22c289784cbb0088920854 100644 (file)
--- a/userdiff.c
+++ b/userdiff.c
@@ -15,6 +15,7 @@ static int drivers_alloc;
                 .cflags = REG_EXTENDED, \
         }, \
         .word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
+       .word_regex_multi_byte = wrx "|[^[:space:]]", \
  }
  #define IPATTERN(lang, rx, wrx) { \
         .name = lang, \
@@ -24,6 +25,7 @@ static int drivers_alloc;
                 .cflags = REG_EXTENDED | REG_ICASE, \
         }, \
         .word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
+       .word_regex_multi_byte = wrx "|[^[:space:]]", \
  }
  
  /*
@@ -292,7 +294,7 @@ PATTERNS("scheme",
          /* All other words should be delimited by spaces or parentheses */
          "|([^][)(}{[ \t])+"),
  PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$",
-        "\\\\[a-zA-Z@]+|\\\\.|[a-zA-Z0-9\x80-\xff]+"),
+        "\\\\[a-zA-Z@]+|\\\\.|([a-zA-Z0-9]|[^\x01-\x7f])+"),
  { "default", NULL, -1, { NULL, 0 } },
  };
  #undef PATTERNS
@@ -328,6 +330,25 @@ static int userdiff_find_by_namelen_cb(struct userdiff_driver *driver,
         return 0;
  }
  
+static int regexec_supports_multi_byte_chars(void)
+{
+       static const char not_space[] = "[^[:space:]]";
+       static const char utf8_multi_byte_char[] = "\xc2\xa3";
+       regex_t re;
+       regmatch_t match;
+       static int result = -1;
+
+       if (result != -1)
+               return result;
+       if (regcomp(&re, not_space, REG_EXTENDED))
+               BUG("invalid regular expression: %s", not_space);
+       result = !regexec(&re, utf8_multi_byte_char, 1, &match, 0) &&
+               match.rm_so == 0 &&
+               match.rm_eo == strlen(utf8_multi_byte_char);
+       regfree(&re);
+       return result;
+}
+
  static struct userdiff_driver *userdiff_find_by_namelen(const char *name, size_t len)
  {
         struct find_by_namelen_data udcbdata = {
@@ -401,7 +422,13 @@ int userdiff_config(const char *k, const char *v)
  struct userdiff_driver *userdiff_find_by_name(const char *name)
  {
         int len = strlen(name);
-       return userdiff_find_by_namelen(name, len);
+       struct userdiff_driver *driver = userdiff_find_by_namelen(name, len);
+       if (driver && driver->word_regex_multi_byte) {
+               if (regexec_supports_multi_byte_chars())
+                       driver->word_regex = driver->word_regex_multi_byte;
+               driver->word_regex_multi_byte = NULL;
+       }
+       return driver;
  }
  
  struct userdiff_driver *userdiff_find_by_path(struct index_state *istate,
diff --git a/userdiff.h b/userdiff.h

index aee91bc77e6d592021d8597da7fce0123c779a32..b09974f6b22cf11239e8b6a98640b27129a9f82e 100644 (file)
--- a/userdiff.h
+++ b/userdiff.h
@@ -17,6 +17,7 @@ struct userdiff_driver {
         int binary;
         struct userdiff_funcname funcname;
         const char *word_regex;
+       const char *word_regex_multi_byte;
         const char *textconv;
         struct notes_cache *textconv_cache;
         int textconv_want_cache;
author	René Scharfe <l.s.r@web.de>
	Thu, 6 Apr 2023 20:19:11 +0000 (22:19 +0200)
committer	Junio C Hamano <gitster@pobox.com>
	Fri, 7 Apr 2023 14:38:09 +0000 (07:38 -0700)
t/t4034-diff-words.sh		patch \| blob \| blame \| history
userdiff.c		patch \| blob \| blame \| history
userdiff.h		patch \| blob \| blame \| history