patch 9.1.0296: regexp: engines do not handle case-folding well

author Christian Brabandt <cb@256bit.org>

Tue, 9 Apr 2024 20:53:19 +0000 (22:53 +0200)

committer Christian Brabandt <cb@256bit.org>

Tue, 9 Apr 2024 20:53:19 +0000 (22:53 +0200)
author Christian Brabandt <cb@256bit.org>
Tue, 9 Apr 2024 20:53:19 +0000 (22:53 +0200)
committer Christian Brabandt <cb@256bit.org>
Tue, 9 Apr 2024 20:53:19 +0000 (22:53 +0200)
diff --git a/src/mbyte.c b/src/mbyte.c

index d6fb7ecc761d2800138bd70d5f66cd7d71ce4875..3be75099f10e6269718a3c20830fe978700e3bcd 100644 (file)
--- a/src/mbyte.c
+++ b/src/mbyte.c
@@ -3800,6 +3800,15 @@ utf_strnicmp(
   * Returns zero if s1 and s2 are equal (ignoring case), the difference between
   * two characters otherwise.
   */
+    int
+mb_strnicmp2(char_u *s1, char_u *s2, int n1, int n2)
+{
+    if (n1 == n2 || !enc_utf8)
+       return mb_strnicmp(s1, s2, n1);
+    else
+       return utf_strnicmp(s1, s2, n1, n2);
+}
+
      int
  mb_strnicmp(char_u *s1, char_u *s2, size_t nn)
  {
diff --git a/src/proto/mbyte.pro b/src/proto/mbyte.pro

index 7883b3b4c7331093ea3b3618fed492151eef9794..c49f7e7072a68e3db3c7a8b8d61ee1773879d42e 100644 (file)
--- a/src/proto/mbyte.pro
+++ b/src/proto/mbyte.pro
@@ -48,6 +48,7 @@ int utf_islower(int a);
  int utf_tolower(int a);
  int utf_isupper(int a);
  int mb_strnicmp(char_u *s1, char_u *s2, size_t nn);
+int mb_strnicmp2(char_u *s1, char_u *s2, int n1, int n2);
  void show_utf8(void);
  int latin_head_off(char_u *base, char_u *p);
  int dbcs_screen_head_off(char_u *base, char_u *p);
diff --git a/src/regexp.c b/src/regexp.c

index 4373ae0cfaf41f6e875db7990471e16fded85bc0..4e85ebc29e728059112bab54daf8149023b5cdf9 100644 (file)
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -1606,7 +1606,9 @@ mb_decompose(int c, int *c1, int *c2, int *c3)
  /*
   * Compare two strings, ignore case if rex.reg_ic set.
   * Return 0 if strings match, non-zero otherwise.
- * Correct the length "*n" when composing characters are ignored.
+ * Correct the length "*n" when composing characters are ignored
+ * or for utf8 when both utf codepoints are considered equal because of
+ * case-folding but have different length (e.g. 's' and 'ſ')
   */
      static int
  cstrncmp(char_u *s1, char_u *s2, int *n)
@@ -1615,6 +1617,13 @@ cstrncmp(char_u *s1, char_u *s2, int *n)
  
      if (!rex.reg_ic)
         result = STRNCMP(s1, s2, *n);
+    else if (enc_utf8)
+    {
+       int l2 = mb_ptr2len(s2);
+       result = MB_STRNICMP2(s1, s2, *n, l2);
+       if (result == 0 && l2 < *n)
+           *n = l2;
+    }
      else
         result = MB_STRNICMP(s1, s2, *n);
  
diff --git a/src/regexp_bt.c b/src/regexp_bt.c

index 5d9450d8712b3f284ca6c23e341ce142e07a58a8..2a03fec579c82feb9e0e897e76650a422b522c38 100644 (file)
--- a/src/regexp_bt.c
+++ b/src/regexp_bt.c
@@ -3816,6 +3816,14 @@ regmatch(
                         }
                     }
                 }
+               else if (enc_utf8)
+               {
+                   if (cstrncmp(opnd, rex.input, &len) != 0)
+                   {
+                       status = RA_NOMATCH;
+                       break;
+                   }
+               }
                 else
                     for (i = 0; i < len; ++i)
                         if (opnd[i] != rex.input[i])
diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c

index 5e4fadd028bdcae30bc109a5c6d9f0315e291183..451720a09dd88c192d19a9104bf42c69dea620fc 100644 (file)
--- a/src/regexp_nfa.c
+++ b/src/regexp_nfa.c
@@ -5666,7 +5666,12 @@ find_match_text(colnr_T *startcol, int regstart, char_u *match_text)
      for (;;)
      {
         match = TRUE;
-       len2 = MB_CHAR2LEN(regstart); // skip regstart
+       // skip regstart
+       len2 = MB_CHAR2LEN(regstart);
+       if (enc_utf8 && len2 > 1 && MB_CHAR2LEN(PTR2CHAR(rex.line + col)) != len2)
+           // because of case-folding of the previously matched text, we may need
+           // to skip fewer bytes than mb_char2len(regstart)
+           len2 = mb_char2len(utf_fold(regstart));
         for (len1 = 0; match_text[len1] != NUL; len1 += MB_CHAR2LEN(c1))
         {
             c1 = PTR2CHAR(match_text + len1);
@@ -7503,7 +7508,7 @@ nfa_regexec_both(
  
         // If match_text is set it contains the full text that must match.
         // Nothing else to try. Doesn't handle combining chars well.
-       if (prog->match_text != NULL && !rex.reg_icombine)
+       if (prog->match_text != NULL && *prog->match_text != NUL && !rex.reg_icombine)
         {
             retval = find_match_text(&col, prog->regstart, prog->match_text);
             if (REG_MULTI)
diff --git a/src/testdir/test_regexp_utf8.vim b/src/testdir/test_regexp_utf8.vim

index 6669dee57e4cc7af650ed2d5bb4ba297c6da5d11..9980e5b7f5505e47142b5825522f6f882bdb4305 100644 (file)
--- a/src/testdir/test_regexp_utf8.vim
+++ b/src/testdir/test_regexp_utf8.vim
@@ -587,4 +587,32 @@ func Test_combining_chars_in_collection()
    bw!
  endfunc
  
+func Test_search_multibyte_match_ascii()
+  new
+  " Match single 'ſ' and 's'
+  call setline(1,  'das abc heraus abc ſich abc ſind')
+  for i in range(0, 2)
+    exe "set re="..i
+    let ic_match = matchbufline('%', '\c\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+    let noic_match = matchbufline('%', '\C\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+    call assert_equal(['s', 's', 'ſ','ſ'], ic_match, "Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['ſ','ſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re)
+  endfor
+  " Match several 'ſſ' and 'ss'
+  call setline(1,  'das abc herauss abc ſſich abc ſind')
+  for i in range(0, 2)
+    exe "set re="..i
+    let ic_match = matchbufline('%', '\c\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+    let noic_match = matchbufline('%', '\C\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+    let ic_match2 = matchbufline('%', '\c\%u17f\+', 1, '$')->mapnew({idx, val -> val.text})
+    let noic_match2 = matchbufline('%', '\C\%u17f\+', 1, '$')->mapnew({idx, val -> val.text})
+
+    call assert_equal(['ss', 'ſſ'], ic_match, "Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['ſſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['s', 'ss', 'ſſ', 'ſ'], ic_match2, "Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['ſſ','ſ'], noic_match2, "No-Ignorecase Regex-engine: " .. &re)
+  endfor
+  bw!
+endfunc
+
  " vim: shiftwidth=2 sts=2 expandtab
diff --git a/src/version.c b/src/version.c

index 2c6e7d02fdcbaaf4f3cc21200578217c7f961d5a..c63b141aa9399140318a292a91c4fcab639c8a46 100644 (file)
--- a/src/version.c
+++ b/src/version.c
@@ -704,6 +704,8 @@ static char *(features[]) =
  
  static int included_patches[] =
  {   /* Add new patch number below this line */
+/**/
+    296,
  /**/
      295,
  /**/
diff --git a/src/vim.h b/src/vim.h

index 4507674fcde6efb73c2ab9455f0f4ab969120d3c..33d592038d1f2dead21e883bf142566e69ca4661 100644 (file)
--- a/src/vim.h
+++ b/src/vim.h
@@ -1751,6 +1751,7 @@ void *vim_memset(void *, int, size_t);
  
  # define MB_STRICMP(d, s)      mb_strnicmp((char_u *)(d), (char_u *)(s), (int)MAXCOL)
  # define MB_STRNICMP(d, s, n)  mb_strnicmp((char_u *)(d), (char_u *)(s), (int)(n))
+# define MB_STRNICMP2(d, s, n1, n2)    mb_strnicmp2((char_u *)(d), (char_u *)(s), (int)(n1), (int)(n2))
  
  #define STRCAT(d, s)       strcat((char *)(d), (char *)(s))
  #define STRNCAT(d, s, n)    strncat((char *)(d), (char *)(s), (size_t)(n))
author	Christian Brabandt <cb@256bit.org>
	Tue, 9 Apr 2024 20:53:19 +0000 (22:53 +0200)
committer	Christian Brabandt <cb@256bit.org>
	Tue, 9 Apr 2024 20:53:19 +0000 (22:53 +0200)
src/mbyte.c		patch \| blob \| blame \| history
src/proto/mbyte.pro		patch \| blob \| blame \| history
src/regexp.c		patch \| blob \| blame \| history
src/regexp_bt.c		patch \| blob \| blame \| history
src/regexp_nfa.c		patch \| blob \| blame \| history
src/testdir/test_regexp_utf8.vim		patch \| blob \| blame \| history
src/version.c		patch \| blob \| blame \| history
src/vim.h		patch \| blob \| blame \| history