patch 9.1.0645: regex: wrong match when searching multi-byte char case-insensitive

author Christian Brabandt <cb@256bit.org>

Tue, 30 Jul 2024 18:39:18 +0000 (20:39 +0200)

committer Christian Brabandt <cb@256bit.org>

Tue, 30 Jul 2024 18:39:18 +0000 (20:39 +0200)
author Christian Brabandt <cb@256bit.org>
Tue, 30 Jul 2024 18:39:18 +0000 (20:39 +0200)
committer Christian Brabandt <cb@256bit.org>
Tue, 30 Jul 2024 18:39:18 +0000 (20:39 +0200)
diff --git a/runtime/doc/version9.txt b/runtime/doc/version9.txt

index e8e9194dc540f726def29c4195cf13a226a63168..73e54a1cf3551eb5915e496823376ea1b68de3b9 100644 (file)
--- a/runtime/doc/version9.txt
+++ b/runtime/doc/version9.txt
@@ -1,4 +1,4 @@
-*version9.txt*  For Vim version 9.1.  Last change: 2024 Jul 28
+*version9.txt*  For Vim version 9.1.  Last change: 2024 Jul 30
  
  
                   VIM REFERENCE MANUAL    by Bram Moolenaar
@@ -41590,6 +41590,8 @@ Changed~
    behaviour/inconsistency (see |d-special| and |cw|).
  - allow to specify additional attributes in the completion menu (allows to
    mark deprecated attributes from LSP server) |complete-items|
+- the regex engines match correctly case-insensitive multi-byte characters
+  (and apply proper case folding)
  
                                                         *added-9.2*
  Added ~
diff --git a/src/mbyte.c b/src/mbyte.c

index a68ba7be3db83ff992c9b426389143722c6d24a7..d8c47acdd161c7cedbe38c3ea087b1d23fc34e7b 100644 (file)
--- a/src/mbyte.c
+++ b/src/mbyte.c
@@ -3800,6 +3800,15 @@ utf_strnicmp(
   * Returns zero if s1 and s2 are equal (ignoring case), the difference between
   * two characters otherwise.
   */
+    int
+mb_strnicmp2(char_u *s1, char_u *s2, size_t n1, size_t n2)
+{
+    if (n1 == n2 || !enc_utf8)
+       return mb_strnicmp(s1, s2, n1);
+    else
+       return utf_strnicmp(s1, s2, n1, n2);
+}
+
      int
  mb_strnicmp(char_u *s1, char_u *s2, size_t nn)
  {
diff --git a/src/proto/mbyte.pro b/src/proto/mbyte.pro

index c57c94c8aedb432dd1c830d7113408df3df7a292..bb976e3bf812b65835689ee5f2c07bb7cc389c1a 100644 (file)
--- a/src/proto/mbyte.pro
+++ b/src/proto/mbyte.pro
@@ -48,6 +48,7 @@ int utf_islower(int a);
  int utf_tolower(int a);
  int utf_isupper(int a);
  int mb_strnicmp(char_u *s1, char_u *s2, size_t nn);
+int mb_strnicmp2(char_u *s1, char_u *s2, size_t n1, size_t n2);
  void show_utf8(void);
  int latin_head_off(char_u *base, char_u *p);
  int dbcs_screen_head_off(char_u *base, char_u *p);
diff --git a/src/regexp.c b/src/regexp.c

index ff201d9ffee18aeef10b8746d4d0c0331f7490e7..a1b080e7d9d0487fb2fd10cfd58570f5a3b312b9 100644 (file)
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -1729,7 +1729,9 @@ mb_decompose(int c, int *c1, int *c2, int *c3)
  /*
   * Compare two strings, ignore case if rex.reg_ic set.
   * Return 0 if strings match, non-zero otherwise.
- * Correct the length "*n" when composing characters are ignored.
+ * Correct the length "*n" when composing characters are ignored
+ * or for utf8 when both utf codepoints are considered equal because of
+ * case-folding but have different length (e.g. 's' and 'ſ')
   */
      static int
  cstrncmp(char_u *s1, char_u *s2, int *n)
@@ -1738,6 +1740,29 @@ cstrncmp(char_u *s1, char_u *s2, int *n)
  
      if (!rex.reg_ic)
         result = STRNCMP(s1, s2, *n);
+    else if (enc_utf8)
+    {
+       char_u *p = s1;
+       size_t n2 = 0;
+       int n1 = *n;
+       // count the number of characters for byte-length of s1
+       while (n1 > 0 && *p != NUL)
+       {
+           n1 -= mb_ptr2len(s1);
+           MB_PTR_ADV(p);
+           n2++;
+       }
+       // count the number of bytes to advance the same number of chars for s2
+       p = s2;
+       while (n2-- > 0 && *p != NUL)
+           MB_PTR_ADV(p);
+
+       n2 = p - s2;
+
+       result = MB_STRNICMP2(s1, s2, *n, n2);
+       if (result == 0 && (int)n2 < *n)
+           *n = n2;
+    }
      else
         result = MB_STRNICMP(s1, s2, *n);
  
@@ -1787,7 +1812,7 @@ cstrncmp(char_u *s1, char_u *s2, int *n)
  cstrchr(char_u *s, int c)
  {
      char_u     *p;
-    int                cc;
+    int                cc, lc;
  
      if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
         return vim_strchr(s, c);
@@ -1796,26 +1821,35 @@ cstrchr(char_u *s, int c)
      // faster (esp. when using MS Visual C++!).
      // For UTF-8 need to use folded case.
      if (enc_utf8 && c > 0x80)
+    {
         cc = utf_fold(c);
+       lc = cc;
+    }
      else
-        if (MB_ISUPPER(c))
-       cc = MB_TOLOWER(c);
-    else if (MB_ISLOWER(c))
-       cc = MB_TOUPPER(c);
-    else
-       return vim_strchr(s, c);
+       if (MB_ISUPPER(c))
+       {
+           cc = MB_TOLOWER(c);
+           lc = cc;
+       }
+       else if (MB_ISLOWER(c))
+       {
+           cc = MB_TOUPPER(c);
+           lc = c;
+       }
+       else
+           return vim_strchr(s, c);
  
      if (has_mbyte)
      {
         for (p = s; *p != NUL; p += (*mb_ptr2len)(p))
         {
-           if (enc_utf8 && c > 0x80)
+           int uc = utf_ptr2char(p);
+           if (enc_utf8 && (c > 0x80 || uc > 0x80))
             {
-               int uc = utf_ptr2char(p);
-
                 // Do not match an illegal byte.  E.g. 0xff matches 0xc3 0xbf,
                 // not 0xff.
-               if ((uc < 0x80 || uc != *p) && utf_fold(uc) == cc)
+               // compare with lower case of the character
+               if ((uc < 0x80 || uc != *p) && utf_fold(uc) == lc)
                     return p;
             }
             else if (*p == c || *p == cc)
diff --git a/src/regexp_bt.c b/src/regexp_bt.c

index 5452dda0f6ef5e937f5852380be8c5b71290b162..16dac730de977ed4721ea5b6333a6b0e46419cd8 100644 (file)
--- a/src/regexp_bt.c
+++ b/src/regexp_bt.c
@@ -3823,6 +3823,14 @@ regmatch(
                         }
                     }
                 }
+               else if (enc_utf8)
+               {
+                   if (cstrncmp(opnd, rex.input, &len) != 0)
+                   {
+                       status = RA_NOMATCH;
+                       break;
+                   }
+               }
                 else
                     for (i = 0; i < len; ++i)
                         if (opnd[i] != rex.input[i])
diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c

index 4f07a21d5dd8c16c98e048ca57a3cc73ec5b5e6d..6db4134628710fa2a45e4fe80377cdabda575b8b 100644 (file)
--- a/src/regexp_nfa.c
+++ b/src/regexp_nfa.c
@@ -5666,7 +5666,12 @@ find_match_text(colnr_T *startcol, int regstart, char_u *match_text)
      for (;;)
      {
         match = TRUE;
-       len2 = MB_CHAR2LEN(regstart); // skip regstart
+       // skip regstart
+       len2 = MB_CHAR2LEN(regstart);
+       if (enc_utf8 && len2 > 1 && MB_CHAR2LEN(PTR2CHAR(rex.line + col)) != len2)
+           // because of case-folding of the previously matched text, we may need
+           // to skip fewer bytes than mb_char2len(regstart)
+           len2 = mb_char2len(utf_fold(regstart));
         for (len1 = 0; match_text[len1] != NUL; len1 += MB_CHAR2LEN(c1))
         {
             c1 = PTR2CHAR(match_text + len1);
@@ -7502,7 +7507,7 @@ nfa_regexec_both(
  
         // If match_text is set it contains the full text that must match.
         // Nothing else to try. Doesn't handle combining chars well.
-       if (prog->match_text != NULL && !rex.reg_icombine)
+       if (prog->match_text != NULL && *prog->match_text != NUL && !rex.reg_icombine)
         {
             retval = find_match_text(&col, prog->regstart, prog->match_text);
             if (REG_MULTI)
diff --git a/src/testdir/test_regexp_utf8.vim b/src/testdir/test_regexp_utf8.vim

index bc705441e7a1a49eb034e7f14262ddacee144048..51c0984adbf323d49e49fbaeb4a8897bbf86a0c8 100644 (file)
--- a/src/testdir/test_regexp_utf8.vim
+++ b/src/testdir/test_regexp_utf8.vim
@@ -587,4 +587,36 @@ func Test_combining_chars_in_collection()
    bw!
  endfunc
  
+func Test_search_multibyte_match_ascii()
+  new
+  " Match single 'ſ' and 's'
+  call setline(1,  'das abc heraus abc ſich abc ſind')
+  for i in range(0, 2)
+    exe "set re="..i
+    let ic_match = matchbufline('%', '\c\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+    let noic_match = matchbufline('%', '\C\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+    call assert_equal(['s', 's', 'ſ','ſ'], ic_match, "Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['ſ','ſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re)
+  endfor
+  " Match several 'ſſ' and 'ss'
+  call setline(1,  'das abc herauss abc ſſich abc ſind')
+  for i in range(0, 2)
+    exe "set re="..i
+    let ic_match = matchbufline('%', '\c\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+    let noic_match = matchbufline('%', '\C\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+    let ic_match2 = matchbufline('%', '\c\%u17f\+', 1, '$')->mapnew({idx, val -> val.text})
+    let noic_match2 = matchbufline('%', '\C\%u17f\+', 1, '$')->mapnew({idx, val -> val.text})
+    let ic_match3 = matchbufline('%', '\c[\u17f]\+', 1, '$')->mapnew({idx, val -> val.text})
+    let noic_match3 = matchbufline('%', '\C[\u17f]\+', 1, '$')->mapnew({idx, val -> val.text})
+
+    call assert_equal(['ss', 'ſſ'], ic_match, "Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['ſſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['s', 'ss', 'ſſ', 'ſ'], ic_match2, "Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['ſſ','ſ'], noic_match2, "No-Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['s', 'ss', 'ſſ', 'ſ'], ic_match3, "Ignorecase Collection Regex-engine: " .. &re)
+    call assert_equal(['ſſ','ſ'], noic_match3, "No-Ignorecase Collection Regex-engine: " .. &re)
+  endfor
+  bw!
+endfunc
+
  " vim: shiftwidth=2 sts=2 expandtab
diff --git a/src/version.c b/src/version.c

index a2662861e362d59e08d88993f54e1d13208cfa48..f19e9415e9b74096b8f2a434e852ee2c6c3277a2 100644 (file)
--- a/src/version.c
+++ b/src/version.c
@@ -704,6 +704,8 @@ static char *(features[]) =
  
  static int included_patches[] =
  {   /* Add new patch number below this line */
+/**/
+    645,
  /**/
      644,
  /**/
diff --git a/src/vim.h b/src/vim.h

index c022f2e7f4025a6c35687e5397feacb135c8d6c9..9c1434cc63cbfdb85ca3f2f9ae86e7dac15d866f 100644 (file)
--- a/src/vim.h
+++ b/src/vim.h
@@ -1769,6 +1769,7 @@ void *vim_memset(void *, int, size_t);
  
  # define MB_STRICMP(d, s)      mb_strnicmp((char_u *)(d), (char_u *)(s), (int)MAXCOL)
  # define MB_STRNICMP(d, s, n)  mb_strnicmp((char_u *)(d), (char_u *)(s), (int)(n))
+# define MB_STRNICMP2(d, s, n1, n2)    mb_strnicmp2((char_u *)(d), (char_u *)(s), (n1), (n2))
  
  #define STRCAT(d, s)       strcat((char *)(d), (char *)(s))
  #define STRNCAT(d, s, n)    strncat((char *)(d), (char *)(s), (size_t)(n))
author	Christian Brabandt <cb@256bit.org>
	Tue, 30 Jul 2024 18:39:18 +0000 (20:39 +0200)
committer	Christian Brabandt <cb@256bit.org>
	Tue, 30 Jul 2024 18:39:18 +0000 (20:39 +0200)
runtime/doc/version9.txt		patch \| blob \| blame \| history
src/mbyte.c		patch \| blob \| blame \| history
src/proto/mbyte.pro		patch \| blob \| blame \| history
src/regexp.c		patch \| blob \| blame \| history
src/regexp_bt.c		patch \| blob \| blame \| history
src/regexp_nfa.c		patch \| blob \| blame \| history
src/testdir/test_regexp_utf8.vim		patch \| blob \| blame \| history
src/version.c		patch \| blob \| blame \| history
src/vim.h		patch \| blob \| blame \| history