patch 9.1.0011: regexp cannot match combining chars in collection

author Christian Brabandt <cb@256bit.org>

Thu, 4 Jan 2024 21:54:08 +0000 (22:54 +0100)

committer Christian Brabandt <cb@256bit.org>

Thu, 4 Jan 2024 21:54:08 +0000 (22:54 +0100)
author Christian Brabandt <cb@256bit.org>
Thu, 4 Jan 2024 21:54:08 +0000 (22:54 +0100)
committer Christian Brabandt <cb@256bit.org>
Thu, 4 Jan 2024 21:54:08 +0000 (22:54 +0100)
diff --git a/src/regexp.c b/src/regexp.c

index a64672856c3c4470f9e9e630786c7eac4df94def..c3bc4966c7538f4837af77a4b85483ebf2d66ebd 100644 (file)
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -2686,7 +2686,10 @@ static regengine_T bt_regengine =
      bt_regcomp,
      bt_regfree,
      bt_regexec_nl,
-    bt_regexec_multi,
+    bt_regexec_multi
+#ifdef DEBUG
+    ,(char_u *)""
+#endif
  };
  
  #include "regexp_nfa.c"
@@ -2696,7 +2699,10 @@ static regengine_T nfa_regengine =
      nfa_regcomp,
      nfa_regfree,
      nfa_regexec_nl,
-    nfa_regexec_multi,
+    nfa_regexec_multi
+#ifdef DEBUG
+    ,(char_u *)""
+#endif
  };
  
  // Which regexp engine to use? Needed for vim_regcomp().
diff --git a/src/regexp.h b/src/regexp.h

index d6c8f48c7b93390fe9db076d95fcce649ecaa83c..1ff2e1b6efae6fa1bf1c565014bb65868bb44a7e 100644 (file)
--- a/src/regexp.h
+++ b/src/regexp.h
@@ -178,7 +178,9 @@ struct regengine
      int                (*regexec_nl)(regmatch_T *, char_u *, colnr_T, int);
      // bt_regexec_mult or nfa_regexec_mult
      long       (*regexec_multi)(regmmatch_T *, win_T *, buf_T *, linenr_T, colnr_T, int *);
-    //char_u   *expr;
+#ifdef DEBUG
+    char_u     *expr;
+#endif
  };
  
  // Flags used by vim_regsub() and vim_regsub_both()
diff --git a/src/regexp_bt.c b/src/regexp_bt.c

index 522cf37e2dfb1fe0eeabf362772cbeee8acfe35d..198946e0dcb63b8c0eed10d63a6a5752059e33fe 100644 (file)
--- a/src/regexp_bt.c
+++ b/src/regexp_bt.c
@@ -3743,13 +3743,38 @@ regmatch(
  
           case ANYOF:
           case ANYBUT:
-           if (c == NUL)
-               status = RA_NOMATCH;
-           else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
-               status = RA_NOMATCH;
-           else
-               ADVANCE_REGINPUT();
-           break;
+           {
+               char_u  *q = OPERAND(scan);
+
+               if (c == NUL)
+                   status = RA_NOMATCH;
+               else if ((cstrchr(q, c) == NULL) == (op == ANYOF))
+                   status = RA_NOMATCH;
+               else
+               {
+                   // Check following combining characters
+                   int len = 0;
+                   int i;
+
+                   if (enc_utf8)
+                       len = utfc_ptr2len(q) - utf_ptr2len(q);
+
+                   MB_CPTR_ADV(rex.input);
+                   MB_CPTR_ADV(q);
+
+                   if (!enc_utf8 || len == 0)
+                       break;
+
+                   for (i = 0; i < len; ++i)
+                       if (q[i] != rex.input[i])
+                       {
+                           status = RA_NOMATCH;
+                           break;
+                       }
+                   rex.input += len;
+               }
+               break;
+           }
  
           case MULTIBYTECODE:
             if (has_mbyte)
diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c

index d724d527b6d23bd6fb74b62d65df236c23b707eb..ff54348905e7482ab6ce56e3c83405d09c709de4 100644 (file)
--- a/src/regexp_nfa.c
+++ b/src/regexp_nfa.c
@@ -1764,6 +1764,7 @@ collection:
             endp = skip_anyof(p);
             if (*endp == ']')
             {
+               int plen;
                 /*
                  * Try to reverse engineer character classes. For example,
                  * recognize that [0-9] stands for \d and [A-Za-z_] for \h,
@@ -2033,13 +2034,43 @@ collection:
                         else
                         {
                             if (got_coll_char == TRUE && startc == 0)
+                           {
                                 EMIT(0x0a);
+                               EMIT(NFA_CONCAT);
+                           }
                             else
+                           {
                                 EMIT(startc);
-                           EMIT(NFA_CONCAT);
+                               if (!(enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse)))))
+                               {
+                                   EMIT(NFA_CONCAT);
+                               }
+                           }
                         }
                     }
  
+                   if (enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))))
+                   {
+                       int i = utf_ptr2len(regparse);
+
+                       c = utf_ptr2char(regparse + i);
+
+                       // Add composing characters
+                       for (;;)
+                       {
+                           if (c == 0)
+                               // \x00 is translated to \x0a, start at \x01.
+                               EMIT(1);
+                           else
+                               EMIT(c);
+                           EMIT(NFA_CONCAT);
+                           if ((i += utf_char2len(c)) >= plen)
+                               break;
+                           c = utf_ptr2char(regparse + i);
+                       }
+                       EMIT(NFA_COMPOSING);
+                       EMIT(NFA_CONCAT);
+                   }
                     MB_PTR_ADV(regparse);
                 } // while (p < endp)
  
@@ -6418,6 +6449,84 @@ nfa_regmatch(
                 result_if_matched = (t->state->c == NFA_START_COLL);
                 for (;;)
                 {
+                   if (state->c == NFA_COMPOSING)
+                   {
+                       int         mc = curc;
+                       int         len = 0;
+                       nfa_state_T *end;
+                       nfa_state_T *sta;
+                       int         cchars[MAX_MCO];
+                       int         ccount = 0;
+                       int         j;
+
+                       sta = t->state->out->out;
+                       len = 0;
+                       if (utf_iscomposing(sta->c))
+                       {
+                           // Only match composing character(s), ignore base
+                           // character.  Used for ".{composing}" and "{composing}"
+                           // (no preceding character).
+                           len += mb_char2len(mc);
+                       }
+                       if (rex.reg_icombine && len == 0)
+                       {
+                           // If \Z was present, then ignore composing characters.
+                           // When ignoring the base character this always matches.
+                           if (sta->c != curc)
+                               result = FAIL;
+                           else
+                               result = OK;
+                           while (sta->c != NFA_END_COMPOSING)
+                               sta = sta->out;
+                       }
+                       // Check base character matches first, unless ignored.
+                       else if (len > 0 || mc == sta->c)
+//                     if (len > 0 || mc == sta->c)
+                       {
+                           if (len == 0)
+                           {
+                               len += mb_char2len(mc);
+                               sta = sta->out;
+                           }
+
+                           // We don't care about the order of composing characters.
+                           // Get them into cchars[] first.
+                           while (len < clen)
+                           {
+                               mc = mb_ptr2char(rex.input + len);
+                               cchars[ccount++] = mc;
+                               len += mb_char2len(mc);
+                               if (ccount == MAX_MCO)
+                                   break;
+                           }
+
+                           // Check that each composing char in the pattern matches a
+                           // composing char in the text.  We do not check if all
+                           // composing chars are matched.
+                           result = OK;
+                           while (sta->c != NFA_END_COMPOSING)
+                           {
+                               for (j = 0; j < ccount; ++j)
+                                   if (cchars[j] == sta->c)
+                                       break;
+                               if (j == ccount)
+                               {
+                                   result = FAIL;
+                                   break;
+                               }
+                               sta = sta->out;
+                           }
+                       }
+                       else
+                           result = FAIL;
+
+                       if (t->state->out->out1->c == NFA_END_COMPOSING)
+                       {
+                           end = t->state->out->out1;
+                           ADD_STATE_IF_MATCH(end);
+                       }
+                       break;
+                   }
                     if (state->c == NFA_END_COLL)
                     {
                         result = !result_if_matched;
diff --git a/src/testdir/test_regexp_utf8.vim b/src/testdir/test_regexp_utf8.vim

index b591aedbb7c02687efebbd6b44265f26a8852e41..6669dee57e4cc7af650ed2d5bb4ba297c6da5d11 100644 (file)
--- a/src/testdir/test_regexp_utf8.vim
+++ b/src/testdir/test_regexp_utf8.vim
@@ -575,5 +575,16 @@ func Test_match_too_complicated()
    set regexpengine=0
  endfunc
  
+func Test_combining_chars_in_collection()
+  new
+  for i in range(0,2)
+    exe "set re=".i
+    put =['ɔ̃', 'ɔ',  '̃  ã', 'abcd']
+    :%s/[ɔ̃]//
+    call assert_equal(['', '', 'ɔ', '̃  ã', 'abcd'], getline(1,'$'))
+    %d
+  endfor
+  bw!
+endfunc
  
  " vim: shiftwidth=2 sts=2 expandtab
diff --git a/src/version.c b/src/version.c

index c31fbf63587a61c15af0d8c6cbb3e1fa9773b84b..d45181d9d76f312a496d185f84b77ffbcdcf6604 100644 (file)
--- a/src/version.c
+++ b/src/version.c
@@ -704,6 +704,8 @@ static char *(features[]) =
  
  static int included_patches[] =
  {   /* Add new patch number below this line */
+/**/
+    11,
  /**/
      10,
  /**/
author	Christian Brabandt <cb@256bit.org>
	Thu, 4 Jan 2024 21:54:08 +0000 (22:54 +0100)
committer	Christian Brabandt <cb@256bit.org>
	Thu, 4 Jan 2024 21:54:08 +0000 (22:54 +0100)
src/regexp.c		patch \| blob \| blame \| history
src/regexp.h		patch \| blob \| blame \| history
src/regexp_bt.c		patch \| blob \| blame \| history
src/regexp_nfa.c		patch \| blob \| blame \| history
src/testdir/test_regexp_utf8.vim		patch \| blob \| blame \| history
src/version.c		patch \| blob \| blame \| history